awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
ai/chronon/types.py ADDED
@@ -0,0 +1,57 @@
1
+ """
2
+ importing ai.chronon.types will bring in all the api's needed to create any chronon object
3
+ """
4
+
5
+ import ai.chronon.api.common.ttypes as common
6
+ import ai.chronon.api.ttypes as ttypes
7
+ import ai.chronon.group_by as group_by
8
+ import ai.chronon.join as join
9
+ import ai.chronon.query as query
10
+ import ai.chronon.source as source
11
+
12
+ # source related concepts
13
+ Query = query.Query
14
+ selects = query.selects
15
+
16
+ Source = ttypes.Source
17
+ EventSource = source.EventSource
18
+ EntitySource = source.EntitySource
19
+ JoinSource = source.JoinSource
20
+
21
+ # Aggregation / GroupBy related concepts
22
+ GroupBy = group_by.GroupBy
23
+ Aggregation = group_by.Aggregation
24
+ Operation = group_by.Operation
25
+ Window = group_by.Window
26
+ TimeUnit = group_by.TimeUnit
27
+ DefaultAggregation = group_by.DefaultAggregation
28
+
29
+ Accuracy = ttypes.Accuracy
30
+ TEMPORAL = ttypes.Accuracy.TEMPORAL
31
+ SNAPSHOT = ttypes.Accuracy.SNAPSHOT
32
+
33
+ Derivation = group_by.Derivation
34
+
35
+ # join related concepts
36
+ Join = join.Join
37
+ JoinPart = join.JoinPart
38
+ BootstrapPart = join.BootstrapPart
39
+ LabelParts = join.LabelParts
40
+ ContextualSource = join.ContextualSource
41
+ ExternalPart = join.ExternalPart
42
+ ExternalSource = join.ExternalSource
43
+ DataType = join.DataType
44
+
45
+
46
+ # Staging Query related concepts
47
+ StagingQuery = ttypes.StagingQuery
48
+ MetaData = ttypes.MetaData
49
+
50
+
51
+ EnvironmentVariables = common.EnvironmentVariables
52
+ ConfigProperties = common.ConfigProperties
53
+ ClusterConfigProperties = common.ClusterConfigProperties
54
+ ExecutionInfo = common.ExecutionInfo
55
+ TableDependency = common.TableDependency
56
+
57
+ Team = ttypes.Team
ai/chronon/utils.py ADDED
@@ -0,0 +1,557 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import gc
16
+ import importlib
17
+ import json
18
+ import os
19
+ import re
20
+ import shutil
21
+ import subprocess
22
+ import tempfile
23
+ from collections.abc import Iterable
24
+ from typing import List, Optional, Union, cast
25
+
26
+ import ai.chronon.api.ttypes as api
27
+ import ai.chronon.repo.extract_objects as eo
28
+ from ai.chronon.cli.compile import parse_teams
29
+ from ai.chronon.repo import FOLDER_NAME_TO_CLASS
30
+
31
+ ChrononJobTypes = Union[api.GroupBy, api.Join, api.StagingQuery]
32
+
33
+ chronon_root_path = "" # passed from compile.py
34
+
35
+
36
+ def edit_distance(str1, str2):
37
+ m = len(str1) + 1
38
+ n = len(str2) + 1
39
+ dp = [[0 for _ in range(n)] for _ in range(m)]
40
+ for i in range(m):
41
+ for j in range(n):
42
+ if i == 0:
43
+ dp[i][j] = j
44
+ elif j == 0:
45
+ dp[i][j] = i
46
+ elif str1[i - 1] == str2[j - 1]:
47
+ dp[i][j] = dp[i - 1][j - 1]
48
+ else:
49
+ dp[i][j] = 1 + min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1])
50
+ return dp[m - 1][n - 1]
51
+
52
+
53
+ class JsonDiffer:
54
+ def __init__(self):
55
+ self.temp_dir = tempfile.mkdtemp()
56
+ self.new_name = "new.json"
57
+ self.old_name = "old.json"
58
+
59
+ def diff(
60
+ self, new_json_str: object, old_json_str: object, skipped_keys=None
61
+ ) -> str:
62
+ if skipped_keys is None:
63
+ skipped_keys = []
64
+ new_json = {
65
+ k: v for k, v in json.loads(new_json_str).items() if k not in skipped_keys
66
+ }
67
+ old_json = {
68
+ k: v for k, v in json.loads(old_json_str).items() if k not in skipped_keys
69
+ }
70
+
71
+ with open(os.path.join(self.temp_dir, self.old_name), mode="w") as old, open(
72
+ os.path.join(self.temp_dir, self.new_name), mode="w"
73
+ ) as new:
74
+ old.write(json.dumps(old_json, sort_keys=True, indent=2))
75
+ new.write(json.dumps(new_json, sort_keys=True, indent=2))
76
+ diff_str = subprocess.run(
77
+ ["diff", old.name, new.name], stdout=subprocess.PIPE
78
+ ).stdout.decode("utf-8")
79
+ return diff_str
80
+
81
+ def clean(self):
82
+ shutil.rmtree(self.temp_dir)
83
+
84
+
85
+ def check_contains_single(candidate, valid_items, type_name, name, print_function=repr):
86
+ name_suffix = f"for {name}" if name else ""
87
+ candidate_str = print_function(candidate)
88
+ if not valid_items:
89
+ assert f"{candidate_str}, is not a valid {type_name} because no {type_name}s are specified {name_suffix}"
90
+ elif candidate not in valid_items:
91
+ sorted_items = sorted(
92
+ map(print_function, valid_items),
93
+ key=lambda item: edit_distance(candidate_str, item),
94
+ )
95
+ printed_items = "\n ".join(sorted_items)
96
+ assert (
97
+ candidate in valid_items
98
+ ), f"""{candidate_str}, is not a valid {type_name} {name_suffix}
99
+ Please pick one from:
100
+ {printed_items}
101
+ """
102
+
103
+
104
+ def check_contains(candidates, *args):
105
+ if isinstance(candidates, Iterable) and not isinstance(candidates, str):
106
+ for candidate in candidates:
107
+ check_contains_single(candidate, *args)
108
+ else:
109
+ check_contains_single(candidates, *args)
110
+
111
+
112
+ def get_streaming_sources(group_by: api.GroupBy) -> List[api.Source]:
113
+ """Checks if the group by has a source with streaming enabled."""
114
+ return [source for source in group_by.sources if is_streaming(source)]
115
+
116
+
117
+ def is_streaming(source: api.Source) -> bool:
118
+ """Checks if the source has streaming enabled."""
119
+ return (source.entities and source.entities.mutationTopic is not None) or (
120
+ source.events and source.events.topic is not None
121
+ )
122
+
123
+
124
+ def _get_underlying_source(
125
+ source: api.Source,
126
+ ) -> Union[api.EventSource, api.EntitySource, api.JoinSource]:
127
+ if source.entities:
128
+ return source.entities
129
+ elif source.events:
130
+ return source.events
131
+ else:
132
+ return source.joinSource
133
+
134
+ def get_root_source(
135
+ source: api.Source,
136
+ ) -> Union[api.EventSource, api.EntitySource]:
137
+ if source.entities:
138
+ return source.entities
139
+ elif source.events:
140
+ return source.events
141
+ else:
142
+ return get_root_source(source.joinSource.join.left)
143
+
144
+ def get_query(source: api.Source) -> api.Query:
145
+ return _get_underlying_source(source).query
146
+
147
+
148
+ def get_table(source: api.Source) -> str:
149
+ if source.entities:
150
+ table = source.entities.snapshotTable
151
+ elif source.events:
152
+ table = source.events.table
153
+ else:
154
+ table = get_join_output_table_name(source.joinSource.join, True)
155
+ return table.split("/")[0]
156
+
157
+
158
+ def get_topic(source: api.Source) -> str:
159
+ return source.entities.mutationTopic if source.entities else source.events.topic
160
+
161
+
162
+ def get_columns(source: api.Source):
163
+ query = get_query(source)
164
+ assert query.selects is not None, "Please specify selects in your Source/Query"
165
+ columns = query.selects.keys()
166
+ return columns
167
+
168
+
169
+ def get_mod_name_from_gc(obj, mod_prefix):
170
+ """get an object's module information from garbage collector"""
171
+ mod_name = None
172
+ # get obj's module info from garbage collector
173
+ gc.collect()
174
+
175
+ referrers = gc.get_referrers(obj)
176
+
177
+ valid_referrers = [
178
+ ref for ref in referrers if (isinstance(ref, Iterable) and "__name__" in ref)
179
+ ]
180
+
181
+ if len(valid_referrers) == 1:
182
+ return valid_referrers[0]["__name__"]
183
+
184
+ for ref in valid_referrers:
185
+ if ref["__name__"].startswith(mod_prefix):
186
+ mod_name = ref["__name__"]
187
+ break
188
+
189
+ return mod_name
190
+
191
+
192
+ def get_mod_and_var_name_from_gc(obj, mod_prefix):
193
+ # Find the variable name within the module
194
+ mod_name = get_mod_name_from_gc(obj, mod_prefix)
195
+ """Get the variable name that points to the obj in the module"""
196
+ if not mod_name:
197
+ return None
198
+
199
+ module = importlib.import_module(mod_name)
200
+ for var_name, value in vars(module).items():
201
+ if value is obj:
202
+ return mod_name, var_name
203
+
204
+ return mod_name, None
205
+
206
+
207
+ def __set_name(obj, cls, mod_prefix):
208
+ module_qualifier = get_mod_name_from_gc(obj, mod_prefix)
209
+
210
+ module = importlib.import_module(module_qualifier)
211
+ eo.import_module_set_name(module, cls)
212
+
213
+
214
+ def sanitize(name):
215
+ """
216
+ From api.Extensions.scala
217
+ Option(name).map(_.replaceAll("[^a-zA-Z0-9_]", "_")).orNull
218
+ """
219
+ if name is not None:
220
+ return re.sub("[^a-zA-Z0-9_]", "_", name)
221
+ return None
222
+
223
+
224
+ def dict_to_bash_commands(d):
225
+ """
226
+ Convert a dict into a bash command substring
227
+ """
228
+ if not d:
229
+ return ""
230
+ bash_commands = []
231
+ for key, value in d.items():
232
+ cmd = (
233
+ f"--{key.replace('_', '-')}={value}"
234
+ if value
235
+ else f"--{key.replace('_', '-')}"
236
+ )
237
+ bash_commands.append(cmd)
238
+ return " ".join(bash_commands)
239
+
240
+
241
+ def dict_to_exports(d):
242
+ if not d:
243
+ return ""
244
+ exports = []
245
+ for key, value in d.items():
246
+ exports.append(f"export {key.upper()}={value}")
247
+ return " && ".join(exports)
248
+
249
+
250
+ def output_table_name(obj, full_name: bool):
251
+ table_name = sanitize(obj.metaData.name)
252
+ db = obj.metaData.outputNamespace
253
+ db = db or "{{ db }}"
254
+ if full_name:
255
+ return db + "." + table_name
256
+ else:
257
+ return table_name
258
+
259
+
260
+ def join_part_name(jp):
261
+ if jp.groupBy is None:
262
+ raise NotImplementedError(
263
+ "Join Part names for non group bys is not implemented."
264
+ )
265
+ if not jp.groupBy.metaData.name and isinstance(jp.groupBy, api.GroupBy):
266
+ __set_name(jp.groupBy, api.GroupBy, "group_bys")
267
+ return "_".join(
268
+ [
269
+ component
270
+ for component in [jp.prefix, sanitize(jp.groupBy.metaData.name)]
271
+ if component is not None
272
+ ]
273
+ )
274
+
275
+
276
+ def join_part_output_table_name(join, jp, full_name: bool = False):
277
+ """
278
+ From api.Extensions.scala
279
+
280
+ Join Part output table name.
281
+ To be synced with Scala API.
282
+ def partOutputTable(jp: JoinPart): String = (Seq(join.metaData.outputTable) ++ Option(jp.prefix) :+
283
+ jp.groupBy.metaData.cleanName).mkString("_")
284
+ """
285
+ if not join.metaData.name and isinstance(join, api.Join):
286
+ __set_name(join, api.Join, "joins")
287
+ return "_".join(
288
+ [
289
+ component
290
+ for component in [
291
+ output_table_name(join, full_name),
292
+ join_part_name(jp),
293
+ ]
294
+ if component is not None
295
+ ]
296
+ )
297
+
298
+
299
+ def group_by_output_table_name(obj, full_name: bool = False):
300
+ """
301
+ Group by backfill output table name
302
+ To be synced with api.Extensions.scala
303
+ """
304
+ if not obj.metaData.name:
305
+ __set_name(obj, api.GroupBy, "group_bys")
306
+ return output_table_name(obj, full_name)
307
+
308
+
309
+ def log_table_name(obj, full_name: bool = False):
310
+ return output_table_name(obj, full_name=full_name) + "_logged"
311
+
312
+
313
+ def get_staging_query_output_table_name(
314
+ staging_query: api.StagingQuery, full_name: bool = False
315
+ ):
316
+ """generate output table name for staging query job"""
317
+ __set_name(staging_query, api.StagingQuery, "staging_queries")
318
+ return output_table_name(staging_query, full_name=full_name)
319
+
320
+
321
+ def get_team_conf_from_py(team, key):
322
+ team_module = importlib.import_module(f"teams.{team}")
323
+ return getattr(team_module, key)
324
+
325
+
326
+ def get_join_output_table_name(join: api.Join, full_name: bool = False):
327
+ """generate output table name for join backfill job"""
328
+ # join sources could also be created inline alongside groupBy file
329
+ # so we specify fallback module as group_bys
330
+ if isinstance(join, api.Join):
331
+ __set_name(join, api.Join, "joins")
332
+ # set output namespace
333
+ if not join.metaData.outputNamespace:
334
+ team_name = join.metaData.name.split(".")[0]
335
+ namespace = (
336
+ parse_teams.load_teams(chronon_root_path, print=False)
337
+ .get(team_name)
338
+ .outputNamespace
339
+ )
340
+ join.metaData.outputNamespace = namespace
341
+ return output_table_name(join, full_name=full_name)
342
+
343
+
344
+ def wait_for_simple_schema(table, lag, start, end):
345
+ if not table:
346
+ return None
347
+ table_tokens = table.split("/")
348
+ clean_name = table_tokens[0]
349
+ subpartition_spec = "/".join(table_tokens[1:]) if len(table_tokens) > 1 else ""
350
+ return {
351
+ "name": "wait_for_{}_ds{}".format(
352
+ clean_name, "" if lag == 0 else f"_minus_{lag}"
353
+ ),
354
+ "spec": "{}/ds={}{}".format(
355
+ clean_name,
356
+ "{{ ds }}" if lag == 0 else "{{{{ macros.ds_add(ds, -{}) }}}}".format(lag),
357
+ "/{}".format(subpartition_spec) if subpartition_spec else "",
358
+ ),
359
+ "start": start,
360
+ "end": end,
361
+ }
362
+
363
+
364
+ def wait_for_name(dep):
365
+ replace_nonalphanumeric = re.sub("[^a-zA-Z0-9]", "_", dep)
366
+ name = f"wait_for_{replace_nonalphanumeric}"
367
+ return re.sub("_+", "_", name).rstrip("_")
368
+
369
+
370
+ def dedupe_in_order(seq):
371
+ seen = set()
372
+ seen_add = seen.add
373
+ return [x for x in seq if not (x in seen or seen_add(x))]
374
+
375
+
376
+ def has_topic(group_by: api.GroupBy) -> bool:
377
+ """Find if there's topic or mutationTopic for a source helps define streaming tasks"""
378
+ return any(
379
+ (source.entities and source.entities.mutationTopic)
380
+ or (source.events and source.events.topic)
381
+ for source in group_by.sources
382
+ )
383
+
384
+
385
+ def get_offline_schedule(conf: ChrononJobTypes) -> Optional[str]:
386
+ schedule_interval = conf.metaData.executionInfo.scheduleCron or "@daily"
387
+ if schedule_interval == "@never":
388
+ return None
389
+ return schedule_interval
390
+
391
+
392
+ def requires_log_flattening_task(conf: ChrononJobTypes) -> bool:
393
+ return (conf.metaData.samplePercent or 0) > 0
394
+
395
+
396
+ def get_applicable_modes(conf: ChrononJobTypes) -> List[str]:
397
+ """Based on a conf and mode determine if a conf should define a task."""
398
+ modes = [] # type: List[str]
399
+
400
+ if isinstance(conf, api.GroupBy):
401
+ group_by = cast(api.GroupBy, conf)
402
+ if group_by.backfillStartDate is not None:
403
+ modes.append("backfill")
404
+
405
+ online = group_by.metaData.online or False
406
+
407
+ if online:
408
+ modes.append("upload")
409
+
410
+ temporal_accuracy = group_by.accuracy or False
411
+ streaming = has_topic(group_by)
412
+ if temporal_accuracy or streaming:
413
+ modes.append("streaming")
414
+
415
+ elif isinstance(conf, api.Join):
416
+
417
+ join = cast(api.Join, conf)
418
+
419
+ if get_offline_schedule(conf) is not None:
420
+ modes.append("backfill")
421
+ modes.append("stats-summary")
422
+
423
+ if join.metaData.consistencyCheck is True:
424
+ modes.append("consistency-metrics-compute")
425
+
426
+ if requires_log_flattening_task(join):
427
+ modes.append("log-flattener")
428
+
429
+ if join.labelParts is not None:
430
+ modes.append("label-join")
431
+
432
+ elif isinstance(conf, api.StagingQuery):
433
+ modes.append("backfill")
434
+ else:
435
+ raise ValueError(f"Unsupported job type {type(conf).__name__}")
436
+
437
+ return modes
438
+
439
+
440
+ def get_related_table_names(conf: ChrononJobTypes) -> List[str]:
441
+ table_name = output_table_name(conf, full_name=True)
442
+
443
+ applicable_modes = set(get_applicable_modes(conf))
444
+ related_tables = [] # type: List[str]
445
+
446
+ if "upload" in applicable_modes:
447
+ related_tables.append(f"{table_name}_upload")
448
+ if "stats-summary" in applicable_modes:
449
+ related_tables.append(f"{table_name}_daily_stats")
450
+ if "label-join" in applicable_modes:
451
+ related_tables.append(f"{table_name}_labels")
452
+ related_tables.append(f"{table_name}_labeled")
453
+ related_tables.append(f"{table_name}_labeled_latest")
454
+ if "log-flattener" in applicable_modes:
455
+ related_tables.append(f"{table_name}_logged")
456
+ if "consistency-metrics-compute" in applicable_modes:
457
+ related_tables.append(f"{table_name}_consistency")
458
+
459
+ if isinstance(conf, api.Join) and conf.bootstrapParts:
460
+ related_tables.append(f"{table_name}_bootstrap")
461
+
462
+ return related_tables
463
+
464
+
465
+ class DotDict(dict):
466
+ def __getattr__(self, attr):
467
+ if attr in self:
468
+ value = self[attr]
469
+ return DotDict(value) if isinstance(value, dict) else value
470
+ return None
471
+
472
+
473
+ def convert_json_to_obj(d):
474
+ if isinstance(d, dict):
475
+ return DotDict({k: convert_json_to_obj(v) for k, v in d.items()})
476
+ elif isinstance(d, list):
477
+ return [convert_json_to_obj(item) for item in d]
478
+ else:
479
+ return d
480
+
481
+
482
+ def chronon_path(file_path: str) -> str:
483
+ conf_types = FOLDER_NAME_TO_CLASS.keys()
484
+ splits = file_path.split("/")
485
+ conf_occurences = [splits.index(typ) for typ in conf_types if typ in splits]
486
+ assert (
487
+ len(conf_occurences) > 0
488
+ ), f"Path: {file_path} doesn't contain folder with name among {conf_types}"
489
+
490
+ index = min([splits.index(typ) for typ in conf_types if typ in splits])
491
+ rel_path = "/".join(splits[index:])
492
+ return rel_path
493
+
494
+
495
+ def module_path(file_path: str) -> str:
496
+ adjusted_path = chronon_path(file_path)
497
+ assert adjusted_path.endswith(".py"), f"Path: {file_path} doesn't end with '.py'"
498
+ without_extension = adjusted_path[:-3]
499
+ mod_path = without_extension.replace("/", ".")
500
+ return mod_path
501
+
502
+
503
+ def compose(arg, *methods):
504
+ """
505
+ Allows composing deeply nested method calls - typically used in selects & derivations
506
+ The first arg is what is threaded into methods, methods can have more than one arg.
507
+
508
+ Example:
509
+
510
+ .. code-block:: python
511
+ compose(
512
+ "user_id_approx_distinct_count_by_query",
513
+ "map_entries",
514
+ "array_sort (x, y) -> IF(y.value > x.value, -1, IF(y.value < x.value, 1, 0))",
515
+ "transform entry -> entry.key"
516
+ )
517
+
518
+ would produce (without the new lines or indents):
519
+
520
+ .. code-block:: text
521
+
522
+ transform(
523
+ array_sort(
524
+ map_entries(
525
+ user_id_approx_distinct_count_by_query
526
+ ),
527
+ (x, y) -> IF(y.value > x.value, -1, IF(y.value < x.value, 1, 0))
528
+ ),
529
+ entry -> entry.key
530
+ )
531
+ """
532
+
533
+ indent = " " * (len(methods))
534
+
535
+ result = [indent + arg]
536
+
537
+ for method in methods:
538
+
539
+ method_parts = method.split(" ", 1)
540
+ method = method_parts[0]
541
+
542
+ if len(method_parts) > 1:
543
+ remaining_args = method_parts[1]
544
+ last = result.pop()
545
+ result = result + [last + ",", indent + remaining_args]
546
+
547
+ indent = indent[:-4]
548
+ result = [f"{indent}{method}("] + result + [f"{indent})"]
549
+
550
+ return "\n".join(result)
551
+
552
+
553
+ def clean_expression(expr):
554
+ """
555
+ Cleans up an expression by removing leading and trailing whitespace and newlines.
556
+ """
557
+ return re.sub(r"\s+", " ", expr).strip()
ai/chronon/windows.py ADDED
@@ -0,0 +1,50 @@
1
+ import ai.chronon.api.common.ttypes as common
2
+
3
+
4
+ def _days(length: int) -> common.Window:
5
+ return common.Window(length=length, timeUnit=common.TimeUnit.DAYS)
6
+
7
+
8
+ def _hours(length: int) -> common.Window:
9
+ return common.Window(length=length, timeUnit=common.TimeUnit.HOURS)
10
+
11
+
12
+ def _from_str(s: str) -> common.Window:
13
+ """
14
+ converts strings like "30d", "2h" etc into common.Window
15
+
16
+ Args:
17
+ s (str): Duration string in format "<number>(d|h)" where d=days, h=hours
18
+
19
+ Returns:
20
+ common.Window: Window object with specified duration
21
+
22
+ Raises:
23
+ ValueError: If string format is invalid
24
+ """
25
+
26
+ if not s or len(s) < 2:
27
+ raise ValueError(f"Invalid duration format: {s}")
28
+
29
+ # Get the numeric value and unit
30
+ value = s[:-1]
31
+ unit = s[-1].lower()
32
+
33
+ try:
34
+ length = int(value)
35
+ if length <= 0:
36
+ raise ValueError(f"Duration must be positive: {s}")
37
+
38
+ if unit == "d":
39
+ return _days(length)
40
+ elif unit == "h":
41
+ return _hours(length)
42
+ else:
43
+ raise ValueError(
44
+ f"Invalid time unit '{unit}'. Must be 'd' for days or 'h' for hours"
45
+ )
46
+
47
+ except ValueError as e:
48
+ if "invalid literal for int()" in str(e):
49
+ raise ValueError(f"Invalid numeric value in duration: {value}") from e
50
+ raise e from None