awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
ai/__init__.py ADDED
File without changes
ai/chronon/__init__.py ADDED
File without changes
@@ -0,0 +1,251 @@
1
+ import json
2
+ import math
3
+ from typing import OrderedDict
4
+
5
+ import ai.chronon.utils as utils
6
+ from ai.chronon.api.common.ttypes import TimeUnit
7
+ from ai.chronon.api.ttypes import GroupBy, Join
8
+ from ai.chronon.constants import (
9
+ AIRFLOW_DEPENDENCIES_KEY,
10
+ AIRFLOW_LABEL_DEPENDENCIES_KEY,
11
+ PARTITION_COLUMN_KEY,
12
+ )
13
+
14
+
15
+ def create_airflow_dependency(table, partition_column, additional_partitions=None, offset=0):
16
+ """
17
+ Create an Airflow dependency object for a table.
18
+
19
+ Args:
20
+ table: The table name (with namespace)
21
+ partition_column: The partition column to use (defaults to 'ds')
22
+ additional_partitions: Additional partitions to include in the dependency
23
+
24
+ Returns:
25
+ A dictionary with name and spec for the Airflow dependency
26
+ """
27
+ assert (
28
+ partition_column is not None
29
+ ), """Partition column must be provided via the spark.chronon.partition.column
30
+ config. This can be set as a default in teams.py, or at the individual config level. For example:
31
+ ```
32
+ Team(
33
+ conf=ConfigProperties(
34
+ common={
35
+ "spark.chronon.partition.column": "_test_column",
36
+ }
37
+ )
38
+ )
39
+ ```
40
+ """
41
+
42
+ additional_partitions_str = ""
43
+ if additional_partitions:
44
+ additional_partitions_str = "/" + "/".join(additional_partitions)
45
+
46
+ return {
47
+ "name": f"wf_{utils.sanitize(table)}_with_offset_{offset}",
48
+ "spec": f"{table}/{partition_column}={{{{ macros.ds_add(ds, {offset}) }}}}{additional_partitions_str}",
49
+ }
50
+
51
+
52
+ def _get_partition_col_from_query(query):
53
+ """Gets partition column from query if available"""
54
+ if query:
55
+ return query.partitionColumn
56
+ return None
57
+
58
+ def _get_additional_subPartitionsToWaitFor_from_query(query):
59
+ """Gets additional subPartitionsToWaitFor from query if available"""
60
+ if query:
61
+ return query.subPartitionsToWaitFor
62
+ return None
63
+
64
+
65
+ def _get_airflow_deps_from_source(source, partition_column=None):
66
+ """
67
+ Given a source, return a list of Airflow dependencies.
68
+
69
+ Args:
70
+ source: The source object (events, entities, or joinSource)
71
+ partition_column: The partition column to use
72
+
73
+ Returns:
74
+ A list of Airflow dependency objects
75
+ """
76
+ tables = []
77
+ additional_partitions = None
78
+ # Assumes source has already been normalized
79
+ if source.events:
80
+ tables = [source.events.table]
81
+ # Use partition column from query if available, otherwise use the provided one
82
+ source_partition_column, additional_partitions = (
83
+ _get_partition_col_from_query(source.events.query) or partition_column, _get_additional_subPartitionsToWaitFor_from_query(source.events.query)
84
+ )
85
+
86
+ elif source.entities:
87
+ # Given the setup of Query, we currently mandate the same partition column for snapshot and mutations tables
88
+ tables = [source.entities.snapshotTable]
89
+ if source.entities.mutationTable:
90
+ tables.append(source.entities.mutationTable)
91
+ source_partition_column, additional_partitions = (
92
+ _get_partition_col_from_query(source.entities.query) or partition_column, _get_additional_subPartitionsToWaitFor_from_query(source.entities.query)
93
+ )
94
+ elif source.joinSource:
95
+ # TODO: Handle joinSource -- it doesn't work right now because the metadata isn't set on joinSource at this point
96
+ return []
97
+ else:
98
+ # Unknown source type
99
+ return []
100
+
101
+ return [
102
+ create_airflow_dependency(table, source_partition_column, additional_partitions) for table in tables
103
+ ]
104
+
105
+
106
+ def extract_default_partition_column(obj):
107
+ try:
108
+ return obj.metaData.executionInfo.conf.common.get(
109
+ "spark.chronon.partition.column"
110
+ )
111
+ except Exception:
112
+ # Error handling occurs in `create_airflow_dependency`
113
+ return None
114
+
115
+
116
+ def _get_distinct_day_windows(group_by):
117
+ windows = []
118
+ aggs = group_by.aggregations
119
+ if aggs:
120
+ for agg in aggs:
121
+ for window in agg.windows:
122
+ time_unit = window.timeUnit
123
+ length = window.length
124
+ if time_unit == TimeUnit.DAYS:
125
+ windows.append(length)
126
+ elif time_unit == TimeUnit.HOURS:
127
+ windows.append(math.ceil(length/24))
128
+ elif time_unit == TimeUnit.MINUTES:
129
+ windows.append(math.ceil(length/(24*60)))
130
+ return set(windows)
131
+
132
+
133
+ def _set_join_deps(join):
134
+ default_partition_col = extract_default_partition_column(join)
135
+
136
+ deps = []
137
+
138
+ # Handle left source
139
+ left_query = utils.get_query(join.left)
140
+ left_partition_column = (
141
+ _get_partition_col_from_query(left_query) or default_partition_col
142
+ )
143
+ deps.extend(_get_airflow_deps_from_source(join.left, left_partition_column))
144
+
145
+ # Handle right parts (join parts)
146
+ if join.joinParts:
147
+ for join_part in join.joinParts:
148
+ if join_part.groupBy and join_part.groupBy.sources:
149
+ for source in join_part.groupBy.sources:
150
+ source_query = utils.get_query(source)
151
+ source_partition_column = (
152
+ _get_partition_col_from_query(source_query)
153
+ or default_partition_col
154
+ )
155
+ deps.extend(
156
+ _get_airflow_deps_from_source(source, source_partition_column)
157
+ )
158
+
159
+ label_deps = []
160
+ # Handle label parts
161
+ if join.labelParts and join.labelParts.labels:
162
+ join_output_table = utils.output_table_name(join, full_name=True)
163
+ partition_column = join.metaData.executionInfo.conf.common[PARTITION_COLUMN_KEY]
164
+
165
+
166
+ # set the dependencies on the label sources
167
+ for label_part in join.labelParts.labels:
168
+ group_by = label_part.groupBy
169
+
170
+ # set the dependency on the join output -- one for each distinct window offset
171
+ windows = _get_distinct_day_windows(group_by)
172
+ for window in windows:
173
+ label_deps.append(
174
+ create_airflow_dependency(join_output_table, partition_column, offset=-1 * window)
175
+ )
176
+
177
+ if group_by and group_by.sources:
178
+ for source in label_part.groupBy.sources:
179
+ source_query = utils.get_query(source)
180
+ source_partition_column = (
181
+ _get_partition_col_from_query(source_query)
182
+ or default_partition_col
183
+ )
184
+ label_deps.extend(
185
+ _get_airflow_deps_from_source(source, source_partition_column)
186
+ )
187
+
188
+
189
+ # Update the metadata customJson with dependencies
190
+ _dedupe_and_set_airflow_deps_json(join, deps, AIRFLOW_DEPENDENCIES_KEY)
191
+
192
+ # Update the metadata customJson with label join deps
193
+ if label_deps:
194
+ _dedupe_and_set_airflow_deps_json(join, label_deps, AIRFLOW_LABEL_DEPENDENCIES_KEY)
195
+
196
+ # Set the t/f flag for label_join
197
+ _set_label_join_flag(join)
198
+
199
+
200
+ def _set_group_by_deps(group_by):
201
+ if not group_by.sources:
202
+ return
203
+
204
+ default_partition_col = extract_default_partition_column(group_by)
205
+
206
+ deps = []
207
+
208
+ # Process each source in the group_by
209
+ for source in group_by.sources:
210
+ source_query = utils.get_query(source)
211
+ source_partition_column = (
212
+ _get_partition_col_from_query(source_query) or default_partition_col
213
+ )
214
+ deps.extend(_get_airflow_deps_from_source(source, source_partition_column))
215
+
216
+ # Update the metadata customJson with dependencies
217
+ _dedupe_and_set_airflow_deps_json(group_by, deps, AIRFLOW_DEPENDENCIES_KEY)
218
+
219
+
220
+ def _set_label_join_flag(join):
221
+ existing_json = join.metaData.customJson or "{}"
222
+ json_map = json.loads(existing_json)
223
+ label_join_flag = False
224
+ if join.labelParts:
225
+ label_join_flag = True
226
+ json_map["label_join"] = label_join_flag
227
+ join.metaData.customJson = json.dumps(json_map)
228
+
229
+
230
+ def _dedupe_and_set_airflow_deps_json(obj, deps, custom_json_key):
231
+ sorted_items = [tuple(sorted(d.items())) for d in deps]
232
+ # Use OrderedDict for re-producible ordering of dependencies
233
+ unique = [OrderedDict(t) for t in sorted_items]
234
+ existing_json = obj.metaData.customJson or "{}"
235
+ json_map = json.loads(existing_json)
236
+ json_map[custom_json_key] = unique
237
+ obj.metaData.customJson = json.dumps(json_map)
238
+
239
+
240
+ def set_airflow_deps(obj):
241
+ """
242
+ Set Airflow dependencies for a Chronon object.
243
+
244
+ Args:
245
+ obj: A Join, GroupBy
246
+ """
247
+ # StagingQuery dependency setting is handled directly in object init
248
+ if isinstance(obj, Join):
249
+ _set_join_deps(obj)
250
+ elif isinstance(obj, GroupBy):
251
+ _set_group_by_deps(obj)
@@ -0,0 +1 @@
1
+ __all__ = ['ttypes', 'constants']
@@ -0,0 +1 @@
1
+ __all__ = ['ttypes', 'constants']
@@ -0,0 +1,15 @@
1
+ #
2
+ # Autogenerated by Thrift Compiler (0.22.0)
3
+ #
4
+ # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5
+ #
6
+ # options string: py
7
+ #
8
+
9
+ from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
10
+ from thrift.protocol.TProtocol import TProtocolException
11
+ from thrift.TRecursive import fix_spec
12
+ from uuid import UUID
13
+
14
+ import sys
15
+ from .ttypes import *