awx-zipline-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +251 -0
- ai/chronon/api/__init__.py +1 -0
- ai/chronon/api/common/__init__.py +1 -0
- ai/chronon/api/common/constants.py +15 -0
- ai/chronon/api/common/ttypes.py +1844 -0
- ai/chronon/api/constants.py +15 -0
- ai/chronon/api/ttypes.py +3624 -0
- ai/chronon/cli/compile/column_hashing.py +313 -0
- ai/chronon/cli/compile/compile_context.py +177 -0
- ai/chronon/cli/compile/compiler.py +160 -0
- ai/chronon/cli/compile/conf_validator.py +590 -0
- ai/chronon/cli/compile/display/class_tracker.py +112 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +46 -0
- ai/chronon/cli/compile/fill_templates.py +40 -0
- ai/chronon/cli/compile/parse_configs.py +141 -0
- ai/chronon/cli/compile/parse_teams.py +238 -0
- ai/chronon/cli/compile/serializer.py +115 -0
- ai/chronon/cli/git_utils.py +156 -0
- ai/chronon/cli/logger.py +61 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/eval/__init__.py +122 -0
- ai/chronon/eval/query_parsing.py +19 -0
- ai/chronon/eval/sample_tables.py +100 -0
- ai/chronon/eval/table_scan.py +186 -0
- ai/chronon/fetcher/__init__.py +1 -0
- ai/chronon/fetcher/constants.py +15 -0
- ai/chronon/fetcher/ttypes.py +127 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/hub/__init__.py +1 -0
- ai/chronon/hub/constants.py +15 -0
- ai/chronon/hub/ttypes.py +1228 -0
- ai/chronon/join.py +566 -0
- ai/chronon/logger.py +24 -0
- ai/chronon/model.py +35 -0
- ai/chronon/observability/__init__.py +1 -0
- ai/chronon/observability/constants.py +15 -0
- ai/chronon/observability/ttypes.py +2192 -0
- ai/chronon/orchestration/__init__.py +1 -0
- ai/chronon/orchestration/constants.py +15 -0
- ai/chronon/orchestration/ttypes.py +4406 -0
- ai/chronon/planner/__init__.py +1 -0
- ai/chronon/planner/constants.py +15 -0
- ai/chronon/planner/ttypes.py +1686 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +40 -0
- ai/chronon/repo/aws.py +298 -0
- ai/chronon/repo/cluster.py +65 -0
- ai/chronon/repo/compile.py +56 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +291 -0
- ai/chronon/repo/explore.py +421 -0
- ai/chronon/repo/extract_objects.py +137 -0
- ai/chronon/repo/gcp.py +585 -0
- ai/chronon/repo/gitpython_utils.py +14 -0
- ai/chronon/repo/hub_runner.py +171 -0
- ai/chronon/repo/hub_uploader.py +108 -0
- ai/chronon/repo/init.py +53 -0
- ai/chronon/repo/join_backfill.py +105 -0
- ai/chronon/repo/run.py +293 -0
- ai/chronon/repo/serializer.py +141 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +472 -0
- ai/chronon/repo/zipline.py +51 -0
- ai/chronon/repo/zipline_hub.py +105 -0
- ai/chronon/resources/gcp/README.md +174 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +30 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +23 -0
- ai/chronon/resources/gcp/teams.py +70 -0
- ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
- ai/chronon/source.py +88 -0
- ai/chronon/staging_query.py +185 -0
- ai/chronon/types.py +57 -0
- ai/chronon/utils.py +557 -0
- ai/chronon/windows.py +50 -0
- awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
- awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
- awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
- awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
- jars/__init__.py +0 -0
ai/__init__.py
ADDED
|
File without changes
|
ai/chronon/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import math
|
|
3
|
+
from typing import OrderedDict
|
|
4
|
+
|
|
5
|
+
import ai.chronon.utils as utils
|
|
6
|
+
from ai.chronon.api.common.ttypes import TimeUnit
|
|
7
|
+
from ai.chronon.api.ttypes import GroupBy, Join
|
|
8
|
+
from ai.chronon.constants import (
|
|
9
|
+
AIRFLOW_DEPENDENCIES_KEY,
|
|
10
|
+
AIRFLOW_LABEL_DEPENDENCIES_KEY,
|
|
11
|
+
PARTITION_COLUMN_KEY,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_airflow_dependency(table, partition_column, additional_partitions=None, offset=0):
|
|
16
|
+
"""
|
|
17
|
+
Create an Airflow dependency object for a table.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
table: The table name (with namespace)
|
|
21
|
+
partition_column: The partition column to use (defaults to 'ds')
|
|
22
|
+
additional_partitions: Additional partitions to include in the dependency
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A dictionary with name and spec for the Airflow dependency
|
|
26
|
+
"""
|
|
27
|
+
assert (
|
|
28
|
+
partition_column is not None
|
|
29
|
+
), """Partition column must be provided via the spark.chronon.partition.column
|
|
30
|
+
config. This can be set as a default in teams.py, or at the individual config level. For example:
|
|
31
|
+
```
|
|
32
|
+
Team(
|
|
33
|
+
conf=ConfigProperties(
|
|
34
|
+
common={
|
|
35
|
+
"spark.chronon.partition.column": "_test_column",
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
```
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
additional_partitions_str = ""
|
|
43
|
+
if additional_partitions:
|
|
44
|
+
additional_partitions_str = "/" + "/".join(additional_partitions)
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
"name": f"wf_{utils.sanitize(table)}_with_offset_{offset}",
|
|
48
|
+
"spec": f"{table}/{partition_column}={{{{ macros.ds_add(ds, {offset}) }}}}{additional_partitions_str}",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_partition_col_from_query(query):
|
|
53
|
+
"""Gets partition column from query if available"""
|
|
54
|
+
if query:
|
|
55
|
+
return query.partitionColumn
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
def _get_additional_subPartitionsToWaitFor_from_query(query):
|
|
59
|
+
"""Gets additional subPartitionsToWaitFor from query if available"""
|
|
60
|
+
if query:
|
|
61
|
+
return query.subPartitionsToWaitFor
|
|
62
|
+
return None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _get_airflow_deps_from_source(source, partition_column=None):
|
|
66
|
+
"""
|
|
67
|
+
Given a source, return a list of Airflow dependencies.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
source: The source object (events, entities, or joinSource)
|
|
71
|
+
partition_column: The partition column to use
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
A list of Airflow dependency objects
|
|
75
|
+
"""
|
|
76
|
+
tables = []
|
|
77
|
+
additional_partitions = None
|
|
78
|
+
# Assumes source has already been normalized
|
|
79
|
+
if source.events:
|
|
80
|
+
tables = [source.events.table]
|
|
81
|
+
# Use partition column from query if available, otherwise use the provided one
|
|
82
|
+
source_partition_column, additional_partitions = (
|
|
83
|
+
_get_partition_col_from_query(source.events.query) or partition_column, _get_additional_subPartitionsToWaitFor_from_query(source.events.query)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
elif source.entities:
|
|
87
|
+
# Given the setup of Query, we currently mandate the same partition column for snapshot and mutations tables
|
|
88
|
+
tables = [source.entities.snapshotTable]
|
|
89
|
+
if source.entities.mutationTable:
|
|
90
|
+
tables.append(source.entities.mutationTable)
|
|
91
|
+
source_partition_column, additional_partitions = (
|
|
92
|
+
_get_partition_col_from_query(source.entities.query) or partition_column, _get_additional_subPartitionsToWaitFor_from_query(source.entities.query)
|
|
93
|
+
)
|
|
94
|
+
elif source.joinSource:
|
|
95
|
+
# TODO: Handle joinSource -- it doesn't work right now because the metadata isn't set on joinSource at this point
|
|
96
|
+
return []
|
|
97
|
+
else:
|
|
98
|
+
# Unknown source type
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
return [
|
|
102
|
+
create_airflow_dependency(table, source_partition_column, additional_partitions) for table in tables
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def extract_default_partition_column(obj):
|
|
107
|
+
try:
|
|
108
|
+
return obj.metaData.executionInfo.conf.common.get(
|
|
109
|
+
"spark.chronon.partition.column"
|
|
110
|
+
)
|
|
111
|
+
except Exception:
|
|
112
|
+
# Error handling occurs in `create_airflow_dependency`
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _get_distinct_day_windows(group_by):
|
|
117
|
+
windows = []
|
|
118
|
+
aggs = group_by.aggregations
|
|
119
|
+
if aggs:
|
|
120
|
+
for agg in aggs:
|
|
121
|
+
for window in agg.windows:
|
|
122
|
+
time_unit = window.timeUnit
|
|
123
|
+
length = window.length
|
|
124
|
+
if time_unit == TimeUnit.DAYS:
|
|
125
|
+
windows.append(length)
|
|
126
|
+
elif time_unit == TimeUnit.HOURS:
|
|
127
|
+
windows.append(math.ceil(length/24))
|
|
128
|
+
elif time_unit == TimeUnit.MINUTES:
|
|
129
|
+
windows.append(math.ceil(length/(24*60)))
|
|
130
|
+
return set(windows)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _set_join_deps(join):
|
|
134
|
+
default_partition_col = extract_default_partition_column(join)
|
|
135
|
+
|
|
136
|
+
deps = []
|
|
137
|
+
|
|
138
|
+
# Handle left source
|
|
139
|
+
left_query = utils.get_query(join.left)
|
|
140
|
+
left_partition_column = (
|
|
141
|
+
_get_partition_col_from_query(left_query) or default_partition_col
|
|
142
|
+
)
|
|
143
|
+
deps.extend(_get_airflow_deps_from_source(join.left, left_partition_column))
|
|
144
|
+
|
|
145
|
+
# Handle right parts (join parts)
|
|
146
|
+
if join.joinParts:
|
|
147
|
+
for join_part in join.joinParts:
|
|
148
|
+
if join_part.groupBy and join_part.groupBy.sources:
|
|
149
|
+
for source in join_part.groupBy.sources:
|
|
150
|
+
source_query = utils.get_query(source)
|
|
151
|
+
source_partition_column = (
|
|
152
|
+
_get_partition_col_from_query(source_query)
|
|
153
|
+
or default_partition_col
|
|
154
|
+
)
|
|
155
|
+
deps.extend(
|
|
156
|
+
_get_airflow_deps_from_source(source, source_partition_column)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
label_deps = []
|
|
160
|
+
# Handle label parts
|
|
161
|
+
if join.labelParts and join.labelParts.labels:
|
|
162
|
+
join_output_table = utils.output_table_name(join, full_name=True)
|
|
163
|
+
partition_column = join.metaData.executionInfo.conf.common[PARTITION_COLUMN_KEY]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# set the dependencies on the label sources
|
|
167
|
+
for label_part in join.labelParts.labels:
|
|
168
|
+
group_by = label_part.groupBy
|
|
169
|
+
|
|
170
|
+
# set the dependency on the join output -- one for each distinct window offset
|
|
171
|
+
windows = _get_distinct_day_windows(group_by)
|
|
172
|
+
for window in windows:
|
|
173
|
+
label_deps.append(
|
|
174
|
+
create_airflow_dependency(join_output_table, partition_column, offset=-1 * window)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if group_by and group_by.sources:
|
|
178
|
+
for source in label_part.groupBy.sources:
|
|
179
|
+
source_query = utils.get_query(source)
|
|
180
|
+
source_partition_column = (
|
|
181
|
+
_get_partition_col_from_query(source_query)
|
|
182
|
+
or default_partition_col
|
|
183
|
+
)
|
|
184
|
+
label_deps.extend(
|
|
185
|
+
_get_airflow_deps_from_source(source, source_partition_column)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# Update the metadata customJson with dependencies
|
|
190
|
+
_dedupe_and_set_airflow_deps_json(join, deps, AIRFLOW_DEPENDENCIES_KEY)
|
|
191
|
+
|
|
192
|
+
# Update the metadata customJson with label join deps
|
|
193
|
+
if label_deps:
|
|
194
|
+
_dedupe_and_set_airflow_deps_json(join, label_deps, AIRFLOW_LABEL_DEPENDENCIES_KEY)
|
|
195
|
+
|
|
196
|
+
# Set the t/f flag for label_join
|
|
197
|
+
_set_label_join_flag(join)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _set_group_by_deps(group_by):
|
|
201
|
+
if not group_by.sources:
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
default_partition_col = extract_default_partition_column(group_by)
|
|
205
|
+
|
|
206
|
+
deps = []
|
|
207
|
+
|
|
208
|
+
# Process each source in the group_by
|
|
209
|
+
for source in group_by.sources:
|
|
210
|
+
source_query = utils.get_query(source)
|
|
211
|
+
source_partition_column = (
|
|
212
|
+
_get_partition_col_from_query(source_query) or default_partition_col
|
|
213
|
+
)
|
|
214
|
+
deps.extend(_get_airflow_deps_from_source(source, source_partition_column))
|
|
215
|
+
|
|
216
|
+
# Update the metadata customJson with dependencies
|
|
217
|
+
_dedupe_and_set_airflow_deps_json(group_by, deps, AIRFLOW_DEPENDENCIES_KEY)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _set_label_join_flag(join):
|
|
221
|
+
existing_json = join.metaData.customJson or "{}"
|
|
222
|
+
json_map = json.loads(existing_json)
|
|
223
|
+
label_join_flag = False
|
|
224
|
+
if join.labelParts:
|
|
225
|
+
label_join_flag = True
|
|
226
|
+
json_map["label_join"] = label_join_flag
|
|
227
|
+
join.metaData.customJson = json.dumps(json_map)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _dedupe_and_set_airflow_deps_json(obj, deps, custom_json_key):
|
|
231
|
+
sorted_items = [tuple(sorted(d.items())) for d in deps]
|
|
232
|
+
# Use OrderedDict for re-producible ordering of dependencies
|
|
233
|
+
unique = [OrderedDict(t) for t in sorted_items]
|
|
234
|
+
existing_json = obj.metaData.customJson or "{}"
|
|
235
|
+
json_map = json.loads(existing_json)
|
|
236
|
+
json_map[custom_json_key] = unique
|
|
237
|
+
obj.metaData.customJson = json.dumps(json_map)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def set_airflow_deps(obj):
|
|
241
|
+
"""
|
|
242
|
+
Set Airflow dependencies for a Chronon object.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
obj: A Join, GroupBy
|
|
246
|
+
"""
|
|
247
|
+
# StagingQuery dependency setting is handled directly in object init
|
|
248
|
+
if isinstance(obj, Join):
|
|
249
|
+
_set_join_deps(obj)
|
|
250
|
+
elif isinstance(obj, GroupBy):
|
|
251
|
+
_set_group_by_deps(obj)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__all__ = ['ttypes', 'constants']
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__all__ = ['ttypes', 'constants']
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Autogenerated by Thrift Compiler (0.22.0)
|
|
3
|
+
#
|
|
4
|
+
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
|
|
5
|
+
#
|
|
6
|
+
# options string: py
|
|
7
|
+
#
|
|
8
|
+
|
|
9
|
+
from thrift.Thrift import TType, TMessageType, TFrozenDict, TException, TApplicationException
|
|
10
|
+
from thrift.protocol.TProtocol import TProtocolException
|
|
11
|
+
from thrift.TRecursive import fix_spec
|
|
12
|
+
from uuid import UUID
|
|
13
|
+
|
|
14
|
+
import sys
|
|
15
|
+
from .ttypes import *
|