awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Dict, List, Optional, Union
|
|
5
|
+
|
|
6
|
+
import gen_thrift.api.ttypes as ttypes
|
|
7
|
+
import gen_thrift.common.ttypes as common
|
|
8
|
+
|
|
9
|
+
import ai.chronon.airflow_helpers as airflow_helpers
|
|
10
|
+
from ai.chronon import utils
|
|
11
|
+
from ai.chronon.constants import AIRFLOW_DEPENDENCIES_KEY
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _get_output_table_name(staging_query: ttypes.StagingQuery, full_name: bool = False):
|
|
15
|
+
"""generate output table name for staging query job"""
|
|
16
|
+
utils.__set_name(staging_query, ttypes.StagingQuery, "staging_queries")
|
|
17
|
+
return utils.output_table_name(staging_query, full_name=full_name)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Wrapper for EngineType
|
|
21
|
+
class EngineType:
|
|
22
|
+
SPARK = ttypes.EngineType.SPARK
|
|
23
|
+
BIGQUERY = ttypes.EngineType.BIGQUERY
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class TableDependency:
|
|
28
|
+
table: str
|
|
29
|
+
partition_column: Optional[str] = None
|
|
30
|
+
partition_format: Optional[str] = None
|
|
31
|
+
additional_partitions: Optional[List[str]] = None
|
|
32
|
+
offset: Optional[int] = None
|
|
33
|
+
|
|
34
|
+
def to_thrift(self):
|
|
35
|
+
if self.offset is None:
|
|
36
|
+
raise ValueError(f"Dependency offset for table {self.table} must be specified.")
|
|
37
|
+
offset_window = common.Window(length=self.offset, timeUnit=common.TimeUnit.DAYS)
|
|
38
|
+
return common.TableDependency(
|
|
39
|
+
tableInfo=common.TableInfo(
|
|
40
|
+
table=self.table,
|
|
41
|
+
partitionColumn=self.partition_column,
|
|
42
|
+
partitionFormat=self.partition_format,
|
|
43
|
+
partitionInterval=common.Window(1, common.TimeUnit.DAYS),
|
|
44
|
+
),
|
|
45
|
+
startOffset=offset_window,
|
|
46
|
+
endOffset=offset_window,
|
|
47
|
+
startCutOff=None,
|
|
48
|
+
endCutOff=None,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def Import(
|
|
53
|
+
query: str,
|
|
54
|
+
version: int,
|
|
55
|
+
output_namespace: Optional[str] = None,
|
|
56
|
+
engine_type: Optional[EngineType] = None,
|
|
57
|
+
dependencies: Optional[List[Union[TableDependency, Dict]]] = None,
|
|
58
|
+
conf: Optional[common.ConfigProperties] = None,
|
|
59
|
+
env_vars: Optional[common.EnvironmentVariables] = None,
|
|
60
|
+
offline_schedule: str = "@daily",
|
|
61
|
+
):
|
|
62
|
+
assert dependencies is not None and len(dependencies) == 1, (
|
|
63
|
+
f"Import must specify exactly one table dependency. Got: {dependencies}"
|
|
64
|
+
)
|
|
65
|
+
assert dependencies[0].partition_column is not None, (
|
|
66
|
+
f"Import must specify a partition column for the table dependency. Got: {dependencies[0].partition_column}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return StagingQuery(
|
|
70
|
+
query=query,
|
|
71
|
+
version=version,
|
|
72
|
+
output_namespace=output_namespace,
|
|
73
|
+
dependencies=dependencies,
|
|
74
|
+
conf=conf,
|
|
75
|
+
env_vars=env_vars,
|
|
76
|
+
engine_type=engine_type,
|
|
77
|
+
offline_schedule=offline_schedule,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def StagingQuery(
|
|
82
|
+
query: str,
|
|
83
|
+
version: int,
|
|
84
|
+
output_namespace: Optional[str] = None,
|
|
85
|
+
table_properties: Optional[Dict[str, str]] = None,
|
|
86
|
+
setups: Optional[List[str]] = None,
|
|
87
|
+
engine_type: Optional[EngineType] = None,
|
|
88
|
+
dependencies: Optional[List[Union[TableDependency, Dict]]] = None,
|
|
89
|
+
tags: Optional[Dict[str, str]] = None,
|
|
90
|
+
# execution params
|
|
91
|
+
offline_schedule: str = "@daily",
|
|
92
|
+
conf: Optional[common.ConfigProperties] = None,
|
|
93
|
+
env_vars: Optional[common.EnvironmentVariables] = None,
|
|
94
|
+
cluster_conf: common.ClusterConfigProperties = None,
|
|
95
|
+
step_days: Optional[int] = None,
|
|
96
|
+
recompute_days: Optional[int] = None,
|
|
97
|
+
additional_partitions: List[str] = None,
|
|
98
|
+
) -> ttypes.StagingQuery:
|
|
99
|
+
"""
|
|
100
|
+
Creates a StagingQuery object for executing arbitrary SQL queries with templated date parameters.
|
|
101
|
+
|
|
102
|
+
:param query:
|
|
103
|
+
Arbitrary spark query that should be written with template parameters:
|
|
104
|
+
- `{{ start_date }}`: Initial run uses start_date, future runs use latest partition + 1 day
|
|
105
|
+
- `{{ end_date }}`: The end partition of the computing range
|
|
106
|
+
- `{{ latest_date }}`: End partition independent of the computing range (for cumulative sources)
|
|
107
|
+
- `{{ max_date(table=namespace.my_table) }}`: Max partition available for a given table
|
|
108
|
+
These parameters can be modified with offset and bounds:
|
|
109
|
+
- `{{ start_date(offset=-10, lower_bound='2023-01-01', upper_bound='2024-01-01') }}`
|
|
110
|
+
:type query: str
|
|
111
|
+
:param setups:
|
|
112
|
+
Spark SQL setup statements. Used typically to register UDFs.
|
|
113
|
+
:type setups: List[str]
|
|
114
|
+
:type partition_column: str
|
|
115
|
+
:param engine_type:
|
|
116
|
+
By default, spark is the compute engine. You can specify an override (eg. bigquery, etc.)
|
|
117
|
+
Use the EngineType class constants: EngineType.SPARK, EngineType.BIGQUERY, etc.
|
|
118
|
+
:type engine_type: int
|
|
119
|
+
:param tags:
|
|
120
|
+
Additional metadata that does not directly affect computation, but is useful for management.
|
|
121
|
+
:type tags: Dict[str, str]
|
|
122
|
+
:param offline_schedule:
|
|
123
|
+
The offline schedule interval for batch jobs. Format examples:
|
|
124
|
+
'@hourly': '0 * * * *',
|
|
125
|
+
'@daily': '0 0 * * *',
|
|
126
|
+
'@weekly': '0 0 * * 0',
|
|
127
|
+
'@monthly': '0 0 1 * *',
|
|
128
|
+
'@yearly': '0 0 1 1 *'
|
|
129
|
+
:type offline_schedule: str
|
|
130
|
+
:param conf:
|
|
131
|
+
Configuration properties for the StagingQuery.
|
|
132
|
+
:type conf: common.ConfigProperties
|
|
133
|
+
:param env_vars:
|
|
134
|
+
Environment variables for the StagingQuery.
|
|
135
|
+
:type env_vars: common.EnvironmentVariables
|
|
136
|
+
:param cluster_conf:
|
|
137
|
+
Cluster configuration properties for the join.
|
|
138
|
+
:param step_days:
|
|
139
|
+
The maximum number of days to process at once
|
|
140
|
+
:type step_days: int
|
|
141
|
+
:param dependencies:
|
|
142
|
+
List of dependencies for the StagingQuery. Each dependency can be either a TableDependency object
|
|
143
|
+
or a dictionary with 'name' and 'spec' keys.
|
|
144
|
+
:type dependencies: List[Union[TableDependency, Dict]]
|
|
145
|
+
:param recompute_days:
|
|
146
|
+
Used by orchestrator to determine how many days are recomputed on each incremental scheduled run. Should be
|
|
147
|
+
set when the source data is changed in-place (i.e. existing partitions overwritten with new data each day up to
|
|
148
|
+
X days later) or when you want partially mature aggregations (i.e. a 7 day window, but start computing it from
|
|
149
|
+
day 1, and refresh it for the next 6 days)
|
|
150
|
+
:type recompute_days: int
|
|
151
|
+
:return:
|
|
152
|
+
A StagingQuery object
|
|
153
|
+
"""
|
|
154
|
+
# Get caller's filename to assign team
|
|
155
|
+
team = inspect.stack()[1].filename.split("/")[-2]
|
|
156
|
+
|
|
157
|
+
assert isinstance(version, int), (
|
|
158
|
+
f"Version must be an integer, but found {type(version).__name__}"
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# Create execution info
|
|
162
|
+
exec_info = common.ExecutionInfo(
|
|
163
|
+
scheduleCron=offline_schedule,
|
|
164
|
+
conf=conf,
|
|
165
|
+
env=env_vars,
|
|
166
|
+
stepDays=step_days,
|
|
167
|
+
clusterConf=cluster_conf,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
airflow_dependencies = []
|
|
171
|
+
|
|
172
|
+
if dependencies:
|
|
173
|
+
for d in dependencies:
|
|
174
|
+
if isinstance(d, TableDependency):
|
|
175
|
+
# Create an Airflow dependency object for the table
|
|
176
|
+
airflow_dependency = airflow_helpers.create_airflow_dependency(
|
|
177
|
+
d.table,
|
|
178
|
+
d.partition_column,
|
|
179
|
+
d.additional_partitions,
|
|
180
|
+
d.offset,
|
|
181
|
+
)
|
|
182
|
+
airflow_dependencies.append(airflow_dependency)
|
|
183
|
+
elif isinstance(d, dict):
|
|
184
|
+
# If it's already a dictionary, just append it
|
|
185
|
+
airflow_dependencies.append(d)
|
|
186
|
+
else:
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"Dependencies must be either TableDependency instances or dictionaries."
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
custom_json = json.dumps({AIRFLOW_DEPENDENCIES_KEY: airflow_dependencies})
|
|
192
|
+
|
|
193
|
+
# Create metadata
|
|
194
|
+
meta_data = ttypes.MetaData(
|
|
195
|
+
outputNamespace=output_namespace,
|
|
196
|
+
team=team,
|
|
197
|
+
executionInfo=exec_info,
|
|
198
|
+
tags=tags,
|
|
199
|
+
customJson=custom_json,
|
|
200
|
+
tableProperties=table_properties,
|
|
201
|
+
version=str(version),
|
|
202
|
+
additionalOutputPartitionColumns=additional_partitions,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
thrift_deps = []
|
|
206
|
+
if dependencies and len(dependencies) > 0:
|
|
207
|
+
for d in dependencies:
|
|
208
|
+
if d and isinstance(d, TableDependency):
|
|
209
|
+
thrift_deps.append(d.to_thrift())
|
|
210
|
+
|
|
211
|
+
# Create and return the StagingQuery object with camelCase parameter names
|
|
212
|
+
staging_query = ttypes.StagingQuery(
|
|
213
|
+
metaData=meta_data,
|
|
214
|
+
query=query,
|
|
215
|
+
setups=setups,
|
|
216
|
+
engineType=engine_type,
|
|
217
|
+
tableDependencies=thrift_deps,
|
|
218
|
+
recomputeDays=recompute_days,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Add the table property that calls the private function
|
|
222
|
+
staging_query.__class__.table = property(
|
|
223
|
+
lambda self: _get_output_table_name(self, full_name=True)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
return staging_query
|
ai/chronon/types.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
importing ai.chronon.types will bring in all the api's needed to create any chronon object
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import gen_thrift.api.ttypes as ttypes
|
|
6
|
+
import gen_thrift.common.ttypes as common
|
|
7
|
+
|
|
8
|
+
import ai.chronon.group_by as group_by
|
|
9
|
+
import ai.chronon.join as join
|
|
10
|
+
import ai.chronon.query as query
|
|
11
|
+
import ai.chronon.source as source
|
|
12
|
+
|
|
13
|
+
# source related concepts
|
|
14
|
+
Query = query.Query
|
|
15
|
+
selects = query.selects
|
|
16
|
+
|
|
17
|
+
Source = ttypes.Source
|
|
18
|
+
EventSource = source.EventSource
|
|
19
|
+
EntitySource = source.EntitySource
|
|
20
|
+
JoinSource = source.JoinSource
|
|
21
|
+
|
|
22
|
+
# Aggregation / GroupBy related concepts
|
|
23
|
+
GroupBy = group_by.GroupBy
|
|
24
|
+
Aggregation = group_by.Aggregation
|
|
25
|
+
Operation = group_by.Operation
|
|
26
|
+
Window = group_by.Window
|
|
27
|
+
TimeUnit = group_by.TimeUnit
|
|
28
|
+
DefaultAggregation = group_by.DefaultAggregation
|
|
29
|
+
|
|
30
|
+
Accuracy = ttypes.Accuracy
|
|
31
|
+
TEMPORAL = ttypes.Accuracy.TEMPORAL
|
|
32
|
+
SNAPSHOT = ttypes.Accuracy.SNAPSHOT
|
|
33
|
+
|
|
34
|
+
Derivation = group_by.Derivation
|
|
35
|
+
|
|
36
|
+
# join related concepts
|
|
37
|
+
Join = join.Join
|
|
38
|
+
JoinPart = join.JoinPart
|
|
39
|
+
BootstrapPart = join.BootstrapPart
|
|
40
|
+
LabelParts = join.LabelParts
|
|
41
|
+
ContextualSource = join.ContextualSource
|
|
42
|
+
ExternalPart = join.ExternalPart
|
|
43
|
+
ExternalSource = join.ExternalSource
|
|
44
|
+
DataType = join.DataType
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Staging Query related concepts
|
|
48
|
+
StagingQuery = ttypes.StagingQuery
|
|
49
|
+
MetaData = ttypes.MetaData
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
EnvironmentVariables = common.EnvironmentVariables
|
|
53
|
+
ConfigProperties = common.ConfigProperties
|
|
54
|
+
ClusterConfigProperties = common.ClusterConfigProperties
|
|
55
|
+
ExecutionInfo = common.ExecutionInfo
|
|
56
|
+
TableDependency = common.TableDependency
|
|
57
|
+
|
|
58
|
+
Team = ttypes.Team
|