awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/ttypes.py +6 -6
- ai/chronon/airflow_helpers.py +20 -23
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +40 -17
- ai/chronon/cli/compile/compile_context.py +13 -17
- ai/chronon/cli/compile/compiler.py +59 -36
- ai/chronon/cli/compile/conf_validator.py +251 -99
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +6 -16
- ai/chronon/cli/compile/display/compile_status.py +10 -10
- ai/chronon/cli/compile/display/diff_result.py +79 -14
- ai/chronon/cli/compile/fill_templates.py +3 -8
- ai/chronon/cli/compile/parse_configs.py +10 -17
- ai/chronon/cli/compile/parse_teams.py +38 -34
- ai/chronon/cli/compile/serializer.py +3 -9
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +2 -13
- ai/chronon/cli/logger.py +0 -2
- ai/chronon/constants.py +1 -1
- ai/chronon/group_by.py +47 -47
- ai/chronon/join.py +46 -32
- ai/chronon/logger.py +1 -2
- ai/chronon/model.py +9 -4
- ai/chronon/query.py +2 -2
- ai/chronon/repo/__init__.py +1 -2
- ai/chronon/repo/aws.py +17 -31
- ai/chronon/repo/cluster.py +121 -50
- ai/chronon/repo/compile.py +14 -8
- ai/chronon/repo/constants.py +1 -1
- ai/chronon/repo/default_runner.py +32 -54
- ai/chronon/repo/explore.py +70 -73
- ai/chronon/repo/extract_objects.py +6 -9
- ai/chronon/repo/gcp.py +89 -88
- ai/chronon/repo/gitpython_utils.py +3 -2
- ai/chronon/repo/hub_runner.py +145 -55
- ai/chronon/repo/hub_uploader.py +2 -1
- ai/chronon/repo/init.py +12 -5
- ai/chronon/repo/join_backfill.py +19 -5
- ai/chronon/repo/run.py +42 -39
- ai/chronon/repo/serializer.py +4 -12
- ai/chronon/repo/utils.py +72 -63
- ai/chronon/repo/zipline.py +3 -19
- ai/chronon/repo/zipline_hub.py +211 -39
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +4 -8
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +9 -6
- ai/chronon/resources/gcp/teams.py +9 -21
- ai/chronon/source.py +2 -4
- ai/chronon/staging_query.py +60 -19
- ai/chronon/types.py +3 -2
- ai/chronon/utils.py +21 -68
- ai/chronon/windows.py +2 -4
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
- awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
- awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- {ai/chronon → gen_thrift}/api/ttypes.py +327 -197
- {ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
- gen_thrift/eval/ttypes.py +660 -0
- {ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
- {ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
- {ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
- ai/chronon/eval/__init__.py +0 -122
- ai/chronon/eval/query_parsing.py +0 -19
- ai/chronon/eval/sample_tables.py +0 -100
- ai/chronon/eval/table_scan.py +0 -186
- ai/chronon/orchestration/ttypes.py +0 -4406
- ai/chronon/resources/gcp/README.md +0 -174
- ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
- awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
- awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
- awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
- /jars/__init__.py → /__init__.py +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
- {ai/chronon → gen_thrift}/api/__init__.py +0 -0
- {ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
- {ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
- {ai/chronon/api → gen_thrift/common}/constants.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
- {ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
- {ai/chronon → gen_thrift}/planner/__init__.py +0 -0
- {ai/chronon → gen_thrift}/planner/constants.py +0 -0
|
@@ -1,34 +1,30 @@
|
|
|
1
|
-
|
|
2
1
|
from sources.test.data import source_v1
|
|
3
2
|
|
|
4
3
|
from ai.chronon.group_by import Aggregation, GroupBy, Operation, TimeUnit, Window
|
|
5
4
|
|
|
6
|
-
window_sizes = [
|
|
5
|
+
window_sizes = [
|
|
6
|
+
Window(length=day, time_unit=TimeUnit.DAYS) for day in [3, 14, 30]
|
|
7
|
+
] # Define some window sizes to use below
|
|
7
8
|
|
|
8
9
|
group_by_v1 = GroupBy(
|
|
9
10
|
backfill_start_date="2023-11-01",
|
|
10
11
|
sources=[source_v1],
|
|
11
|
-
keys=["user_id"],
|
|
12
|
+
keys=["user_id"], # We are aggregating by user
|
|
12
13
|
online=True,
|
|
13
|
-
aggregations=[
|
|
14
|
-
input_column="purchase_price",
|
|
15
|
-
operation=Operation.SUM,
|
|
16
|
-
windows=window_sizes
|
|
17
|
-
), # The sum of purchases prices in various windows
|
|
14
|
+
aggregations=[
|
|
18
15
|
Aggregation(
|
|
19
|
-
input_column="purchase_price",
|
|
20
|
-
|
|
21
|
-
windows=window_sizes
|
|
22
|
-
), # The count of purchases in various windows
|
|
16
|
+
input_column="purchase_price", operation=Operation.SUM, windows=window_sizes
|
|
17
|
+
), # The sum of purchases prices in various windows
|
|
23
18
|
Aggregation(
|
|
24
|
-
input_column="purchase_price",
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
19
|
+
input_column="purchase_price", operation=Operation.COUNT, windows=window_sizes
|
|
20
|
+
), # The count of purchases in various windows
|
|
21
|
+
Aggregation(
|
|
22
|
+
input_column="purchase_price", operation=Operation.AVERAGE, windows=window_sizes
|
|
23
|
+
), # The average purchases by user in various windows
|
|
28
24
|
Aggregation(
|
|
29
25
|
input_column="purchase_price",
|
|
30
26
|
operation=Operation.LAST_K(10),
|
|
31
27
|
),
|
|
32
28
|
],
|
|
33
29
|
version=0,
|
|
34
|
-
)
|
|
30
|
+
)
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
from gen_thrift.api.ttypes import EventSource, Source
|
|
1
2
|
from group_bys.test.data import group_by_v1
|
|
2
3
|
|
|
3
|
-
from ai.chronon.api.ttypes import EventSource, Source
|
|
4
4
|
from ai.chronon.join import Join, JoinPart
|
|
5
5
|
from ai.chronon.query import Query, selects
|
|
6
6
|
|
|
@@ -12,9 +12,7 @@ source = Source(
|
|
|
12
12
|
events=EventSource(
|
|
13
13
|
table="data.checkouts",
|
|
14
14
|
query=Query(
|
|
15
|
-
selects=selects(
|
|
16
|
-
"user_id"
|
|
17
|
-
), # The primary key used to join various GroupBys together
|
|
15
|
+
selects=selects("user_id"), # The primary key used to join various GroupBys together
|
|
18
16
|
time_column="ts",
|
|
19
17
|
), # The event time used to compute feature values as-of
|
|
20
18
|
)
|
|
@@ -22,9 +20,7 @@ source = Source(
|
|
|
22
20
|
|
|
23
21
|
v1 = Join(
|
|
24
22
|
left=source,
|
|
25
|
-
right_parts=[
|
|
26
|
-
JoinPart(group_by=group_by_v1)
|
|
27
|
-
],
|
|
23
|
+
right_parts=[JoinPart(group_by=group_by_v1)],
|
|
28
24
|
row_ids="user_id",
|
|
29
25
|
version=0,
|
|
30
|
-
)
|
|
26
|
+
)
|
|
File without changes
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from gen_thrift.api.ttypes import EventSource, Source
|
|
2
|
+
|
|
2
3
|
from ai.chronon.query import Query, selects
|
|
3
4
|
|
|
4
5
|
"""
|
|
@@ -13,11 +14,13 @@ with a clear event time column and selected fields for downstream feature comput
|
|
|
13
14
|
|
|
14
15
|
source_v1 = Source(
|
|
15
16
|
events=EventSource(
|
|
16
|
-
table="data.purchases",
|
|
17
|
-
topic=None,
|
|
17
|
+
table="data.purchases", # This points to the log table in the warehouse with historical purchase events, updated in batch daily
|
|
18
|
+
topic=None, # See the 'returns' GroupBy for an example that has a streaming source configured. In this case, this would be the streaming source topic that can be listened to for realtime events
|
|
18
19
|
query=Query(
|
|
19
|
-
selects=selects("user_id","purchase_price"),
|
|
20
|
-
time_column="ts"
|
|
21
|
-
|
|
20
|
+
selects=selects("user_id", "purchase_price"), # Select the fields we care about
|
|
21
|
+
time_column="ts",
|
|
22
|
+
), # The event time
|
|
23
|
+
)
|
|
24
|
+
)
|
|
22
25
|
|
|
23
26
|
# The `source_v1` object can now be used in a Chronon join or pipeline definition
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from gen_thrift.api.ttypes import Team
|
|
2
|
+
|
|
2
3
|
from ai.chronon.repo.constants import RunMode
|
|
3
4
|
from ai.chronon.types import ConfigProperties, EnvironmentVariables
|
|
4
5
|
|
|
@@ -10,34 +11,25 @@ default = Team(
|
|
|
10
11
|
common={
|
|
11
12
|
"spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider",
|
|
12
13
|
"spark.chronon.table_write.format": "iceberg",
|
|
13
|
-
|
|
14
14
|
"spark.sql.defaultCatalog": "bigquery_catalog",
|
|
15
|
-
|
|
16
15
|
"spark.sql.catalog.bigquery_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog",
|
|
17
16
|
"spark.sql.catalog.bigquery_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
|
|
18
17
|
"spark.sql.catalog.bigquery_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO",
|
|
19
|
-
|
|
20
18
|
"spark.sql.defaultUrlStreamHandlerFactory.enabled": "false",
|
|
21
19
|
"spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator",
|
|
22
|
-
|
|
23
20
|
"spark.chronon.coalesce.factor": "10",
|
|
24
21
|
"spark.default.parallelism": "10",
|
|
25
22
|
"spark.sql.shuffle.partitions": "10",
|
|
26
|
-
|
|
27
23
|
# TODO: Please fill in the following values
|
|
28
24
|
"spark.sql.catalog.bigquery_catalog.warehouse": "gs://zipline-warehouse-<customer_id>/data/tables/",
|
|
29
|
-
"spark.sql.catalog.bigquery_catalog.
|
|
30
|
-
"spark.sql.catalog.bigquery_catalog.
|
|
31
|
-
"spark.chronon.partition.format": "<date-format>",
|
|
32
|
-
"spark.chronon.partition.column": "<partition-column-name>",
|
|
25
|
+
"spark.sql.catalog.bigquery_catalog.gcp.bigquery.location": "<region>",
|
|
26
|
+
"spark.sql.catalog.bigquery_catalog.gcp.bigquery.project-id": "<project-id>",
|
|
27
|
+
"spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
|
|
28
|
+
"spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
|
|
33
29
|
},
|
|
34
30
|
),
|
|
35
31
|
env=EnvironmentVariables(
|
|
36
32
|
common={
|
|
37
|
-
"JOB_MODE": "local[*]",
|
|
38
|
-
"CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",
|
|
39
|
-
"CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>",
|
|
40
|
-
|
|
41
33
|
# TODO: Please fill in the following values
|
|
42
34
|
"CUSTOMER_ID": "<customer_id>",
|
|
43
35
|
"GCP_PROJECT_ID": "<project-id>",
|
|
@@ -45,7 +37,7 @@ default = Team(
|
|
|
45
37
|
"GCP_DATAPROC_CLUSTER_NAME": "<dataproc-cluster-name>",
|
|
46
38
|
"GCP_BIGTABLE_INSTANCE_ID": "<bigtable-instance-id>",
|
|
47
39
|
"ARTIFACT_PREFIX": "<customer-artifact-bucket>",
|
|
48
|
-
"CLOUD_PROVIDER": "<gcp | aws>"
|
|
40
|
+
"CLOUD_PROVIDER": "<gcp | aws>",
|
|
49
41
|
},
|
|
50
42
|
),
|
|
51
43
|
)
|
|
@@ -54,11 +46,7 @@ default = Team(
|
|
|
54
46
|
test = Team(
|
|
55
47
|
outputNamespace="data",
|
|
56
48
|
env=EnvironmentVariables(
|
|
57
|
-
common={},
|
|
58
|
-
modeEnvironments={
|
|
59
|
-
RunMode.BACKFILL: {},
|
|
60
|
-
RunMode.UPLOAD: {}
|
|
61
|
-
}
|
|
49
|
+
common={}, modeEnvironments={RunMode.BACKFILL: {}, RunMode.UPLOAD: {}}
|
|
62
50
|
),
|
|
63
51
|
)
|
|
64
52
|
|
|
@@ -67,4 +55,4 @@ team_conf = Team(
|
|
|
67
55
|
env=EnvironmentVariables(
|
|
68
56
|
common={},
|
|
69
57
|
),
|
|
70
|
-
)
|
|
58
|
+
)
|
ai/chronon/source.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Wrappers to directly create Source objects.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
5
|
+
import gen_thrift.api.ttypes as ttypes
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def EventSource(
|
|
@@ -29,9 +29,7 @@ def EventSource(
|
|
|
29
29
|
|
|
30
30
|
"""
|
|
31
31
|
return ttypes.Source(
|
|
32
|
-
events=ttypes.EventSource(
|
|
33
|
-
table=table, topic=topic, query=query, isCumulative=is_cumulative
|
|
34
|
-
)
|
|
32
|
+
events=ttypes.EventSource(table=table, topic=topic, query=query, isCumulative=is_cumulative)
|
|
35
33
|
)
|
|
36
34
|
|
|
37
35
|
|
ai/chronon/staging_query.py
CHANGED
|
@@ -1,20 +1,28 @@
|
|
|
1
|
-
|
|
2
1
|
import inspect
|
|
3
2
|
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from typing import Dict, List, Optional, Union
|
|
6
5
|
|
|
6
|
+
import gen_thrift.api.ttypes as ttypes
|
|
7
|
+
import gen_thrift.common.ttypes as common
|
|
8
|
+
|
|
7
9
|
import ai.chronon.airflow_helpers as airflow_helpers
|
|
8
|
-
|
|
9
|
-
import ai.chronon.api.ttypes as ttypes
|
|
10
|
+
from ai.chronon import utils
|
|
10
11
|
from ai.chronon.constants import AIRFLOW_DEPENDENCIES_KEY
|
|
11
12
|
|
|
12
13
|
|
|
14
|
+
def _get_output_table_name(staging_query: ttypes.StagingQuery, full_name: bool = False):
|
|
15
|
+
"""generate output table name for staging query job"""
|
|
16
|
+
utils.__set_name(staging_query, ttypes.StagingQuery, "staging_queries")
|
|
17
|
+
return utils.output_table_name(staging_query, full_name=full_name)
|
|
18
|
+
|
|
19
|
+
|
|
13
20
|
# Wrapper for EngineType
|
|
14
21
|
class EngineType:
|
|
15
22
|
SPARK = ttypes.EngineType.SPARK
|
|
16
23
|
BIGQUERY = ttypes.EngineType.BIGQUERY
|
|
17
24
|
|
|
25
|
+
|
|
18
26
|
@dataclass
|
|
19
27
|
class TableDependency:
|
|
20
28
|
table: str
|
|
@@ -26,26 +34,54 @@ class TableDependency:
|
|
|
26
34
|
def to_thrift(self):
|
|
27
35
|
if self.offset is None:
|
|
28
36
|
raise ValueError(f"Dependency offset for table {self.table} must be specified.")
|
|
29
|
-
offset_window = common.Window(length
|
|
37
|
+
offset_window = common.Window(length=self.offset, timeUnit=common.TimeUnit.DAYS)
|
|
30
38
|
return common.TableDependency(
|
|
31
39
|
tableInfo=common.TableInfo(
|
|
32
|
-
table=self.table,
|
|
40
|
+
table=self.table,
|
|
33
41
|
partitionColumn=self.partition_column,
|
|
34
42
|
partitionFormat=self.partition_format,
|
|
35
|
-
partitionInterval=common.Window(1, common.TimeUnit.DAYS)
|
|
43
|
+
partitionInterval=common.Window(1, common.TimeUnit.DAYS),
|
|
36
44
|
),
|
|
37
45
|
startOffset=offset_window,
|
|
38
46
|
endOffset=offset_window,
|
|
39
47
|
startCutOff=None,
|
|
40
|
-
endCutOff=None
|
|
48
|
+
endCutOff=None,
|
|
41
49
|
)
|
|
42
50
|
|
|
51
|
+
|
|
52
|
+
def Import(
|
|
53
|
+
query: str,
|
|
54
|
+
version: int,
|
|
55
|
+
output_namespace: Optional[str] = None,
|
|
56
|
+
engine_type: Optional[EngineType] = None,
|
|
57
|
+
dependencies: Optional[List[Union[TableDependency, Dict]]] = None,
|
|
58
|
+
conf: Optional[common.ConfigProperties] = None,
|
|
59
|
+
env_vars: Optional[common.EnvironmentVariables] = None,
|
|
60
|
+
offline_schedule: str = "@daily",
|
|
61
|
+
):
|
|
62
|
+
assert dependencies is not None and len(dependencies) == 1, (
|
|
63
|
+
f"Import must specify exactly one table dependency. Got: {dependencies}"
|
|
64
|
+
)
|
|
65
|
+
assert dependencies[0].partition_column is not None, (
|
|
66
|
+
f"Import must specify a partition column for the table dependency. Got: {dependencies[0].partition_column}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return StagingQuery(
|
|
70
|
+
query=query,
|
|
71
|
+
version=version,
|
|
72
|
+
output_namespace=output_namespace,
|
|
73
|
+
dependencies=dependencies,
|
|
74
|
+
conf=conf,
|
|
75
|
+
env_vars=env_vars,
|
|
76
|
+
engine_type=engine_type,
|
|
77
|
+
offline_schedule=offline_schedule,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
43
81
|
def StagingQuery(
|
|
44
|
-
name: str,
|
|
45
82
|
query: str,
|
|
46
83
|
version: int,
|
|
47
84
|
output_namespace: Optional[str] = None,
|
|
48
|
-
start_partition: Optional[str] = None,
|
|
49
85
|
table_properties: Optional[Dict[str, str]] = None,
|
|
50
86
|
setups: Optional[List[str]] = None,
|
|
51
87
|
engine_type: Optional[EngineType] = None,
|
|
@@ -58,23 +94,20 @@ def StagingQuery(
|
|
|
58
94
|
cluster_conf: common.ClusterConfigProperties = None,
|
|
59
95
|
step_days: Optional[int] = None,
|
|
60
96
|
recompute_days: Optional[int] = None,
|
|
97
|
+
additional_partitions: List[str] = None,
|
|
61
98
|
) -> ttypes.StagingQuery:
|
|
62
99
|
"""
|
|
63
100
|
Creates a StagingQuery object for executing arbitrary SQL queries with templated date parameters.
|
|
64
101
|
|
|
65
102
|
:param query:
|
|
66
103
|
Arbitrary spark query that should be written with template parameters:
|
|
67
|
-
- `{{ start_date }}`: Initial run uses
|
|
104
|
+
- `{{ start_date }}`: Initial run uses start_date, future runs use latest partition + 1 day
|
|
68
105
|
- `{{ end_date }}`: The end partition of the computing range
|
|
69
106
|
- `{{ latest_date }}`: End partition independent of the computing range (for cumulative sources)
|
|
70
107
|
- `{{ max_date(table=namespace.my_table) }}`: Max partition available for a given table
|
|
71
108
|
These parameters can be modified with offset and bounds:
|
|
72
109
|
- `{{ start_date(offset=-10, lower_bound='2023-01-01', upper_bound='2024-01-01') }}`
|
|
73
110
|
:type query: str
|
|
74
|
-
:param start_partition:
|
|
75
|
-
On the first run, `{{ start_date }}` will be set to this user provided start date,
|
|
76
|
-
future incremental runs will set it to the latest existing partition + 1 day.
|
|
77
|
-
:type start_partition: str
|
|
78
111
|
:param setups:
|
|
79
112
|
Spark SQL setup statements. Used typically to register UDFs.
|
|
80
113
|
:type setups: List[str]
|
|
@@ -121,13 +154,17 @@ def StagingQuery(
|
|
|
121
154
|
# Get caller's filename to assign team
|
|
122
155
|
team = inspect.stack()[1].filename.split("/")[-2]
|
|
123
156
|
|
|
157
|
+
assert isinstance(version, int), (
|
|
158
|
+
f"Version must be an integer, but found {type(version).__name__}"
|
|
159
|
+
)
|
|
160
|
+
|
|
124
161
|
# Create execution info
|
|
125
162
|
exec_info = common.ExecutionInfo(
|
|
126
163
|
scheduleCron=offline_schedule,
|
|
127
164
|
conf=conf,
|
|
128
165
|
env=env_vars,
|
|
129
166
|
stepDays=step_days,
|
|
130
|
-
clusterConf=cluster_conf
|
|
167
|
+
clusterConf=cluster_conf,
|
|
131
168
|
)
|
|
132
169
|
|
|
133
170
|
airflow_dependencies = []
|
|
@@ -155,14 +192,14 @@ def StagingQuery(
|
|
|
155
192
|
|
|
156
193
|
# Create metadata
|
|
157
194
|
meta_data = ttypes.MetaData(
|
|
158
|
-
name=name,
|
|
159
195
|
outputNamespace=output_namespace,
|
|
160
196
|
team=team,
|
|
161
197
|
executionInfo=exec_info,
|
|
162
198
|
tags=tags,
|
|
163
199
|
customJson=custom_json,
|
|
164
200
|
tableProperties=table_properties,
|
|
165
|
-
version=str(version)
|
|
201
|
+
version=str(version),
|
|
202
|
+
additionalOutputPartitionColumns=additional_partitions,
|
|
166
203
|
)
|
|
167
204
|
|
|
168
205
|
thrift_deps = []
|
|
@@ -175,11 +212,15 @@ def StagingQuery(
|
|
|
175
212
|
staging_query = ttypes.StagingQuery(
|
|
176
213
|
metaData=meta_data,
|
|
177
214
|
query=query,
|
|
178
|
-
startPartition=start_partition,
|
|
179
215
|
setups=setups,
|
|
180
216
|
engineType=engine_type,
|
|
181
217
|
tableDependencies=thrift_deps,
|
|
182
218
|
recomputeDays=recompute_days,
|
|
183
219
|
)
|
|
184
220
|
|
|
185
|
-
|
|
221
|
+
# Add the table property that calls the private function
|
|
222
|
+
staging_query.__class__.table = property(
|
|
223
|
+
lambda self: _get_output_table_name(self, full_name=True)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
return staging_query
|
ai/chronon/types.py
CHANGED
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
importing ai.chronon.types will bring in all the api's needed to create any chronon object
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
import
|
|
6
|
-
import
|
|
5
|
+
import gen_thrift.api.ttypes as ttypes
|
|
6
|
+
import gen_thrift.common.ttypes as common
|
|
7
|
+
|
|
7
8
|
import ai.chronon.group_by as group_by
|
|
8
9
|
import ai.chronon.join as join
|
|
9
10
|
import ai.chronon.query as query
|
ai/chronon/utils.py
CHANGED
|
@@ -23,9 +23,9 @@ import tempfile
|
|
|
23
23
|
from collections.abc import Iterable
|
|
24
24
|
from typing import List, Optional, Union, cast
|
|
25
25
|
|
|
26
|
-
import
|
|
26
|
+
import gen_thrift.api.ttypes as api
|
|
27
|
+
|
|
27
28
|
import ai.chronon.repo.extract_objects as eo
|
|
28
|
-
from ai.chronon.cli.compile import parse_teams
|
|
29
29
|
from ai.chronon.repo import FOLDER_NAME_TO_CLASS
|
|
30
30
|
|
|
31
31
|
ChrononJobTypes = Union[api.GroupBy, api.Join, api.StagingQuery]
|
|
@@ -56,21 +56,16 @@ class JsonDiffer:
|
|
|
56
56
|
self.new_name = "new.json"
|
|
57
57
|
self.old_name = "old.json"
|
|
58
58
|
|
|
59
|
-
def diff(
|
|
60
|
-
self, new_json_str: object, old_json_str: object, skipped_keys=None
|
|
61
|
-
) -> str:
|
|
59
|
+
def diff(self, new_json_str: object, old_json_str: object, skipped_keys=None) -> str:
|
|
62
60
|
if skipped_keys is None:
|
|
63
61
|
skipped_keys = []
|
|
64
|
-
new_json = {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
with open(os.path.join(self.temp_dir, self.old_name), mode="w") as old, open(
|
|
72
|
-
os.path.join(self.temp_dir, self.new_name), mode="w"
|
|
73
|
-
) as new:
|
|
62
|
+
new_json = {k: v for k, v in json.loads(new_json_str).items() if k not in skipped_keys}
|
|
63
|
+
old_json = {k: v for k, v in json.loads(old_json_str).items() if k not in skipped_keys}
|
|
64
|
+
|
|
65
|
+
with (
|
|
66
|
+
open(os.path.join(self.temp_dir, self.old_name), mode="w") as old,
|
|
67
|
+
open(os.path.join(self.temp_dir, self.new_name), mode="w") as new,
|
|
68
|
+
):
|
|
74
69
|
old.write(json.dumps(old_json, sort_keys=True, indent=2))
|
|
75
70
|
new.write(json.dumps(new_json, sort_keys=True, indent=2))
|
|
76
71
|
diff_str = subprocess.run(
|
|
@@ -131,6 +126,7 @@ def _get_underlying_source(
|
|
|
131
126
|
else:
|
|
132
127
|
return source.joinSource
|
|
133
128
|
|
|
129
|
+
|
|
134
130
|
def get_root_source(
|
|
135
131
|
source: api.Source,
|
|
136
132
|
) -> Union[api.EventSource, api.EntitySource]:
|
|
@@ -141,6 +137,7 @@ def get_root_source(
|
|
|
141
137
|
else:
|
|
142
138
|
return get_root_source(source.joinSource.join.left)
|
|
143
139
|
|
|
140
|
+
|
|
144
141
|
def get_query(source: api.Source) -> api.Query:
|
|
145
142
|
return _get_underlying_source(source).query
|
|
146
143
|
|
|
@@ -151,7 +148,9 @@ def get_table(source: api.Source) -> str:
|
|
|
151
148
|
elif source.events:
|
|
152
149
|
table = source.events.table
|
|
153
150
|
else:
|
|
154
|
-
|
|
151
|
+
from ai.chronon.join import _get_output_table_name
|
|
152
|
+
|
|
153
|
+
table = _get_output_table_name(source.joinSource.join, True)
|
|
155
154
|
return table.split("/")[0]
|
|
156
155
|
|
|
157
156
|
|
|
@@ -229,11 +228,7 @@ def dict_to_bash_commands(d):
|
|
|
229
228
|
return ""
|
|
230
229
|
bash_commands = []
|
|
231
230
|
for key, value in d.items():
|
|
232
|
-
cmd = (
|
|
233
|
-
f"--{key.replace('_', '-')}={value}"
|
|
234
|
-
if value
|
|
235
|
-
else f"--{key.replace('_', '-')}"
|
|
236
|
-
)
|
|
231
|
+
cmd = f"--{key.replace('_', '-')}={value}" if value else f"--{key.replace('_', '-')}"
|
|
237
232
|
bash_commands.append(cmd)
|
|
238
233
|
return " ".join(bash_commands)
|
|
239
234
|
|
|
@@ -259,9 +254,7 @@ def output_table_name(obj, full_name: bool):
|
|
|
259
254
|
|
|
260
255
|
def join_part_name(jp):
|
|
261
256
|
if jp.groupBy is None:
|
|
262
|
-
raise NotImplementedError(
|
|
263
|
-
"Join Part names for non group bys is not implemented."
|
|
264
|
-
)
|
|
257
|
+
raise NotImplementedError("Join Part names for non group bys is not implemented.")
|
|
265
258
|
if not jp.groupBy.metaData.name and isinstance(jp.groupBy, api.GroupBy):
|
|
266
259
|
__set_name(jp.groupBy, api.GroupBy, "group_bys")
|
|
267
260
|
return "_".join(
|
|
@@ -296,51 +289,15 @@ def join_part_output_table_name(join, jp, full_name: bool = False):
|
|
|
296
289
|
)
|
|
297
290
|
|
|
298
291
|
|
|
299
|
-
def group_by_output_table_name(obj, full_name: bool = False):
|
|
300
|
-
"""
|
|
301
|
-
Group by backfill output table name
|
|
302
|
-
To be synced with api.Extensions.scala
|
|
303
|
-
"""
|
|
304
|
-
if not obj.metaData.name:
|
|
305
|
-
__set_name(obj, api.GroupBy, "group_bys")
|
|
306
|
-
return output_table_name(obj, full_name)
|
|
307
|
-
|
|
308
|
-
|
|
309
292
|
def log_table_name(obj, full_name: bool = False):
|
|
310
293
|
return output_table_name(obj, full_name=full_name) + "_logged"
|
|
311
294
|
|
|
312
295
|
|
|
313
|
-
def get_staging_query_output_table_name(
|
|
314
|
-
staging_query: api.StagingQuery, full_name: bool = False
|
|
315
|
-
):
|
|
316
|
-
"""generate output table name for staging query job"""
|
|
317
|
-
__set_name(staging_query, api.StagingQuery, "staging_queries")
|
|
318
|
-
return output_table_name(staging_query, full_name=full_name)
|
|
319
|
-
|
|
320
|
-
|
|
321
296
|
def get_team_conf_from_py(team, key):
|
|
322
297
|
team_module = importlib.import_module(f"teams.{team}")
|
|
323
298
|
return getattr(team_module, key)
|
|
324
299
|
|
|
325
300
|
|
|
326
|
-
def get_join_output_table_name(join: api.Join, full_name: bool = False):
|
|
327
|
-
"""generate output table name for join backfill job"""
|
|
328
|
-
# join sources could also be created inline alongside groupBy file
|
|
329
|
-
# so we specify fallback module as group_bys
|
|
330
|
-
if isinstance(join, api.Join):
|
|
331
|
-
__set_name(join, api.Join, "joins")
|
|
332
|
-
# set output namespace
|
|
333
|
-
if not join.metaData.outputNamespace:
|
|
334
|
-
team_name = join.metaData.name.split(".")[0]
|
|
335
|
-
namespace = (
|
|
336
|
-
parse_teams.load_teams(chronon_root_path, print=False)
|
|
337
|
-
.get(team_name)
|
|
338
|
-
.outputNamespace
|
|
339
|
-
)
|
|
340
|
-
join.metaData.outputNamespace = namespace
|
|
341
|
-
return output_table_name(join, full_name=full_name)
|
|
342
|
-
|
|
343
|
-
|
|
344
301
|
def wait_for_simple_schema(table, lag, start, end):
|
|
345
302
|
if not table:
|
|
346
303
|
return None
|
|
@@ -348,9 +305,7 @@ def wait_for_simple_schema(table, lag, start, end):
|
|
|
348
305
|
clean_name = table_tokens[0]
|
|
349
306
|
subpartition_spec = "/".join(table_tokens[1:]) if len(table_tokens) > 1 else ""
|
|
350
307
|
return {
|
|
351
|
-
"name": "wait_for_{}_ds{}".format(
|
|
352
|
-
clean_name, "" if lag == 0 else f"_minus_{lag}"
|
|
353
|
-
),
|
|
308
|
+
"name": "wait_for_{}_ds{}".format(clean_name, "" if lag == 0 else f"_minus_{lag}"),
|
|
354
309
|
"spec": "{}/ds={}{}".format(
|
|
355
310
|
clean_name,
|
|
356
311
|
"{{ ds }}" if lag == 0 else "{{{{ macros.ds_add(ds, -{}) }}}}".format(lag),
|
|
@@ -413,7 +368,6 @@ def get_applicable_modes(conf: ChrononJobTypes) -> List[str]:
|
|
|
413
368
|
modes.append("streaming")
|
|
414
369
|
|
|
415
370
|
elif isinstance(conf, api.Join):
|
|
416
|
-
|
|
417
371
|
join = cast(api.Join, conf)
|
|
418
372
|
|
|
419
373
|
if get_offline_schedule(conf) is not None:
|
|
@@ -483,9 +437,9 @@ def chronon_path(file_path: str) -> str:
|
|
|
483
437
|
conf_types = FOLDER_NAME_TO_CLASS.keys()
|
|
484
438
|
splits = file_path.split("/")
|
|
485
439
|
conf_occurences = [splits.index(typ) for typ in conf_types if typ in splits]
|
|
486
|
-
assert (
|
|
487
|
-
|
|
488
|
-
)
|
|
440
|
+
assert len(conf_occurences) > 0, (
|
|
441
|
+
f"Path: {file_path} doesn't contain folder with name among {conf_types}"
|
|
442
|
+
)
|
|
489
443
|
|
|
490
444
|
index = min([splits.index(typ) for typ in conf_types if typ in splits])
|
|
491
445
|
rel_path = "/".join(splits[index:])
|
|
@@ -535,7 +489,6 @@ def compose(arg, *methods):
|
|
|
535
489
|
result = [indent + arg]
|
|
536
490
|
|
|
537
491
|
for method in methods:
|
|
538
|
-
|
|
539
492
|
method_parts = method.split(" ", 1)
|
|
540
493
|
method = method_parts[0]
|
|
541
494
|
|
ai/chronon/windows.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import gen_thrift.common.ttypes as common
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
def _days(length: int) -> common.Window:
|
|
@@ -40,9 +40,7 @@ def _from_str(s: str) -> common.Window:
|
|
|
40
40
|
elif unit == "h":
|
|
41
41
|
return _hours(length)
|
|
42
42
|
else:
|
|
43
|
-
raise ValueError(
|
|
44
|
-
f"Invalid time unit '{unit}'. Must be 'd' for days or 'h' for hours"
|
|
45
|
-
)
|
|
43
|
+
raise ValueError(f"Invalid time unit '{unit}'. Must be 'd' for days or 'h' for hours")
|
|
46
44
|
|
|
47
45
|
except ValueError as e:
|
|
48
46
|
if "invalid literal for int()" in str(e):
|
|
@@ -1,33 +1,57 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: awx-zipline-ai
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Zipline
|
|
5
|
-
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: CLI tool for the Zipline AI platform
|
|
5
|
+
Author-email: Zipline AI <hello@zipline.ai>
|
|
6
|
+
License: Apache License 2.0
|
|
7
|
+
Project-URL: homepage, https://zipline.ai
|
|
8
|
+
Project-URL: documentation, https://docs.zipline.ai
|
|
9
|
+
Project-URL: github, https://github.com/zipline-ai/chronon/
|
|
6
10
|
Requires-Python: >=3.11
|
|
7
11
|
Description-Content-Type: text/markdown
|
|
8
|
-
|
|
9
|
-
Requires-Dist:
|
|
10
|
-
Requires-Dist:
|
|
11
|
-
Requires-Dist:
|
|
12
|
-
Requires-Dist:
|
|
12
|
+
Requires-Dist: boto3==1.40.26
|
|
13
|
+
Requires-Dist: botocore==1.40.26
|
|
14
|
+
Requires-Dist: cachetools==5.5.2
|
|
15
|
+
Requires-Dist: certifi==2025.8.3
|
|
16
|
+
Requires-Dist: charset-normalizer==3.4.3
|
|
17
|
+
Requires-Dist: click==8.2.1
|
|
13
18
|
Requires-Dist: crcmod==1.7
|
|
14
|
-
Requires-Dist:
|
|
15
|
-
Requires-Dist:
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: gitdb==4.0.12
|
|
20
|
+
Requires-Dist: gitpython==3.1.45
|
|
21
|
+
Requires-Dist: google-api-core[grpc]==2.25.1
|
|
22
|
+
Requires-Dist: google-auth==2.40.3
|
|
23
|
+
Requires-Dist: google-cloud-bigquery-storage==2.33.0
|
|
24
|
+
Requires-Dist: google-cloud-core==2.4.3
|
|
25
|
+
Requires-Dist: google-cloud-iam==2.19.1
|
|
18
26
|
Requires-Dist: google-cloud-storage==2.19.0
|
|
19
|
-
Requires-Dist: google-
|
|
20
|
-
Requires-Dist:
|
|
21
|
-
|
|
22
|
-
Requires-Dist:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
Requires-Dist: google-crc32c==1.7.1
|
|
28
|
+
Requires-Dist: google-resumable-media==2.7.2
|
|
29
|
+
Requires-Dist: googleapis-common-protos[grpc]==1.70.0
|
|
30
|
+
Requires-Dist: grpc-google-iam-v1==0.14.2
|
|
31
|
+
Requires-Dist: grpcio<=1.74.0,>=1.66.2
|
|
32
|
+
Requires-Dist: grpcio-status<=1.74.0,>=1.62.3
|
|
33
|
+
Requires-Dist: idna==3.10
|
|
34
|
+
Requires-Dist: importlib-resources==6.5.2
|
|
35
|
+
Requires-Dist: jmespath==1.0.1
|
|
36
|
+
Requires-Dist: markdown-it-py==4.0.0
|
|
37
|
+
Requires-Dist: mdurl==0.1.2
|
|
38
|
+
Requires-Dist: proto-plus==1.26.1
|
|
39
|
+
Requires-Dist: protobuf<=6.32.0,>=4.25.5
|
|
40
|
+
Requires-Dist: py4j==0.10.9.7
|
|
41
|
+
Requires-Dist: pyasn1==0.6.1
|
|
42
|
+
Requires-Dist: pyasn1-modules==0.4.2
|
|
43
|
+
Requires-Dist: pygments==2.19.2
|
|
44
|
+
Requires-Dist: pyspark==3.5.4
|
|
45
|
+
Requires-Dist: python-dateutil==2.9.0.post0
|
|
46
|
+
Requires-Dist: requests==2.32.5
|
|
47
|
+
Requires-Dist: rich==14.1.0
|
|
48
|
+
Requires-Dist: rsa==4.9.1
|
|
49
|
+
Requires-Dist: s3transfer==0.13.1
|
|
50
|
+
Requires-Dist: six==1.17.0
|
|
51
|
+
Requires-Dist: smmap==5.0.2
|
|
52
|
+
Requires-Dist: thrift==0.20.0
|
|
53
|
+
Requires-Dist: urllib3==2.5.0
|
|
54
|
+
Requires-Dist: python-dotenv>=1.0.1
|
|
31
55
|
|
|
32
56
|
### Chronon Python API
|
|
33
57
|
|