awx-zipline-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +251 -0
- ai/chronon/api/__init__.py +1 -0
- ai/chronon/api/common/__init__.py +1 -0
- ai/chronon/api/common/constants.py +15 -0
- ai/chronon/api/common/ttypes.py +1844 -0
- ai/chronon/api/constants.py +15 -0
- ai/chronon/api/ttypes.py +3624 -0
- ai/chronon/cli/compile/column_hashing.py +313 -0
- ai/chronon/cli/compile/compile_context.py +177 -0
- ai/chronon/cli/compile/compiler.py +160 -0
- ai/chronon/cli/compile/conf_validator.py +590 -0
- ai/chronon/cli/compile/display/class_tracker.py +112 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +46 -0
- ai/chronon/cli/compile/fill_templates.py +40 -0
- ai/chronon/cli/compile/parse_configs.py +141 -0
- ai/chronon/cli/compile/parse_teams.py +238 -0
- ai/chronon/cli/compile/serializer.py +115 -0
- ai/chronon/cli/git_utils.py +156 -0
- ai/chronon/cli/logger.py +61 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/eval/__init__.py +122 -0
- ai/chronon/eval/query_parsing.py +19 -0
- ai/chronon/eval/sample_tables.py +100 -0
- ai/chronon/eval/table_scan.py +186 -0
- ai/chronon/fetcher/__init__.py +1 -0
- ai/chronon/fetcher/constants.py +15 -0
- ai/chronon/fetcher/ttypes.py +127 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/hub/__init__.py +1 -0
- ai/chronon/hub/constants.py +15 -0
- ai/chronon/hub/ttypes.py +1228 -0
- ai/chronon/join.py +566 -0
- ai/chronon/logger.py +24 -0
- ai/chronon/model.py +35 -0
- ai/chronon/observability/__init__.py +1 -0
- ai/chronon/observability/constants.py +15 -0
- ai/chronon/observability/ttypes.py +2192 -0
- ai/chronon/orchestration/__init__.py +1 -0
- ai/chronon/orchestration/constants.py +15 -0
- ai/chronon/orchestration/ttypes.py +4406 -0
- ai/chronon/planner/__init__.py +1 -0
- ai/chronon/planner/constants.py +15 -0
- ai/chronon/planner/ttypes.py +1686 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +40 -0
- ai/chronon/repo/aws.py +298 -0
- ai/chronon/repo/cluster.py +65 -0
- ai/chronon/repo/compile.py +56 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +291 -0
- ai/chronon/repo/explore.py +421 -0
- ai/chronon/repo/extract_objects.py +137 -0
- ai/chronon/repo/gcp.py +585 -0
- ai/chronon/repo/gitpython_utils.py +14 -0
- ai/chronon/repo/hub_runner.py +171 -0
- ai/chronon/repo/hub_uploader.py +108 -0
- ai/chronon/repo/init.py +53 -0
- ai/chronon/repo/join_backfill.py +105 -0
- ai/chronon/repo/run.py +293 -0
- ai/chronon/repo/serializer.py +141 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +472 -0
- ai/chronon/repo/zipline.py +51 -0
- ai/chronon/repo/zipline_hub.py +105 -0
- ai/chronon/resources/gcp/README.md +174 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +30 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +23 -0
- ai/chronon/resources/gcp/teams.py +70 -0
- ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
- ai/chronon/source.py +88 -0
- ai/chronon/staging_query.py +185 -0
- ai/chronon/types.py +57 -0
- ai/chronon/utils.py +557 -0
- ai/chronon/windows.py +50 -0
- awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
- awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
- awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
- awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
- jars/__init__.py +0 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
|
|
2
|
+
# 🧠 Zipline AI: Sample Chronon Project
|
|
3
|
+
|
|
4
|
+
This repository demonstrates how to author and run [Chronon](https://chronon.ai) pipelines, including GroupBy and Join definitions, using GCP (BigQuery + Iceberg) as the storage backend.
|
|
5
|
+
|
|
6
|
+
Chronon is a unified platform for **feature engineering**, enabling **online and offline consistency**, **real-time feature generation**, and **historical backfills** from a single codebase.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 📦 Project Structure
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
.
|
|
14
|
+
├── group_bys/ # GroupBy definitions (feature aggregations)
|
|
15
|
+
├── joins/ # Join definitions (how sources and GroupBys are combined)
|
|
16
|
+
├── sources/ # Chronon Source definitions (event tables)
|
|
17
|
+
├── compiled/ # Generated configs and outputs
|
|
18
|
+
├── teams.py # Chronon Team configurations
|
|
19
|
+
└── README.md
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## 🚀 Quick Start
|
|
25
|
+
|
|
26
|
+
### 🛠️ Requirements
|
|
27
|
+
|
|
28
|
+
To get started, make sure you have the following set up:
|
|
29
|
+
|
|
30
|
+
- ✅ **Python** 3.11 or higher
|
|
31
|
+
- ✅ **Zipline CLI** — Only for **upgrades or downgrades**, install via:
|
|
32
|
+
```bash
|
|
33
|
+
./zipline-cli-install.sh
|
|
34
|
+
- ✅ gcloud CLI — authenticated and configured with the correct GCP project
|
|
35
|
+
- ✅ Google Cloud credentials — either:
|
|
36
|
+
- Application Default Credentials (ADC)
|
|
37
|
+
- A service account with access to BigQuery and GCS
|
|
38
|
+
- ✅Add this to your shell config (e.g., .bashrc, .zshrc):
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# From the same directory as this README
|
|
42
|
+
export PYTHONPATH="$(pwd):$PYTHONPATH"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
## Requirements
|
|
47
|
+
|
|
48
|
+
Teams define metadata, Spark config, and environment variables.
|
|
49
|
+
|
|
50
|
+
In [teams.py](teams.py), fill in the appropriate values in the TODO section.
|
|
51
|
+
|
|
52
|
+
Make sure to replace placeholders like `<project-id>` and `<gcs-prefix>` with real values.
|
|
53
|
+
|
|
54
|
+
### Partition format and column
|
|
55
|
+
Chronon expects tables to be date partitioned. Please specify the partition format and the column in teams.py here:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
"spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
|
|
59
|
+
"spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## 🧪 Compiling
|
|
65
|
+
|
|
66
|
+
To generate the user configs from the Python chronon objects to be used in the CLI, run:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
zipline compile
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
This will create a `compiled` directory.
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## 🧪 Running a GroupBy or Join Backfill
|
|
77
|
+
|
|
78
|
+
Run a GroupBy backfill from the CLI:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
zipline run \
|
|
82
|
+
--mode backfill \
|
|
83
|
+
--conf compiled/group_bys/<TEAM_NAME>/<GROUPBY_NAME>
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Run a Join backfill from the CLI:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
zipline run \
|
|
90
|
+
--mode backfill \
|
|
91
|
+
--conf compiled/joins/<TEAM_NAME>/<JOIN_NAME>
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Results are written to the configured BigQuery + Iceberg tables under the `outputNamespace` (e.g. `default.group_by_v1` or `default.v1`).
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## 🧪 Running a GroupBy upload (GBU) job.
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
zipline run \
|
|
102
|
+
--mode upload \
|
|
103
|
+
--conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
|
|
104
|
+
--ds <DATE>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Results are written to the configured BigQuery + Iceberg tables under the `outputNamespace` (e.g. `default.group_by_v1` or `default.v1`).
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 🧪 Upload the GBU values to online KV store.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
zipline run \
|
|
115
|
+
--mode upload-to-kv \
|
|
116
|
+
--conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
|
|
117
|
+
--ds <DATE>
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 🧪 Upload the metadata of Chronon GroupBy or Join to online KV store for serving.
|
|
123
|
+
|
|
124
|
+
GroupBy metadata upload:
|
|
125
|
+
```bash
|
|
126
|
+
zipline run \
|
|
127
|
+
--mode metadata-upload \
|
|
128
|
+
--conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME>
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Join metadata upload:
|
|
132
|
+
```bash
|
|
133
|
+
zipline run \
|
|
134
|
+
--mode metadata-upload \
|
|
135
|
+
--conf compiled/joins/<TEAM_NAME>/<JOIN_NAME>
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## 🧪 Fetch feature values from Chronon GroupBy or Join.
|
|
141
|
+
|
|
142
|
+
**Note:** This is only for debugging purposes. Not for production use.
|
|
143
|
+
|
|
144
|
+
Fetching from a GroupBy:
|
|
145
|
+
```bash
|
|
146
|
+
zipline run \
|
|
147
|
+
--mode fetch \
|
|
148
|
+
--conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
|
|
149
|
+
--name <GROUP_BY_NAME> \
|
|
150
|
+
-k '{"<ENTITY_KEY>": "<VALUE>"}'
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
Fetching from a Join:
|
|
154
|
+
```bash
|
|
155
|
+
zipline run \
|
|
156
|
+
--mode fetch \
|
|
157
|
+
--conf compiled/joins/<TEAM_NAME>/<JOIN_NAME> \
|
|
158
|
+
--name <JOIN_NAME> \
|
|
159
|
+
-k '{"<ENTITY_KEY>": "<VALUE>"}'
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## 📚 Resources
|
|
165
|
+
|
|
166
|
+
- [Chronon Docs](https://chronon.ai)
|
|
167
|
+
- [GitHub](https://github.com/airbnb/chronon)
|
|
168
|
+
- [Community Slack](https://join.slack.com/t/chrononworkspace/shared_invite/zt-33zbnzwac-ghPZXpYNZJsArXZ5WdBy9g)
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## 👋 About
|
|
173
|
+
|
|
174
|
+
This project is a reference scaffold for building scalable feature pipelines using Chronon on GCP. It provides end-to-end visibility from source to production features.
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
|
|
2
|
+
from sources.test.data import source_v1
|
|
3
|
+
|
|
4
|
+
from ai.chronon.group_by import Aggregation, GroupBy, Operation, TimeUnit, Window
|
|
5
|
+
|
|
6
|
+
window_sizes = [Window(length=day, time_unit=TimeUnit.DAYS) for day in [3, 14, 30]] # Define some window sizes to use below
|
|
7
|
+
|
|
8
|
+
group_by_v1 = GroupBy(
|
|
9
|
+
backfill_start_date="2023-11-01",
|
|
10
|
+
sources=[source_v1],
|
|
11
|
+
keys=["user_id"], # We are aggregating by user
|
|
12
|
+
online=True,
|
|
13
|
+
aggregations=[Aggregation(
|
|
14
|
+
input_column="purchase_price",
|
|
15
|
+
operation=Operation.SUM,
|
|
16
|
+
windows=window_sizes
|
|
17
|
+
), # The sum of purchases prices in various windows
|
|
18
|
+
Aggregation(
|
|
19
|
+
input_column="purchase_price",
|
|
20
|
+
operation=Operation.COUNT,
|
|
21
|
+
windows=window_sizes
|
|
22
|
+
), # The count of purchases in various windows
|
|
23
|
+
Aggregation(
|
|
24
|
+
input_column="purchase_price",
|
|
25
|
+
operation=Operation.AVERAGE,
|
|
26
|
+
windows=window_sizes
|
|
27
|
+
), # The average purchases by user in various windows
|
|
28
|
+
Aggregation(
|
|
29
|
+
input_column="purchase_price",
|
|
30
|
+
operation=Operation.LAST_K(10),
|
|
31
|
+
),
|
|
32
|
+
],
|
|
33
|
+
version=0,
|
|
34
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from group_bys.test.data import group_by_v1
|
|
2
|
+
|
|
3
|
+
from ai.chronon.api.ttypes import EventSource, Source
|
|
4
|
+
from ai.chronon.join import Join, JoinPart
|
|
5
|
+
from ai.chronon.query import Query, selects
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
This is the "left side" of the join that will comprise our training set. It is responsible for providing the primary keys
|
|
9
|
+
and timestamps for which features will be computed.
|
|
10
|
+
"""
|
|
11
|
+
source = Source(
|
|
12
|
+
events=EventSource(
|
|
13
|
+
table="data.checkouts",
|
|
14
|
+
query=Query(
|
|
15
|
+
selects=selects(
|
|
16
|
+
"user_id"
|
|
17
|
+
), # The primary key used to join various GroupBys together
|
|
18
|
+
time_column="ts",
|
|
19
|
+
), # The event time used to compute feature values as-of
|
|
20
|
+
)
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
v1 = Join(
|
|
24
|
+
left=source,
|
|
25
|
+
right_parts=[
|
|
26
|
+
JoinPart(group_by=group_by_v1)
|
|
27
|
+
],
|
|
28
|
+
row_ids="user_id",
|
|
29
|
+
version=0,
|
|
30
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from ai.chronon.api.ttypes import EventSource, Source
|
|
2
|
+
from ai.chronon.query import Query, selects
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Example: Defining a Chronon Source from a Batch Table
|
|
6
|
+
|
|
7
|
+
This example demonstrates how to configure a Chronon `Source` from a BigQuery or Hive table,
|
|
8
|
+
with a clear event time column and selected fields for downstream feature computation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Define the EventSource using the batch table and query
|
|
12
|
+
# Wrap the EventSource in a Source object
|
|
13
|
+
|
|
14
|
+
source_v1 = Source(
|
|
15
|
+
events=EventSource(
|
|
16
|
+
table="data.purchases", # This points to the log table in the warehouse with historical purchase events, updated in batch daily
|
|
17
|
+
topic=None, # See the 'returns' GroupBy for an example that has a streaming source configured. In this case, this would be the streaming source topic that can be listened to for realtime events
|
|
18
|
+
query=Query(
|
|
19
|
+
selects=selects("user_id","purchase_price"), # Select the fields we care about
|
|
20
|
+
time_column="ts") # The event time
|
|
21
|
+
))
|
|
22
|
+
|
|
23
|
+
# The `source_v1` object can now be used in a Chronon join or pipeline definition
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from ai.chronon.api.ttypes import Team
|
|
2
|
+
from ai.chronon.repo.constants import RunMode
|
|
3
|
+
from ai.chronon.types import ConfigProperties, EnvironmentVariables
|
|
4
|
+
|
|
5
|
+
default = Team(
|
|
6
|
+
description="Default team",
|
|
7
|
+
email="<responsible-team-email>",
|
|
8
|
+
outputNamespace="default",
|
|
9
|
+
conf=ConfigProperties(
|
|
10
|
+
common={
|
|
11
|
+
"spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider",
|
|
12
|
+
"spark.chronon.table_write.format": "iceberg",
|
|
13
|
+
|
|
14
|
+
"spark.sql.defaultCatalog": "bigquery_catalog",
|
|
15
|
+
|
|
16
|
+
"spark.sql.catalog.bigquery_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog",
|
|
17
|
+
"spark.sql.catalog.bigquery_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
|
|
18
|
+
"spark.sql.catalog.bigquery_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO",
|
|
19
|
+
|
|
20
|
+
"spark.sql.defaultUrlStreamHandlerFactory.enabled": "false",
|
|
21
|
+
"spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator",
|
|
22
|
+
|
|
23
|
+
"spark.chronon.coalesce.factor": "10",
|
|
24
|
+
"spark.default.parallelism": "10",
|
|
25
|
+
"spark.sql.shuffle.partitions": "10",
|
|
26
|
+
|
|
27
|
+
# TODO: Please fill in the following values
|
|
28
|
+
"spark.sql.catalog.bigquery_catalog.warehouse": "gs://zipline-warehouse-<customer_id>/data/tables/",
|
|
29
|
+
"spark.sql.catalog.bigquery_catalog.gcp_location": "<region>",
|
|
30
|
+
"spark.sql.catalog.bigquery_catalog.gcp_project": "<project-id>",
|
|
31
|
+
"spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
|
|
32
|
+
"spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
|
|
33
|
+
},
|
|
34
|
+
),
|
|
35
|
+
env=EnvironmentVariables(
|
|
36
|
+
common={
|
|
37
|
+
"JOB_MODE": "local[*]",
|
|
38
|
+
"CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",
|
|
39
|
+
"CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>",
|
|
40
|
+
|
|
41
|
+
# TODO: Please fill in the following values
|
|
42
|
+
"CUSTOMER_ID": "<customer_id>",
|
|
43
|
+
"GCP_PROJECT_ID": "<project-id>",
|
|
44
|
+
"GCP_REGION": "<region>",
|
|
45
|
+
"GCP_DATAPROC_CLUSTER_NAME": "<dataproc-cluster-name>",
|
|
46
|
+
"GCP_BIGTABLE_INSTANCE_ID": "<bigtable-instance-id>",
|
|
47
|
+
"ARTIFACT_PREFIX": "<customer-artifact-bucket>",
|
|
48
|
+
"CLOUD_PROVIDER": "<gcp | aws>"
|
|
49
|
+
},
|
|
50
|
+
),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
test = Team(
|
|
55
|
+
outputNamespace="data",
|
|
56
|
+
env=EnvironmentVariables(
|
|
57
|
+
common={},
|
|
58
|
+
modeEnvironments={
|
|
59
|
+
RunMode.BACKFILL: {},
|
|
60
|
+
RunMode.UPLOAD: {}
|
|
61
|
+
}
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
team_conf = Team(
|
|
66
|
+
outputNamespace="test",
|
|
67
|
+
env=EnvironmentVariables(
|
|
68
|
+
common={},
|
|
69
|
+
),
|
|
70
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
function print_usage() {
|
|
4
|
+
echo "Usage: $0 [OPTIONS]"
|
|
5
|
+
echo "Options:"
|
|
6
|
+
echo " --artifact_prefix <gcs_bucket> Specify the gcs bucket to upload artifacts to e.g. \"gs://ck-zipline-artifacts\""
|
|
7
|
+
echo " --version <version> Specify the version you want to run"
|
|
8
|
+
echo " -h, --help Show this help message"
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
if [ $# -ne 4 ]; then
|
|
12
|
+
print_usage
|
|
13
|
+
exit 1
|
|
14
|
+
fi
|
|
15
|
+
|
|
16
|
+
while [[ $# -gt 0 ]]; do
|
|
17
|
+
case $1 in
|
|
18
|
+
--artifact_prefix)
|
|
19
|
+
if [[ -z $2 ]]; then
|
|
20
|
+
echo "Error: --artifact_prefix requires a value"
|
|
21
|
+
print_usage
|
|
22
|
+
exit 1
|
|
23
|
+
fi
|
|
24
|
+
ARTIFACT_PREFIX="$2"
|
|
25
|
+
shift 2
|
|
26
|
+
;;
|
|
27
|
+
-h|--help)
|
|
28
|
+
print_usage
|
|
29
|
+
exit 0
|
|
30
|
+
;;
|
|
31
|
+
--version)
|
|
32
|
+
if [[ -z $2 ]]; then
|
|
33
|
+
echo "Error: --version requires a value"
|
|
34
|
+
print_usage
|
|
35
|
+
exit 1
|
|
36
|
+
fi
|
|
37
|
+
VERSION="$2"
|
|
38
|
+
shift 2
|
|
39
|
+
;;
|
|
40
|
+
*)
|
|
41
|
+
echo "Unknown option: $1"
|
|
42
|
+
print_usage
|
|
43
|
+
exit 1
|
|
44
|
+
;;
|
|
45
|
+
esac
|
|
46
|
+
done
|
|
47
|
+
|
|
48
|
+
gcloud storage cp "${ARTIFACT_PREFIX%/}/release/$VERSION/wheels/zipline_ai-$VERSION-py3-none-any.whl" .
|
|
49
|
+
|
|
50
|
+
trap 'rm -f ./zipline_ai-$VERSION-py3-none-any.whl' EXIT
|
|
51
|
+
|
|
52
|
+
pip3 uninstall zipline-ai
|
|
53
|
+
|
|
54
|
+
pip3 install ./zipline_ai-$VERSION-py3-none-any.whl
|
ai/chronon/source.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Wrappers to directly create Source objects.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import ai.chronon.api.ttypes as ttypes
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def EventSource(
|
|
9
|
+
table: str,
|
|
10
|
+
query: ttypes.Query,
|
|
11
|
+
topic: str = None,
|
|
12
|
+
is_cumulative: bool = None,
|
|
13
|
+
) -> ttypes.Source:
|
|
14
|
+
"""
|
|
15
|
+
Event Sources represent data that gets generated over-time.
|
|
16
|
+
Typically, but not necessarily, logged to message buses like kafka, kinesis or google pub/sub.
|
|
17
|
+
fct tables are also event source worthy.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
|
|
21
|
+
- table: Table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
|
|
22
|
+
Table names can contain subpartition specs, example db.table/system=mobile/currency=USD
|
|
23
|
+
- topic: Topic is a kafka table. The table contains all the events historically came through this topic.
|
|
24
|
+
- query: The logic used to scan both the table and the topic. Contains row level transformations
|
|
25
|
+
and filtering expressed as Spark SQL statements.
|
|
26
|
+
- isCumulative: If each new hive partition contains not just the current day's events but the entire set
|
|
27
|
+
of events since the begininng. The key property is that the events are not mutated
|
|
28
|
+
across partitions.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
return ttypes.Source(
|
|
32
|
+
events=ttypes.EventSource(
|
|
33
|
+
table=table, topic=topic, query=query, isCumulative=is_cumulative
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def EntitySource(
|
|
39
|
+
snapshot_table: str,
|
|
40
|
+
query: ttypes.Query,
|
|
41
|
+
mutation_table: str = None,
|
|
42
|
+
mutation_topic: str = None,
|
|
43
|
+
) -> ttypes.Source:
|
|
44
|
+
"""
|
|
45
|
+
Entity Sources represent data that gets mutated over-time - at row-level. This is a group of three data elements.
|
|
46
|
+
snapshotTable, mutationTable and mutationTopic. mutationTable and mutationTopic are only necessary if we are trying
|
|
47
|
+
to create realtime or point-in-time aggregations over these sources. Entity sources usually map 1:1 with a database
|
|
48
|
+
tables in your OLTP store that typically serves live application traffic. When mutation data is absent they map 1:1
|
|
49
|
+
to `dim` tables in star schema.
|
|
50
|
+
|
|
51
|
+
Attributes:
|
|
52
|
+
- snapshotTable: Snapshot table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
|
|
53
|
+
- mutationTable: Topic is a kafka table. The table contains
|
|
54
|
+
all the events that historically came through this topic.
|
|
55
|
+
We need all the fields present in the snapshot table, PLUS two additional fields,
|
|
56
|
+
`mutation_time` - milliseconds since epoch of type Long that represents the time of the mutation
|
|
57
|
+
`is_before` - a boolean flag that represents whether
|
|
58
|
+
this row contains values before or after the mutation.
|
|
59
|
+
- mutationTopic: The logic used to scan both the table and the topic. Contains row level transformations
|
|
60
|
+
and filtering expressed as Spark SQL statements.
|
|
61
|
+
- query: If each new hive partition contains not just the current day's events but the entire set
|
|
62
|
+
of events since the begininng. The key property is that the events are not mutated across partitions.
|
|
63
|
+
"""
|
|
64
|
+
return ttypes.Source(
|
|
65
|
+
entities=ttypes.EntitySource(
|
|
66
|
+
snapshotTable=snapshot_table,
|
|
67
|
+
mutationTable=mutation_table,
|
|
68
|
+
mutationTopic=mutation_topic,
|
|
69
|
+
query=query,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def JoinSource(join: ttypes.Join, query: ttypes.Query) -> ttypes.Source:
|
|
75
|
+
"""
|
|
76
|
+
The output of a join can be used as a source for `GroupBy`.
|
|
77
|
+
Useful for expressing complex computation in chronon.
|
|
78
|
+
|
|
79
|
+
Offline this simply means that we will compute the necessary date ranges of the join
|
|
80
|
+
before we start computing the `GroupBy`.
|
|
81
|
+
|
|
82
|
+
Online we will:
|
|
83
|
+
1. enrich the stream/topic of `join.left` with all the columns defined by the join
|
|
84
|
+
2. apply the selects & wheres defined in the `query`
|
|
85
|
+
3. perform aggregations defined in the *downstream* `GroupBy`
|
|
86
|
+
4. write the result to the kv store.
|
|
87
|
+
"""
|
|
88
|
+
return ttypes.Source(joinSource=ttypes.JoinSource(join=join, query=query))
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
|
|
2
|
+
import inspect
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Dict, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import ai.chronon.airflow_helpers as airflow_helpers
|
|
8
|
+
import ai.chronon.api.common.ttypes as common
|
|
9
|
+
import ai.chronon.api.ttypes as ttypes
|
|
10
|
+
from ai.chronon.constants import AIRFLOW_DEPENDENCIES_KEY
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Wrapper for EngineType
|
|
14
|
+
class EngineType:
|
|
15
|
+
SPARK = ttypes.EngineType.SPARK
|
|
16
|
+
BIGQUERY = ttypes.EngineType.BIGQUERY
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class TableDependency:
|
|
20
|
+
table: str
|
|
21
|
+
partition_column: Optional[str] = None
|
|
22
|
+
partition_format: Optional[str] = None
|
|
23
|
+
additional_partitions: Optional[List[str]] = None
|
|
24
|
+
offset: Optional[int] = None
|
|
25
|
+
|
|
26
|
+
def to_thrift(self):
|
|
27
|
+
if self.offset is None:
|
|
28
|
+
raise ValueError(f"Dependency offset for table {self.table} must be specified.")
|
|
29
|
+
offset_window = common.Window(length = self.offset, timeUnit= common.TimeUnit.DAYS)
|
|
30
|
+
return common.TableDependency(
|
|
31
|
+
tableInfo=common.TableInfo(
|
|
32
|
+
table=self.table,
|
|
33
|
+
partitionColumn=self.partition_column,
|
|
34
|
+
partitionFormat=self.partition_format,
|
|
35
|
+
partitionInterval=common.Window(1, common.TimeUnit.DAYS)
|
|
36
|
+
),
|
|
37
|
+
startOffset=offset_window,
|
|
38
|
+
endOffset=offset_window,
|
|
39
|
+
startCutOff=None,
|
|
40
|
+
endCutOff=None
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def StagingQuery(
|
|
44
|
+
name: str,
|
|
45
|
+
query: str,
|
|
46
|
+
version: int,
|
|
47
|
+
output_namespace: Optional[str] = None,
|
|
48
|
+
start_partition: Optional[str] = None,
|
|
49
|
+
table_properties: Optional[Dict[str, str]] = None,
|
|
50
|
+
setups: Optional[List[str]] = None,
|
|
51
|
+
engine_type: Optional[EngineType] = None,
|
|
52
|
+
dependencies: Optional[List[Union[TableDependency, Dict]]] = None,
|
|
53
|
+
tags: Optional[Dict[str, str]] = None,
|
|
54
|
+
# execution params
|
|
55
|
+
offline_schedule: str = "@daily",
|
|
56
|
+
conf: Optional[common.ConfigProperties] = None,
|
|
57
|
+
env_vars: Optional[common.EnvironmentVariables] = None,
|
|
58
|
+
cluster_conf: common.ClusterConfigProperties = None,
|
|
59
|
+
step_days: Optional[int] = None,
|
|
60
|
+
recompute_days: Optional[int] = None,
|
|
61
|
+
) -> ttypes.StagingQuery:
|
|
62
|
+
"""
|
|
63
|
+
Creates a StagingQuery object for executing arbitrary SQL queries with templated date parameters.
|
|
64
|
+
|
|
65
|
+
:param query:
|
|
66
|
+
Arbitrary spark query that should be written with template parameters:
|
|
67
|
+
- `{{ start_date }}`: Initial run uses start_partition, future runs use latest partition + 1 day
|
|
68
|
+
- `{{ end_date }}`: The end partition of the computing range
|
|
69
|
+
- `{{ latest_date }}`: End partition independent of the computing range (for cumulative sources)
|
|
70
|
+
- `{{ max_date(table=namespace.my_table) }}`: Max partition available for a given table
|
|
71
|
+
These parameters can be modified with offset and bounds:
|
|
72
|
+
- `{{ start_date(offset=-10, lower_bound='2023-01-01', upper_bound='2024-01-01') }}`
|
|
73
|
+
:type query: str
|
|
74
|
+
:param start_partition:
|
|
75
|
+
On the first run, `{{ start_date }}` will be set to this user provided start date,
|
|
76
|
+
future incremental runs will set it to the latest existing partition + 1 day.
|
|
77
|
+
:type start_partition: str
|
|
78
|
+
:param setups:
|
|
79
|
+
Spark SQL setup statements. Used typically to register UDFs.
|
|
80
|
+
:type setups: List[str]
|
|
81
|
+
:type partition_column: str
|
|
82
|
+
:param engine_type:
|
|
83
|
+
By default, spark is the compute engine. You can specify an override (eg. bigquery, etc.)
|
|
84
|
+
Use the EngineType class constants: EngineType.SPARK, EngineType.BIGQUERY, etc.
|
|
85
|
+
:type engine_type: int
|
|
86
|
+
:param tags:
|
|
87
|
+
Additional metadata that does not directly affect computation, but is useful for management.
|
|
88
|
+
:type tags: Dict[str, str]
|
|
89
|
+
:param offline_schedule:
|
|
90
|
+
The offline schedule interval for batch jobs. Format examples:
|
|
91
|
+
'@hourly': '0 * * * *',
|
|
92
|
+
'@daily': '0 0 * * *',
|
|
93
|
+
'@weekly': '0 0 * * 0',
|
|
94
|
+
'@monthly': '0 0 1 * *',
|
|
95
|
+
'@yearly': '0 0 1 1 *'
|
|
96
|
+
:type offline_schedule: str
|
|
97
|
+
:param conf:
|
|
98
|
+
Configuration properties for the StagingQuery.
|
|
99
|
+
:type conf: common.ConfigProperties
|
|
100
|
+
:param env_vars:
|
|
101
|
+
Environment variables for the StagingQuery.
|
|
102
|
+
:type env_vars: common.EnvironmentVariables
|
|
103
|
+
:param cluster_conf:
|
|
104
|
+
Cluster configuration properties for the join.
|
|
105
|
+
:param step_days:
|
|
106
|
+
The maximum number of days to process at once
|
|
107
|
+
:type step_days: int
|
|
108
|
+
:param dependencies:
|
|
109
|
+
List of dependencies for the StagingQuery. Each dependency can be either a TableDependency object
|
|
110
|
+
or a dictionary with 'name' and 'spec' keys.
|
|
111
|
+
:type dependencies: List[Union[TableDependency, Dict]]
|
|
112
|
+
:param recompute_days:
|
|
113
|
+
Used by orchestrator to determine how many days are recomputed on each incremental scheduled run. Should be
|
|
114
|
+
set when the source data is changed in-place (i.e. existing partitions overwritten with new data each day up to
|
|
115
|
+
X days later) or when you want partially mature aggregations (i.e. a 7 day window, but start computing it from
|
|
116
|
+
day 1, and refresh it for the next 6 days)
|
|
117
|
+
:type recompute_days: int
|
|
118
|
+
:return:
|
|
119
|
+
A StagingQuery object
|
|
120
|
+
"""
|
|
121
|
+
# Get caller's filename to assign team
|
|
122
|
+
team = inspect.stack()[1].filename.split("/")[-2]
|
|
123
|
+
|
|
124
|
+
# Create execution info
|
|
125
|
+
exec_info = common.ExecutionInfo(
|
|
126
|
+
scheduleCron=offline_schedule,
|
|
127
|
+
conf=conf,
|
|
128
|
+
env=env_vars,
|
|
129
|
+
stepDays=step_days,
|
|
130
|
+
clusterConf=cluster_conf
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
airflow_dependencies = []
|
|
134
|
+
|
|
135
|
+
if dependencies:
|
|
136
|
+
for d in dependencies:
|
|
137
|
+
if isinstance(d, TableDependency):
|
|
138
|
+
# Create an Airflow dependency object for the table
|
|
139
|
+
airflow_dependency = airflow_helpers.create_airflow_dependency(
|
|
140
|
+
d.table,
|
|
141
|
+
d.partition_column,
|
|
142
|
+
d.additional_partitions,
|
|
143
|
+
d.offset,
|
|
144
|
+
)
|
|
145
|
+
airflow_dependencies.append(airflow_dependency)
|
|
146
|
+
elif isinstance(d, dict):
|
|
147
|
+
# If it's already a dictionary, just append it
|
|
148
|
+
airflow_dependencies.append(d)
|
|
149
|
+
else:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
"Dependencies must be either TableDependency instances or dictionaries."
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
custom_json = json.dumps({AIRFLOW_DEPENDENCIES_KEY: airflow_dependencies})
|
|
155
|
+
|
|
156
|
+
# Create metadata
|
|
157
|
+
meta_data = ttypes.MetaData(
|
|
158
|
+
name=name,
|
|
159
|
+
outputNamespace=output_namespace,
|
|
160
|
+
team=team,
|
|
161
|
+
executionInfo=exec_info,
|
|
162
|
+
tags=tags,
|
|
163
|
+
customJson=custom_json,
|
|
164
|
+
tableProperties=table_properties,
|
|
165
|
+
version=str(version)
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
thrift_deps = []
|
|
169
|
+
if dependencies and len(dependencies) > 0:
|
|
170
|
+
for d in dependencies:
|
|
171
|
+
if d and isinstance(d, TableDependency):
|
|
172
|
+
thrift_deps.append(d.to_thrift())
|
|
173
|
+
|
|
174
|
+
# Create and return the StagingQuery object with camelCase parameter names
|
|
175
|
+
staging_query = ttypes.StagingQuery(
|
|
176
|
+
metaData=meta_data,
|
|
177
|
+
query=query,
|
|
178
|
+
startPartition=start_partition,
|
|
179
|
+
setups=setups,
|
|
180
|
+
engineType=engine_type,
|
|
181
|
+
tableDependencies=thrift_deps,
|
|
182
|
+
recomputeDays=recompute_days,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
return staging_query
|