awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
@@ -0,0 +1,174 @@
1
+
2
+ # 🧠 Zipline AI: Sample Chronon Project
3
+
4
+ This repository demonstrates how to author and run [Chronon](https://chronon.ai) pipelines, including GroupBy and Join definitions, using GCP (BigQuery + Iceberg) as the storage backend.
5
+
6
+ Chronon is a unified platform for **feature engineering**, enabling **online and offline consistency**, **real-time feature generation**, and **historical backfills** from a single codebase.
7
+
8
+ ---
9
+
10
+ ## 📦 Project Structure
11
+
12
+ ```bash
13
+ .
14
+ ├── group_bys/ # GroupBy definitions (feature aggregations)
15
+ ├── joins/ # Join definitions (how sources and GroupBys are combined)
16
+ ├── sources/ # Chronon Source definitions (event tables)
17
+ ├── compiled/ # Generated configs and outputs
18
+ ├── teams.py # Chronon Team configurations
19
+ └── README.md
20
+ ```
21
+
22
+ ---
23
+
24
+ ## 🚀 Quick Start
25
+
26
+ ### 🛠️ Requirements
27
+
28
+ To get started, make sure you have the following set up:
29
+
30
+ - ✅ **Python** 3.11 or higher
31
+ - ✅ **Zipline CLI** — Only for **upgrades or downgrades**, install via:
32
+ ```bash
33
+ ./zipline-cli-install.sh
34
+ - ✅ gcloud CLI — authenticated and configured with the correct GCP project
35
+ - ✅ Google Cloud credentials — either:
36
+ - Application Default Credentials (ADC)
37
+ - A service account with access to BigQuery and GCS
38
+ - ✅Add this to your shell config (e.g., .bashrc, .zshrc):
39
+
40
+ ```bash
41
+ # From the same directory as this README
42
+ export PYTHONPATH="$(pwd):$PYTHONPATH"
43
+ ```
44
+
45
+ ---
46
+ ## Requirements
47
+
48
+ Teams define metadata, Spark config, and environment variables.
49
+
50
+ In [teams.py](teams.py), fill in the appropriate values in the TODO section.
51
+
52
+ Make sure to replace placeholders like `<project-id>` and `<gcs-prefix>` with real values.
53
+
54
+ ### Partition format and column
55
+ Chronon expects tables to be date partitioned. Please specify the partition format and the column in teams.py here:
56
+
57
+ ```python
58
+ "spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
59
+ "spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
60
+ ```
61
+
62
+ ---
63
+
64
+ ## 🧪 Compiling
65
+
66
+ To generate the user configs from the Python chronon objects to be used in the CLI, run:
67
+
68
+ ```bash
69
+ zipline compile
70
+ ```
71
+
72
+ This will create a `compiled` directory.
73
+
74
+ ---
75
+
76
+ ## 🧪 Running a GroupBy or Join Backfill
77
+
78
+ Run a GroupBy backfill from the CLI:
79
+
80
+ ```bash
81
+ zipline run \
82
+ --mode backfill \
83
+ --conf compiled/group_bys/<TEAM_NAME>/<GROUPBY_NAME>
84
+ ```
85
+
86
+ Run a Join backfill from the CLI:
87
+
88
+ ```bash
89
+ zipline run \
90
+ --mode backfill \
91
+ --conf compiled/joins/<TEAM_NAME>/<JOIN_NAME>
92
+ ```
93
+
94
+ Results are written to the configured BigQuery + Iceberg tables under the `outputNamespace` (e.g. `default.group_by_v1` or `default.v1`).
95
+
96
+ ---
97
+
98
+ ## 🧪 Running a GroupBy upload (GBU) job.
99
+
100
+ ```bash
101
+ zipline run \
102
+ --mode upload \
103
+ --conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
104
+ --ds <DATE>
105
+ ```
106
+
107
+ Results are written to the configured BigQuery + Iceberg tables under the `outputNamespace` (e.g. `default.group_by_v1` or `default.v1`).
108
+
109
+ ---
110
+
111
+ ## 🧪 Upload the GBU values to online KV store.
112
+
113
+ ```bash
114
+ zipline run \
115
+ --mode upload-to-kv \
116
+ --conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
117
+ --ds <DATE>
118
+ ```
119
+
120
+ ---
121
+
122
+ ## 🧪 Upload the metadata of Chronon GroupBy or Join to online KV store for serving.
123
+
124
+ GroupBy metadata upload:
125
+ ```bash
126
+ zipline run \
127
+ --mode metadata-upload \
128
+ --conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME>
129
+ ```
130
+
131
+ Join metadata upload:
132
+ ```bash
133
+ zipline run \
134
+ --mode metadata-upload \
135
+ --conf compiled/joins/<TEAM_NAME>/<JOIN_NAME>
136
+ ```
137
+
138
+ ---
139
+
140
+ ## 🧪 Fetch feature values from Chronon GroupBy or Join.
141
+
142
+ **Note:** This is only for debugging purposes. Not for production use.
143
+
144
+ Fetching from a GroupBy:
145
+ ```bash
146
+ zipline run \
147
+ --mode fetch \
148
+ --conf compiled/group_bys/<TEAM_NAME>/<GROUP_BY_NAME> \
149
+ --name <GROUP_BY_NAME> \
150
+ -k '{"<ENTITY_KEY>": "<VALUE>"}'
151
+ ```
152
+
153
+ Fetching from a Join:
154
+ ```bash
155
+ zipline run \
156
+ --mode fetch \
157
+ --conf compiled/joins/<TEAM_NAME>/<JOIN_NAME> \
158
+ --name <JOIN_NAME> \
159
+ -k '{"<ENTITY_KEY>": "<VALUE>"}'
160
+ ```
161
+
162
+ ---
163
+
164
+ ## 📚 Resources
165
+
166
+ - [Chronon Docs](https://chronon.ai)
167
+ - [GitHub](https://github.com/airbnb/chronon)
168
+ - [Community Slack](https://join.slack.com/t/chrononworkspace/shared_invite/zt-33zbnzwac-ghPZXpYNZJsArXZ5WdBy9g)
169
+
170
+ ---
171
+
172
+ ## 👋 About
173
+
174
+ This project is a reference scaffold for building scalable feature pipelines using Chronon on GCP. It provides end-to-end visibility from source to production features.
File without changes
@@ -0,0 +1,34 @@
1
+
2
+ from sources.test.data import source_v1
3
+
4
+ from ai.chronon.group_by import Aggregation, GroupBy, Operation, TimeUnit, Window
5
+
6
+ window_sizes = [Window(length=day, time_unit=TimeUnit.DAYS) for day in [3, 14, 30]] # Define some window sizes to use below
7
+
8
+ group_by_v1 = GroupBy(
9
+ backfill_start_date="2023-11-01",
10
+ sources=[source_v1],
11
+ keys=["user_id"], # We are aggregating by user
12
+ online=True,
13
+ aggregations=[Aggregation(
14
+ input_column="purchase_price",
15
+ operation=Operation.SUM,
16
+ windows=window_sizes
17
+ ), # The sum of purchases prices in various windows
18
+ Aggregation(
19
+ input_column="purchase_price",
20
+ operation=Operation.COUNT,
21
+ windows=window_sizes
22
+ ), # The count of purchases in various windows
23
+ Aggregation(
24
+ input_column="purchase_price",
25
+ operation=Operation.AVERAGE,
26
+ windows=window_sizes
27
+ ), # The average purchases by user in various windows
28
+ Aggregation(
29
+ input_column="purchase_price",
30
+ operation=Operation.LAST_K(10),
31
+ ),
32
+ ],
33
+ version=0,
34
+ )
File without changes
@@ -0,0 +1,30 @@
1
+ from group_bys.test.data import group_by_v1
2
+
3
+ from ai.chronon.api.ttypes import EventSource, Source
4
+ from ai.chronon.join import Join, JoinPart
5
+ from ai.chronon.query import Query, selects
6
+
7
+ """
8
+ This is the "left side" of the join that will comprise our training set. It is responsible for providing the primary keys
9
+ and timestamps for which features will be computed.
10
+ """
11
+ source = Source(
12
+ events=EventSource(
13
+ table="data.checkouts",
14
+ query=Query(
15
+ selects=selects(
16
+ "user_id"
17
+ ), # The primary key used to join various GroupBys together
18
+ time_column="ts",
19
+ ), # The event time used to compute feature values as-of
20
+ )
21
+ )
22
+
23
+ v1 = Join(
24
+ left=source,
25
+ right_parts=[
26
+ JoinPart(group_by=group_by_v1)
27
+ ],
28
+ row_ids="user_id",
29
+ version=0,
30
+ )
File without changes
@@ -0,0 +1,23 @@
1
+ from ai.chronon.api.ttypes import EventSource, Source
2
+ from ai.chronon.query import Query, selects
3
+
4
+ """
5
+ Example: Defining a Chronon Source from a Batch Table
6
+
7
+ This example demonstrates how to configure a Chronon `Source` from a BigQuery or Hive table,
8
+ with a clear event time column and selected fields for downstream feature computation.
9
+ """
10
+
11
+ # Define the EventSource using the batch table and query
12
+ # Wrap the EventSource in a Source object
13
+
14
+ source_v1 = Source(
15
+ events=EventSource(
16
+ table="data.purchases", # This points to the log table in the warehouse with historical purchase events, updated in batch daily
17
+ topic=None, # See the 'returns' GroupBy for an example that has a streaming source configured. In this case, this would be the streaming source topic that can be listened to for realtime events
18
+ query=Query(
19
+ selects=selects("user_id","purchase_price"), # Select the fields we care about
20
+ time_column="ts") # The event time
21
+ ))
22
+
23
+ # The `source_v1` object can now be used in a Chronon join or pipeline definition
@@ -0,0 +1,70 @@
1
+ from ai.chronon.api.ttypes import Team
2
+ from ai.chronon.repo.constants import RunMode
3
+ from ai.chronon.types import ConfigProperties, EnvironmentVariables
4
+
5
+ default = Team(
6
+ description="Default team",
7
+ email="<responsible-team-email>",
8
+ outputNamespace="default",
9
+ conf=ConfigProperties(
10
+ common={
11
+ "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider",
12
+ "spark.chronon.table_write.format": "iceberg",
13
+
14
+ "spark.sql.defaultCatalog": "bigquery_catalog",
15
+
16
+ "spark.sql.catalog.bigquery_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog",
17
+ "spark.sql.catalog.bigquery_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
18
+ "spark.sql.catalog.bigquery_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO",
19
+
20
+ "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false",
21
+ "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator",
22
+
23
+ "spark.chronon.coalesce.factor": "10",
24
+ "spark.default.parallelism": "10",
25
+ "spark.sql.shuffle.partitions": "10",
26
+
27
+ # TODO: Please fill in the following values
28
+ "spark.sql.catalog.bigquery_catalog.warehouse": "gs://zipline-warehouse-<customer_id>/data/tables/",
29
+ "spark.sql.catalog.bigquery_catalog.gcp_location": "<region>",
30
+ "spark.sql.catalog.bigquery_catalog.gcp_project": "<project-id>",
31
+ "spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
32
+ "spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
33
+ },
34
+ ),
35
+ env=EnvironmentVariables(
36
+ common={
37
+ "JOB_MODE": "local[*]",
38
+ "CHRONON_ONLINE_CLASS": "[ONLINE-TODO]your.online.class",
39
+ "CHRONON_ONLINE_ARGS": "[ONLINE-TODO]args prefixed with -Z become constructor map for your implementation of ai.chronon.online.Api, -Zkv-host=<YOUR_HOST> -Zkv-port=<YOUR_PORT>",
40
+
41
+ # TODO: Please fill in the following values
42
+ "CUSTOMER_ID": "<customer_id>",
43
+ "GCP_PROJECT_ID": "<project-id>",
44
+ "GCP_REGION": "<region>",
45
+ "GCP_DATAPROC_CLUSTER_NAME": "<dataproc-cluster-name>",
46
+ "GCP_BIGTABLE_INSTANCE_ID": "<bigtable-instance-id>",
47
+ "ARTIFACT_PREFIX": "<customer-artifact-bucket>",
48
+ "CLOUD_PROVIDER": "<gcp | aws>"
49
+ },
50
+ ),
51
+ )
52
+
53
+
54
+ test = Team(
55
+ outputNamespace="data",
56
+ env=EnvironmentVariables(
57
+ common={},
58
+ modeEnvironments={
59
+ RunMode.BACKFILL: {},
60
+ RunMode.UPLOAD: {}
61
+ }
62
+ ),
63
+ )
64
+
65
+ team_conf = Team(
66
+ outputNamespace="test",
67
+ env=EnvironmentVariables(
68
+ common={},
69
+ ),
70
+ )
@@ -0,0 +1,54 @@
1
+ #!/bin/bash
2
+
3
+ function print_usage() {
4
+ echo "Usage: $0 [OPTIONS]"
5
+ echo "Options:"
6
+ echo " --artifact_prefix <gcs_bucket> Specify the gcs bucket to upload artifacts to e.g. \"gs://ck-zipline-artifacts\""
7
+ echo " --version <version> Specify the version you want to run"
8
+ echo " -h, --help Show this help message"
9
+ }
10
+
11
+ if [ $# -ne 4 ]; then
12
+ print_usage
13
+ exit 1
14
+ fi
15
+
16
+ while [[ $# -gt 0 ]]; do
17
+ case $1 in
18
+ --artifact_prefix)
19
+ if [[ -z $2 ]]; then
20
+ echo "Error: --artifact_prefix requires a value"
21
+ print_usage
22
+ exit 1
23
+ fi
24
+ ARTIFACT_PREFIX="$2"
25
+ shift 2
26
+ ;;
27
+ -h|--help)
28
+ print_usage
29
+ exit 0
30
+ ;;
31
+ --version)
32
+ if [[ -z $2 ]]; then
33
+ echo "Error: --version requires a value"
34
+ print_usage
35
+ exit 1
36
+ fi
37
+ VERSION="$2"
38
+ shift 2
39
+ ;;
40
+ *)
41
+ echo "Unknown option: $1"
42
+ print_usage
43
+ exit 1
44
+ ;;
45
+ esac
46
+ done
47
+
48
+ gcloud storage cp "${ARTIFACT_PREFIX%/}/release/$VERSION/wheels/zipline_ai-$VERSION-py3-none-any.whl" .
49
+
50
+ trap 'rm -f ./zipline_ai-$VERSION-py3-none-any.whl' EXIT
51
+
52
+ pip3 uninstall zipline-ai
53
+
54
+ pip3 install ./zipline_ai-$VERSION-py3-none-any.whl
ai/chronon/source.py ADDED
@@ -0,0 +1,88 @@
1
+ """
2
+ Wrappers to directly create Source objects.
3
+ """
4
+
5
+ import ai.chronon.api.ttypes as ttypes
6
+
7
+
8
+ def EventSource(
9
+ table: str,
10
+ query: ttypes.Query,
11
+ topic: str = None,
12
+ is_cumulative: bool = None,
13
+ ) -> ttypes.Source:
14
+ """
15
+ Event Sources represent data that gets generated over-time.
16
+ Typically, but not necessarily, logged to message buses like kafka, kinesis or google pub/sub.
17
+ fct tables are also event source worthy.
18
+
19
+ Attributes:
20
+
21
+ - table: Table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
22
+ Table names can contain subpartition specs, example db.table/system=mobile/currency=USD
23
+ - topic: Topic is a kafka table. The table contains all the events historically came through this topic.
24
+ - query: The logic used to scan both the table and the topic. Contains row level transformations
25
+ and filtering expressed as Spark SQL statements.
26
+ - isCumulative: If each new hive partition contains not just the current day's events but the entire set
27
+ of events since the begininng. The key property is that the events are not mutated
28
+ across partitions.
29
+
30
+ """
31
+ return ttypes.Source(
32
+ events=ttypes.EventSource(
33
+ table=table, topic=topic, query=query, isCumulative=is_cumulative
34
+ )
35
+ )
36
+
37
+
38
+ def EntitySource(
39
+ snapshot_table: str,
40
+ query: ttypes.Query,
41
+ mutation_table: str = None,
42
+ mutation_topic: str = None,
43
+ ) -> ttypes.Source:
44
+ """
45
+ Entity Sources represent data that gets mutated over-time - at row-level. This is a group of three data elements.
46
+ snapshotTable, mutationTable and mutationTopic. mutationTable and mutationTopic are only necessary if we are trying
47
+ to create realtime or point-in-time aggregations over these sources. Entity sources usually map 1:1 with a database
48
+ tables in your OLTP store that typically serves live application traffic. When mutation data is absent they map 1:1
49
+ to `dim` tables in star schema.
50
+
51
+ Attributes:
52
+ - snapshotTable: Snapshot table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
53
+ - mutationTable: Topic is a kafka table. The table contains
54
+ all the events that historically came through this topic.
55
+ We need all the fields present in the snapshot table, PLUS two additional fields,
56
+ `mutation_time` - milliseconds since epoch of type Long that represents the time of the mutation
57
+ `is_before` - a boolean flag that represents whether
58
+ this row contains values before or after the mutation.
59
+ - mutationTopic: The logic used to scan both the table and the topic. Contains row level transformations
60
+ and filtering expressed as Spark SQL statements.
61
+ - query: If each new hive partition contains not just the current day's events but the entire set
62
+ of events since the begininng. The key property is that the events are not mutated across partitions.
63
+ """
64
+ return ttypes.Source(
65
+ entities=ttypes.EntitySource(
66
+ snapshotTable=snapshot_table,
67
+ mutationTable=mutation_table,
68
+ mutationTopic=mutation_topic,
69
+ query=query,
70
+ )
71
+ )
72
+
73
+
74
+ def JoinSource(join: ttypes.Join, query: ttypes.Query) -> ttypes.Source:
75
+ """
76
+ The output of a join can be used as a source for `GroupBy`.
77
+ Useful for expressing complex computation in chronon.
78
+
79
+ Offline this simply means that we will compute the necessary date ranges of the join
80
+ before we start computing the `GroupBy`.
81
+
82
+ Online we will:
83
+ 1. enrich the stream/topic of `join.left` with all the columns defined by the join
84
+ 2. apply the selects & wheres defined in the `query`
85
+ 3. perform aggregations defined in the *downstream* `GroupBy`
86
+ 4. write the result to the kv store.
87
+ """
88
+ return ttypes.Source(joinSource=ttypes.JoinSource(join=join, query=query))
@@ -0,0 +1,185 @@
1
+
2
+ import inspect
3
+ import json
4
+ from dataclasses import dataclass
5
+ from typing import Dict, List, Optional, Union
6
+
7
+ import ai.chronon.airflow_helpers as airflow_helpers
8
+ import ai.chronon.api.common.ttypes as common
9
+ import ai.chronon.api.ttypes as ttypes
10
+ from ai.chronon.constants import AIRFLOW_DEPENDENCIES_KEY
11
+
12
+
13
+ # Wrapper for EngineType
14
+ class EngineType:
15
+ SPARK = ttypes.EngineType.SPARK
16
+ BIGQUERY = ttypes.EngineType.BIGQUERY
17
+
18
+ @dataclass
19
+ class TableDependency:
20
+ table: str
21
+ partition_column: Optional[str] = None
22
+ partition_format: Optional[str] = None
23
+ additional_partitions: Optional[List[str]] = None
24
+ offset: Optional[int] = None
25
+
26
+ def to_thrift(self):
27
+ if self.offset is None:
28
+ raise ValueError(f"Dependency offset for table {self.table} must be specified.")
29
+ offset_window = common.Window(length = self.offset, timeUnit= common.TimeUnit.DAYS)
30
+ return common.TableDependency(
31
+ tableInfo=common.TableInfo(
32
+ table=self.table,
33
+ partitionColumn=self.partition_column,
34
+ partitionFormat=self.partition_format,
35
+ partitionInterval=common.Window(1, common.TimeUnit.DAYS)
36
+ ),
37
+ startOffset=offset_window,
38
+ endOffset=offset_window,
39
+ startCutOff=None,
40
+ endCutOff=None
41
+ )
42
+
43
+ def StagingQuery(
44
+ name: str,
45
+ query: str,
46
+ version: int,
47
+ output_namespace: Optional[str] = None,
48
+ start_partition: Optional[str] = None,
49
+ table_properties: Optional[Dict[str, str]] = None,
50
+ setups: Optional[List[str]] = None,
51
+ engine_type: Optional[EngineType] = None,
52
+ dependencies: Optional[List[Union[TableDependency, Dict]]] = None,
53
+ tags: Optional[Dict[str, str]] = None,
54
+ # execution params
55
+ offline_schedule: str = "@daily",
56
+ conf: Optional[common.ConfigProperties] = None,
57
+ env_vars: Optional[common.EnvironmentVariables] = None,
58
+ cluster_conf: common.ClusterConfigProperties = None,
59
+ step_days: Optional[int] = None,
60
+ recompute_days: Optional[int] = None,
61
+ ) -> ttypes.StagingQuery:
62
+ """
63
+ Creates a StagingQuery object for executing arbitrary SQL queries with templated date parameters.
64
+
65
+ :param query:
66
+ Arbitrary spark query that should be written with template parameters:
67
+ - `{{ start_date }}`: Initial run uses start_partition, future runs use latest partition + 1 day
68
+ - `{{ end_date }}`: The end partition of the computing range
69
+ - `{{ latest_date }}`: End partition independent of the computing range (for cumulative sources)
70
+ - `{{ max_date(table=namespace.my_table) }}`: Max partition available for a given table
71
+ These parameters can be modified with offset and bounds:
72
+ - `{{ start_date(offset=-10, lower_bound='2023-01-01', upper_bound='2024-01-01') }}`
73
+ :type query: str
74
+ :param start_partition:
75
+ On the first run, `{{ start_date }}` will be set to this user provided start date,
76
+ future incremental runs will set it to the latest existing partition + 1 day.
77
+ :type start_partition: str
78
+ :param setups:
79
+ Spark SQL setup statements. Used typically to register UDFs.
80
+ :type setups: List[str]
81
+ :type partition_column: str
82
+ :param engine_type:
83
+ By default, spark is the compute engine. You can specify an override (eg. bigquery, etc.)
84
+ Use the EngineType class constants: EngineType.SPARK, EngineType.BIGQUERY, etc.
85
+ :type engine_type: int
86
+ :param tags:
87
+ Additional metadata that does not directly affect computation, but is useful for management.
88
+ :type tags: Dict[str, str]
89
+ :param offline_schedule:
90
+ The offline schedule interval for batch jobs. Format examples:
91
+ '@hourly': '0 * * * *',
92
+ '@daily': '0 0 * * *',
93
+ '@weekly': '0 0 * * 0',
94
+ '@monthly': '0 0 1 * *',
95
+ '@yearly': '0 0 1 1 *'
96
+ :type offline_schedule: str
97
+ :param conf:
98
+ Configuration properties for the StagingQuery.
99
+ :type conf: common.ConfigProperties
100
+ :param env_vars:
101
+ Environment variables for the StagingQuery.
102
+ :type env_vars: common.EnvironmentVariables
103
+ :param cluster_conf:
104
+ Cluster configuration properties for the join.
105
+ :param step_days:
106
+ The maximum number of days to process at once
107
+ :type step_days: int
108
+ :param dependencies:
109
+ List of dependencies for the StagingQuery. Each dependency can be either a TableDependency object
110
+ or a dictionary with 'name' and 'spec' keys.
111
+ :type dependencies: List[Union[TableDependency, Dict]]
112
+ :param recompute_days:
113
+ Used by orchestrator to determine how many days are recomputed on each incremental scheduled run. Should be
114
+ set when the source data is changed in-place (i.e. existing partitions overwritten with new data each day up to
115
+ X days later) or when you want partially mature aggregations (i.e. a 7 day window, but start computing it from
116
+ day 1, and refresh it for the next 6 days)
117
+ :type recompute_days: int
118
+ :return:
119
+ A StagingQuery object
120
+ """
121
+ # Get caller's filename to assign team
122
+ team = inspect.stack()[1].filename.split("/")[-2]
123
+
124
+ # Create execution info
125
+ exec_info = common.ExecutionInfo(
126
+ scheduleCron=offline_schedule,
127
+ conf=conf,
128
+ env=env_vars,
129
+ stepDays=step_days,
130
+ clusterConf=cluster_conf
131
+ )
132
+
133
+ airflow_dependencies = []
134
+
135
+ if dependencies:
136
+ for d in dependencies:
137
+ if isinstance(d, TableDependency):
138
+ # Create an Airflow dependency object for the table
139
+ airflow_dependency = airflow_helpers.create_airflow_dependency(
140
+ d.table,
141
+ d.partition_column,
142
+ d.additional_partitions,
143
+ d.offset,
144
+ )
145
+ airflow_dependencies.append(airflow_dependency)
146
+ elif isinstance(d, dict):
147
+ # If it's already a dictionary, just append it
148
+ airflow_dependencies.append(d)
149
+ else:
150
+ raise ValueError(
151
+ "Dependencies must be either TableDependency instances or dictionaries."
152
+ )
153
+
154
+ custom_json = json.dumps({AIRFLOW_DEPENDENCIES_KEY: airflow_dependencies})
155
+
156
+ # Create metadata
157
+ meta_data = ttypes.MetaData(
158
+ name=name,
159
+ outputNamespace=output_namespace,
160
+ team=team,
161
+ executionInfo=exec_info,
162
+ tags=tags,
163
+ customJson=custom_json,
164
+ tableProperties=table_properties,
165
+ version=str(version)
166
+ )
167
+
168
+ thrift_deps = []
169
+ if dependencies and len(dependencies) > 0:
170
+ for d in dependencies:
171
+ if d and isinstance(d, TableDependency):
172
+ thrift_deps.append(d.to_thrift())
173
+
174
+ # Create and return the StagingQuery object with camelCase parameter names
175
+ staging_query = ttypes.StagingQuery(
176
+ metaData=meta_data,
177
+ query=query,
178
+ startPartition=start_partition,
179
+ setups=setups,
180
+ engineType=engine_type,
181
+ tableDependencies=thrift_deps,
182
+ recomputeDays=recompute_days,
183
+ )
184
+
185
+ return staging_query