awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
@@ -0,0 +1,277 @@
1
+ import json
2
+ import os
3
+ from datetime import date, datetime, timedelta, timezone
4
+ from typing import Optional
5
+
6
+ import google.auth
7
+ import requests
8
+ from google.auth.transport.requests import Request
9
+ from google.cloud import iam_credentials_v1
10
+
11
+
12
+ class ZiplineHub:
13
+ def __init__(self, base_url, sa_name=None):
14
+ if not base_url:
15
+ raise ValueError("Base URL for ZiplineHub cannot be empty.")
16
+ self.base_url = base_url
17
+ if self.base_url.startswith("https"):
18
+ print("\n 🔐 Using Google Cloud authentication for ZiplineHub.")
19
+
20
+ # First try to get ID token from environment (GitHub Actions)
21
+ self.id_token = os.getenv("GCP_ID_TOKEN")
22
+ if self.id_token:
23
+ print(" 🔑 Using ID token from environment")
24
+ elif sa_name is not None:
25
+ # Fallback to Google Cloud authentication
26
+ print(" 🔑 Generating ID token from service account credentials")
27
+ credentials, project_id = google.auth.default()
28
+ self.project_id = project_id
29
+ credentials.refresh(Request())
30
+
31
+ self.sa = f"{sa_name}@{project_id}.iam.gserviceaccount.com"
32
+ else:
33
+ print(" 🔑 Generating ID token from default credentials")
34
+ credentials, project_id = google.auth.default()
35
+ credentials.refresh(Request())
36
+ self.sa = None
37
+ self.id_token = credentials.id_token
38
+
39
+ def _generate_jwt_payload(self, service_account_email: str, resource_url: str) -> str:
40
+ """Generates JWT payload for service account.
41
+
42
+ Creates a properly formatted JWT payload with standard claims (iss, sub, aud,
43
+ iat, exp) needed for IAP authentication.
44
+
45
+ Args:
46
+ service_account_email (str): Specifies service account JWT is created for.
47
+ resource_url (str): Specifies scope of the JWT, the URL that the JWT will
48
+ be allowed to access.
49
+
50
+ Returns:
51
+ str: JSON string containing the JWT payload with properly formatted claims.
52
+ """
53
+ # Create current time and expiration time (1 hour later) in UTC
54
+ iat = datetime.now(tz=timezone.utc)
55
+ exp = iat + timedelta(seconds=3600)
56
+
57
+ # Convert datetime objects to numeric timestamps (seconds since epoch)
58
+ # as required by JWT standard (RFC 7519)
59
+ payload = {
60
+ "iss": service_account_email,
61
+ "sub": service_account_email,
62
+ "aud": resource_url,
63
+ "iat": int(iat.timestamp()),
64
+ "exp": int(exp.timestamp()),
65
+ }
66
+
67
+ return json.dumps(payload)
68
+
69
+ def _sign_jwt(self, target_sa: str, resource_url: str) -> str:
70
+ """Signs JWT payload using ADC and IAM credentials API.
71
+
72
+ Uses Google Cloud's IAM Credentials API to sign a JWT. This requires the
73
+ caller to have iap.webServiceVersions.accessViaIap permission on the target
74
+ service account.
75
+
76
+ Args:
77
+ target_sa (str): Service Account JWT is being created for.
78
+ iap.webServiceVersions.accessViaIap permission is required.
79
+ resource_url (str): Audience of the JWT, and scope of the JWT token.
80
+ This is the url of the IAP protected application.
81
+
82
+ Returns:
83
+ str: A signed JWT that can be used to access IAP protected apps.
84
+ Use in Authorization header as: 'Bearer <signed_jwt>'
85
+ """
86
+ # Get default credentials from environment or application credentials
87
+ source_credentials, project_id = google.auth.default()
88
+
89
+ # Initialize IAM credentials client with source credentials
90
+ iam_client = iam_credentials_v1.IAMCredentialsClient(credentials=source_credentials)
91
+
92
+ # Generate the service account resource name
93
+ # Use '-' as placeholder as per API requirements
94
+ name = iam_client.service_account_path("-", target_sa)
95
+
96
+ # Create and sign the JWT payload
97
+ payload = self._generate_jwt_payload(target_sa, resource_url)
98
+
99
+ request = iam_credentials_v1.SignJwtRequest(
100
+ name=name,
101
+ payload=payload,
102
+ )
103
+ # Sign the JWT using the IAM credentials API
104
+ response = iam_client.sign_jwt(request=request)
105
+
106
+ return response.signed_jwt
107
+
108
+ def call_diff_api(self, names_to_hashes: dict[str, str]) -> Optional[list[str]]:
109
+ url = f"{self.base_url}/upload/v2/diff"
110
+
111
+ diff_request = {"namesToHashes": names_to_hashes}
112
+ headers = {"Content-Type": "application/json"}
113
+ if self.base_url.startswith("https") and hasattr(self, "sa") and self.sa is not None:
114
+ headers["Authorization"] = f"Bearer {self._sign_jwt(self.sa, url)}"
115
+ elif self.base_url.startswith("https"):
116
+ headers["Authorization"] = f"Bearer {self.id_token}"
117
+ try:
118
+ response = requests.post(url, json=diff_request, headers=headers)
119
+ response.raise_for_status()
120
+ diff_response = response.json()
121
+ return diff_response["diff"]
122
+ except requests.RequestException as e:
123
+ if e.response is not None and e.response.status_code == 401 and self.sa is None:
124
+ print(
125
+ " ❌ Error calling diff API. Unauthorized and no service account provided. Make sure the environment has default credentials set up or provide a service account name as SA_NAME in teams.py."
126
+ )
127
+ elif e.response is not None and e.response.status_code == 401 and self.sa is not None:
128
+ print(
129
+ f" ❌ Error calling diff API. Unauthorized with provided service account: {self.sa}. Make sure the service account has the 'iap.webServiceVersions.accessViaIap' permission."
130
+ )
131
+ else:
132
+ print(f" ❌ Error calling diff API: {e}")
133
+ raise e
134
+
135
+ def call_upload_api(self, diff_confs, branch: str):
136
+ url = f"{self.base_url}/upload/v2/confs"
137
+
138
+ upload_request = {
139
+ "diffConfs": diff_confs,
140
+ "branch": branch,
141
+ }
142
+ headers = {"Content-Type": "application/json"}
143
+ if self.base_url.startswith("https") and hasattr(self, "sa") and self.sa is not None:
144
+ headers["Authorization"] = f"Bearer {self._sign_jwt(self.sa, url)}"
145
+ elif self.base_url.startswith("https"):
146
+ headers["Authorization"] = f"Bearer {self.id_token}"
147
+
148
+ try:
149
+ response = requests.post(url, json=upload_request, headers=headers)
150
+ response.raise_for_status()
151
+ return response.json()
152
+ except requests.RequestException as e:
153
+ if e.response is not None and e.response.status_code == 401 and self.sa is None:
154
+ print(
155
+ " ❌ Error calling upload API. Unauthorized and no service account provided. Make sure the environment has default credentials set up or provide a service account name as SA_NAME in teams.py."
156
+ )
157
+ elif e.response is not None and e.response.status_code == 401 and self.sa is not None:
158
+ print(
159
+ f" ❌ Error calling upload API. Unauthorized with provided service account: {self.sa}. Make sure the service account has the 'iap.webServiceVersions.accessViaIap' permission."
160
+ )
161
+ else:
162
+ print(f" ❌ Error calling upload API: {e}")
163
+ raise e
164
+
165
+ def call_schedule_api(self, modes, branch, conf_name, conf_hash):
166
+ url = f"{self.base_url}/schedule/v2/schedules"
167
+
168
+ schedule_request = {
169
+ "modeSchedules": modes,
170
+ "branch": branch,
171
+ "confName": conf_name,
172
+ "confHash": conf_hash,
173
+ }
174
+
175
+ headers = {"Content-Type": "application/json"}
176
+ if self.base_url.startswith("https") and hasattr(self, "sa") and self.sa is not None:
177
+ headers["Authorization"] = f"Bearer {self._sign_jwt(self.sa, url)}"
178
+ elif self.base_url.startswith("https"):
179
+ headers["Authorization"] = f"Bearer {self.id_token}"
180
+
181
+ try:
182
+ response = requests.post(url, json=schedule_request, headers=headers)
183
+ response.raise_for_status()
184
+ return response.json()
185
+ except requests.RequestException as e:
186
+ if e.response is not None and e.response.status_code == 401 and self.sa is None:
187
+ print(
188
+ " ❌ Error deploying schedule. Unauthorized and no service account provided. Make sure the environment has default credentials set up or provide a service account name as SA_NAME in teams.py."
189
+ )
190
+ elif e.response is not None and e.response.status_code == 401 and self.sa is not None:
191
+ print(
192
+ f" ❌ Error deploying schedule. Unauthorized with provided service account: {self.sa}. Make sure the service account has the 'iap.webServiceVersions.accessViaIap' permission."
193
+ )
194
+ else:
195
+ print(f" ❌ Error deploying schedule: {e}")
196
+ raise e
197
+
198
+ def call_sync_api(self, branch: str, names_to_hashes: dict[str, str]) -> Optional[list[str]]:
199
+ url = f"{self.base_url}/upload/v2/sync"
200
+
201
+ sync_request = {
202
+ "namesToHashes": names_to_hashes,
203
+ "branch": branch,
204
+ }
205
+ headers = {"Content-Type": "application/json"}
206
+ if self.base_url.startswith("https") and hasattr(self, "sa") and self.sa is not None:
207
+ headers["Authorization"] = f"Bearer {self._sign_jwt(self.sa, url)}"
208
+ elif self.base_url.startswith("https"):
209
+ headers["Authorization"] = f"Bearer {self.id_token}"
210
+
211
+ try:
212
+ response = requests.post(url, json=sync_request, headers=headers)
213
+ response.raise_for_status()
214
+ return response.json()
215
+ except requests.RequestException as e:
216
+ if e.response is not None and e.response.status_code == 401 and self.sa is None:
217
+ print(
218
+ " ❌ Error calling sync API. Unauthorized and no service account provided. Make sure the environment has default credentials set up or provide a service account name as SA_NAME in teams.py."
219
+ )
220
+ elif e.response is not None and e.response.status_code == 401 and self.sa is not None:
221
+ print(
222
+ f" ❌ Error calling sync API. Unauthorized with provided service account: {self.sa}. Make sure the service account has the 'iap.webServiceVersions.accessViaIap' permission."
223
+ )
224
+ else:
225
+ print(f" ❌ Error calling sync API: {e}")
226
+ raise e
227
+
228
+ def call_workflow_start_api(
229
+ self,
230
+ conf_name,
231
+ mode,
232
+ branch,
233
+ user,
234
+ conf_hash,
235
+ start=None,
236
+ end=None,
237
+ skip_long_running=False,
238
+ ):
239
+ url = f"{self.base_url}/workflow/v2/start"
240
+ end_dt = end.strftime("%Y-%m-%d") if end else date.today().strftime("%Y-%m-%d")
241
+ start_dt = (
242
+ start.strftime("%Y-%m-%d")
243
+ if start
244
+ else (date.today() - timedelta(days=14)).strftime("%Y-%m-%d")
245
+ )
246
+ workflow_request = {
247
+ "confName": conf_name,
248
+ "confHash": conf_hash,
249
+ "mode": mode,
250
+ "branch": branch,
251
+ "user": user,
252
+ "start": start_dt,
253
+ "end": end_dt,
254
+ "skipLongRunningNodes": skip_long_running,
255
+ }
256
+ headers = {"Content-Type": "application/json"}
257
+ if self.base_url.startswith("https") and hasattr(self, "sa") and self.sa is not None:
258
+ headers["Authorization"] = f"Bearer {self._sign_jwt(self.sa, url)}"
259
+ elif self.base_url.startswith("https"):
260
+ headers["Authorization"] = f"Bearer {self.id_token}"
261
+
262
+ try:
263
+ response = requests.post(url, json=workflow_request, headers=headers)
264
+ response.raise_for_status()
265
+ return response.json()
266
+ except requests.RequestException as e:
267
+ if e.response is not None and e.response.status_code == 401 and self.sa is None:
268
+ print(
269
+ " ❌ Error calling workflow start API. Unauthorized and no service account provided. Make sure the environment has default credentials set up or provide a service account name as SA_NAME in teams.py."
270
+ )
271
+ elif e.response is not None and e.response.status_code == 401 and self.sa is not None:
272
+ print(
273
+ f" ❌ Error calling workflow start API. Unauthorized with provided service account: {self.sa}. Make sure the service account has the 'iap.webServiceVersions.accessViaIap' permission."
274
+ )
275
+ else:
276
+ print(f" ❌ Error calling workflow start API: {e}")
277
+ raise e
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,30 @@
1
+ from sources.test.data import source_v1
2
+
3
+ from ai.chronon.group_by import Aggregation, GroupBy, Operation, TimeUnit, Window
4
+
5
+ window_sizes = [
6
+ Window(length=day, time_unit=TimeUnit.DAYS) for day in [3, 14, 30]
7
+ ] # Define some window sizes to use below
8
+
9
+ group_by_v1 = GroupBy(
10
+ backfill_start_date="2023-11-01",
11
+ sources=[source_v1],
12
+ keys=["user_id"], # We are aggregating by user
13
+ online=True,
14
+ aggregations=[
15
+ Aggregation(
16
+ input_column="purchase_price", operation=Operation.SUM, windows=window_sizes
17
+ ), # The sum of purchases prices in various windows
18
+ Aggregation(
19
+ input_column="purchase_price", operation=Operation.COUNT, windows=window_sizes
20
+ ), # The count of purchases in various windows
21
+ Aggregation(
22
+ input_column="purchase_price", operation=Operation.AVERAGE, windows=window_sizes
23
+ ), # The average purchases by user in various windows
24
+ Aggregation(
25
+ input_column="purchase_price",
26
+ operation=Operation.LAST_K(10),
27
+ ),
28
+ ],
29
+ version=0,
30
+ )
File without changes
File without changes
@@ -0,0 +1,26 @@
1
+ from gen_thrift.api.ttypes import EventSource, Source
2
+ from group_bys.test.data import group_by_v1
3
+
4
+ from ai.chronon.join import Join, JoinPart
5
+ from ai.chronon.query import Query, selects
6
+
7
+ """
8
+ This is the "left side" of the join that will comprise our training set. It is responsible for providing the primary keys
9
+ and timestamps for which features will be computed.
10
+ """
11
+ source = Source(
12
+ events=EventSource(
13
+ table="data.checkouts",
14
+ query=Query(
15
+ selects=selects("user_id"), # The primary key used to join various GroupBys together
16
+ time_column="ts",
17
+ ), # The event time used to compute feature values as-of
18
+ )
19
+ )
20
+
21
+ v1 = Join(
22
+ left=source,
23
+ right_parts=[JoinPart(group_by=group_by_v1)],
24
+ row_ids="user_id",
25
+ version=0,
26
+ )
File without changes
File without changes
@@ -0,0 +1,26 @@
1
+ from gen_thrift.api.ttypes import EventSource, Source
2
+
3
+ from ai.chronon.query import Query, selects
4
+
5
+ """
6
+ Example: Defining a Chronon Source from a Batch Table
7
+
8
+ This example demonstrates how to configure a Chronon `Source` from a BigQuery or Hive table,
9
+ with a clear event time column and selected fields for downstream feature computation.
10
+ """
11
+
12
+ # Define the EventSource using the batch table and query
13
+ # Wrap the EventSource in a Source object
14
+
15
+ source_v1 = Source(
16
+ events=EventSource(
17
+ table="data.purchases", # This points to the log table in the warehouse with historical purchase events, updated in batch daily
18
+ topic=None, # See the 'returns' GroupBy for an example that has a streaming source configured. In this case, this would be the streaming source topic that can be listened to for realtime events
19
+ query=Query(
20
+ selects=selects("user_id", "purchase_price"), # Select the fields we care about
21
+ time_column="ts",
22
+ ), # The event time
23
+ )
24
+ )
25
+
26
+ # The `source_v1` object can now be used in a Chronon join or pipeline definition
@@ -0,0 +1,58 @@
1
+ from gen_thrift.api.ttypes import Team
2
+
3
+ from ai.chronon.repo.constants import RunMode
4
+ from ai.chronon.types import ConfigProperties, EnvironmentVariables
5
+
6
+ default = Team(
7
+ description="Default team",
8
+ email="<responsible-team-email>",
9
+ outputNamespace="default",
10
+ conf=ConfigProperties(
11
+ common={
12
+ "spark.chronon.table.format_provider.class": "ai.chronon.integrations.cloud_gcp.GcpFormatProvider",
13
+ "spark.chronon.table_write.format": "iceberg",
14
+ "spark.sql.defaultCatalog": "bigquery_catalog",
15
+ "spark.sql.catalog.bigquery_catalog": "ai.chronon.integrations.cloud_gcp.DelegatingBigQueryMetastoreCatalog",
16
+ "spark.sql.catalog.bigquery_catalog.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
17
+ "spark.sql.catalog.bigquery_catalog.io-impl": "org.apache.iceberg.io.ResolvingFileIO",
18
+ "spark.sql.defaultUrlStreamHandlerFactory.enabled": "false",
19
+ "spark.kryo.registrator": "ai.chronon.integrations.cloud_gcp.ChrononIcebergKryoRegistrator",
20
+ "spark.chronon.coalesce.factor": "10",
21
+ "spark.default.parallelism": "10",
22
+ "spark.sql.shuffle.partitions": "10",
23
+ # TODO: Please fill in the following values
24
+ "spark.sql.catalog.bigquery_catalog.warehouse": "gs://zipline-warehouse-<customer_id>/data/tables/",
25
+ "spark.sql.catalog.bigquery_catalog.gcp.bigquery.location": "<region>",
26
+ "spark.sql.catalog.bigquery_catalog.gcp.bigquery.project-id": "<project-id>",
27
+ "spark.chronon.partition.format": "<date-format>", # ex: "yyyy-MM-dd",
28
+ "spark.chronon.partition.column": "<partition-column-name>", # ex: "ds",
29
+ },
30
+ ),
31
+ env=EnvironmentVariables(
32
+ common={
33
+ # TODO: Please fill in the following values
34
+ "CUSTOMER_ID": "<customer_id>",
35
+ "GCP_PROJECT_ID": "<project-id>",
36
+ "GCP_REGION": "<region>",
37
+ "GCP_DATAPROC_CLUSTER_NAME": "<dataproc-cluster-name>",
38
+ "GCP_BIGTABLE_INSTANCE_ID": "<bigtable-instance-id>",
39
+ "ARTIFACT_PREFIX": "<customer-artifact-bucket>",
40
+ "CLOUD_PROVIDER": "<gcp | aws>",
41
+ },
42
+ ),
43
+ )
44
+
45
+
46
+ test = Team(
47
+ outputNamespace="data",
48
+ env=EnvironmentVariables(
49
+ common={}, modeEnvironments={RunMode.BACKFILL: {}, RunMode.UPLOAD: {}}
50
+ ),
51
+ )
52
+
53
+ team_conf = Team(
54
+ outputNamespace="test",
55
+ env=EnvironmentVariables(
56
+ common={},
57
+ ),
58
+ )
ai/chronon/source.py ADDED
@@ -0,0 +1,86 @@
1
+ """
2
+ Wrappers to directly create Source objects.
3
+ """
4
+
5
+ import gen_thrift.api.ttypes as ttypes
6
+
7
+
8
+ def EventSource(
9
+ table: str,
10
+ query: ttypes.Query,
11
+ topic: str = None,
12
+ is_cumulative: bool = None,
13
+ ) -> ttypes.Source:
14
+ """
15
+ Event Sources represent data that gets generated over-time.
16
+ Typically, but not necessarily, logged to message buses like kafka, kinesis or google pub/sub.
17
+ fct tables are also event source worthy.
18
+
19
+ Attributes:
20
+
21
+ - table: Table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
22
+ Table names can contain subpartition specs, example db.table/system=mobile/currency=USD
23
+ - topic: Topic is a kafka table. The table contains all the events historically came through this topic.
24
+ - query: The logic used to scan both the table and the topic. Contains row level transformations
25
+ and filtering expressed as Spark SQL statements.
26
+ - isCumulative: If each new hive partition contains not just the current day's events but the entire set
27
+ of events since the begininng. The key property is that the events are not mutated
28
+ across partitions.
29
+
30
+ """
31
+ return ttypes.Source(
32
+ events=ttypes.EventSource(table=table, topic=topic, query=query, isCumulative=is_cumulative)
33
+ )
34
+
35
+
36
+ def EntitySource(
37
+ snapshot_table: str,
38
+ query: ttypes.Query,
39
+ mutation_table: str = None,
40
+ mutation_topic: str = None,
41
+ ) -> ttypes.Source:
42
+ """
43
+ Entity Sources represent data that gets mutated over-time - at row-level. This is a group of three data elements.
44
+ snapshotTable, mutationTable and mutationTopic. mutationTable and mutationTopic are only necessary if we are trying
45
+ to create realtime or point-in-time aggregations over these sources. Entity sources usually map 1:1 with a database
46
+ tables in your OLTP store that typically serves live application traffic. When mutation data is absent they map 1:1
47
+ to `dim` tables in star schema.
48
+
49
+ Attributes:
50
+ - snapshotTable: Snapshot table currently needs to be a 'ds' (date string - yyyy-MM-dd) partitioned hive table.
51
+ - mutationTable: Topic is a kafka table. The table contains
52
+ all the events that historically came through this topic.
53
+ We need all the fields present in the snapshot table, PLUS two additional fields,
54
+ `mutation_time` - milliseconds since epoch of type Long that represents the time of the mutation
55
+ `is_before` - a boolean flag that represents whether
56
+ this row contains values before or after the mutation.
57
+ - mutationTopic: The logic used to scan both the table and the topic. Contains row level transformations
58
+ and filtering expressed as Spark SQL statements.
59
+ - query: If each new hive partition contains not just the current day's events but the entire set
60
+ of events since the begininng. The key property is that the events are not mutated across partitions.
61
+ """
62
+ return ttypes.Source(
63
+ entities=ttypes.EntitySource(
64
+ snapshotTable=snapshot_table,
65
+ mutationTable=mutation_table,
66
+ mutationTopic=mutation_topic,
67
+ query=query,
68
+ )
69
+ )
70
+
71
+
72
+ def JoinSource(join: ttypes.Join, query: ttypes.Query) -> ttypes.Source:
73
+ """
74
+ The output of a join can be used as a source for `GroupBy`.
75
+ Useful for expressing complex computation in chronon.
76
+
77
+ Offline this simply means that we will compute the necessary date ranges of the join
78
+ before we start computing the `GroupBy`.
79
+
80
+ Online we will:
81
+ 1. enrich the stream/topic of `join.left` with all the columns defined by the join
82
+ 2. apply the selects & wheres defined in the `query`
83
+ 3. perform aggregations defined in the *downstream* `GroupBy`
84
+ 4. write the result to the kv store.
85
+ """
86
+ return ttypes.Source(joinSource=ttypes.JoinSource(join=join, query=query))