awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/join.py ADDED
@@ -0,0 +1,580 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import copy
16
+ import gc
17
+ import importlib
18
+ import logging
19
+ from collections import Counter
20
+ from typing import Dict, List, Tuple, Union
21
+
22
+ import gen_thrift.api.ttypes as api
23
+ import gen_thrift.common.ttypes as common
24
+
25
+ import ai.chronon.repo.extract_objects as eo
26
+ import ai.chronon.utils as utils
27
+ from ai.chronon.cli.compile import parse_teams
28
+
29
+ logging.basicConfig(level=logging.INFO)
30
+
31
+
32
+ def _get_output_table_name(join: api.Join, full_name: bool = False):
33
+ """generate output table name for join backfill job"""
34
+ # join sources could also be created inline alongside groupBy file
35
+ # so we specify fallback module as group_bys
36
+ if isinstance(join, api.Join):
37
+ utils.__set_name(join, api.Join, "joins")
38
+ # set output namespace
39
+ if not join.metaData.outputNamespace:
40
+ team_name = join.metaData.name.split(".")[0]
41
+ namespace = (
42
+ parse_teams.load_teams(utils.chronon_root_path, print=False)
43
+ .get(team_name)
44
+ .outputNamespace
45
+ )
46
+ join.metaData.outputNamespace = namespace
47
+ return utils.output_table_name(join, full_name=full_name)
48
+
49
+
50
+ def JoinPart(
51
+ group_by: api.GroupBy,
52
+ key_mapping: Dict[str, str] = None,
53
+ prefix: str = None,
54
+ tags: Dict[str, str] = None,
55
+ ) -> api.JoinPart:
56
+ """
57
+ Specifies HOW to join the `left` of a Join with GroupBy's.
58
+
59
+ :param group_by:
60
+ The GroupBy object to join with. Keys on left are used to equi join with keys on right.
61
+ When left is entities all GroupBy's are computed as of midnight.
62
+ When left is events, we do a point-in-time join when right.accuracy == TEMPORAL OR right.source.topic != null
63
+ :type group_by: ai.chronon.api.GroupBy
64
+ :param key_mapping:
65
+ Names of keys don't always match on left and right, this mapping tells us how to map when they don't.
66
+ :type key_mapping: Dict[str, str]
67
+ :param prefix:
68
+ All the output columns of the groupBy will be prefixed with this string. This is used when you need to join
69
+ the same groupBy more than once with `left`. Say on the left you have seller and buyer, on the group you have
70
+ a user's avg_price, and you want to join the left (seller, buyer) with (seller_avg_price, buyer_avg_price) you
71
+ would use key_mapping and prefix parameters.
72
+ :param tags:
73
+ Additional metadata about the JoinPart that you wish to track. Does not effect computation.
74
+ :type tags: Dict[str, str]
75
+ :return:
76
+ JoinPart specifies how the left side of a join, or the query in online setting, would join with the right side
77
+ components like GroupBys.
78
+ """
79
+
80
+ assert isinstance(group_by, api.GroupBy), (
81
+ f"Expecting GroupBy. But found {type(group_by).__name__}"
82
+ )
83
+
84
+ # used for reset for next run
85
+ import_copy = __builtins__["__import__"]
86
+ # get group_by's module info from garbage collector
87
+ gc.collect()
88
+
89
+ group_by_module_name = None
90
+ for ref in gc.get_referrers(group_by):
91
+ if (
92
+ isinstance(
93
+ ref, dict
94
+ ) # Attaching methods to GroupBy adds references in GC, need to filter out
95
+ and "__name__" in ref
96
+ and ref["__name__"].startswith("group_bys")
97
+ ):
98
+ group_by_module_name = ref["__name__"]
99
+ break
100
+
101
+ if group_by_module_name:
102
+ logging.debug(
103
+ "group_by's module info from garbage collector {}".format(group_by_module_name)
104
+ )
105
+ group_by_module = importlib.import_module(group_by_module_name)
106
+ __builtins__["__import__"] = eo.import_module_set_name(group_by_module, api.GroupBy)
107
+ else:
108
+ if not group_by.metaData.name:
109
+ logging.error("No group_by file or custom group_by name found")
110
+ raise ValueError(
111
+ "[GroupBy] Must specify a group_by name if group_by is not defined in separate file. "
112
+ "You may pass it in via GroupBy.name. \n"
113
+ )
114
+
115
+ if key_mapping:
116
+ utils.check_contains(
117
+ key_mapping.values(), group_by.keyColumns, "key", group_by.metaData.name
118
+ )
119
+
120
+ join_part = api.JoinPart(groupBy=group_by, keyMapping=key_mapping, prefix=prefix)
121
+ join_part.tags = tags
122
+ # reset before next run
123
+ __builtins__["__import__"] = import_copy
124
+ return join_part
125
+
126
+
127
+ FieldsType = List[Tuple[str, api.TDataType]]
128
+
129
+
130
+ class DataType:
131
+ """
132
+ Helper class to generate data types for declaring schema.
133
+ This supports primitive like numerics, string etc., and complex
134
+ types like Map, List, Struct etc.
135
+ """
136
+
137
+ BOOLEAN = api.TDataType(api.DataKind.BOOLEAN)
138
+ SHORT = api.TDataType(api.DataKind.SHORT)
139
+ INT = api.TDataType(api.DataKind.INT)
140
+ LONG = api.TDataType(api.DataKind.LONG)
141
+ FLOAT = api.TDataType(api.DataKind.FLOAT)
142
+ DOUBLE = api.TDataType(api.DataKind.DOUBLE)
143
+ STRING = api.TDataType(api.DataKind.STRING)
144
+ BINARY = api.TDataType(api.DataKind.BINARY)
145
+
146
+ # Types unsupported by Avro. See AvroConversions.scala#fromChrononSchema
147
+ # BYTE = api.TDataType(api.DataKind.BYTE)
148
+ # DATE = api.TDataType(api.DataKind.DATE)
149
+ # TIMESTAMP = api.TDataType(api.DataKind.TIMESTAMP)
150
+
151
+ def MAP(key_type: api.TDataType, value_type: api.TDataType) -> api.TDataType:
152
+ assert key_type == api.TDataType(api.DataKind.STRING), (
153
+ "key_type has to STRING for MAP types"
154
+ )
155
+
156
+ return api.TDataType(
157
+ api.DataKind.MAP,
158
+ params=[api.DataField("key", key_type), api.DataField("value", value_type)],
159
+ )
160
+
161
+ def LIST(elem_type: api.TDataType) -> api.TDataType:
162
+ return api.TDataType(api.DataKind.LIST, params=[api.DataField("elem", elem_type)])
163
+
164
+ def STRUCT(name: str, *fields: FieldsType) -> api.TDataType:
165
+ return api.TDataType(
166
+ api.DataKind.STRUCT,
167
+ params=[api.DataField(name, data_type) for (name, data_type) in fields],
168
+ name=name,
169
+ )
170
+
171
+
172
+ def ExternalSource(
173
+ name: str,
174
+ team: str,
175
+ key_fields: FieldsType,
176
+ value_fields: FieldsType,
177
+ ) -> api.ExternalSource:
178
+ """
179
+ External sources are online only data sources. During fetching, using
180
+ chronon java client, they consume a Request containing a key map
181
+ (name string to value). And produce a Response containing a value map.
182
+
183
+ This is primarily used in Joins. We also expose a fetchExternal method in
184
+ java client library that can be used to fetch a batch of External source
185
+ requests efficiently.
186
+
187
+ Internally Chronon will batch these requests to the service and parallelize
188
+ fetching from different services, while de-duplicating given a batch of
189
+ join requests.
190
+
191
+ The implementation of how to fetch is an `ExternalSourceHandler` in
192
+ scala/java api that needs to be registered while implementing
193
+ ai.chronon.online.Api with the name used in the ExternalSource. This is
194
+ meant for re-usability of external source definitions.
195
+
196
+ :param name: name of the external source to fetch from. Should match
197
+ the name in the registry.
198
+ :param key_fields: List of tuples of string and DataType. This is what
199
+ will be given to ExternalSource handler registered in Java API.
200
+ Eg., `[('key1', DataType.INT, 'key2', DataType.STRING)]`
201
+ :param value_fields: List of tuples of string and DataType. This is what
202
+ the ExternalSource handler will respond with::
203
+
204
+ [
205
+ ('value0', DataType.INT),
206
+ ('value1', DataType.MAP(DataType.STRING, DataType.LONG),
207
+ ('value2', DataType.STRUCT(
208
+ name = 'Context',
209
+ ('field1', DataType.INT),
210
+ ('field2', DataType.DOUBLE)
211
+ ))
212
+ ]
213
+
214
+ """
215
+ assert name != "contextual", "Please use `ContextualSource`"
216
+ return api.ExternalSource(
217
+ metadata=api.MetaData(name=name, team=team),
218
+ keySchema=DataType.STRUCT(f"ext_{name}_keys", *key_fields),
219
+ valueSchema=DataType.STRUCT(f"ext_{name}_values", *value_fields),
220
+ )
221
+
222
+
223
+ def ContextualSource(fields: FieldsType, team="default") -> api.ExternalSource:
224
+ """
225
+ Contextual source values are passed along for logging. No external request is
226
+ actually made.
227
+ """
228
+ return api.ExternalSource(
229
+ metadata=api.MetaData(name="contextual", team=team),
230
+ keySchema=DataType.STRUCT("contextual_keys", *fields),
231
+ valueSchema=DataType.STRUCT("contextual_values", *fields),
232
+ )
233
+
234
+
235
+ def ExternalPart(
236
+ source: api.ExternalSource, key_mapping: Dict[str, str] = None, prefix: str = None
237
+ ) -> api.ExternalPart:
238
+ """
239
+ Used to describe which ExternalSources to pull features from while fetching
240
+ online. This data also goes into logs based on sample percent.
241
+
242
+ Just as in JoinPart, key_mapping is used to map the join left's keys to
243
+ external source's keys. "vendor" and "buyer" on left side (query map)
244
+ could both map to a "user" in an account data external source. You would
245
+ create one ExternalPart for vendor with params:
246
+ `(key_mapping={vendor: user}, prefix=vendor)`
247
+ another for buyer.
248
+
249
+ This doesn't have any implications offline besides logging. "right_parts"
250
+ can be both backfilled and logged. Whereas, "external_parts" can only be
251
+ logged. If you need the ability to backfill an external source, look into
252
+ creating an EntitySource with mutation data for point-in-time-correctness.
253
+
254
+ :param source: External source to join with
255
+ :param key_mapping: How to map the keys from the query/left side to the
256
+ source
257
+ :param prefix: Sometime you want to use the same source to fetch data for
258
+ different entities in the query. Eg., A transaction
259
+ between a buyer and a seller might query "user information"
260
+ serivce/source that has information about both buyer &
261
+ seller
262
+ """
263
+ return api.ExternalPart(source=source, keyMapping=key_mapping, prefix=prefix)
264
+
265
+
266
+ def LabelParts(
267
+ labels: List[api.JoinPart],
268
+ left_start_offset: int,
269
+ left_end_offset: int,
270
+ label_offline_schedule: str = "@daily",
271
+ ) -> api.LabelParts:
272
+ """
273
+ Used to describe labels in join. Label part can be viewed as regular join part but represent
274
+ label data instead of regular feature data. Once labels are mature, label join job would join
275
+ labels with features in the training window user specified within the label GroupBy-s.
276
+
277
+ Since label join job will run continuously based on the schedule, multiple labels could be generated but with
278
+ different label_ds or label version. Label join job would have all computed label versions available, as well as
279
+ a view of latest version for easy label retrieval.
280
+
281
+ LabelParts definition can be updated along the way, but label join job can only accommodate these changes going
282
+ forward unless a backfill is manually triggered.
283
+
284
+ Label aggregation is also supported but with conditions applied. Single aggregation with one window is allowed
285
+ for now. If aggregation is present, we would infer the left_start_offset and left_end_offset same as window size
286
+ and the param input will be ignored.
287
+
288
+ :param labels: List of labels
289
+ :param label_offline_schedule: Cron expression for Airflow to schedule a DAG for offline
290
+ label join compute tasks
291
+ """
292
+
293
+ exec_info = common.ExecutionInfo(
294
+ scheduleCron=label_offline_schedule,
295
+ )
296
+ label_metadata = api.MetaData(executionInfo=exec_info)
297
+
298
+ return api.LabelParts(
299
+ labels=labels,
300
+ metaData=label_metadata,
301
+ )
302
+
303
+
304
+ def Derivation(name: str, expression: str) -> api.Derivation:
305
+ """
306
+ Derivation allows arbitrary SQL select clauses to be computed using columns from joinPart and externalParts,
307
+ and saves the result as derived columns. The results will be available both in online fetching response map,
308
+ and in offline Hive table.
309
+
310
+ joinPart column names are automatically constructed according to the below convention
311
+ `{join_part_prefix}_{group_by_name}_{input_column_name}_{aggregation_operation}_{window}_{by_bucket}`
312
+ prefix, window and bucket are optional. You can find the type information of columns using the analyzer tool.
313
+
314
+ externalPart column names are automatically constructed according to the below convention
315
+ `ext_{external_source_name}_{value_column}`.
316
+ Types are defined along with the schema by users for external sources.
317
+
318
+ Note that only values can be used in derivations, not keys. If you want to use a key in the derivation, you must
319
+ define it as a contextual field. You also must refer to a contextual field with its prefix included, for example:
320
+ `ext_contextual_request_id`.
321
+
322
+ If both name and expression are set to "*", then every raw column will be included along with the derived columns.
323
+
324
+ :param name: output column name of the SQL expression
325
+ :param expression: any valid Spark SQL select clause based on joinPart or externalPart columns
326
+ :return: a Derivation object representing a single derived column or a wildcard ("*") selection.
327
+ """
328
+ return api.Derivation(name=name, expression=expression)
329
+
330
+
331
+ def BootstrapPart(
332
+ table: str, key_columns: List[str] = None, query: api.Query = None
333
+ ) -> api.BootstrapPart:
334
+ """
335
+ Bootstrap is the concept of using pre-computed feature values and skipping backfill computation during the
336
+ training data generation phase. Bootstrap can be used for many purposes:
337
+ - Generating ongoing feature values from logs
338
+ - Backfilling feature values for external features (in which case Chronon is unable to run backfill)
339
+ - Initializing a new Join by migrating old data from an older Join and reusing data
340
+
341
+ One can bootstrap against any of these:
342
+
343
+ - join part fields:
344
+ Bootstrap can happen at individual field level within a join part.
345
+ If all fields within a group by are bootstrapped, then we skip computation for group by. Otherwise, the whole
346
+ thing will be re-run but only the values for the non-bootstrapped fields will be retained in the final table.
347
+ - external part fields:
348
+ Bootstrap can happen at individual field level within an external part.
349
+ Since there is no backfill logic in chronon for external part, all non-bootstrapped fields in external parts
350
+ are left as NULLs.
351
+ - derivation fields:
352
+ Derived fields can also be bootstrapped. Since derived fields depend on "base" fields (either join part or
353
+ external part), chronon will try to trigger the least amount of computation possible. For example,
354
+ if there is a join part where all derived fields that depend on the join part have been bootstrapped,
355
+ then we skip the computation for this join part.
356
+ - keys:
357
+ Keys of both join parts and external parts can be bootstrapped. During offline table generation, we will first
358
+ try to utilize key's data from left table; if it's not there, then we utilize bootstrap.
359
+ For contextual features, we also support propagating the key bootstrap to the values.
360
+
361
+
362
+ :param table: Name of hive table that contains feature values where rows are 1:1 mapped to left table
363
+ :param key_columns: Keys to join bootstrap table to left table
364
+ :param query: Selected columns (features & keys) and filtering conditions of the bootstrap tables.
365
+ """
366
+ return api.BootstrapPart(table=table, query=query, keyColumns=key_columns)
367
+
368
+
369
+ def Join(
370
+ left: api.Source,
371
+ right_parts: List[api.JoinPart],
372
+ version: int,
373
+ row_ids: Union[str, List[str]],
374
+ online_external_parts: List[api.ExternalPart] = None,
375
+ bootstrap_parts: List[api.BootstrapPart] = None,
376
+ bootstrap_from_log: bool = False,
377
+ skew_keys: Dict[str, List[str]] = None,
378
+ derivations: List[api.Derivation] = None,
379
+ label_part: api.LabelParts = None,
380
+ output_namespace: str = None,
381
+ table_properties: Dict[str, str] = None,
382
+ online: bool = False,
383
+ production: bool = False,
384
+ sample_percent: float = 100.0,
385
+ check_consistency: bool = None,
386
+ consistency_sample_percent: float = 5.0,
387
+ use_long_names: bool = False,
388
+ # execution params
389
+ offline_schedule: str = "@daily",
390
+ historical_backfill: bool = None,
391
+ conf: common.ConfigProperties = None,
392
+ env_vars: common.EnvironmentVariables = None,
393
+ cluster_conf: common.ClusterConfigProperties = None,
394
+ step_days: int = None,
395
+ ) -> api.Join:
396
+ """
397
+ Construct a join object. A join can pull together data from various GroupBy's both offline and online. This is also
398
+ the focal point for logging, data quality computation and monitoring. A join maps 1:1 to models in ML usage.
399
+
400
+ :param left:
401
+ The source on the left side, when Entities, all GroupBys are join with SNAPSHOT accuracy (midnight values).
402
+ When left is events, if on the right, either when GroupBy's are TEMPORAL, or when topic is specified, we perform
403
+ a TEMPORAL / point-in-time join.
404
+ :type left: ai.chronon.api.Source
405
+ :param right_parts:
406
+ The list of groupBy's to join with. GroupBy's are wrapped in a JoinPart, which contains additional information
407
+ on how to join the left side with the GroupBy.
408
+ :type right_parts: List[ai.chronon.api.JoinPart]
409
+ :param check_consistency:
410
+ If online serving data should be compared with backfill data - as online-offline-consistency metrics.
411
+ The metrics go into hive and your configured kv store for further visualization and monitoring.
412
+ :type check_consistency: bool
413
+ :param additional_args:
414
+ Additional args go into `customJson` of `ai.chronon.api.MetaData` within the `ai.chronon.api.Join` object.
415
+ This is a place for arbitrary information you want to tag your conf with.
416
+ :type additional_args: List[str]
417
+ :param additional_env:
418
+ Deprecated, see env
419
+ :type additional_env: List[str]
420
+ :param online:
421
+ Should we upload this conf into kv store so that we can fetch/serve this join online.
422
+ Once Online is set to True, you ideally should not change the conf.
423
+ :type online: bool
424
+ :param production:
425
+ This when set can be integrated to trigger alerts. You will have to integrate this flag into your alerting
426
+ system yourself.
427
+ :type production: bool
428
+ :param output_namespace:
429
+ In backfill mode, we will produce data into hive. This represents the hive namespace that the data will be
430
+ written into. You can set this at the teams.json level.
431
+ :type output_namespace: str
432
+ :param table_properties:
433
+ Specifies the properties on output hive tables. Can be specified in teams.json.
434
+ :param lag:
435
+ Param that goes into customJson. You can pull this out of the json at path "metaData.customJson.lag"
436
+ This is used by airflow integration to pick an older hive partition to wait on.
437
+ :param skew_keys:
438
+ While back-filling, if there are known irrelevant keys - like user_id = 0 / NULL etc. You can specify them here.
439
+ This is used to blacklist crawlers etc
440
+ :param sample_percent:
441
+ Online only parameter. What percent of online serving requests to this join should be logged into warehouse.
442
+ :param consistency_sample_percent:
443
+ Online only parameter. What percent of online serving requests to this join should be sampled to compute
444
+ online offline consistency metrics.
445
+ if sample_percent=50.0 and consistency_sample_percent=10.0, then basically the consistency job runs on
446
+ 5% of total traffic.
447
+ :param online_external_parts:
448
+ users can register external sources into Api implementation. Chronon fetcher can invoke the implementation.
449
+ This is applicable only for online fetching. Offline this will not be produce any values.
450
+ :param offline_schedule:
451
+ Cron expression for Airflow to schedule a DAG for offline join compute tasks
452
+ :param row_ids:
453
+ Columns of the left table that uniquely define a training record. Used as default keys during bootstrap
454
+ :param bootstrap_parts:
455
+ A list of BootstrapPart used for the Join. See BootstrapPart doc for more details
456
+ :param bootstrap_from_log:
457
+ If set to True, will use logging table to generate training data by default and skip continuous backfill.
458
+ Logging will be treated as another bootstrap source, but other bootstrap_parts will take precedence.
459
+ :param label_part:
460
+ Label part which contains a list of labels and label refresh window boundary used for the Join
461
+ :param historical_backfill:
462
+ Flag to indicate whether join backfill should backfill previous holes.
463
+ Setting to false will only backfill latest single partition
464
+ :type historical_backfill: bool
465
+ :return:
466
+ A join object that can be used to backfill or serve data. For ML use-cases this should map 1:1 to model.
467
+ :param conf:
468
+ Configuration properties for the join. Depending on the mode we layer confs with the following priority:
469
+ 1. conf set in the join.conf.<mode>
470
+ 2. conf set in the join.conf.common
471
+ 3. conf set in the team.conf.<mode>
472
+ 4. conf set in the team.conf.common
473
+ 5. conf set in the default.conf.<mode>
474
+ 6. conf set in the default.conf.common
475
+ :param env_vars:
476
+ Environment variables for the join. Depending on the mode we layer envs with the following priority:
477
+ 1. env vars set in the join.env.<mode>
478
+ 2. env vars set in the join.env.common
479
+ 3. env vars set in the team.env.<mode>
480
+ 4. env vars set in the team.env.common
481
+ 5. env vars set in the default.env.<mode>
482
+ 6. env vars set in the default.env.common
483
+ :param cluster_conf:
484
+ Cluster configuration properties for the join.
485
+ :param step_days:
486
+ The maximum number of days to output at once
487
+ """
488
+ # Normalize row_ids
489
+ if isinstance(row_ids, str):
490
+ row_ids = [row_ids]
491
+
492
+ assert isinstance(version, int), (
493
+ f"Version must be an integer, but found {type(version).__name__}"
494
+ )
495
+
496
+ # create a deep copy for case: multiple LeftOuterJoin use the same left,
497
+ # validation will fail after the first iteration
498
+ updated_left = copy.deepcopy(left)
499
+ if left.events and left.events.query.selects:
500
+ assert "ts" not in left.events.query.selects.keys(), (
501
+ "'ts' is a reserved key word for Chronon, please specify the expression in timeColumn"
502
+ )
503
+ # mapping ts to query.timeColumn to events only
504
+ updated_left.events.query.selects.update({"ts": updated_left.events.query.timeColumn})
505
+
506
+ if label_part:
507
+ label_metadata = api.MetaData(
508
+ executionInfo=label_part.metaData.executionInfo,
509
+ )
510
+ label_part = api.LabelParts(
511
+ labels=label_part.labels,
512
+ leftStartOffset=label_part.leftStartOffset,
513
+ leftEndOffset=label_part.leftEndOffset,
514
+ metaData=label_metadata,
515
+ )
516
+
517
+ consistency_sample_percent = consistency_sample_percent if check_consistency else None
518
+
519
+ # external parts need to be unique on (prefix, part.source.metaData.name)
520
+ if online_external_parts:
521
+ count_map = Counter(
522
+ [(part.prefix, part.source.metadata.name) for part in online_external_parts]
523
+ )
524
+ has_duplicates = False
525
+ for key, count in count_map.items():
526
+ if count > 1:
527
+ has_duplicates = True
528
+ print(f"Found {count - 1} duplicate(s) for external part {key}")
529
+ assert has_duplicates is False, "Please address all the above mentioned duplicates."
530
+
531
+ if bootstrap_from_log:
532
+ has_logging = sample_percent > 0 and online
533
+ assert has_logging, (
534
+ "Join must be online with sample_percent set in order to use bootstrap_from_log option"
535
+ )
536
+ bootstrap_parts = (bootstrap_parts or []) + [
537
+ api.BootstrapPart(
538
+ # templated values will be replaced when metaData.name is set at the end
539
+ table="{{ logged_table }}"
540
+ )
541
+ ]
542
+
543
+ exec_info = common.ExecutionInfo(
544
+ scheduleCron=offline_schedule,
545
+ conf=conf,
546
+ env=env_vars,
547
+ stepDays=step_days,
548
+ historicalBackfill=historical_backfill,
549
+ clusterConf=cluster_conf,
550
+ )
551
+
552
+ metadata = api.MetaData(
553
+ online=online,
554
+ production=production,
555
+ outputNamespace=output_namespace,
556
+ tableProperties=table_properties,
557
+ samplePercent=sample_percent,
558
+ consistencyCheck=check_consistency,
559
+ consistencySamplePercent=consistency_sample_percent,
560
+ executionInfo=exec_info,
561
+ version=str(version),
562
+ )
563
+
564
+ join = api.Join(
565
+ left=updated_left,
566
+ joinParts=right_parts,
567
+ metaData=metadata,
568
+ skewKeys=skew_keys,
569
+ onlineExternalParts=online_external_parts,
570
+ bootstrapParts=bootstrap_parts,
571
+ rowIds=row_ids,
572
+ labelParts=label_part,
573
+ derivations=derivations,
574
+ useLongNames=use_long_names,
575
+ )
576
+
577
+ # Add the table property that calls the private function
578
+ join.__class__.table = property(lambda self: _get_output_table_name(self, full_name=True))
579
+
580
+ return join
ai/chronon/logger.py ADDED
@@ -0,0 +1,23 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import logging
16
+
17
+ LOG_FORMAT = "[%(asctime)-11s] %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
18
+
19
+
20
+ def get_logger(log_level=logging.INFO):
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(log_level)
23
+ return logger
ai/chronon/model.py ADDED
@@ -0,0 +1,40 @@
1
+ from typing import Optional
2
+
3
+ import gen_thrift.api.ttypes as ttypes
4
+
5
+
6
+ class ModelType:
7
+ XGBoost = ttypes.ModelType.XGBoost
8
+ PyTorch = ttypes.ModelType.PyTorch
9
+
10
+
11
+ # Name must match S3 path that we expose if you're uploading trained models?
12
+ def Model(
13
+ source: ttypes.Source,
14
+ outputSchema: ttypes.TDataType,
15
+ modelType: ModelType,
16
+ name: str = None,
17
+ modelParams: Optional[dict[str, str]] = None,
18
+ ) -> ttypes.Model:
19
+ if not isinstance(source, ttypes.Source):
20
+ raise ValueError("Invalid source type")
21
+ if not (isinstance(outputSchema, ttypes.TDataType) or isinstance(outputSchema, int)):
22
+ raise ValueError("outputSchema must be a TDataType or DataKind")
23
+ if isinstance(outputSchema, int):
24
+ # Convert DataKind to TDataType
25
+ outputSchema = ttypes.TDataType(outputSchema)
26
+
27
+ if modelParams is None:
28
+ modelParams = {}
29
+
30
+ metaData = ttypes.MetaData(
31
+ name=name,
32
+ )
33
+
34
+ return ttypes.Model(
35
+ modelType=modelType,
36
+ outputSchema=outputSchema,
37
+ source=source,
38
+ modelParams=modelParams,
39
+ metaData=metaData,
40
+ )