awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
ai/chronon/group_by.py ADDED
@@ -0,0 +1,692 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import json
17
+ import logging
18
+ from copy import deepcopy
19
+ from typing import Callable, Dict, List, Optional, Tuple, Union
20
+
21
+ import ai.chronon.api.common.ttypes as common
22
+ import ai.chronon.api.ttypes as ttypes
23
+ import ai.chronon.utils as utils
24
+ import ai.chronon.windows as window_utils
25
+
26
+ OperationType = int # type(zthrift.Operation.FIRST)
27
+
28
+ # The GroupBy's default online/production status is None and it will inherit
29
+ # online/production status from the Joins it is included.
30
+ # If it is included in multiple joins, it is considered online/production
31
+ # if any of the joins are online/production. Otherwise it is not online/production
32
+ # unless it is explicitly marked as online/production on the GroupBy itself.
33
+ DEFAULT_ONLINE = None
34
+ DEFAULT_PRODUCTION = None
35
+ LOGGER = logging.getLogger()
36
+
37
+
38
+ def collector(
39
+ op: ttypes.Operation,
40
+ ) -> Callable[[ttypes.Operation], Tuple[ttypes.Operation, Dict[str, str]]]:
41
+ return lambda k: (op, {"k": str(k)})
42
+
43
+
44
+ def generic_collector(op: ttypes.Operation, required, **kwargs):
45
+ def _collector(*args, **other_args):
46
+ arguments = kwargs.copy() if kwargs else {}
47
+ for idx, arg in enumerate(required):
48
+ arguments[arg] = args[idx]
49
+ arguments.update(other_args)
50
+ return (op, {k: str(v) for k, v in arguments.items()})
51
+
52
+ return _collector
53
+
54
+
55
+ # To simplify imports
56
+ class Accuracy(ttypes.Accuracy):
57
+ pass
58
+
59
+
60
+ class Operation:
61
+
62
+ MIN = ttypes.Operation.MIN
63
+ """Minimum value in the column"""
64
+
65
+ MAX = ttypes.Operation.MAX
66
+ """Maximum value in the column"""
67
+
68
+ FIRST = ttypes.Operation.FIRST
69
+ """First non-null value of input column by time column"""
70
+
71
+ LAST = ttypes.Operation.LAST
72
+ """Last non-null value of input column by time column"""
73
+
74
+ APPROX_UNIQUE_COUNT = ttypes.Operation.APPROX_UNIQUE_COUNT
75
+ """Approximate count of unique values using CPC (Compressed Probability Counting) sketch"""
76
+
77
+ APPROX_UNIQUE_COUNT_LGK = collector(ttypes.Operation.APPROX_UNIQUE_COUNT)
78
+ """Configurable approximate unique count with lgK parameter for sketch size tuning.
79
+ Default lgK is 8. See CpcSketch.java for accuracy vs size tradeoffs:
80
+ https://github.com/apache/incubator-datasketches-java/blob/master/src/main/java/org/apache/datasketches/cpc/CpcSketch.java#L180
81
+ """
82
+
83
+ UNIQUE_COUNT = ttypes.Operation.UNIQUE_COUNT
84
+ """
85
+ Exact count of unique values of the input column.
86
+ Will store the set of items and can be expensive if the cardinality of the column is high.
87
+ """
88
+
89
+ COUNT = ttypes.Operation.COUNT
90
+ """Total count of non-null values of the input column"""
91
+
92
+ SUM = ttypes.Operation.SUM
93
+ """Sum of values in the input column"""
94
+
95
+ AVERAGE = ttypes.Operation.AVERAGE
96
+ """Arithmetic mean of values in the input column"""
97
+
98
+ VARIANCE = ttypes.Operation.VARIANCE
99
+ """Statistical variance of values in the input column"""
100
+
101
+ SKEW = ttypes.Operation.SKEW
102
+ """Skewness (third standardized moment) of the distribution of values in input column"""
103
+
104
+ KURTOSIS = ttypes.Operation.KURTOSIS
105
+ """Kurtosis (fourth standardized moment) of the distribution of values in input column"""
106
+
107
+ HISTOGRAM = ttypes.Operation.HISTOGRAM
108
+ """Full frequency distribution of values"""
109
+
110
+ FREQUENT_K = collector(ttypes.Operation.HISTOGRAM)
111
+ """
112
+ !! Could be expensive if the cardinality of the column is high !!
113
+ Computes columns values that are frequent in the input column exactly.
114
+ Produces a map of items as keys and counts as values.
115
+ """
116
+
117
+ APPROX_FREQUENT_K = collector(ttypes.Operation.APPROX_FREQUENT_K)
118
+ """
119
+ Computes columns values that are frequent in the input column approximately.
120
+ Produces a map of items as keys and counts as values approximately.
121
+ """
122
+
123
+ APPROX_HEAVY_HITTERS_K = collector(ttypes.Operation.APPROX_HEAVY_HITTERS_K)
124
+ """
125
+ Computes column values that are skewed in the input column.
126
+ Produces a map of items as keys and counts as values approximately.
127
+ Different from APPROX_FREQUENT_K in that it only retains if a value is abnormally
128
+ more frequent.
129
+ """
130
+
131
+ FIRST_K = collector(ttypes.Operation.FIRST_K)
132
+ """Returns first k input column values by time column"""
133
+
134
+ LAST_K = collector(ttypes.Operation.LAST_K)
135
+ """Returns last k input column values by time column"""
136
+
137
+ TOP_K = collector(ttypes.Operation.TOP_K)
138
+ """Returns k largest values of the input column. Input needs to be sortable."""
139
+
140
+ BOTTOM_K = collector(ttypes.Operation.BOTTOM_K)
141
+ """Returns k smallest values of the input column"""
142
+
143
+ UNIQUE_TOP_K = collector(ttypes.Operation.UNIQUE_TOP_K)
144
+ """Returns top k unique elements ranked by their values. Automatically deduplicates inputs. For structs, requires sort_key (String) and unique_id (Long) fields."""
145
+
146
+ APPROX_PERCENTILE = generic_collector(
147
+ ttypes.Operation.APPROX_PERCENTILE, ["percentiles"], k=20
148
+ )
149
+ """Approximate percentile calculation with configurable accuracy parameter k=20"""
150
+
151
+
152
+ def Aggregations(**agg_dict):
153
+ assert all(isinstance(agg, ttypes.Aggregation) for agg in agg_dict.values())
154
+ for key, agg in agg_dict.items():
155
+ if not agg.inputColumn:
156
+ agg.inputColumn = key
157
+ return agg_dict.values()
158
+
159
+
160
+ def DefaultAggregation(keys, sources, operation=Operation.LAST, tags=None):
161
+ aggregate_columns = []
162
+ for source in sources:
163
+ query = utils.get_query(source)
164
+ columns = utils.get_columns(source)
165
+ non_aggregate_columns = keys + [
166
+ "ts",
167
+ "is_before",
168
+ "mutation_ts",
169
+ "ds",
170
+ query.timeColumn,
171
+ ]
172
+ aggregate_columns += [
173
+ column for column in columns if column not in non_aggregate_columns
174
+ ]
175
+ return [
176
+ Aggregation(operation=operation, input_column=column, tags=tags)
177
+ for column in aggregate_columns
178
+ ]
179
+
180
+
181
+ class TimeUnit:
182
+ HOURS = common.TimeUnit.HOURS
183
+ DAYS = common.TimeUnit.DAYS
184
+
185
+
186
+ def window_to_str_pretty(window: common.Window):
187
+ unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()
188
+ return f"{window.length} {unit}"
189
+
190
+
191
+ def op_to_str(operation: OperationType):
192
+ return ttypes.Operation._VALUES_TO_NAMES[operation].lower()
193
+
194
+
195
+ # See docs/Aggregations.md
196
+ def Aggregation(
197
+ input_column: str = None,
198
+ operation: Union[ttypes.Operation, Tuple[ttypes.Operation, Dict[str, str]]] = None,
199
+ windows: Union[List[common.Window], List[str]] = None,
200
+ buckets: List[str] = None,
201
+ tags: Dict[str, str] = None,
202
+ ) -> ttypes.Aggregation:
203
+ """
204
+ :param input_column:
205
+ Column on which the aggregation needs to be performed.
206
+ This should be one of the input columns specified on the keys of the `select` in the `Query`'s `Source`
207
+ :type input_column: str
208
+ :param operation:
209
+ Operation to use to aggregate the input columns. For example, MAX, MIN, COUNT
210
+ Some operations have arguments, like last_k, approx_percentiles etc.,
211
+ Defaults to "LAST".
212
+ :type operation: ttypes.Operation
213
+ :param windows:
214
+ Length to window to calculate the aggregates on. Strings like "1h", "30d" are also accepted.
215
+ Minimum window size is 1hr. Maximum can be arbitrary. When not defined, the computation is un-windowed.
216
+ :type windows: List[common.Window]
217
+ :param buckets:
218
+ Besides the GroupBy.keys, this is another level of keys for use under this aggregation.
219
+ Using this would create an output as a map of string to aggregate.
220
+ :type buckets: List[str]
221
+ :return: An aggregate defined with the specified operation.
222
+ """
223
+ # Default to last
224
+ operation = operation if operation is not None else Operation.LAST
225
+ arg_map = {}
226
+ if isinstance(operation, tuple):
227
+ operation, arg_map = operation[0], operation[1]
228
+
229
+ def normalize(w: Union[common.Window, str]) -> common.Window:
230
+ if isinstance(w, str):
231
+ return window_utils._from_str(w)
232
+ elif isinstance(w, common.Window):
233
+ return w
234
+ else:
235
+ raise Exception(
236
+ "window should be either a string like '7d', '24h', or a Window type"
237
+ )
238
+
239
+ norm_windows = [normalize(w) for w in windows] if windows else None
240
+
241
+ agg = ttypes.Aggregation(input_column, operation, arg_map, norm_windows, buckets)
242
+
243
+ agg.tags = tags
244
+ return agg
245
+
246
+
247
+ def Window(length: int, time_unit: common.TimeUnit) -> common.Window:
248
+ return common.Window(length, time_unit)
249
+
250
+
251
+ def Derivation(name: str, expression: str) -> ttypes.Derivation:
252
+ """
253
+ Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
254
+ output schema. It is supported for offline computations for now.
255
+
256
+ If both name and expression are set to "*", then every raw column will be included along with the derived columns.
257
+
258
+ :param name: output column name of the SQL expression
259
+ :param expression: any valid Spark SQL select clause based on joinPart or externalPart columns
260
+ :return: a Derivation object representing a single derived column or a wildcard ("*") selection.
261
+ """
262
+ return ttypes.Derivation(name=name, expression=expression)
263
+
264
+
265
+ def contains_windowed_aggregation(aggregations: Optional[List[ttypes.Aggregation]]):
266
+ if not aggregations:
267
+ return False
268
+ for agg in aggregations:
269
+ if agg.windows:
270
+ return True
271
+ return False
272
+
273
+
274
+ def validate_group_by(group_by: ttypes.GroupBy):
275
+ sources = group_by.sources
276
+ keys = group_by.keyColumns
277
+ aggregations = group_by.aggregations
278
+ # check ts is not included in query.select
279
+ first_source_columns = set(utils.get_columns(sources[0]))
280
+ # TODO undo this check after ml_models CI passes
281
+ assert "ts" not in first_source_columns, (
282
+ "'ts' is a reserved key word for Chronon,"
283
+ " please specify the expression in timeColumn"
284
+ )
285
+ for src in sources:
286
+ query = utils.get_query(src)
287
+ if src.events:
288
+ assert query.mutationTimeColumn is None, (
289
+ "ingestionTimeColumn should not be specified for "
290
+ "event source as it should be the same with timeColumn"
291
+ )
292
+ assert query.reversalColumn is None, (
293
+ "reversalColumn should not be specified for event source "
294
+ "as it won't have mutations"
295
+ )
296
+ if group_by.accuracy != Accuracy.SNAPSHOT:
297
+ assert query.timeColumn is not None, (
298
+ "please specify query.timeColumn for non-snapshot accurate "
299
+ "group by with event source"
300
+ )
301
+ else:
302
+ if contains_windowed_aggregation(aggregations):
303
+ assert (
304
+ query.timeColumn
305
+ ), "Please specify timeColumn for entity source with windowed aggregations"
306
+
307
+ column_set = None
308
+ # all sources should select the same columns
309
+ for i, source in enumerate(sources[1:]):
310
+ column_set = set(utils.get_columns(source))
311
+ column_diff = column_set ^ first_source_columns
312
+ assert not column_diff, f"""
313
+ Mismatched columns among sources [1, {i+2}], Difference: {column_diff}
314
+ """
315
+
316
+ # all keys should be present in the selected columns
317
+ unselected_keys = set(keys) - first_source_columns
318
+ assert not unselected_keys, f"""
319
+ Keys {unselected_keys}, are unselected in source
320
+ """
321
+
322
+ # Aggregations=None is only valid if group_by is Entities
323
+ if aggregations is None:
324
+ is_events = any([s.events for s in sources])
325
+ has_mutations = (
326
+ any(
327
+ [
328
+ (
329
+ s.entities.mutationTable is not None
330
+ or s.entities.mutationTopic is not None
331
+ )
332
+ for s in sources
333
+ if s.entities is not None
334
+ ]
335
+ )
336
+ if not is_events
337
+ else False
338
+ )
339
+ assert not (
340
+ is_events or has_mutations
341
+ ), "You can only set aggregations=None in an EntitySource without mutations"
342
+ else:
343
+ columns = set([c for src in sources for c in utils.get_columns(src)])
344
+ for agg in aggregations:
345
+ assert agg.inputColumn, (
346
+ f"input_column is required for all operations, found: input_column = {agg.inputColumn} "
347
+ f"and operation {op_to_str(agg.operation)}"
348
+ )
349
+ assert (agg.inputColumn in columns) or (agg.inputColumn == "ts"), (
350
+ f"input_column: for aggregation is not part of the query. Available columns: {column_set} "
351
+ f"input_column: {agg.inputColumn}"
352
+ )
353
+ if agg.operation == ttypes.Operation.APPROX_PERCENTILE:
354
+ if agg.argMap is not None and agg.argMap.get("percentiles") is not None:
355
+ try:
356
+ percentile_array = json.loads(agg.argMap["percentiles"])
357
+ assert isinstance(percentile_array, list)
358
+ assert all(
359
+ [float(p) >= 0 and float(p) <= 1 for p in percentile_array]
360
+ )
361
+ except Exception as e:
362
+ LOGGER.exception(e)
363
+ raise ValueError(
364
+ "[Percentiles] Unable to decode percentiles value, expected json array with values between"
365
+ f" 0 and 1 inclusive (ex: [0.6, 0.1]), received: {agg.argMap['percentiles']}"
366
+ ) from e
367
+ else:
368
+ raise ValueError(
369
+ f"[Percentiles] Unsupported arguments for {op_to_str(agg.operation)}, "
370
+ "example required: {'k': '128', 'percentiles': '[0.4,0.5,0.95]'},"
371
+ f" received: {agg.argMap}\n"
372
+ )
373
+ if agg.windows:
374
+ assert not (
375
+ # Snapshot accuracy.
376
+ (
377
+ (group_by.accuracy and group_by.accuracy == Accuracy.SNAPSHOT)
378
+ or group_by.backfillStartDate
379
+ )
380
+ and
381
+ # Hourly aggregation.
382
+ any([window.timeUnit == TimeUnit.HOURS for window in agg.windows])
383
+ ), (
384
+ "Detected a snapshot accuracy group by with an hourly aggregation. Resolution with snapshot "
385
+ "accuracy is not fine enough to allow hourly group bys. Consider removing the `backfill start "
386
+ "date` param if set or adjusting the aggregation window. "
387
+ f"input_column: {agg.inputColumn}, windows: {agg.windows}"
388
+ )
389
+
390
+
391
+ _ANY_SOURCE_TYPE = Union[
392
+ ttypes.Source, ttypes.EventSource, ttypes.EntitySource, ttypes.JoinSource
393
+ ]
394
+
395
+
396
+ def _get_op_suffix(operation, argmap):
397
+ op_str = op_to_str(operation)
398
+ if operation in [
399
+ ttypes.Operation.LAST_K,
400
+ ttypes.Operation.TOP_K,
401
+ ttypes.Operation.FIRST_K,
402
+ ttypes.Operation.BOTTOM_K,
403
+ ]:
404
+ op_name_suffix = op_str[:-2]
405
+ arg_suffix = argmap.get("k")
406
+ return "{}{}".format(op_name_suffix, arg_suffix)
407
+ else:
408
+ return op_str
409
+
410
+
411
+ def get_output_col_names(aggregation):
412
+ base_name = f"{aggregation.inputColumn}_{_get_op_suffix(aggregation.operation, aggregation.argMap)}"
413
+ windowed_names = []
414
+ if aggregation.windows:
415
+ for window in aggregation.windows:
416
+ unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()[0]
417
+ window_suffix = f"{window.length}{unit}"
418
+ windowed_names.append(f"{base_name}_{window_suffix}")
419
+ else:
420
+ windowed_names = [base_name]
421
+
422
+ bucketed_names = []
423
+ if aggregation.buckets:
424
+ for bucket in aggregation.buckets:
425
+ bucketed_names.extend([f"{name}_by_{bucket}" for name in windowed_names])
426
+ else:
427
+ bucketed_names = windowed_names
428
+
429
+ return bucketed_names
430
+
431
+
432
+ def GroupBy(
433
+ version: int,
434
+ sources: Union[List[_ANY_SOURCE_TYPE], _ANY_SOURCE_TYPE],
435
+ keys: List[str],
436
+ aggregations: Optional[List[ttypes.Aggregation]],
437
+ derivations: List[ttypes.Derivation] = None,
438
+ accuracy: ttypes.Accuracy = None,
439
+ backfill_start_date: str = None,
440
+ output_namespace: str = None,
441
+ table_properties: Dict[str, str] = None,
442
+ tags: Dict[str, str] = None,
443
+ online: bool = DEFAULT_ONLINE,
444
+ production: bool = DEFAULT_PRODUCTION,
445
+ # execution params
446
+ offline_schedule: str = "@daily",
447
+ conf: common.ConfigProperties = None,
448
+ env_vars: common.EnvironmentVariables = None,
449
+ cluster_conf: common.ClusterConfigProperties = None,
450
+ step_days: int = None,
451
+ disable_historical_backfill: bool = False,
452
+ ) -> ttypes.GroupBy:
453
+ """
454
+
455
+ :param version: TODO
456
+ :param sources:
457
+ can be constructed as entities or events or joinSource::
458
+
459
+ import ai.chronon.api.ttypes as chronon
460
+ events = chronon.Source(events=chronon.Events(
461
+ table=YOUR_TABLE,
462
+ topic=YOUR_TOPIC # <- OPTIONAL for serving
463
+ query=chronon.Query(...)
464
+ isCumulative=False # <- defaults to false.
465
+ ))
466
+ Or
467
+ entities = chronon.Source(entities=chronon.Entities(
468
+ snapshotTable=YOUR_TABLE,
469
+ mutationTopic=YOUR_TOPIC,
470
+ mutationTable=YOUR_MUTATION_TABLE
471
+ query=chronon.Query(...)
472
+ ))
473
+ or
474
+ joinSource = chronon.Source(joinSource=chronon.JoinSource(
475
+ join = YOUR_CHRONON_PARENT_JOIN,
476
+ query = chronon.Query(...)
477
+ ))
478
+
479
+ Multiple sources can be supplied to backfill the historical values with their respective start and end
480
+ partitions. However, only one source is allowed to be a streaming one.
481
+ :type sources: List[ai.chronon.api.ttypes.Events|ai.chronon.api.ttypes.Entities]
482
+ :param keys:
483
+ List of primary keys that defines the data that needs to be collected in the result table. Similar to the
484
+ GroupBy in the SQL context.
485
+ :type keys: List[String]
486
+ :param aggregations:
487
+ List of aggregations that needs to be computed for the data following the grouping defined by the keys::
488
+
489
+ import ai.chronon.api.ttypes as chronon
490
+ aggregations = [
491
+ chronon.Aggregation(input_column="entity", operation=Operation.LAST),
492
+ chronon.Aggregation(input_column="entity", operation=Operation.LAST, windows=['7d'])
493
+ ],
494
+ :type aggregations: List[ai.chronon.api.ttypes.Aggregation]
495
+ :param online:
496
+ Should we upload the result data of this conf into the KV store so that we can fetch/serve this GroupBy online.
497
+ Once Online is set to True, you ideally should not change the conf.
498
+ :type online: bool
499
+ :param production:
500
+ This when set can be integrated to trigger alerts. You will have to integrate this flag into your alerting
501
+ system yourself.
502
+ :type production: bool
503
+ :param backfill_start_date:
504
+ Start date from which GroupBy data should be computed. This will determine how back of a time that Chronon would
505
+ goto to compute the resultant table and its aggregations.
506
+ :type backfill_start_date: str
507
+ :param env:
508
+ This is a dictionary of "mode name" to dictionary of "env var name" to "env var value"::
509
+
510
+ {
511
+ 'backfill' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' },
512
+ 'upload' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
513
+ 'streaming' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
514
+ }
515
+
516
+ These vars then flow into run.py and the underlying spark_submit.sh.
517
+ These vars can be set in other places as well. The priority order (descending) is as below
518
+
519
+ 1. env vars set while using run.py "VAR=VAL run.py --mode=backfill <name>"
520
+ 2. env vars set here in Join's env param
521
+ 3. env vars set in `team.json['team.production.<MODE NAME>']`
522
+ 4. env vars set in `team.json['default.production.<MODE NAME>']`
523
+
524
+ :type env: Dict[str, Dict[str, str]]
525
+ :param table_properties:
526
+ Specifies the properties on output hive tables. Can be specified in teams.json.
527
+ :type table_properties: Dict[str, str]
528
+ :param output_namespace:
529
+ In backfill mode, we will produce data into hive. This represents the hive namespace that the data will be
530
+ written into. You can set this at the teams.json level.
531
+ :type output_namespace: str
532
+ :param accuracy:
533
+ Defines the computing accuracy of the GroupBy.
534
+ If "Snapshot" is selected, the aggregations are computed based on the partition identifier - "ds" time column.
535
+ If "Temporal" is selected, the aggregations are computed based on the event time - "ts" time column.
536
+ :type accuracy: ai.chronon.api.ttypes.SNAPSHOT or ai.chronon.api.ttypes.TEMPORAL
537
+ :param lag:
538
+ Param that goes into customJson. You can pull this out of the json at path "metaData.customJson.lag"
539
+ This is used by airflow integration to pick an older hive partition to wait on.
540
+ :type lag: int
541
+ :param offline_schedule:
542
+ the offline schedule interval for batch jobs. Below is the equivalent of the cron tab commands::
543
+
544
+ '@hourly': '0 * * * *',
545
+ '@daily': '0 0 * * *',
546
+ '@weekly': '0 0 * * 0',
547
+ '@monthly': '0 0 1 * *',
548
+ '@yearly': '0 0 1 1 *',
549
+
550
+ :type offline_schedule: str
551
+ :param tags:
552
+ Additional metadata that does not directly affect feature computation, but is useful to
553
+ track for management purposes.
554
+ :type tags: Dict[str, str]
555
+ :param derivations:
556
+ Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
557
+ output schema. It is supported for offline computations for now.
558
+ :type derivations: List[ai.chronon.api.ttypes.Drivation]
559
+ :param kwargs:
560
+ Additional properties that would be passed to run.py if specified under additional_args property.
561
+ And provides an option to pass custom values to the processing logic.
562
+ :type kwargs: Dict[str, str]
563
+ :param conf:
564
+ Configuration properties for the GroupBy. Depending on the mode we layer confs with the following priority:
565
+ 1. conf set in the GroupBy.conf.<mode>
566
+ 2. conf set in the GroupBy.conf.common
567
+ 3. conf set in the team.conf.<mode>
568
+ 4. conf set in the team.conf.common
569
+ 5. conf set in the default.conf.<mode>
570
+ 6. conf set in the default.conf.common
571
+ :param env_vars:
572
+ Environment variables for the GroupBy. Depending on the mode we layer envs with the following priority:
573
+ 1. env vars set in the GroupBy.env.<mode>
574
+ 2. env vars set in the GroupBy.env.common
575
+ 3. env vars set in the team.env.<mode>
576
+ 4. env vars set in the team.env.common
577
+ 5. env vars set in the default.env.<mode>
578
+ 6. env vars set in the default.env.common
579
+ :param cluster_conf:
580
+ Cluster configuration properties for the join.
581
+ :param step_days
582
+ The maximum number of days to output at once
583
+ :return:
584
+ A GroupBy object containing specified aggregations.
585
+ """
586
+ assert sources, "Sources are not specified"
587
+
588
+ agg_inputs = []
589
+ if aggregations is not None:
590
+ agg_inputs = [agg.inputColumn for agg in aggregations]
591
+
592
+ required_columns = keys + agg_inputs
593
+
594
+ def _sanitize_columns(src: ttypes.Source):
595
+ source = deepcopy(src)
596
+ query = (
597
+ source.entities.query
598
+ if source.entities is not None
599
+ else (
600
+ source.events.query
601
+ if source.events is not None
602
+ else source.joinSource.query
603
+ )
604
+ )
605
+
606
+ if query.selects is None:
607
+ query.selects = {}
608
+ for col in required_columns:
609
+ if col not in query.selects:
610
+ query.selects[col] = col
611
+ if "ts" in query.selects: # ts cannot be in selects.
612
+ ts = query.selects["ts"]
613
+ del query.selects["ts"]
614
+ if query.timeColumn is None:
615
+ query.timeColumn = ts
616
+ assert query.timeColumn == ts, (
617
+ f"mismatched `ts`: {ts} and `timeColumn`: {query.timeColumn} "
618
+ "in source {source}. Please specify only the `timeColumn`"
619
+ )
620
+ return source
621
+
622
+ def _normalize_source(source):
623
+ if isinstance(source, ttypes.EventSource):
624
+ return ttypes.Source(events=source)
625
+ elif isinstance(source, ttypes.EntitySource):
626
+ return ttypes.Source(entities=source)
627
+ elif isinstance(source, ttypes.JoinSource):
628
+ utils.__set_name(source.join, ttypes.Join, "joins")
629
+ if not source.join.metaData.outputNamespace:
630
+ source.join.metaData.outputNamespace = output_namespace
631
+ return ttypes.Source(joinSource=source)
632
+ elif isinstance(source, ttypes.Source):
633
+ if source.entities:
634
+ return _normalize_source(source.entities)
635
+ elif source.events:
636
+ return _normalize_source(source.events)
637
+ elif source.joinSource:
638
+ return _normalize_source(source.joinSource)
639
+ else:
640
+ return source
641
+ else:
642
+ print("unrecognized " + str(source))
643
+
644
+ if not isinstance(sources, list):
645
+ sources = [sources]
646
+
647
+ sources = [_sanitize_columns(_normalize_source(source)) for source in sources]
648
+
649
+ # get caller's filename to assign team
650
+ team = inspect.stack()[1].filename.split("/")[-2]
651
+
652
+ exec_info = common.ExecutionInfo(
653
+ scheduleCron=offline_schedule,
654
+ conf=conf,
655
+ env=env_vars,
656
+ stepDays=step_days,
657
+ historicalBackfill=disable_historical_backfill,
658
+ clusterConf=cluster_conf,
659
+ )
660
+
661
+ column_tags = {}
662
+ if aggregations:
663
+ for agg in aggregations:
664
+ if hasattr(agg, "tags") and agg.tags:
665
+ for output_col in get_output_col_names(agg):
666
+ column_tags[output_col] = agg.tags
667
+
668
+
669
+ metadata = ttypes.MetaData(
670
+ online=online,
671
+ production=production,
672
+ outputNamespace=output_namespace,
673
+ tableProperties=table_properties,
674
+ team=team,
675
+ executionInfo=exec_info,
676
+ tags=tags if tags else None,
677
+ columnTags=column_tags if column_tags else None,
678
+ version=str(version),
679
+ )
680
+
681
+ group_by = ttypes.GroupBy(
682
+ sources=sources,
683
+ keyColumns=keys,
684
+ aggregations=aggregations,
685
+ metaData=metadata,
686
+ backfillStartDate=backfill_start_date,
687
+ accuracy=accuracy,
688
+ derivations=derivations,
689
+ )
690
+ validate_group_by(group_by)
691
+
692
+ return group_by
@@ -0,0 +1 @@
1
+ __all__ = ['ttypes', 'constants']