awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/group_by.py ADDED
@@ -0,0 +1,692 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import inspect
16
+ import json
17
+ import logging
18
+ from copy import deepcopy
19
+ from typing import Callable, Dict, List, Optional, Tuple, Union
20
+
21
+ import gen_thrift.api.ttypes as ttypes
22
+ import gen_thrift.common.ttypes as common
23
+
24
+ import ai.chronon.utils as utils
25
+ import ai.chronon.windows as window_utils
26
+
27
+ OperationType = int # type(zthrift.Operation.FIRST)
28
+
29
+
30
+ def _get_output_table_name(obj, full_name: bool = False):
31
+ """
32
+ Group by backfill output table name
33
+ To be synced with api.Extensions.scala
34
+ """
35
+ if not obj.metaData.name:
36
+ utils.__set_name(obj, ttypes.GroupBy, "group_bys")
37
+ return utils.output_table_name(obj, full_name)
38
+
39
+
40
+ # The GroupBy's default online/production status is None and it will inherit
41
+ # online/production status from the Joins it is included.
42
+ # If it is included in multiple joins, it is considered online/production
43
+ # if any of the joins are online/production. Otherwise it is not online/production
44
+ # unless it is explicitly marked as online/production on the GroupBy itself.
45
+ DEFAULT_ONLINE = None
46
+ DEFAULT_PRODUCTION = None
47
+ LOGGER = logging.getLogger()
48
+
49
+
50
+ def collector(
51
+ op: ttypes.Operation,
52
+ ) -> Callable[[ttypes.Operation], Tuple[ttypes.Operation, Dict[str, str]]]:
53
+ return lambda k: (op, {"k": str(k)})
54
+
55
+
56
+ def generic_collector(op: ttypes.Operation, required, **kwargs):
57
+ def _collector(*args, **other_args):
58
+ arguments = kwargs.copy() if kwargs else {}
59
+ for idx, arg in enumerate(required):
60
+ arguments[arg] = args[idx]
61
+ arguments.update(other_args)
62
+ return (op, {k: str(v) for k, v in arguments.items()})
63
+
64
+ return _collector
65
+
66
+
67
+ # To simplify imports
68
+ class Accuracy(ttypes.Accuracy):
69
+ pass
70
+
71
+
72
+ class Operation:
73
+ MIN = ttypes.Operation.MIN
74
+ """Minimum value in the column"""
75
+
76
+ MAX = ttypes.Operation.MAX
77
+ """Maximum value in the column"""
78
+
79
+ FIRST = ttypes.Operation.FIRST
80
+ """First non-null value of input column by time column"""
81
+
82
+ LAST = ttypes.Operation.LAST
83
+ """Last non-null value of input column by time column"""
84
+
85
+ APPROX_UNIQUE_COUNT = ttypes.Operation.APPROX_UNIQUE_COUNT
86
+ """Approximate count of unique values using CPC (Compressed Probability Counting) sketch"""
87
+
88
+ APPROX_UNIQUE_COUNT_LGK = collector(ttypes.Operation.APPROX_UNIQUE_COUNT)
89
+ """Configurable approximate unique count with lgK parameter for sketch size tuning.
90
+ Default lgK is 8. See CpcSketch.java for accuracy vs size tradeoffs:
91
+ https://github.com/apache/incubator-datasketches-java/blob/master/src/main/java/org/apache/datasketches/cpc/CpcSketch.java#L180
92
+ """
93
+
94
+ UNIQUE_COUNT = ttypes.Operation.UNIQUE_COUNT
95
+ """
96
+ Exact count of unique values of the input column.
97
+ Will store the set of items and can be expensive if the cardinality of the column is high.
98
+ """
99
+
100
+ COUNT = ttypes.Operation.COUNT
101
+ """Total count of non-null values of the input column"""
102
+
103
+ SUM = ttypes.Operation.SUM
104
+ """Sum of values in the input column"""
105
+
106
+ AVERAGE = ttypes.Operation.AVERAGE
107
+ """Arithmetic mean of values in the input column"""
108
+
109
+ VARIANCE = ttypes.Operation.VARIANCE
110
+ """Statistical variance of values in the input column"""
111
+
112
+ SKEW = ttypes.Operation.SKEW
113
+ """Skewness (third standardized moment) of the distribution of values in input column"""
114
+
115
+ KURTOSIS = ttypes.Operation.KURTOSIS
116
+ """Kurtosis (fourth standardized moment) of the distribution of values in input column"""
117
+
118
+ HISTOGRAM = ttypes.Operation.HISTOGRAM
119
+ """Full frequency distribution of values"""
120
+
121
+ FREQUENT_K = collector(ttypes.Operation.HISTOGRAM)
122
+ """
123
+ !! Could be expensive if the cardinality of the column is high !!
124
+ Computes columns values that are frequent in the input column exactly.
125
+ Produces a map of items as keys and counts as values.
126
+ """
127
+
128
+ APPROX_FREQUENT_K = collector(ttypes.Operation.APPROX_FREQUENT_K)
129
+ """
130
+ Computes columns values that are frequent in the input column approximately.
131
+ Produces a map of items as keys and counts as values approximately.
132
+ """
133
+
134
+ APPROX_HEAVY_HITTERS_K = collector(ttypes.Operation.APPROX_HEAVY_HITTERS_K)
135
+ """
136
+ Computes column values that are skewed in the input column.
137
+ Produces a map of items as keys and counts as values approximately.
138
+ Different from APPROX_FREQUENT_K in that it only retains if a value is abnormally
139
+ more frequent.
140
+ """
141
+
142
+ FIRST_K = collector(ttypes.Operation.FIRST_K)
143
+ """Returns first k input column values by time column"""
144
+
145
+ LAST_K = collector(ttypes.Operation.LAST_K)
146
+ """Returns last k input column values by time column"""
147
+
148
+ TOP_K = collector(ttypes.Operation.TOP_K)
149
+ """Returns k largest values of the input column. Input needs to be sortable."""
150
+
151
+ BOTTOM_K = collector(ttypes.Operation.BOTTOM_K)
152
+ """Returns k smallest values of the input column"""
153
+
154
+ UNIQUE_TOP_K = collector(ttypes.Operation.UNIQUE_TOP_K)
155
+ """Returns top k unique elements ranked by their values. Automatically deduplicates inputs. For structs, requires sort_key (String) and unique_id (Long) fields."""
156
+
157
+ APPROX_PERCENTILE = generic_collector(ttypes.Operation.APPROX_PERCENTILE, ["percentiles"], k=20)
158
+ """Approximate percentile calculation with configurable accuracy parameter k=20"""
159
+
160
+
161
+ def Aggregations(**agg_dict):
162
+ assert all(isinstance(agg, ttypes.Aggregation) for agg in agg_dict.values())
163
+ for key, agg in agg_dict.items():
164
+ if not agg.inputColumn:
165
+ agg.inputColumn = key
166
+ return agg_dict.values()
167
+
168
+
169
+ def DefaultAggregation(keys, sources, operation=Operation.LAST, tags=None):
170
+ aggregate_columns = []
171
+ for source in sources:
172
+ query = utils.get_query(source)
173
+ columns = utils.get_columns(source)
174
+ non_aggregate_columns = keys + [
175
+ "ts",
176
+ "is_before",
177
+ "mutation_ts",
178
+ "ds",
179
+ query.timeColumn,
180
+ ]
181
+ aggregate_columns += [column for column in columns if column not in non_aggregate_columns]
182
+ return [
183
+ Aggregation(operation=operation, input_column=column, tags=tags)
184
+ for column in aggregate_columns
185
+ ]
186
+
187
+
188
+ class TimeUnit:
189
+ HOURS = common.TimeUnit.HOURS
190
+ DAYS = common.TimeUnit.DAYS
191
+
192
+
193
+ def window_to_str_pretty(window: common.Window):
194
+ unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()
195
+ return f"{window.length} {unit}"
196
+
197
+
198
+ def op_to_str(operation: OperationType):
199
+ return ttypes.Operation._VALUES_TO_NAMES[operation].lower()
200
+
201
+
202
+ # See docs/Aggregations.md
203
+ def Aggregation(
204
+ input_column: str = None,
205
+ operation: Union[ttypes.Operation, Tuple[ttypes.Operation, Dict[str, str]]] = None,
206
+ windows: Union[List[common.Window], List[str]] = None,
207
+ buckets: List[str] = None,
208
+ tags: Dict[str, str] = None,
209
+ ) -> ttypes.Aggregation:
210
+ """
211
+ :param input_column:
212
+ Column on which the aggregation needs to be performed.
213
+ This should be one of the input columns specified on the keys of the `select` in the `Query`'s `Source`
214
+ :type input_column: str
215
+ :param operation:
216
+ Operation to use to aggregate the input columns. For example, MAX, MIN, COUNT
217
+ Some operations have arguments, like last_k, approx_percentiles etc.,
218
+ Defaults to "LAST".
219
+ :type operation: ttypes.Operation
220
+ :param windows:
221
+ Length to window to calculate the aggregates on. Strings like "1h", "30d" are also accepted.
222
+ Minimum window size is 1hr. Maximum can be arbitrary. When not defined, the computation is un-windowed.
223
+ :type windows: List[common.Window]
224
+ :param buckets:
225
+ Besides the GroupBy.keys, this is another level of keys for use under this aggregation.
226
+ Using this would create an output as a map of string to aggregate.
227
+ :type buckets: List[str]
228
+ :return: An aggregate defined with the specified operation.
229
+ """
230
+ # Default to last
231
+ operation = operation if operation is not None else Operation.LAST
232
+ arg_map = {}
233
+ if isinstance(operation, tuple):
234
+ operation, arg_map = operation[0], operation[1]
235
+
236
+ def normalize(w: Union[common.Window, str]) -> common.Window:
237
+ if isinstance(w, str):
238
+ return window_utils._from_str(w)
239
+ elif isinstance(w, common.Window):
240
+ return w
241
+ else:
242
+ raise Exception("window should be either a string like '7d', '24h', or a Window type")
243
+
244
+ norm_windows = [normalize(w) for w in windows] if windows else None
245
+
246
+ agg = ttypes.Aggregation(input_column, operation, arg_map, norm_windows, buckets)
247
+
248
+ agg.tags = tags
249
+ return agg
250
+
251
+
252
+ def Window(length: int, time_unit: common.TimeUnit) -> common.Window:
253
+ return common.Window(length, time_unit)
254
+
255
+
256
+ def Derivation(name: str, expression: str) -> ttypes.Derivation:
257
+ """
258
+ Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
259
+ output schema. It is supported for offline computations for now.
260
+
261
+ If both name and expression are set to "*", then every raw column will be included along with the derived columns.
262
+
263
+ :param name: output column name of the SQL expression
264
+ :param expression: any valid Spark SQL select clause based on joinPart or externalPart columns
265
+ :return: a Derivation object representing a single derived column or a wildcard ("*") selection.
266
+ """
267
+ return ttypes.Derivation(name=name, expression=expression)
268
+
269
+
270
+ def contains_windowed_aggregation(aggregations: Optional[List[ttypes.Aggregation]]):
271
+ if not aggregations:
272
+ return False
273
+ for agg in aggregations:
274
+ if agg.windows:
275
+ return True
276
+ return False
277
+
278
+
279
+ def validate_group_by(group_by: ttypes.GroupBy):
280
+ sources = group_by.sources
281
+ keys = group_by.keyColumns
282
+ aggregations = group_by.aggregations
283
+ # check ts is not included in query.select
284
+ first_source_columns = set(utils.get_columns(sources[0]))
285
+ # TODO undo this check after ml_models CI passes
286
+ assert "ts" not in first_source_columns, (
287
+ "'ts' is a reserved key word for Chronon, please specify the expression in timeColumn"
288
+ )
289
+ for src in sources:
290
+ query = utils.get_query(src)
291
+ if src.events:
292
+ assert query.mutationTimeColumn is None, (
293
+ "ingestionTimeColumn should not be specified for "
294
+ "event source as it should be the same with timeColumn"
295
+ )
296
+ assert query.reversalColumn is None, (
297
+ "reversalColumn should not be specified for event source as it won't have mutations"
298
+ )
299
+ if group_by.accuracy != Accuracy.SNAPSHOT:
300
+ assert query.timeColumn is not None, (
301
+ "please specify query.timeColumn for non-snapshot accurate "
302
+ "group by with event source"
303
+ )
304
+ else:
305
+ if contains_windowed_aggregation(aggregations):
306
+ assert query.timeColumn, (
307
+ "Please specify timeColumn for entity source with windowed aggregations"
308
+ )
309
+
310
+ column_set = None
311
+ # all sources should select the same columns
312
+ for i, source in enumerate(sources[1:]):
313
+ column_set = set(utils.get_columns(source))
314
+ column_diff = column_set ^ first_source_columns
315
+ assert not column_diff, f"""
316
+ Mismatched columns among sources [1, {i + 2}], Difference: {column_diff}
317
+ """
318
+
319
+ # all keys should be present in the selected columns
320
+ unselected_keys = set(keys) - first_source_columns
321
+ assert not unselected_keys, f"""
322
+ Keys {unselected_keys}, are unselected in source
323
+ """
324
+
325
+ # Aggregations=None is only valid if group_by is Entities
326
+ if aggregations is None:
327
+ is_events = any([s.events for s in sources])
328
+ has_mutations = (
329
+ any(
330
+ [
331
+ (s.entities.mutationTable is not None or s.entities.mutationTopic is not None)
332
+ for s in sources
333
+ if s.entities is not None
334
+ ]
335
+ )
336
+ if not is_events
337
+ else False
338
+ )
339
+ assert not (is_events or has_mutations), (
340
+ "You can only set aggregations=None in an EntitySource without mutations"
341
+ )
342
+ else:
343
+ columns = set([c for src in sources for c in utils.get_columns(src)])
344
+ for agg in aggregations:
345
+ assert agg.inputColumn, (
346
+ f"input_column is required for all operations, found: input_column = {agg.inputColumn} "
347
+ f"and operation {op_to_str(agg.operation)}"
348
+ )
349
+ assert (agg.inputColumn in columns) or (agg.inputColumn == "ts"), (
350
+ f"input_column: for aggregation is not part of the query. Available columns: {column_set} "
351
+ f"input_column: {agg.inputColumn}"
352
+ )
353
+ if agg.operation == ttypes.Operation.APPROX_PERCENTILE:
354
+ if agg.argMap is not None and agg.argMap.get("percentiles") is not None:
355
+ try:
356
+ percentile_array = json.loads(agg.argMap["percentiles"])
357
+ assert isinstance(percentile_array, list)
358
+ assert all([float(p) >= 0 and float(p) <= 1 for p in percentile_array])
359
+ except Exception as e:
360
+ LOGGER.exception(e)
361
+ raise ValueError(
362
+ "[Percentiles] Unable to decode percentiles value, expected json array with values between"
363
+ f" 0 and 1 inclusive (ex: [0.6, 0.1]), received: {agg.argMap['percentiles']}"
364
+ ) from e
365
+ else:
366
+ raise ValueError(
367
+ f"[Percentiles] Unsupported arguments for {op_to_str(agg.operation)}, "
368
+ "example required: {'k': '128', 'percentiles': '[0.4,0.5,0.95]'},"
369
+ f" received: {agg.argMap}\n"
370
+ )
371
+ if agg.windows:
372
+ assert not (
373
+ # Snapshot accuracy.
374
+ (
375
+ (group_by.accuracy and group_by.accuracy == Accuracy.SNAPSHOT)
376
+ or group_by.backfillStartDate
377
+ )
378
+ and
379
+ # Hourly aggregation.
380
+ any([window.timeUnit == TimeUnit.HOURS for window in agg.windows])
381
+ ), (
382
+ "Detected a snapshot accuracy group by with an hourly aggregation. Resolution with snapshot "
383
+ "accuracy is not fine enough to allow hourly group bys. Consider removing the `backfill start "
384
+ "date` param if set or adjusting the aggregation window. "
385
+ f"input_column: {agg.inputColumn}, windows: {agg.windows}"
386
+ )
387
+
388
+
389
+ _ANY_SOURCE_TYPE = Union[ttypes.Source, ttypes.EventSource, ttypes.EntitySource, ttypes.JoinSource]
390
+
391
+
392
+ def _get_op_suffix(operation, argmap):
393
+ op_str = op_to_str(operation)
394
+ if operation in [
395
+ ttypes.Operation.LAST_K,
396
+ ttypes.Operation.TOP_K,
397
+ ttypes.Operation.FIRST_K,
398
+ ttypes.Operation.BOTTOM_K,
399
+ ]:
400
+ op_name_suffix = op_str[:-2]
401
+ arg_suffix = argmap.get("k")
402
+ return "{}{}".format(op_name_suffix, arg_suffix)
403
+ else:
404
+ return op_str
405
+
406
+
407
+ def get_output_col_names(aggregation):
408
+ base_name = (
409
+ f"{aggregation.inputColumn}_{_get_op_suffix(aggregation.operation, aggregation.argMap)}"
410
+ )
411
+ windowed_names = []
412
+ if aggregation.windows:
413
+ for window in aggregation.windows:
414
+ unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()[0]
415
+ window_suffix = f"{window.length}{unit}"
416
+ windowed_names.append(f"{base_name}_{window_suffix}")
417
+ else:
418
+ windowed_names = [base_name]
419
+
420
+ bucketed_names = []
421
+ if aggregation.buckets:
422
+ for bucket in aggregation.buckets:
423
+ bucketed_names.extend([f"{name}_by_{bucket}" for name in windowed_names])
424
+ else:
425
+ bucketed_names = windowed_names
426
+
427
+ return bucketed_names
428
+
429
+
430
+ def GroupBy(
431
+ version: int,
432
+ sources: Union[List[_ANY_SOURCE_TYPE], _ANY_SOURCE_TYPE],
433
+ keys: List[str],
434
+ aggregations: Optional[List[ttypes.Aggregation]],
435
+ derivations: List[ttypes.Derivation] = None,
436
+ accuracy: ttypes.Accuracy = None,
437
+ backfill_start_date: str = None,
438
+ output_namespace: str = None,
439
+ table_properties: Dict[str, str] = None,
440
+ tags: Dict[str, str] = None,
441
+ online: bool = DEFAULT_ONLINE,
442
+ production: bool = DEFAULT_PRODUCTION,
443
+ # execution params
444
+ offline_schedule: str = "@daily",
445
+ conf: common.ConfigProperties = None,
446
+ env_vars: common.EnvironmentVariables = None,
447
+ cluster_conf: common.ClusterConfigProperties = None,
448
+ step_days: int = None,
449
+ disable_historical_backfill: bool = False,
450
+ ) -> ttypes.GroupBy:
451
+ """
452
+
453
+ :param version: TODO
454
+ :param sources:
455
+ can be constructed as entities or events or joinSource::
456
+
457
+ import gen_thrift.api.ttypes as chronon
458
+ events = chronon.Source(events=chronon.Events(
459
+ table=YOUR_TABLE,
460
+ topic=YOUR_TOPIC # <- OPTIONAL for serving
461
+ query=chronon.Query(...)
462
+ isCumulative=False # <- defaults to false.
463
+ ))
464
+ Or
465
+ entities = chronon.Source(entities=chronon.Entities(
466
+ snapshotTable=YOUR_TABLE,
467
+ mutationTopic=YOUR_TOPIC,
468
+ mutationTable=YOUR_MUTATION_TABLE
469
+ query=chronon.Query(...)
470
+ ))
471
+ or
472
+ joinSource = chronon.Source(joinSource=chronon.JoinSource(
473
+ join = YOUR_CHRONON_PARENT_JOIN,
474
+ query = chronon.Query(...)
475
+ ))
476
+
477
+ Multiple sources can be supplied to backfill the historical values with their respective start and end
478
+ partitions. However, only one source is allowed to be a streaming one.
479
+ :type sources: List[gen_thrift.api.ttypes.Events|gen_thrift.api.ttypes.Entities]
480
+ :param keys:
481
+ List of primary keys that defines the data that needs to be collected in the result table. Similar to the
482
+ GroupBy in the SQL context.
483
+ :type keys: List[String]
484
+ :param aggregations:
485
+ List of aggregations that needs to be computed for the data following the grouping defined by the keys::
486
+
487
+ import gen_thrift.api.ttypes as chronon
488
+ aggregations = [
489
+ chronon.Aggregation(input_column="entity", operation=Operation.LAST),
490
+ chronon.Aggregation(input_column="entity", operation=Operation.LAST, windows=['7d'])
491
+ ],
492
+ :type aggregations: List[gen_thrift.api.ttypes.Aggregation]
493
+ :param online:
494
+ Should we upload the result data of this conf into the KV store so that we can fetch/serve this GroupBy online.
495
+ Once Online is set to True, you ideally should not change the conf.
496
+ :type online: bool
497
+ :param production:
498
+ This when set can be integrated to trigger alerts. You will have to integrate this flag into your alerting
499
+ system yourself.
500
+ :type production: bool
501
+ :param backfill_start_date:
502
+ Start date from which GroupBy data should be computed. This will determine how back of a time that Chronon would
503
+ goto to compute the resultant table and its aggregations.
504
+ :type backfill_start_date: str
505
+ :param env:
506
+ This is a dictionary of "mode name" to dictionary of "env var name" to "env var value"::
507
+
508
+ {
509
+ 'backfill' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' },
510
+ 'upload' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
511
+ 'streaming' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
512
+ }
513
+
514
+ These vars then flow into run.py and the underlying spark_submit.sh.
515
+ These vars can be set in other places as well. The priority order (descending) is as below
516
+
517
+ 1. env vars set while using run.py "VAR=VAL run.py --mode=backfill <name>"
518
+ 2. env vars set here in Join's env param
519
+ 3. env vars set in `team.json['team.production.<MODE NAME>']`
520
+ 4. env vars set in `team.json['default.production.<MODE NAME>']`
521
+
522
+ :type env: Dict[str, Dict[str, str]]
523
+ :param table_properties:
524
+ Specifies the properties on output hive tables. Can be specified in teams.json.
525
+ :type table_properties: Dict[str, str]
526
+ :param output_namespace:
527
+ In backfill mode, we will produce data into hive. This represents the hive namespace that the data will be
528
+ written into. You can set this at the teams.json level.
529
+ :type output_namespace: str
530
+ :param accuracy:
531
+ Defines the computing accuracy of the GroupBy.
532
+ If "Snapshot" is selected, the aggregations are computed based on the partition identifier - "ds" time column.
533
+ If "Temporal" is selected, the aggregations are computed based on the event time - "ts" time column.
534
+ :type accuracy: gen_thrift.api.ttypes.SNAPSHOT or gen_thrift.api.ttypes.TEMPORAL
535
+ :param lag:
536
+ Param that goes into customJson. You can pull this out of the json at path "metaData.customJson.lag"
537
+ This is used by airflow integration to pick an older hive partition to wait on.
538
+ :type lag: int
539
+ :param offline_schedule:
540
+ the offline schedule interval for batch jobs. Below is the equivalent of the cron tab commands::
541
+
542
+ '@hourly': '0 * * * *',
543
+ '@daily': '0 0 * * *',
544
+ '@weekly': '0 0 * * 0',
545
+ '@monthly': '0 0 1 * *',
546
+ '@yearly': '0 0 1 1 *',
547
+
548
+ :type offline_schedule: str
549
+ :param tags:
550
+ Additional metadata that does not directly affect feature computation, but is useful to
551
+ track for management purposes.
552
+ :type tags: Dict[str, str]
553
+ :param derivations:
554
+ Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
555
+ output schema. It is supported for offline computations for now.
556
+ :type derivations: List[gen_thrift.api.ttypes.Drivation]
557
+ :param kwargs:
558
+ Additional properties that would be passed to run.py if specified under additional_args property.
559
+ And provides an option to pass custom values to the processing logic.
560
+ :type kwargs: Dict[str, str]
561
+ :param conf:
562
+ Configuration properties for the GroupBy. Depending on the mode we layer confs with the following priority:
563
+ 1. conf set in the GroupBy.conf.<mode>
564
+ 2. conf set in the GroupBy.conf.common
565
+ 3. conf set in the team.conf.<mode>
566
+ 4. conf set in the team.conf.common
567
+ 5. conf set in the default.conf.<mode>
568
+ 6. conf set in the default.conf.common
569
+ :param env_vars:
570
+ Environment variables for the GroupBy. Depending on the mode we layer envs with the following priority:
571
+ 1. env vars set in the GroupBy.env.<mode>
572
+ 2. env vars set in the GroupBy.env.common
573
+ 3. env vars set in the team.env.<mode>
574
+ 4. env vars set in the team.env.common
575
+ 5. env vars set in the default.env.<mode>
576
+ 6. env vars set in the default.env.common
577
+ :param cluster_conf:
578
+ Cluster configuration properties for the join.
579
+ :param step_days
580
+ The maximum number of days to output at once
581
+ :return:
582
+ A GroupBy object containing specified aggregations.
583
+ """
584
+ assert sources, "Sources are not specified"
585
+
586
+ assert isinstance(version, int), (
587
+ f"Version must be an integer, but found {type(version).__name__}"
588
+ )
589
+
590
+ agg_inputs = []
591
+ if aggregations is not None:
592
+ agg_inputs = [agg.inputColumn for agg in aggregations]
593
+
594
+ required_columns = keys + agg_inputs
595
+
596
+ def _sanitize_columns(src: ttypes.Source):
597
+ source = deepcopy(src)
598
+ query = (
599
+ source.entities.query
600
+ if source.entities is not None
601
+ else (source.events.query if source.events is not None else source.joinSource.query)
602
+ )
603
+
604
+ if query.selects is None:
605
+ query.selects = {}
606
+ for col in required_columns:
607
+ if col not in query.selects:
608
+ query.selects[col] = col
609
+ if "ts" in query.selects: # ts cannot be in selects.
610
+ ts = query.selects["ts"]
611
+ del query.selects["ts"]
612
+ if query.timeColumn is None:
613
+ query.timeColumn = ts
614
+ assert query.timeColumn == ts, (
615
+ f"mismatched `ts`: {ts} and `timeColumn`: {query.timeColumn} "
616
+ "in source {source}. Please specify only the `timeColumn`"
617
+ )
618
+ return source
619
+
620
+ def _normalize_source(source):
621
+ if isinstance(source, ttypes.EventSource):
622
+ return ttypes.Source(events=source)
623
+ elif isinstance(source, ttypes.EntitySource):
624
+ return ttypes.Source(entities=source)
625
+ elif isinstance(source, ttypes.JoinSource):
626
+ utils.__set_name(source.join, ttypes.Join, "joins")
627
+ if not source.join.metaData.outputNamespace:
628
+ source.join.metaData.outputNamespace = output_namespace
629
+ return ttypes.Source(joinSource=source)
630
+ elif isinstance(source, ttypes.Source):
631
+ if source.entities:
632
+ return _normalize_source(source.entities)
633
+ elif source.events:
634
+ return _normalize_source(source.events)
635
+ elif source.joinSource:
636
+ return _normalize_source(source.joinSource)
637
+ else:
638
+ return source
639
+ else:
640
+ print("unrecognized " + str(source))
641
+
642
+ if not isinstance(sources, list):
643
+ sources = [sources]
644
+
645
+ sources = [_sanitize_columns(_normalize_source(source)) for source in sources]
646
+
647
+ # get caller's filename to assign team
648
+ team = inspect.stack()[1].filename.split("/")[-2]
649
+
650
+ exec_info = common.ExecutionInfo(
651
+ scheduleCron=offline_schedule,
652
+ conf=conf,
653
+ env=env_vars,
654
+ stepDays=step_days,
655
+ historicalBackfill=disable_historical_backfill,
656
+ clusterConf=cluster_conf,
657
+ )
658
+
659
+ column_tags = {}
660
+ if aggregations:
661
+ for agg in aggregations:
662
+ if hasattr(agg, "tags") and agg.tags:
663
+ for output_col in get_output_col_names(agg):
664
+ column_tags[output_col] = agg.tags
665
+
666
+ metadata = ttypes.MetaData(
667
+ online=online,
668
+ production=production,
669
+ outputNamespace=output_namespace,
670
+ tableProperties=table_properties,
671
+ team=team,
672
+ executionInfo=exec_info,
673
+ tags=tags if tags else None,
674
+ columnTags=column_tags if column_tags else None,
675
+ version=str(version),
676
+ )
677
+
678
+ group_by = ttypes.GroupBy(
679
+ sources=sources,
680
+ keyColumns=keys,
681
+ aggregations=aggregations,
682
+ metaData=metadata,
683
+ backfillStartDate=backfill_start_date,
684
+ accuracy=accuracy,
685
+ derivations=derivations,
686
+ )
687
+ validate_group_by(group_by)
688
+
689
+ # Add the table property that calls the private function
690
+ group_by.__class__.table = property(lambda self: _get_output_table_name(self, full_name=True))
691
+
692
+ return group_by