awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/group_by.py
ADDED
|
@@ -0,0 +1,692 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import inspect
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
from copy import deepcopy
|
|
19
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
20
|
+
|
|
21
|
+
import gen_thrift.api.ttypes as ttypes
|
|
22
|
+
import gen_thrift.common.ttypes as common
|
|
23
|
+
|
|
24
|
+
import ai.chronon.utils as utils
|
|
25
|
+
import ai.chronon.windows as window_utils
|
|
26
|
+
|
|
27
|
+
OperationType = int # type(zthrift.Operation.FIRST)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_output_table_name(obj, full_name: bool = False):
|
|
31
|
+
"""
|
|
32
|
+
Group by backfill output table name
|
|
33
|
+
To be synced with api.Extensions.scala
|
|
34
|
+
"""
|
|
35
|
+
if not obj.metaData.name:
|
|
36
|
+
utils.__set_name(obj, ttypes.GroupBy, "group_bys")
|
|
37
|
+
return utils.output_table_name(obj, full_name)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# The GroupBy's default online/production status is None and it will inherit
|
|
41
|
+
# online/production status from the Joins it is included.
|
|
42
|
+
# If it is included in multiple joins, it is considered online/production
|
|
43
|
+
# if any of the joins are online/production. Otherwise it is not online/production
|
|
44
|
+
# unless it is explicitly marked as online/production on the GroupBy itself.
|
|
45
|
+
DEFAULT_ONLINE = None
|
|
46
|
+
DEFAULT_PRODUCTION = None
|
|
47
|
+
LOGGER = logging.getLogger()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def collector(
|
|
51
|
+
op: ttypes.Operation,
|
|
52
|
+
) -> Callable[[ttypes.Operation], Tuple[ttypes.Operation, Dict[str, str]]]:
|
|
53
|
+
return lambda k: (op, {"k": str(k)})
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def generic_collector(op: ttypes.Operation, required, **kwargs):
|
|
57
|
+
def _collector(*args, **other_args):
|
|
58
|
+
arguments = kwargs.copy() if kwargs else {}
|
|
59
|
+
for idx, arg in enumerate(required):
|
|
60
|
+
arguments[arg] = args[idx]
|
|
61
|
+
arguments.update(other_args)
|
|
62
|
+
return (op, {k: str(v) for k, v in arguments.items()})
|
|
63
|
+
|
|
64
|
+
return _collector
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# To simplify imports
|
|
68
|
+
class Accuracy(ttypes.Accuracy):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class Operation:
|
|
73
|
+
MIN = ttypes.Operation.MIN
|
|
74
|
+
"""Minimum value in the column"""
|
|
75
|
+
|
|
76
|
+
MAX = ttypes.Operation.MAX
|
|
77
|
+
"""Maximum value in the column"""
|
|
78
|
+
|
|
79
|
+
FIRST = ttypes.Operation.FIRST
|
|
80
|
+
"""First non-null value of input column by time column"""
|
|
81
|
+
|
|
82
|
+
LAST = ttypes.Operation.LAST
|
|
83
|
+
"""Last non-null value of input column by time column"""
|
|
84
|
+
|
|
85
|
+
APPROX_UNIQUE_COUNT = ttypes.Operation.APPROX_UNIQUE_COUNT
|
|
86
|
+
"""Approximate count of unique values using CPC (Compressed Probability Counting) sketch"""
|
|
87
|
+
|
|
88
|
+
APPROX_UNIQUE_COUNT_LGK = collector(ttypes.Operation.APPROX_UNIQUE_COUNT)
|
|
89
|
+
"""Configurable approximate unique count with lgK parameter for sketch size tuning.
|
|
90
|
+
Default lgK is 8. See CpcSketch.java for accuracy vs size tradeoffs:
|
|
91
|
+
https://github.com/apache/incubator-datasketches-java/blob/master/src/main/java/org/apache/datasketches/cpc/CpcSketch.java#L180
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
UNIQUE_COUNT = ttypes.Operation.UNIQUE_COUNT
|
|
95
|
+
"""
|
|
96
|
+
Exact count of unique values of the input column.
|
|
97
|
+
Will store the set of items and can be expensive if the cardinality of the column is high.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
COUNT = ttypes.Operation.COUNT
|
|
101
|
+
"""Total count of non-null values of the input column"""
|
|
102
|
+
|
|
103
|
+
SUM = ttypes.Operation.SUM
|
|
104
|
+
"""Sum of values in the input column"""
|
|
105
|
+
|
|
106
|
+
AVERAGE = ttypes.Operation.AVERAGE
|
|
107
|
+
"""Arithmetic mean of values in the input column"""
|
|
108
|
+
|
|
109
|
+
VARIANCE = ttypes.Operation.VARIANCE
|
|
110
|
+
"""Statistical variance of values in the input column"""
|
|
111
|
+
|
|
112
|
+
SKEW = ttypes.Operation.SKEW
|
|
113
|
+
"""Skewness (third standardized moment) of the distribution of values in input column"""
|
|
114
|
+
|
|
115
|
+
KURTOSIS = ttypes.Operation.KURTOSIS
|
|
116
|
+
"""Kurtosis (fourth standardized moment) of the distribution of values in input column"""
|
|
117
|
+
|
|
118
|
+
HISTOGRAM = ttypes.Operation.HISTOGRAM
|
|
119
|
+
"""Full frequency distribution of values"""
|
|
120
|
+
|
|
121
|
+
FREQUENT_K = collector(ttypes.Operation.HISTOGRAM)
|
|
122
|
+
"""
|
|
123
|
+
!! Could be expensive if the cardinality of the column is high !!
|
|
124
|
+
Computes columns values that are frequent in the input column exactly.
|
|
125
|
+
Produces a map of items as keys and counts as values.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
APPROX_FREQUENT_K = collector(ttypes.Operation.APPROX_FREQUENT_K)
|
|
129
|
+
"""
|
|
130
|
+
Computes columns values that are frequent in the input column approximately.
|
|
131
|
+
Produces a map of items as keys and counts as values approximately.
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
APPROX_HEAVY_HITTERS_K = collector(ttypes.Operation.APPROX_HEAVY_HITTERS_K)
|
|
135
|
+
"""
|
|
136
|
+
Computes column values that are skewed in the input column.
|
|
137
|
+
Produces a map of items as keys and counts as values approximately.
|
|
138
|
+
Different from APPROX_FREQUENT_K in that it only retains if a value is abnormally
|
|
139
|
+
more frequent.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
FIRST_K = collector(ttypes.Operation.FIRST_K)
|
|
143
|
+
"""Returns first k input column values by time column"""
|
|
144
|
+
|
|
145
|
+
LAST_K = collector(ttypes.Operation.LAST_K)
|
|
146
|
+
"""Returns last k input column values by time column"""
|
|
147
|
+
|
|
148
|
+
TOP_K = collector(ttypes.Operation.TOP_K)
|
|
149
|
+
"""Returns k largest values of the input column. Input needs to be sortable."""
|
|
150
|
+
|
|
151
|
+
BOTTOM_K = collector(ttypes.Operation.BOTTOM_K)
|
|
152
|
+
"""Returns k smallest values of the input column"""
|
|
153
|
+
|
|
154
|
+
UNIQUE_TOP_K = collector(ttypes.Operation.UNIQUE_TOP_K)
|
|
155
|
+
"""Returns top k unique elements ranked by their values. Automatically deduplicates inputs. For structs, requires sort_key (String) and unique_id (Long) fields."""
|
|
156
|
+
|
|
157
|
+
APPROX_PERCENTILE = generic_collector(ttypes.Operation.APPROX_PERCENTILE, ["percentiles"], k=20)
|
|
158
|
+
"""Approximate percentile calculation with configurable accuracy parameter k=20"""
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def Aggregations(**agg_dict):
|
|
162
|
+
assert all(isinstance(agg, ttypes.Aggregation) for agg in agg_dict.values())
|
|
163
|
+
for key, agg in agg_dict.items():
|
|
164
|
+
if not agg.inputColumn:
|
|
165
|
+
agg.inputColumn = key
|
|
166
|
+
return agg_dict.values()
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def DefaultAggregation(keys, sources, operation=Operation.LAST, tags=None):
|
|
170
|
+
aggregate_columns = []
|
|
171
|
+
for source in sources:
|
|
172
|
+
query = utils.get_query(source)
|
|
173
|
+
columns = utils.get_columns(source)
|
|
174
|
+
non_aggregate_columns = keys + [
|
|
175
|
+
"ts",
|
|
176
|
+
"is_before",
|
|
177
|
+
"mutation_ts",
|
|
178
|
+
"ds",
|
|
179
|
+
query.timeColumn,
|
|
180
|
+
]
|
|
181
|
+
aggregate_columns += [column for column in columns if column not in non_aggregate_columns]
|
|
182
|
+
return [
|
|
183
|
+
Aggregation(operation=operation, input_column=column, tags=tags)
|
|
184
|
+
for column in aggregate_columns
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class TimeUnit:
|
|
189
|
+
HOURS = common.TimeUnit.HOURS
|
|
190
|
+
DAYS = common.TimeUnit.DAYS
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def window_to_str_pretty(window: common.Window):
|
|
194
|
+
unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()
|
|
195
|
+
return f"{window.length} {unit}"
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def op_to_str(operation: OperationType):
|
|
199
|
+
return ttypes.Operation._VALUES_TO_NAMES[operation].lower()
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
# See docs/Aggregations.md
|
|
203
|
+
def Aggregation(
|
|
204
|
+
input_column: str = None,
|
|
205
|
+
operation: Union[ttypes.Operation, Tuple[ttypes.Operation, Dict[str, str]]] = None,
|
|
206
|
+
windows: Union[List[common.Window], List[str]] = None,
|
|
207
|
+
buckets: List[str] = None,
|
|
208
|
+
tags: Dict[str, str] = None,
|
|
209
|
+
) -> ttypes.Aggregation:
|
|
210
|
+
"""
|
|
211
|
+
:param input_column:
|
|
212
|
+
Column on which the aggregation needs to be performed.
|
|
213
|
+
This should be one of the input columns specified on the keys of the `select` in the `Query`'s `Source`
|
|
214
|
+
:type input_column: str
|
|
215
|
+
:param operation:
|
|
216
|
+
Operation to use to aggregate the input columns. For example, MAX, MIN, COUNT
|
|
217
|
+
Some operations have arguments, like last_k, approx_percentiles etc.,
|
|
218
|
+
Defaults to "LAST".
|
|
219
|
+
:type operation: ttypes.Operation
|
|
220
|
+
:param windows:
|
|
221
|
+
Length to window to calculate the aggregates on. Strings like "1h", "30d" are also accepted.
|
|
222
|
+
Minimum window size is 1hr. Maximum can be arbitrary. When not defined, the computation is un-windowed.
|
|
223
|
+
:type windows: List[common.Window]
|
|
224
|
+
:param buckets:
|
|
225
|
+
Besides the GroupBy.keys, this is another level of keys for use under this aggregation.
|
|
226
|
+
Using this would create an output as a map of string to aggregate.
|
|
227
|
+
:type buckets: List[str]
|
|
228
|
+
:return: An aggregate defined with the specified operation.
|
|
229
|
+
"""
|
|
230
|
+
# Default to last
|
|
231
|
+
operation = operation if operation is not None else Operation.LAST
|
|
232
|
+
arg_map = {}
|
|
233
|
+
if isinstance(operation, tuple):
|
|
234
|
+
operation, arg_map = operation[0], operation[1]
|
|
235
|
+
|
|
236
|
+
def normalize(w: Union[common.Window, str]) -> common.Window:
|
|
237
|
+
if isinstance(w, str):
|
|
238
|
+
return window_utils._from_str(w)
|
|
239
|
+
elif isinstance(w, common.Window):
|
|
240
|
+
return w
|
|
241
|
+
else:
|
|
242
|
+
raise Exception("window should be either a string like '7d', '24h', or a Window type")
|
|
243
|
+
|
|
244
|
+
norm_windows = [normalize(w) for w in windows] if windows else None
|
|
245
|
+
|
|
246
|
+
agg = ttypes.Aggregation(input_column, operation, arg_map, norm_windows, buckets)
|
|
247
|
+
|
|
248
|
+
agg.tags = tags
|
|
249
|
+
return agg
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def Window(length: int, time_unit: common.TimeUnit) -> common.Window:
|
|
253
|
+
return common.Window(length, time_unit)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def Derivation(name: str, expression: str) -> ttypes.Derivation:
|
|
257
|
+
"""
|
|
258
|
+
Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
|
|
259
|
+
output schema. It is supported for offline computations for now.
|
|
260
|
+
|
|
261
|
+
If both name and expression are set to "*", then every raw column will be included along with the derived columns.
|
|
262
|
+
|
|
263
|
+
:param name: output column name of the SQL expression
|
|
264
|
+
:param expression: any valid Spark SQL select clause based on joinPart or externalPart columns
|
|
265
|
+
:return: a Derivation object representing a single derived column or a wildcard ("*") selection.
|
|
266
|
+
"""
|
|
267
|
+
return ttypes.Derivation(name=name, expression=expression)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def contains_windowed_aggregation(aggregations: Optional[List[ttypes.Aggregation]]):
|
|
271
|
+
if not aggregations:
|
|
272
|
+
return False
|
|
273
|
+
for agg in aggregations:
|
|
274
|
+
if agg.windows:
|
|
275
|
+
return True
|
|
276
|
+
return False
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def validate_group_by(group_by: ttypes.GroupBy):
|
|
280
|
+
sources = group_by.sources
|
|
281
|
+
keys = group_by.keyColumns
|
|
282
|
+
aggregations = group_by.aggregations
|
|
283
|
+
# check ts is not included in query.select
|
|
284
|
+
first_source_columns = set(utils.get_columns(sources[0]))
|
|
285
|
+
# TODO undo this check after ml_models CI passes
|
|
286
|
+
assert "ts" not in first_source_columns, (
|
|
287
|
+
"'ts' is a reserved key word for Chronon, please specify the expression in timeColumn"
|
|
288
|
+
)
|
|
289
|
+
for src in sources:
|
|
290
|
+
query = utils.get_query(src)
|
|
291
|
+
if src.events:
|
|
292
|
+
assert query.mutationTimeColumn is None, (
|
|
293
|
+
"ingestionTimeColumn should not be specified for "
|
|
294
|
+
"event source as it should be the same with timeColumn"
|
|
295
|
+
)
|
|
296
|
+
assert query.reversalColumn is None, (
|
|
297
|
+
"reversalColumn should not be specified for event source as it won't have mutations"
|
|
298
|
+
)
|
|
299
|
+
if group_by.accuracy != Accuracy.SNAPSHOT:
|
|
300
|
+
assert query.timeColumn is not None, (
|
|
301
|
+
"please specify query.timeColumn for non-snapshot accurate "
|
|
302
|
+
"group by with event source"
|
|
303
|
+
)
|
|
304
|
+
else:
|
|
305
|
+
if contains_windowed_aggregation(aggregations):
|
|
306
|
+
assert query.timeColumn, (
|
|
307
|
+
"Please specify timeColumn for entity source with windowed aggregations"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
column_set = None
|
|
311
|
+
# all sources should select the same columns
|
|
312
|
+
for i, source in enumerate(sources[1:]):
|
|
313
|
+
column_set = set(utils.get_columns(source))
|
|
314
|
+
column_diff = column_set ^ first_source_columns
|
|
315
|
+
assert not column_diff, f"""
|
|
316
|
+
Mismatched columns among sources [1, {i + 2}], Difference: {column_diff}
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
# all keys should be present in the selected columns
|
|
320
|
+
unselected_keys = set(keys) - first_source_columns
|
|
321
|
+
assert not unselected_keys, f"""
|
|
322
|
+
Keys {unselected_keys}, are unselected in source
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
# Aggregations=None is only valid if group_by is Entities
|
|
326
|
+
if aggregations is None:
|
|
327
|
+
is_events = any([s.events for s in sources])
|
|
328
|
+
has_mutations = (
|
|
329
|
+
any(
|
|
330
|
+
[
|
|
331
|
+
(s.entities.mutationTable is not None or s.entities.mutationTopic is not None)
|
|
332
|
+
for s in sources
|
|
333
|
+
if s.entities is not None
|
|
334
|
+
]
|
|
335
|
+
)
|
|
336
|
+
if not is_events
|
|
337
|
+
else False
|
|
338
|
+
)
|
|
339
|
+
assert not (is_events or has_mutations), (
|
|
340
|
+
"You can only set aggregations=None in an EntitySource without mutations"
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
columns = set([c for src in sources for c in utils.get_columns(src)])
|
|
344
|
+
for agg in aggregations:
|
|
345
|
+
assert agg.inputColumn, (
|
|
346
|
+
f"input_column is required for all operations, found: input_column = {agg.inputColumn} "
|
|
347
|
+
f"and operation {op_to_str(agg.operation)}"
|
|
348
|
+
)
|
|
349
|
+
assert (agg.inputColumn in columns) or (agg.inputColumn == "ts"), (
|
|
350
|
+
f"input_column: for aggregation is not part of the query. Available columns: {column_set} "
|
|
351
|
+
f"input_column: {agg.inputColumn}"
|
|
352
|
+
)
|
|
353
|
+
if agg.operation == ttypes.Operation.APPROX_PERCENTILE:
|
|
354
|
+
if agg.argMap is not None and agg.argMap.get("percentiles") is not None:
|
|
355
|
+
try:
|
|
356
|
+
percentile_array = json.loads(agg.argMap["percentiles"])
|
|
357
|
+
assert isinstance(percentile_array, list)
|
|
358
|
+
assert all([float(p) >= 0 and float(p) <= 1 for p in percentile_array])
|
|
359
|
+
except Exception as e:
|
|
360
|
+
LOGGER.exception(e)
|
|
361
|
+
raise ValueError(
|
|
362
|
+
"[Percentiles] Unable to decode percentiles value, expected json array with values between"
|
|
363
|
+
f" 0 and 1 inclusive (ex: [0.6, 0.1]), received: {agg.argMap['percentiles']}"
|
|
364
|
+
) from e
|
|
365
|
+
else:
|
|
366
|
+
raise ValueError(
|
|
367
|
+
f"[Percentiles] Unsupported arguments for {op_to_str(agg.operation)}, "
|
|
368
|
+
"example required: {'k': '128', 'percentiles': '[0.4,0.5,0.95]'},"
|
|
369
|
+
f" received: {agg.argMap}\n"
|
|
370
|
+
)
|
|
371
|
+
if agg.windows:
|
|
372
|
+
assert not (
|
|
373
|
+
# Snapshot accuracy.
|
|
374
|
+
(
|
|
375
|
+
(group_by.accuracy and group_by.accuracy == Accuracy.SNAPSHOT)
|
|
376
|
+
or group_by.backfillStartDate
|
|
377
|
+
)
|
|
378
|
+
and
|
|
379
|
+
# Hourly aggregation.
|
|
380
|
+
any([window.timeUnit == TimeUnit.HOURS for window in agg.windows])
|
|
381
|
+
), (
|
|
382
|
+
"Detected a snapshot accuracy group by with an hourly aggregation. Resolution with snapshot "
|
|
383
|
+
"accuracy is not fine enough to allow hourly group bys. Consider removing the `backfill start "
|
|
384
|
+
"date` param if set or adjusting the aggregation window. "
|
|
385
|
+
f"input_column: {agg.inputColumn}, windows: {agg.windows}"
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
_ANY_SOURCE_TYPE = Union[ttypes.Source, ttypes.EventSource, ttypes.EntitySource, ttypes.JoinSource]
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def _get_op_suffix(operation, argmap):
|
|
393
|
+
op_str = op_to_str(operation)
|
|
394
|
+
if operation in [
|
|
395
|
+
ttypes.Operation.LAST_K,
|
|
396
|
+
ttypes.Operation.TOP_K,
|
|
397
|
+
ttypes.Operation.FIRST_K,
|
|
398
|
+
ttypes.Operation.BOTTOM_K,
|
|
399
|
+
]:
|
|
400
|
+
op_name_suffix = op_str[:-2]
|
|
401
|
+
arg_suffix = argmap.get("k")
|
|
402
|
+
return "{}{}".format(op_name_suffix, arg_suffix)
|
|
403
|
+
else:
|
|
404
|
+
return op_str
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def get_output_col_names(aggregation):
|
|
408
|
+
base_name = (
|
|
409
|
+
f"{aggregation.inputColumn}_{_get_op_suffix(aggregation.operation, aggregation.argMap)}"
|
|
410
|
+
)
|
|
411
|
+
windowed_names = []
|
|
412
|
+
if aggregation.windows:
|
|
413
|
+
for window in aggregation.windows:
|
|
414
|
+
unit = common.TimeUnit._VALUES_TO_NAMES[window.timeUnit].lower()[0]
|
|
415
|
+
window_suffix = f"{window.length}{unit}"
|
|
416
|
+
windowed_names.append(f"{base_name}_{window_suffix}")
|
|
417
|
+
else:
|
|
418
|
+
windowed_names = [base_name]
|
|
419
|
+
|
|
420
|
+
bucketed_names = []
|
|
421
|
+
if aggregation.buckets:
|
|
422
|
+
for bucket in aggregation.buckets:
|
|
423
|
+
bucketed_names.extend([f"{name}_by_{bucket}" for name in windowed_names])
|
|
424
|
+
else:
|
|
425
|
+
bucketed_names = windowed_names
|
|
426
|
+
|
|
427
|
+
return bucketed_names
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def GroupBy(
|
|
431
|
+
version: int,
|
|
432
|
+
sources: Union[List[_ANY_SOURCE_TYPE], _ANY_SOURCE_TYPE],
|
|
433
|
+
keys: List[str],
|
|
434
|
+
aggregations: Optional[List[ttypes.Aggregation]],
|
|
435
|
+
derivations: List[ttypes.Derivation] = None,
|
|
436
|
+
accuracy: ttypes.Accuracy = None,
|
|
437
|
+
backfill_start_date: str = None,
|
|
438
|
+
output_namespace: str = None,
|
|
439
|
+
table_properties: Dict[str, str] = None,
|
|
440
|
+
tags: Dict[str, str] = None,
|
|
441
|
+
online: bool = DEFAULT_ONLINE,
|
|
442
|
+
production: bool = DEFAULT_PRODUCTION,
|
|
443
|
+
# execution params
|
|
444
|
+
offline_schedule: str = "@daily",
|
|
445
|
+
conf: common.ConfigProperties = None,
|
|
446
|
+
env_vars: common.EnvironmentVariables = None,
|
|
447
|
+
cluster_conf: common.ClusterConfigProperties = None,
|
|
448
|
+
step_days: int = None,
|
|
449
|
+
disable_historical_backfill: bool = False,
|
|
450
|
+
) -> ttypes.GroupBy:
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
:param version: TODO
|
|
454
|
+
:param sources:
|
|
455
|
+
can be constructed as entities or events or joinSource::
|
|
456
|
+
|
|
457
|
+
import gen_thrift.api.ttypes as chronon
|
|
458
|
+
events = chronon.Source(events=chronon.Events(
|
|
459
|
+
table=YOUR_TABLE,
|
|
460
|
+
topic=YOUR_TOPIC # <- OPTIONAL for serving
|
|
461
|
+
query=chronon.Query(...)
|
|
462
|
+
isCumulative=False # <- defaults to false.
|
|
463
|
+
))
|
|
464
|
+
Or
|
|
465
|
+
entities = chronon.Source(entities=chronon.Entities(
|
|
466
|
+
snapshotTable=YOUR_TABLE,
|
|
467
|
+
mutationTopic=YOUR_TOPIC,
|
|
468
|
+
mutationTable=YOUR_MUTATION_TABLE
|
|
469
|
+
query=chronon.Query(...)
|
|
470
|
+
))
|
|
471
|
+
or
|
|
472
|
+
joinSource = chronon.Source(joinSource=chronon.JoinSource(
|
|
473
|
+
join = YOUR_CHRONON_PARENT_JOIN,
|
|
474
|
+
query = chronon.Query(...)
|
|
475
|
+
))
|
|
476
|
+
|
|
477
|
+
Multiple sources can be supplied to backfill the historical values with their respective start and end
|
|
478
|
+
partitions. However, only one source is allowed to be a streaming one.
|
|
479
|
+
:type sources: List[gen_thrift.api.ttypes.Events|gen_thrift.api.ttypes.Entities]
|
|
480
|
+
:param keys:
|
|
481
|
+
List of primary keys that defines the data that needs to be collected in the result table. Similar to the
|
|
482
|
+
GroupBy in the SQL context.
|
|
483
|
+
:type keys: List[String]
|
|
484
|
+
:param aggregations:
|
|
485
|
+
List of aggregations that needs to be computed for the data following the grouping defined by the keys::
|
|
486
|
+
|
|
487
|
+
import gen_thrift.api.ttypes as chronon
|
|
488
|
+
aggregations = [
|
|
489
|
+
chronon.Aggregation(input_column="entity", operation=Operation.LAST),
|
|
490
|
+
chronon.Aggregation(input_column="entity", operation=Operation.LAST, windows=['7d'])
|
|
491
|
+
],
|
|
492
|
+
:type aggregations: List[gen_thrift.api.ttypes.Aggregation]
|
|
493
|
+
:param online:
|
|
494
|
+
Should we upload the result data of this conf into the KV store so that we can fetch/serve this GroupBy online.
|
|
495
|
+
Once Online is set to True, you ideally should not change the conf.
|
|
496
|
+
:type online: bool
|
|
497
|
+
:param production:
|
|
498
|
+
This when set can be integrated to trigger alerts. You will have to integrate this flag into your alerting
|
|
499
|
+
system yourself.
|
|
500
|
+
:type production: bool
|
|
501
|
+
:param backfill_start_date:
|
|
502
|
+
Start date from which GroupBy data should be computed. This will determine how back of a time that Chronon would
|
|
503
|
+
goto to compute the resultant table and its aggregations.
|
|
504
|
+
:type backfill_start_date: str
|
|
505
|
+
:param env:
|
|
506
|
+
This is a dictionary of "mode name" to dictionary of "env var name" to "env var value"::
|
|
507
|
+
|
|
508
|
+
{
|
|
509
|
+
'backfill' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' },
|
|
510
|
+
'upload' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
|
|
511
|
+
'streaming' : { 'VAR1' : 'VAL1', 'VAR2' : 'VAL2' }
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
These vars then flow into run.py and the underlying spark_submit.sh.
|
|
515
|
+
These vars can be set in other places as well. The priority order (descending) is as below
|
|
516
|
+
|
|
517
|
+
1. env vars set while using run.py "VAR=VAL run.py --mode=backfill <name>"
|
|
518
|
+
2. env vars set here in Join's env param
|
|
519
|
+
3. env vars set in `team.json['team.production.<MODE NAME>']`
|
|
520
|
+
4. env vars set in `team.json['default.production.<MODE NAME>']`
|
|
521
|
+
|
|
522
|
+
:type env: Dict[str, Dict[str, str]]
|
|
523
|
+
:param table_properties:
|
|
524
|
+
Specifies the properties on output hive tables. Can be specified in teams.json.
|
|
525
|
+
:type table_properties: Dict[str, str]
|
|
526
|
+
:param output_namespace:
|
|
527
|
+
In backfill mode, we will produce data into hive. This represents the hive namespace that the data will be
|
|
528
|
+
written into. You can set this at the teams.json level.
|
|
529
|
+
:type output_namespace: str
|
|
530
|
+
:param accuracy:
|
|
531
|
+
Defines the computing accuracy of the GroupBy.
|
|
532
|
+
If "Snapshot" is selected, the aggregations are computed based on the partition identifier - "ds" time column.
|
|
533
|
+
If "Temporal" is selected, the aggregations are computed based on the event time - "ts" time column.
|
|
534
|
+
:type accuracy: gen_thrift.api.ttypes.SNAPSHOT or gen_thrift.api.ttypes.TEMPORAL
|
|
535
|
+
:param lag:
|
|
536
|
+
Param that goes into customJson. You can pull this out of the json at path "metaData.customJson.lag"
|
|
537
|
+
This is used by airflow integration to pick an older hive partition to wait on.
|
|
538
|
+
:type lag: int
|
|
539
|
+
:param offline_schedule:
|
|
540
|
+
the offline schedule interval for batch jobs. Below is the equivalent of the cron tab commands::
|
|
541
|
+
|
|
542
|
+
'@hourly': '0 * * * *',
|
|
543
|
+
'@daily': '0 0 * * *',
|
|
544
|
+
'@weekly': '0 0 * * 0',
|
|
545
|
+
'@monthly': '0 0 1 * *',
|
|
546
|
+
'@yearly': '0 0 1 1 *',
|
|
547
|
+
|
|
548
|
+
:type offline_schedule: str
|
|
549
|
+
:param tags:
|
|
550
|
+
Additional metadata that does not directly affect feature computation, but is useful to
|
|
551
|
+
track for management purposes.
|
|
552
|
+
:type tags: Dict[str, str]
|
|
553
|
+
:param derivations:
|
|
554
|
+
Derivation allows arbitrary SQL select clauses to be computed using columns from the output of group by backfill
|
|
555
|
+
output schema. It is supported for offline computations for now.
|
|
556
|
+
:type derivations: List[gen_thrift.api.ttypes.Drivation]
|
|
557
|
+
:param kwargs:
|
|
558
|
+
Additional properties that would be passed to run.py if specified under additional_args property.
|
|
559
|
+
And provides an option to pass custom values to the processing logic.
|
|
560
|
+
:type kwargs: Dict[str, str]
|
|
561
|
+
:param conf:
|
|
562
|
+
Configuration properties for the GroupBy. Depending on the mode we layer confs with the following priority:
|
|
563
|
+
1. conf set in the GroupBy.conf.<mode>
|
|
564
|
+
2. conf set in the GroupBy.conf.common
|
|
565
|
+
3. conf set in the team.conf.<mode>
|
|
566
|
+
4. conf set in the team.conf.common
|
|
567
|
+
5. conf set in the default.conf.<mode>
|
|
568
|
+
6. conf set in the default.conf.common
|
|
569
|
+
:param env_vars:
|
|
570
|
+
Environment variables for the GroupBy. Depending on the mode we layer envs with the following priority:
|
|
571
|
+
1. env vars set in the GroupBy.env.<mode>
|
|
572
|
+
2. env vars set in the GroupBy.env.common
|
|
573
|
+
3. env vars set in the team.env.<mode>
|
|
574
|
+
4. env vars set in the team.env.common
|
|
575
|
+
5. env vars set in the default.env.<mode>
|
|
576
|
+
6. env vars set in the default.env.common
|
|
577
|
+
:param cluster_conf:
|
|
578
|
+
Cluster configuration properties for the join.
|
|
579
|
+
:param step_days
|
|
580
|
+
The maximum number of days to output at once
|
|
581
|
+
:return:
|
|
582
|
+
A GroupBy object containing specified aggregations.
|
|
583
|
+
"""
|
|
584
|
+
assert sources, "Sources are not specified"
|
|
585
|
+
|
|
586
|
+
assert isinstance(version, int), (
|
|
587
|
+
f"Version must be an integer, but found {type(version).__name__}"
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
agg_inputs = []
|
|
591
|
+
if aggregations is not None:
|
|
592
|
+
agg_inputs = [agg.inputColumn for agg in aggregations]
|
|
593
|
+
|
|
594
|
+
required_columns = keys + agg_inputs
|
|
595
|
+
|
|
596
|
+
def _sanitize_columns(src: ttypes.Source):
|
|
597
|
+
source = deepcopy(src)
|
|
598
|
+
query = (
|
|
599
|
+
source.entities.query
|
|
600
|
+
if source.entities is not None
|
|
601
|
+
else (source.events.query if source.events is not None else source.joinSource.query)
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
if query.selects is None:
|
|
605
|
+
query.selects = {}
|
|
606
|
+
for col in required_columns:
|
|
607
|
+
if col not in query.selects:
|
|
608
|
+
query.selects[col] = col
|
|
609
|
+
if "ts" in query.selects: # ts cannot be in selects.
|
|
610
|
+
ts = query.selects["ts"]
|
|
611
|
+
del query.selects["ts"]
|
|
612
|
+
if query.timeColumn is None:
|
|
613
|
+
query.timeColumn = ts
|
|
614
|
+
assert query.timeColumn == ts, (
|
|
615
|
+
f"mismatched `ts`: {ts} and `timeColumn`: {query.timeColumn} "
|
|
616
|
+
"in source {source}. Please specify only the `timeColumn`"
|
|
617
|
+
)
|
|
618
|
+
return source
|
|
619
|
+
|
|
620
|
+
def _normalize_source(source):
|
|
621
|
+
if isinstance(source, ttypes.EventSource):
|
|
622
|
+
return ttypes.Source(events=source)
|
|
623
|
+
elif isinstance(source, ttypes.EntitySource):
|
|
624
|
+
return ttypes.Source(entities=source)
|
|
625
|
+
elif isinstance(source, ttypes.JoinSource):
|
|
626
|
+
utils.__set_name(source.join, ttypes.Join, "joins")
|
|
627
|
+
if not source.join.metaData.outputNamespace:
|
|
628
|
+
source.join.metaData.outputNamespace = output_namespace
|
|
629
|
+
return ttypes.Source(joinSource=source)
|
|
630
|
+
elif isinstance(source, ttypes.Source):
|
|
631
|
+
if source.entities:
|
|
632
|
+
return _normalize_source(source.entities)
|
|
633
|
+
elif source.events:
|
|
634
|
+
return _normalize_source(source.events)
|
|
635
|
+
elif source.joinSource:
|
|
636
|
+
return _normalize_source(source.joinSource)
|
|
637
|
+
else:
|
|
638
|
+
return source
|
|
639
|
+
else:
|
|
640
|
+
print("unrecognized " + str(source))
|
|
641
|
+
|
|
642
|
+
if not isinstance(sources, list):
|
|
643
|
+
sources = [sources]
|
|
644
|
+
|
|
645
|
+
sources = [_sanitize_columns(_normalize_source(source)) for source in sources]
|
|
646
|
+
|
|
647
|
+
# get caller's filename to assign team
|
|
648
|
+
team = inspect.stack()[1].filename.split("/")[-2]
|
|
649
|
+
|
|
650
|
+
exec_info = common.ExecutionInfo(
|
|
651
|
+
scheduleCron=offline_schedule,
|
|
652
|
+
conf=conf,
|
|
653
|
+
env=env_vars,
|
|
654
|
+
stepDays=step_days,
|
|
655
|
+
historicalBackfill=disable_historical_backfill,
|
|
656
|
+
clusterConf=cluster_conf,
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
column_tags = {}
|
|
660
|
+
if aggregations:
|
|
661
|
+
for agg in aggregations:
|
|
662
|
+
if hasattr(agg, "tags") and agg.tags:
|
|
663
|
+
for output_col in get_output_col_names(agg):
|
|
664
|
+
column_tags[output_col] = agg.tags
|
|
665
|
+
|
|
666
|
+
metadata = ttypes.MetaData(
|
|
667
|
+
online=online,
|
|
668
|
+
production=production,
|
|
669
|
+
outputNamespace=output_namespace,
|
|
670
|
+
tableProperties=table_properties,
|
|
671
|
+
team=team,
|
|
672
|
+
executionInfo=exec_info,
|
|
673
|
+
tags=tags if tags else None,
|
|
674
|
+
columnTags=column_tags if column_tags else None,
|
|
675
|
+
version=str(version),
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
group_by = ttypes.GroupBy(
|
|
679
|
+
sources=sources,
|
|
680
|
+
keyColumns=keys,
|
|
681
|
+
aggregations=aggregations,
|
|
682
|
+
metaData=metadata,
|
|
683
|
+
backfillStartDate=backfill_start_date,
|
|
684
|
+
accuracy=accuracy,
|
|
685
|
+
derivations=derivations,
|
|
686
|
+
)
|
|
687
|
+
validate_group_by(group_by)
|
|
688
|
+
|
|
689
|
+
# Add the table property that calls the private function
|
|
690
|
+
group_by.__class__.table = property(lambda self: _get_output_table_name(self, full_name=True))
|
|
691
|
+
|
|
692
|
+
return group_by
|