awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/join.py
ADDED
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import copy
|
|
16
|
+
import gc
|
|
17
|
+
import importlib
|
|
18
|
+
import logging
|
|
19
|
+
from collections import Counter
|
|
20
|
+
from typing import Dict, List, Tuple, Union
|
|
21
|
+
|
|
22
|
+
import gen_thrift.api.ttypes as api
|
|
23
|
+
import gen_thrift.common.ttypes as common
|
|
24
|
+
|
|
25
|
+
import ai.chronon.repo.extract_objects as eo
|
|
26
|
+
import ai.chronon.utils as utils
|
|
27
|
+
from ai.chronon.cli.compile import parse_teams
|
|
28
|
+
|
|
29
|
+
logging.basicConfig(level=logging.INFO)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_output_table_name(join: api.Join, full_name: bool = False):
|
|
33
|
+
"""generate output table name for join backfill job"""
|
|
34
|
+
# join sources could also be created inline alongside groupBy file
|
|
35
|
+
# so we specify fallback module as group_bys
|
|
36
|
+
if isinstance(join, api.Join):
|
|
37
|
+
utils.__set_name(join, api.Join, "joins")
|
|
38
|
+
# set output namespace
|
|
39
|
+
if not join.metaData.outputNamespace:
|
|
40
|
+
team_name = join.metaData.name.split(".")[0]
|
|
41
|
+
namespace = (
|
|
42
|
+
parse_teams.load_teams(utils.chronon_root_path, print=False)
|
|
43
|
+
.get(team_name)
|
|
44
|
+
.outputNamespace
|
|
45
|
+
)
|
|
46
|
+
join.metaData.outputNamespace = namespace
|
|
47
|
+
return utils.output_table_name(join, full_name=full_name)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def JoinPart(
|
|
51
|
+
group_by: api.GroupBy,
|
|
52
|
+
key_mapping: Dict[str, str] = None,
|
|
53
|
+
prefix: str = None,
|
|
54
|
+
tags: Dict[str, str] = None,
|
|
55
|
+
) -> api.JoinPart:
|
|
56
|
+
"""
|
|
57
|
+
Specifies HOW to join the `left` of a Join with GroupBy's.
|
|
58
|
+
|
|
59
|
+
:param group_by:
|
|
60
|
+
The GroupBy object to join with. Keys on left are used to equi join with keys on right.
|
|
61
|
+
When left is entities all GroupBy's are computed as of midnight.
|
|
62
|
+
When left is events, we do a point-in-time join when right.accuracy == TEMPORAL OR right.source.topic != null
|
|
63
|
+
:type group_by: ai.chronon.api.GroupBy
|
|
64
|
+
:param key_mapping:
|
|
65
|
+
Names of keys don't always match on left and right, this mapping tells us how to map when they don't.
|
|
66
|
+
:type key_mapping: Dict[str, str]
|
|
67
|
+
:param prefix:
|
|
68
|
+
All the output columns of the groupBy will be prefixed with this string. This is used when you need to join
|
|
69
|
+
the same groupBy more than once with `left`. Say on the left you have seller and buyer, on the group you have
|
|
70
|
+
a user's avg_price, and you want to join the left (seller, buyer) with (seller_avg_price, buyer_avg_price) you
|
|
71
|
+
would use key_mapping and prefix parameters.
|
|
72
|
+
:param tags:
|
|
73
|
+
Additional metadata about the JoinPart that you wish to track. Does not effect computation.
|
|
74
|
+
:type tags: Dict[str, str]
|
|
75
|
+
:return:
|
|
76
|
+
JoinPart specifies how the left side of a join, or the query in online setting, would join with the right side
|
|
77
|
+
components like GroupBys.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
assert isinstance(group_by, api.GroupBy), (
|
|
81
|
+
f"Expecting GroupBy. But found {type(group_by).__name__}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# used for reset for next run
|
|
85
|
+
import_copy = __builtins__["__import__"]
|
|
86
|
+
# get group_by's module info from garbage collector
|
|
87
|
+
gc.collect()
|
|
88
|
+
|
|
89
|
+
group_by_module_name = None
|
|
90
|
+
for ref in gc.get_referrers(group_by):
|
|
91
|
+
if (
|
|
92
|
+
isinstance(
|
|
93
|
+
ref, dict
|
|
94
|
+
) # Attaching methods to GroupBy adds references in GC, need to filter out
|
|
95
|
+
and "__name__" in ref
|
|
96
|
+
and ref["__name__"].startswith("group_bys")
|
|
97
|
+
):
|
|
98
|
+
group_by_module_name = ref["__name__"]
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
if group_by_module_name:
|
|
102
|
+
logging.debug(
|
|
103
|
+
"group_by's module info from garbage collector {}".format(group_by_module_name)
|
|
104
|
+
)
|
|
105
|
+
group_by_module = importlib.import_module(group_by_module_name)
|
|
106
|
+
__builtins__["__import__"] = eo.import_module_set_name(group_by_module, api.GroupBy)
|
|
107
|
+
else:
|
|
108
|
+
if not group_by.metaData.name:
|
|
109
|
+
logging.error("No group_by file or custom group_by name found")
|
|
110
|
+
raise ValueError(
|
|
111
|
+
"[GroupBy] Must specify a group_by name if group_by is not defined in separate file. "
|
|
112
|
+
"You may pass it in via GroupBy.name. \n"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if key_mapping:
|
|
116
|
+
utils.check_contains(
|
|
117
|
+
key_mapping.values(), group_by.keyColumns, "key", group_by.metaData.name
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
join_part = api.JoinPart(groupBy=group_by, keyMapping=key_mapping, prefix=prefix)
|
|
121
|
+
join_part.tags = tags
|
|
122
|
+
# reset before next run
|
|
123
|
+
__builtins__["__import__"] = import_copy
|
|
124
|
+
return join_part
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
FieldsType = List[Tuple[str, api.TDataType]]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class DataType:
|
|
131
|
+
"""
|
|
132
|
+
Helper class to generate data types for declaring schema.
|
|
133
|
+
This supports primitive like numerics, string etc., and complex
|
|
134
|
+
types like Map, List, Struct etc.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
BOOLEAN = api.TDataType(api.DataKind.BOOLEAN)
|
|
138
|
+
SHORT = api.TDataType(api.DataKind.SHORT)
|
|
139
|
+
INT = api.TDataType(api.DataKind.INT)
|
|
140
|
+
LONG = api.TDataType(api.DataKind.LONG)
|
|
141
|
+
FLOAT = api.TDataType(api.DataKind.FLOAT)
|
|
142
|
+
DOUBLE = api.TDataType(api.DataKind.DOUBLE)
|
|
143
|
+
STRING = api.TDataType(api.DataKind.STRING)
|
|
144
|
+
BINARY = api.TDataType(api.DataKind.BINARY)
|
|
145
|
+
|
|
146
|
+
# Types unsupported by Avro. See AvroConversions.scala#fromChrononSchema
|
|
147
|
+
# BYTE = api.TDataType(api.DataKind.BYTE)
|
|
148
|
+
# DATE = api.TDataType(api.DataKind.DATE)
|
|
149
|
+
# TIMESTAMP = api.TDataType(api.DataKind.TIMESTAMP)
|
|
150
|
+
|
|
151
|
+
def MAP(key_type: api.TDataType, value_type: api.TDataType) -> api.TDataType:
|
|
152
|
+
assert key_type == api.TDataType(api.DataKind.STRING), (
|
|
153
|
+
"key_type has to STRING for MAP types"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return api.TDataType(
|
|
157
|
+
api.DataKind.MAP,
|
|
158
|
+
params=[api.DataField("key", key_type), api.DataField("value", value_type)],
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def LIST(elem_type: api.TDataType) -> api.TDataType:
|
|
162
|
+
return api.TDataType(api.DataKind.LIST, params=[api.DataField("elem", elem_type)])
|
|
163
|
+
|
|
164
|
+
def STRUCT(name: str, *fields: FieldsType) -> api.TDataType:
|
|
165
|
+
return api.TDataType(
|
|
166
|
+
api.DataKind.STRUCT,
|
|
167
|
+
params=[api.DataField(name, data_type) for (name, data_type) in fields],
|
|
168
|
+
name=name,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def ExternalSource(
|
|
173
|
+
name: str,
|
|
174
|
+
team: str,
|
|
175
|
+
key_fields: FieldsType,
|
|
176
|
+
value_fields: FieldsType,
|
|
177
|
+
) -> api.ExternalSource:
|
|
178
|
+
"""
|
|
179
|
+
External sources are online only data sources. During fetching, using
|
|
180
|
+
chronon java client, they consume a Request containing a key map
|
|
181
|
+
(name string to value). And produce a Response containing a value map.
|
|
182
|
+
|
|
183
|
+
This is primarily used in Joins. We also expose a fetchExternal method in
|
|
184
|
+
java client library that can be used to fetch a batch of External source
|
|
185
|
+
requests efficiently.
|
|
186
|
+
|
|
187
|
+
Internally Chronon will batch these requests to the service and parallelize
|
|
188
|
+
fetching from different services, while de-duplicating given a batch of
|
|
189
|
+
join requests.
|
|
190
|
+
|
|
191
|
+
The implementation of how to fetch is an `ExternalSourceHandler` in
|
|
192
|
+
scala/java api that needs to be registered while implementing
|
|
193
|
+
ai.chronon.online.Api with the name used in the ExternalSource. This is
|
|
194
|
+
meant for re-usability of external source definitions.
|
|
195
|
+
|
|
196
|
+
:param name: name of the external source to fetch from. Should match
|
|
197
|
+
the name in the registry.
|
|
198
|
+
:param key_fields: List of tuples of string and DataType. This is what
|
|
199
|
+
will be given to ExternalSource handler registered in Java API.
|
|
200
|
+
Eg., `[('key1', DataType.INT, 'key2', DataType.STRING)]`
|
|
201
|
+
:param value_fields: List of tuples of string and DataType. This is what
|
|
202
|
+
the ExternalSource handler will respond with::
|
|
203
|
+
|
|
204
|
+
[
|
|
205
|
+
('value0', DataType.INT),
|
|
206
|
+
('value1', DataType.MAP(DataType.STRING, DataType.LONG),
|
|
207
|
+
('value2', DataType.STRUCT(
|
|
208
|
+
name = 'Context',
|
|
209
|
+
('field1', DataType.INT),
|
|
210
|
+
('field2', DataType.DOUBLE)
|
|
211
|
+
))
|
|
212
|
+
]
|
|
213
|
+
|
|
214
|
+
"""
|
|
215
|
+
assert name != "contextual", "Please use `ContextualSource`"
|
|
216
|
+
return api.ExternalSource(
|
|
217
|
+
metadata=api.MetaData(name=name, team=team),
|
|
218
|
+
keySchema=DataType.STRUCT(f"ext_{name}_keys", *key_fields),
|
|
219
|
+
valueSchema=DataType.STRUCT(f"ext_{name}_values", *value_fields),
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def ContextualSource(fields: FieldsType, team="default") -> api.ExternalSource:
|
|
224
|
+
"""
|
|
225
|
+
Contextual source values are passed along for logging. No external request is
|
|
226
|
+
actually made.
|
|
227
|
+
"""
|
|
228
|
+
return api.ExternalSource(
|
|
229
|
+
metadata=api.MetaData(name="contextual", team=team),
|
|
230
|
+
keySchema=DataType.STRUCT("contextual_keys", *fields),
|
|
231
|
+
valueSchema=DataType.STRUCT("contextual_values", *fields),
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def ExternalPart(
|
|
236
|
+
source: api.ExternalSource, key_mapping: Dict[str, str] = None, prefix: str = None
|
|
237
|
+
) -> api.ExternalPart:
|
|
238
|
+
"""
|
|
239
|
+
Used to describe which ExternalSources to pull features from while fetching
|
|
240
|
+
online. This data also goes into logs based on sample percent.
|
|
241
|
+
|
|
242
|
+
Just as in JoinPart, key_mapping is used to map the join left's keys to
|
|
243
|
+
external source's keys. "vendor" and "buyer" on left side (query map)
|
|
244
|
+
could both map to a "user" in an account data external source. You would
|
|
245
|
+
create one ExternalPart for vendor with params:
|
|
246
|
+
`(key_mapping={vendor: user}, prefix=vendor)`
|
|
247
|
+
another for buyer.
|
|
248
|
+
|
|
249
|
+
This doesn't have any implications offline besides logging. "right_parts"
|
|
250
|
+
can be both backfilled and logged. Whereas, "external_parts" can only be
|
|
251
|
+
logged. If you need the ability to backfill an external source, look into
|
|
252
|
+
creating an EntitySource with mutation data for point-in-time-correctness.
|
|
253
|
+
|
|
254
|
+
:param source: External source to join with
|
|
255
|
+
:param key_mapping: How to map the keys from the query/left side to the
|
|
256
|
+
source
|
|
257
|
+
:param prefix: Sometime you want to use the same source to fetch data for
|
|
258
|
+
different entities in the query. Eg., A transaction
|
|
259
|
+
between a buyer and a seller might query "user information"
|
|
260
|
+
serivce/source that has information about both buyer &
|
|
261
|
+
seller
|
|
262
|
+
"""
|
|
263
|
+
return api.ExternalPart(source=source, keyMapping=key_mapping, prefix=prefix)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def LabelParts(
|
|
267
|
+
labels: List[api.JoinPart],
|
|
268
|
+
left_start_offset: int,
|
|
269
|
+
left_end_offset: int,
|
|
270
|
+
label_offline_schedule: str = "@daily",
|
|
271
|
+
) -> api.LabelParts:
|
|
272
|
+
"""
|
|
273
|
+
Used to describe labels in join. Label part can be viewed as regular join part but represent
|
|
274
|
+
label data instead of regular feature data. Once labels are mature, label join job would join
|
|
275
|
+
labels with features in the training window user specified within the label GroupBy-s.
|
|
276
|
+
|
|
277
|
+
Since label join job will run continuously based on the schedule, multiple labels could be generated but with
|
|
278
|
+
different label_ds or label version. Label join job would have all computed label versions available, as well as
|
|
279
|
+
a view of latest version for easy label retrieval.
|
|
280
|
+
|
|
281
|
+
LabelParts definition can be updated along the way, but label join job can only accommodate these changes going
|
|
282
|
+
forward unless a backfill is manually triggered.
|
|
283
|
+
|
|
284
|
+
Label aggregation is also supported but with conditions applied. Single aggregation with one window is allowed
|
|
285
|
+
for now. If aggregation is present, we would infer the left_start_offset and left_end_offset same as window size
|
|
286
|
+
and the param input will be ignored.
|
|
287
|
+
|
|
288
|
+
:param labels: List of labels
|
|
289
|
+
:param label_offline_schedule: Cron expression for Airflow to schedule a DAG for offline
|
|
290
|
+
label join compute tasks
|
|
291
|
+
"""
|
|
292
|
+
|
|
293
|
+
exec_info = common.ExecutionInfo(
|
|
294
|
+
scheduleCron=label_offline_schedule,
|
|
295
|
+
)
|
|
296
|
+
label_metadata = api.MetaData(executionInfo=exec_info)
|
|
297
|
+
|
|
298
|
+
return api.LabelParts(
|
|
299
|
+
labels=labels,
|
|
300
|
+
metaData=label_metadata,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def Derivation(name: str, expression: str) -> api.Derivation:
|
|
305
|
+
"""
|
|
306
|
+
Derivation allows arbitrary SQL select clauses to be computed using columns from joinPart and externalParts,
|
|
307
|
+
and saves the result as derived columns. The results will be available both in online fetching response map,
|
|
308
|
+
and in offline Hive table.
|
|
309
|
+
|
|
310
|
+
joinPart column names are automatically constructed according to the below convention
|
|
311
|
+
`{join_part_prefix}_{group_by_name}_{input_column_name}_{aggregation_operation}_{window}_{by_bucket}`
|
|
312
|
+
prefix, window and bucket are optional. You can find the type information of columns using the analyzer tool.
|
|
313
|
+
|
|
314
|
+
externalPart column names are automatically constructed according to the below convention
|
|
315
|
+
`ext_{external_source_name}_{value_column}`.
|
|
316
|
+
Types are defined along with the schema by users for external sources.
|
|
317
|
+
|
|
318
|
+
Note that only values can be used in derivations, not keys. If you want to use a key in the derivation, you must
|
|
319
|
+
define it as a contextual field. You also must refer to a contextual field with its prefix included, for example:
|
|
320
|
+
`ext_contextual_request_id`.
|
|
321
|
+
|
|
322
|
+
If both name and expression are set to "*", then every raw column will be included along with the derived columns.
|
|
323
|
+
|
|
324
|
+
:param name: output column name of the SQL expression
|
|
325
|
+
:param expression: any valid Spark SQL select clause based on joinPart or externalPart columns
|
|
326
|
+
:return: a Derivation object representing a single derived column or a wildcard ("*") selection.
|
|
327
|
+
"""
|
|
328
|
+
return api.Derivation(name=name, expression=expression)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def BootstrapPart(
|
|
332
|
+
table: str, key_columns: List[str] = None, query: api.Query = None
|
|
333
|
+
) -> api.BootstrapPart:
|
|
334
|
+
"""
|
|
335
|
+
Bootstrap is the concept of using pre-computed feature values and skipping backfill computation during the
|
|
336
|
+
training data generation phase. Bootstrap can be used for many purposes:
|
|
337
|
+
- Generating ongoing feature values from logs
|
|
338
|
+
- Backfilling feature values for external features (in which case Chronon is unable to run backfill)
|
|
339
|
+
- Initializing a new Join by migrating old data from an older Join and reusing data
|
|
340
|
+
|
|
341
|
+
One can bootstrap against any of these:
|
|
342
|
+
|
|
343
|
+
- join part fields:
|
|
344
|
+
Bootstrap can happen at individual field level within a join part.
|
|
345
|
+
If all fields within a group by are bootstrapped, then we skip computation for group by. Otherwise, the whole
|
|
346
|
+
thing will be re-run but only the values for the non-bootstrapped fields will be retained in the final table.
|
|
347
|
+
- external part fields:
|
|
348
|
+
Bootstrap can happen at individual field level within an external part.
|
|
349
|
+
Since there is no backfill logic in chronon for external part, all non-bootstrapped fields in external parts
|
|
350
|
+
are left as NULLs.
|
|
351
|
+
- derivation fields:
|
|
352
|
+
Derived fields can also be bootstrapped. Since derived fields depend on "base" fields (either join part or
|
|
353
|
+
external part), chronon will try to trigger the least amount of computation possible. For example,
|
|
354
|
+
if there is a join part where all derived fields that depend on the join part have been bootstrapped,
|
|
355
|
+
then we skip the computation for this join part.
|
|
356
|
+
- keys:
|
|
357
|
+
Keys of both join parts and external parts can be bootstrapped. During offline table generation, we will first
|
|
358
|
+
try to utilize key's data from left table; if it's not there, then we utilize bootstrap.
|
|
359
|
+
For contextual features, we also support propagating the key bootstrap to the values.
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
:param table: Name of hive table that contains feature values where rows are 1:1 mapped to left table
|
|
363
|
+
:param key_columns: Keys to join bootstrap table to left table
|
|
364
|
+
:param query: Selected columns (features & keys) and filtering conditions of the bootstrap tables.
|
|
365
|
+
"""
|
|
366
|
+
return api.BootstrapPart(table=table, query=query, keyColumns=key_columns)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def Join(
|
|
370
|
+
left: api.Source,
|
|
371
|
+
right_parts: List[api.JoinPart],
|
|
372
|
+
version: int,
|
|
373
|
+
row_ids: Union[str, List[str]],
|
|
374
|
+
online_external_parts: List[api.ExternalPart] = None,
|
|
375
|
+
bootstrap_parts: List[api.BootstrapPart] = None,
|
|
376
|
+
bootstrap_from_log: bool = False,
|
|
377
|
+
skew_keys: Dict[str, List[str]] = None,
|
|
378
|
+
derivations: List[api.Derivation] = None,
|
|
379
|
+
label_part: api.LabelParts = None,
|
|
380
|
+
output_namespace: str = None,
|
|
381
|
+
table_properties: Dict[str, str] = None,
|
|
382
|
+
online: bool = False,
|
|
383
|
+
production: bool = False,
|
|
384
|
+
sample_percent: float = 100.0,
|
|
385
|
+
check_consistency: bool = None,
|
|
386
|
+
consistency_sample_percent: float = 5.0,
|
|
387
|
+
use_long_names: bool = False,
|
|
388
|
+
# execution params
|
|
389
|
+
offline_schedule: str = "@daily",
|
|
390
|
+
historical_backfill: bool = None,
|
|
391
|
+
conf: common.ConfigProperties = None,
|
|
392
|
+
env_vars: common.EnvironmentVariables = None,
|
|
393
|
+
cluster_conf: common.ClusterConfigProperties = None,
|
|
394
|
+
step_days: int = None,
|
|
395
|
+
) -> api.Join:
|
|
396
|
+
"""
|
|
397
|
+
Construct a join object. A join can pull together data from various GroupBy's both offline and online. This is also
|
|
398
|
+
the focal point for logging, data quality computation and monitoring. A join maps 1:1 to models in ML usage.
|
|
399
|
+
|
|
400
|
+
:param left:
|
|
401
|
+
The source on the left side, when Entities, all GroupBys are join with SNAPSHOT accuracy (midnight values).
|
|
402
|
+
When left is events, if on the right, either when GroupBy's are TEMPORAL, or when topic is specified, we perform
|
|
403
|
+
a TEMPORAL / point-in-time join.
|
|
404
|
+
:type left: ai.chronon.api.Source
|
|
405
|
+
:param right_parts:
|
|
406
|
+
The list of groupBy's to join with. GroupBy's are wrapped in a JoinPart, which contains additional information
|
|
407
|
+
on how to join the left side with the GroupBy.
|
|
408
|
+
:type right_parts: List[ai.chronon.api.JoinPart]
|
|
409
|
+
:param check_consistency:
|
|
410
|
+
If online serving data should be compared with backfill data - as online-offline-consistency metrics.
|
|
411
|
+
The metrics go into hive and your configured kv store for further visualization and monitoring.
|
|
412
|
+
:type check_consistency: bool
|
|
413
|
+
:param additional_args:
|
|
414
|
+
Additional args go into `customJson` of `ai.chronon.api.MetaData` within the `ai.chronon.api.Join` object.
|
|
415
|
+
This is a place for arbitrary information you want to tag your conf with.
|
|
416
|
+
:type additional_args: List[str]
|
|
417
|
+
:param additional_env:
|
|
418
|
+
Deprecated, see env
|
|
419
|
+
:type additional_env: List[str]
|
|
420
|
+
:param online:
|
|
421
|
+
Should we upload this conf into kv store so that we can fetch/serve this join online.
|
|
422
|
+
Once Online is set to True, you ideally should not change the conf.
|
|
423
|
+
:type online: bool
|
|
424
|
+
:param production:
|
|
425
|
+
This when set can be integrated to trigger alerts. You will have to integrate this flag into your alerting
|
|
426
|
+
system yourself.
|
|
427
|
+
:type production: bool
|
|
428
|
+
:param output_namespace:
|
|
429
|
+
In backfill mode, we will produce data into hive. This represents the hive namespace that the data will be
|
|
430
|
+
written into. You can set this at the teams.json level.
|
|
431
|
+
:type output_namespace: str
|
|
432
|
+
:param table_properties:
|
|
433
|
+
Specifies the properties on output hive tables. Can be specified in teams.json.
|
|
434
|
+
:param lag:
|
|
435
|
+
Param that goes into customJson. You can pull this out of the json at path "metaData.customJson.lag"
|
|
436
|
+
This is used by airflow integration to pick an older hive partition to wait on.
|
|
437
|
+
:param skew_keys:
|
|
438
|
+
While back-filling, if there are known irrelevant keys - like user_id = 0 / NULL etc. You can specify them here.
|
|
439
|
+
This is used to blacklist crawlers etc
|
|
440
|
+
:param sample_percent:
|
|
441
|
+
Online only parameter. What percent of online serving requests to this join should be logged into warehouse.
|
|
442
|
+
:param consistency_sample_percent:
|
|
443
|
+
Online only parameter. What percent of online serving requests to this join should be sampled to compute
|
|
444
|
+
online offline consistency metrics.
|
|
445
|
+
if sample_percent=50.0 and consistency_sample_percent=10.0, then basically the consistency job runs on
|
|
446
|
+
5% of total traffic.
|
|
447
|
+
:param online_external_parts:
|
|
448
|
+
users can register external sources into Api implementation. Chronon fetcher can invoke the implementation.
|
|
449
|
+
This is applicable only for online fetching. Offline this will not be produce any values.
|
|
450
|
+
:param offline_schedule:
|
|
451
|
+
Cron expression for Airflow to schedule a DAG for offline join compute tasks
|
|
452
|
+
:param row_ids:
|
|
453
|
+
Columns of the left table that uniquely define a training record. Used as default keys during bootstrap
|
|
454
|
+
:param bootstrap_parts:
|
|
455
|
+
A list of BootstrapPart used for the Join. See BootstrapPart doc for more details
|
|
456
|
+
:param bootstrap_from_log:
|
|
457
|
+
If set to True, will use logging table to generate training data by default and skip continuous backfill.
|
|
458
|
+
Logging will be treated as another bootstrap source, but other bootstrap_parts will take precedence.
|
|
459
|
+
:param label_part:
|
|
460
|
+
Label part which contains a list of labels and label refresh window boundary used for the Join
|
|
461
|
+
:param historical_backfill:
|
|
462
|
+
Flag to indicate whether join backfill should backfill previous holes.
|
|
463
|
+
Setting to false will only backfill latest single partition
|
|
464
|
+
:type historical_backfill: bool
|
|
465
|
+
:return:
|
|
466
|
+
A join object that can be used to backfill or serve data. For ML use-cases this should map 1:1 to model.
|
|
467
|
+
:param conf:
|
|
468
|
+
Configuration properties for the join. Depending on the mode we layer confs with the following priority:
|
|
469
|
+
1. conf set in the join.conf.<mode>
|
|
470
|
+
2. conf set in the join.conf.common
|
|
471
|
+
3. conf set in the team.conf.<mode>
|
|
472
|
+
4. conf set in the team.conf.common
|
|
473
|
+
5. conf set in the default.conf.<mode>
|
|
474
|
+
6. conf set in the default.conf.common
|
|
475
|
+
:param env_vars:
|
|
476
|
+
Environment variables for the join. Depending on the mode we layer envs with the following priority:
|
|
477
|
+
1. env vars set in the join.env.<mode>
|
|
478
|
+
2. env vars set in the join.env.common
|
|
479
|
+
3. env vars set in the team.env.<mode>
|
|
480
|
+
4. env vars set in the team.env.common
|
|
481
|
+
5. env vars set in the default.env.<mode>
|
|
482
|
+
6. env vars set in the default.env.common
|
|
483
|
+
:param cluster_conf:
|
|
484
|
+
Cluster configuration properties for the join.
|
|
485
|
+
:param step_days:
|
|
486
|
+
The maximum number of days to output at once
|
|
487
|
+
"""
|
|
488
|
+
# Normalize row_ids
|
|
489
|
+
if isinstance(row_ids, str):
|
|
490
|
+
row_ids = [row_ids]
|
|
491
|
+
|
|
492
|
+
assert isinstance(version, int), (
|
|
493
|
+
f"Version must be an integer, but found {type(version).__name__}"
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
# create a deep copy for case: multiple LeftOuterJoin use the same left,
|
|
497
|
+
# validation will fail after the first iteration
|
|
498
|
+
updated_left = copy.deepcopy(left)
|
|
499
|
+
if left.events and left.events.query.selects:
|
|
500
|
+
assert "ts" not in left.events.query.selects.keys(), (
|
|
501
|
+
"'ts' is a reserved key word for Chronon, please specify the expression in timeColumn"
|
|
502
|
+
)
|
|
503
|
+
# mapping ts to query.timeColumn to events only
|
|
504
|
+
updated_left.events.query.selects.update({"ts": updated_left.events.query.timeColumn})
|
|
505
|
+
|
|
506
|
+
if label_part:
|
|
507
|
+
label_metadata = api.MetaData(
|
|
508
|
+
executionInfo=label_part.metaData.executionInfo,
|
|
509
|
+
)
|
|
510
|
+
label_part = api.LabelParts(
|
|
511
|
+
labels=label_part.labels,
|
|
512
|
+
leftStartOffset=label_part.leftStartOffset,
|
|
513
|
+
leftEndOffset=label_part.leftEndOffset,
|
|
514
|
+
metaData=label_metadata,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
consistency_sample_percent = consistency_sample_percent if check_consistency else None
|
|
518
|
+
|
|
519
|
+
# external parts need to be unique on (prefix, part.source.metaData.name)
|
|
520
|
+
if online_external_parts:
|
|
521
|
+
count_map = Counter(
|
|
522
|
+
[(part.prefix, part.source.metadata.name) for part in online_external_parts]
|
|
523
|
+
)
|
|
524
|
+
has_duplicates = False
|
|
525
|
+
for key, count in count_map.items():
|
|
526
|
+
if count > 1:
|
|
527
|
+
has_duplicates = True
|
|
528
|
+
print(f"Found {count - 1} duplicate(s) for external part {key}")
|
|
529
|
+
assert has_duplicates is False, "Please address all the above mentioned duplicates."
|
|
530
|
+
|
|
531
|
+
if bootstrap_from_log:
|
|
532
|
+
has_logging = sample_percent > 0 and online
|
|
533
|
+
assert has_logging, (
|
|
534
|
+
"Join must be online with sample_percent set in order to use bootstrap_from_log option"
|
|
535
|
+
)
|
|
536
|
+
bootstrap_parts = (bootstrap_parts or []) + [
|
|
537
|
+
api.BootstrapPart(
|
|
538
|
+
# templated values will be replaced when metaData.name is set at the end
|
|
539
|
+
table="{{ logged_table }}"
|
|
540
|
+
)
|
|
541
|
+
]
|
|
542
|
+
|
|
543
|
+
exec_info = common.ExecutionInfo(
|
|
544
|
+
scheduleCron=offline_schedule,
|
|
545
|
+
conf=conf,
|
|
546
|
+
env=env_vars,
|
|
547
|
+
stepDays=step_days,
|
|
548
|
+
historicalBackfill=historical_backfill,
|
|
549
|
+
clusterConf=cluster_conf,
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
metadata = api.MetaData(
|
|
553
|
+
online=online,
|
|
554
|
+
production=production,
|
|
555
|
+
outputNamespace=output_namespace,
|
|
556
|
+
tableProperties=table_properties,
|
|
557
|
+
samplePercent=sample_percent,
|
|
558
|
+
consistencyCheck=check_consistency,
|
|
559
|
+
consistencySamplePercent=consistency_sample_percent,
|
|
560
|
+
executionInfo=exec_info,
|
|
561
|
+
version=str(version),
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
join = api.Join(
|
|
565
|
+
left=updated_left,
|
|
566
|
+
joinParts=right_parts,
|
|
567
|
+
metaData=metadata,
|
|
568
|
+
skewKeys=skew_keys,
|
|
569
|
+
onlineExternalParts=online_external_parts,
|
|
570
|
+
bootstrapParts=bootstrap_parts,
|
|
571
|
+
rowIds=row_ids,
|
|
572
|
+
labelParts=label_part,
|
|
573
|
+
derivations=derivations,
|
|
574
|
+
useLongNames=use_long_names,
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# Add the table property that calls the private function
|
|
578
|
+
join.__class__.table = property(lambda self: _get_output_table_name(self, full_name=True))
|
|
579
|
+
|
|
580
|
+
return join
|
ai/chronon/logger.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
LOG_FORMAT = "[%(asctime)-11s] %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_logger(log_level=logging.INFO):
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
logger.setLevel(log_level)
|
|
23
|
+
return logger
|
ai/chronon/model.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import gen_thrift.api.ttypes as ttypes
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ModelType:
|
|
7
|
+
XGBoost = ttypes.ModelType.XGBoost
|
|
8
|
+
PyTorch = ttypes.ModelType.PyTorch
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Name must match S3 path that we expose if you're uploading trained models?
|
|
12
|
+
def Model(
|
|
13
|
+
source: ttypes.Source,
|
|
14
|
+
outputSchema: ttypes.TDataType,
|
|
15
|
+
modelType: ModelType,
|
|
16
|
+
name: str = None,
|
|
17
|
+
modelParams: Optional[dict[str, str]] = None,
|
|
18
|
+
) -> ttypes.Model:
|
|
19
|
+
if not isinstance(source, ttypes.Source):
|
|
20
|
+
raise ValueError("Invalid source type")
|
|
21
|
+
if not (isinstance(outputSchema, ttypes.TDataType) or isinstance(outputSchema, int)):
|
|
22
|
+
raise ValueError("outputSchema must be a TDataType or DataKind")
|
|
23
|
+
if isinstance(outputSchema, int):
|
|
24
|
+
# Convert DataKind to TDataType
|
|
25
|
+
outputSchema = ttypes.TDataType(outputSchema)
|
|
26
|
+
|
|
27
|
+
if modelParams is None:
|
|
28
|
+
modelParams = {}
|
|
29
|
+
|
|
30
|
+
metaData = ttypes.MetaData(
|
|
31
|
+
name=name,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return ttypes.Model(
|
|
35
|
+
modelType=modelType,
|
|
36
|
+
outputSchema=outputSchema,
|
|
37
|
+
source=source,
|
|
38
|
+
modelParams=modelParams,
|
|
39
|
+
metaData=metaData,
|
|
40
|
+
)
|