awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/utils.py
ADDED
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
# Copyright (C) 2023 The Chronon Authors.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import gc
|
|
16
|
+
import importlib
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import shutil
|
|
21
|
+
import subprocess
|
|
22
|
+
import tempfile
|
|
23
|
+
from collections.abc import Iterable
|
|
24
|
+
from typing import List, Optional, Union, cast
|
|
25
|
+
|
|
26
|
+
import gen_thrift.api.ttypes as api
|
|
27
|
+
|
|
28
|
+
import ai.chronon.repo.extract_objects as eo
|
|
29
|
+
from ai.chronon.repo import FOLDER_NAME_TO_CLASS
|
|
30
|
+
|
|
31
|
+
ChrononJobTypes = Union[api.GroupBy, api.Join, api.StagingQuery]
|
|
32
|
+
|
|
33
|
+
chronon_root_path = "" # passed from compile.py
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def edit_distance(str1, str2):
|
|
37
|
+
m = len(str1) + 1
|
|
38
|
+
n = len(str2) + 1
|
|
39
|
+
dp = [[0 for _ in range(n)] for _ in range(m)]
|
|
40
|
+
for i in range(m):
|
|
41
|
+
for j in range(n):
|
|
42
|
+
if i == 0:
|
|
43
|
+
dp[i][j] = j
|
|
44
|
+
elif j == 0:
|
|
45
|
+
dp[i][j] = i
|
|
46
|
+
elif str1[i - 1] == str2[j - 1]:
|
|
47
|
+
dp[i][j] = dp[i - 1][j - 1]
|
|
48
|
+
else:
|
|
49
|
+
dp[i][j] = 1 + min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1])
|
|
50
|
+
return dp[m - 1][n - 1]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class JsonDiffer:
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.temp_dir = tempfile.mkdtemp()
|
|
56
|
+
self.new_name = "new.json"
|
|
57
|
+
self.old_name = "old.json"
|
|
58
|
+
|
|
59
|
+
def diff(self, new_json_str: object, old_json_str: object, skipped_keys=None) -> str:
|
|
60
|
+
if skipped_keys is None:
|
|
61
|
+
skipped_keys = []
|
|
62
|
+
new_json = {k: v for k, v in json.loads(new_json_str).items() if k not in skipped_keys}
|
|
63
|
+
old_json = {k: v for k, v in json.loads(old_json_str).items() if k not in skipped_keys}
|
|
64
|
+
|
|
65
|
+
with (
|
|
66
|
+
open(os.path.join(self.temp_dir, self.old_name), mode="w") as old,
|
|
67
|
+
open(os.path.join(self.temp_dir, self.new_name), mode="w") as new,
|
|
68
|
+
):
|
|
69
|
+
old.write(json.dumps(old_json, sort_keys=True, indent=2))
|
|
70
|
+
new.write(json.dumps(new_json, sort_keys=True, indent=2))
|
|
71
|
+
diff_str = subprocess.run(
|
|
72
|
+
["diff", old.name, new.name], stdout=subprocess.PIPE
|
|
73
|
+
).stdout.decode("utf-8")
|
|
74
|
+
return diff_str
|
|
75
|
+
|
|
76
|
+
def clean(self):
|
|
77
|
+
shutil.rmtree(self.temp_dir)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def check_contains_single(candidate, valid_items, type_name, name, print_function=repr):
|
|
81
|
+
name_suffix = f"for {name}" if name else ""
|
|
82
|
+
candidate_str = print_function(candidate)
|
|
83
|
+
if not valid_items:
|
|
84
|
+
assert f"{candidate_str}, is not a valid {type_name} because no {type_name}s are specified {name_suffix}"
|
|
85
|
+
elif candidate not in valid_items:
|
|
86
|
+
sorted_items = sorted(
|
|
87
|
+
map(print_function, valid_items),
|
|
88
|
+
key=lambda item: edit_distance(candidate_str, item),
|
|
89
|
+
)
|
|
90
|
+
printed_items = "\n ".join(sorted_items)
|
|
91
|
+
assert (
|
|
92
|
+
candidate in valid_items
|
|
93
|
+
), f"""{candidate_str}, is not a valid {type_name} {name_suffix}
|
|
94
|
+
Please pick one from:
|
|
95
|
+
{printed_items}
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def check_contains(candidates, *args):
|
|
100
|
+
if isinstance(candidates, Iterable) and not isinstance(candidates, str):
|
|
101
|
+
for candidate in candidates:
|
|
102
|
+
check_contains_single(candidate, *args)
|
|
103
|
+
else:
|
|
104
|
+
check_contains_single(candidates, *args)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_streaming_sources(group_by: api.GroupBy) -> List[api.Source]:
|
|
108
|
+
"""Checks if the group by has a source with streaming enabled."""
|
|
109
|
+
return [source for source in group_by.sources if is_streaming(source)]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def is_streaming(source: api.Source) -> bool:
|
|
113
|
+
"""Checks if the source has streaming enabled."""
|
|
114
|
+
return (source.entities and source.entities.mutationTopic is not None) or (
|
|
115
|
+
source.events and source.events.topic is not None
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _get_underlying_source(
|
|
120
|
+
source: api.Source,
|
|
121
|
+
) -> Union[api.EventSource, api.EntitySource, api.JoinSource]:
|
|
122
|
+
if source.entities:
|
|
123
|
+
return source.entities
|
|
124
|
+
elif source.events:
|
|
125
|
+
return source.events
|
|
126
|
+
else:
|
|
127
|
+
return source.joinSource
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_root_source(
|
|
131
|
+
source: api.Source,
|
|
132
|
+
) -> Union[api.EventSource, api.EntitySource]:
|
|
133
|
+
if source.entities:
|
|
134
|
+
return source.entities
|
|
135
|
+
elif source.events:
|
|
136
|
+
return source.events
|
|
137
|
+
else:
|
|
138
|
+
return get_root_source(source.joinSource.join.left)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_query(source: api.Source) -> api.Query:
|
|
142
|
+
return _get_underlying_source(source).query
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_table(source: api.Source) -> str:
|
|
146
|
+
if source.entities:
|
|
147
|
+
table = source.entities.snapshotTable
|
|
148
|
+
elif source.events:
|
|
149
|
+
table = source.events.table
|
|
150
|
+
else:
|
|
151
|
+
from ai.chronon.join import _get_output_table_name
|
|
152
|
+
|
|
153
|
+
table = _get_output_table_name(source.joinSource.join, True)
|
|
154
|
+
return table.split("/")[0]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_topic(source: api.Source) -> str:
|
|
158
|
+
return source.entities.mutationTopic if source.entities else source.events.topic
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_columns(source: api.Source):
|
|
162
|
+
query = get_query(source)
|
|
163
|
+
assert query.selects is not None, "Please specify selects in your Source/Query"
|
|
164
|
+
columns = query.selects.keys()
|
|
165
|
+
return columns
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_mod_name_from_gc(obj, mod_prefix):
|
|
169
|
+
"""get an object's module information from garbage collector"""
|
|
170
|
+
mod_name = None
|
|
171
|
+
# get obj's module info from garbage collector
|
|
172
|
+
gc.collect()
|
|
173
|
+
|
|
174
|
+
referrers = gc.get_referrers(obj)
|
|
175
|
+
|
|
176
|
+
valid_referrers = [
|
|
177
|
+
ref for ref in referrers if (isinstance(ref, Iterable) and "__name__" in ref)
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
if len(valid_referrers) == 1:
|
|
181
|
+
return valid_referrers[0]["__name__"]
|
|
182
|
+
|
|
183
|
+
for ref in valid_referrers:
|
|
184
|
+
if ref["__name__"].startswith(mod_prefix):
|
|
185
|
+
mod_name = ref["__name__"]
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
return mod_name
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_mod_and_var_name_from_gc(obj, mod_prefix):
|
|
192
|
+
# Find the variable name within the module
|
|
193
|
+
mod_name = get_mod_name_from_gc(obj, mod_prefix)
|
|
194
|
+
"""Get the variable name that points to the obj in the module"""
|
|
195
|
+
if not mod_name:
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
module = importlib.import_module(mod_name)
|
|
199
|
+
for var_name, value in vars(module).items():
|
|
200
|
+
if value is obj:
|
|
201
|
+
return mod_name, var_name
|
|
202
|
+
|
|
203
|
+
return mod_name, None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def __set_name(obj, cls, mod_prefix):
|
|
207
|
+
module_qualifier = get_mod_name_from_gc(obj, mod_prefix)
|
|
208
|
+
|
|
209
|
+
module = importlib.import_module(module_qualifier)
|
|
210
|
+
eo.import_module_set_name(module, cls)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def sanitize(name):
|
|
214
|
+
"""
|
|
215
|
+
From api.Extensions.scala
|
|
216
|
+
Option(name).map(_.replaceAll("[^a-zA-Z0-9_]", "_")).orNull
|
|
217
|
+
"""
|
|
218
|
+
if name is not None:
|
|
219
|
+
return re.sub("[^a-zA-Z0-9_]", "_", name)
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def dict_to_bash_commands(d):
|
|
224
|
+
"""
|
|
225
|
+
Convert a dict into a bash command substring
|
|
226
|
+
"""
|
|
227
|
+
if not d:
|
|
228
|
+
return ""
|
|
229
|
+
bash_commands = []
|
|
230
|
+
for key, value in d.items():
|
|
231
|
+
cmd = f"--{key.replace('_', '-')}={value}" if value else f"--{key.replace('_', '-')}"
|
|
232
|
+
bash_commands.append(cmd)
|
|
233
|
+
return " ".join(bash_commands)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def dict_to_exports(d):
|
|
237
|
+
if not d:
|
|
238
|
+
return ""
|
|
239
|
+
exports = []
|
|
240
|
+
for key, value in d.items():
|
|
241
|
+
exports.append(f"export {key.upper()}={value}")
|
|
242
|
+
return " && ".join(exports)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def output_table_name(obj, full_name: bool):
|
|
246
|
+
table_name = sanitize(obj.metaData.name)
|
|
247
|
+
db = obj.metaData.outputNamespace
|
|
248
|
+
db = db or "{{ db }}"
|
|
249
|
+
if full_name:
|
|
250
|
+
return db + "." + table_name
|
|
251
|
+
else:
|
|
252
|
+
return table_name
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def join_part_name(jp):
|
|
256
|
+
if jp.groupBy is None:
|
|
257
|
+
raise NotImplementedError("Join Part names for non group bys is not implemented.")
|
|
258
|
+
if not jp.groupBy.metaData.name and isinstance(jp.groupBy, api.GroupBy):
|
|
259
|
+
__set_name(jp.groupBy, api.GroupBy, "group_bys")
|
|
260
|
+
return "_".join(
|
|
261
|
+
[
|
|
262
|
+
component
|
|
263
|
+
for component in [jp.prefix, sanitize(jp.groupBy.metaData.name)]
|
|
264
|
+
if component is not None
|
|
265
|
+
]
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def join_part_output_table_name(join, jp, full_name: bool = False):
|
|
270
|
+
"""
|
|
271
|
+
From api.Extensions.scala
|
|
272
|
+
|
|
273
|
+
Join Part output table name.
|
|
274
|
+
To be synced with Scala API.
|
|
275
|
+
def partOutputTable(jp: JoinPart): String = (Seq(join.metaData.outputTable) ++ Option(jp.prefix) :+
|
|
276
|
+
jp.groupBy.metaData.cleanName).mkString("_")
|
|
277
|
+
"""
|
|
278
|
+
if not join.metaData.name and isinstance(join, api.Join):
|
|
279
|
+
__set_name(join, api.Join, "joins")
|
|
280
|
+
return "_".join(
|
|
281
|
+
[
|
|
282
|
+
component
|
|
283
|
+
for component in [
|
|
284
|
+
output_table_name(join, full_name),
|
|
285
|
+
join_part_name(jp),
|
|
286
|
+
]
|
|
287
|
+
if component is not None
|
|
288
|
+
]
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def log_table_name(obj, full_name: bool = False):
|
|
293
|
+
return output_table_name(obj, full_name=full_name) + "_logged"
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def get_team_conf_from_py(team, key):
|
|
297
|
+
team_module = importlib.import_module(f"teams.{team}")
|
|
298
|
+
return getattr(team_module, key)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def wait_for_simple_schema(table, lag, start, end):
|
|
302
|
+
if not table:
|
|
303
|
+
return None
|
|
304
|
+
table_tokens = table.split("/")
|
|
305
|
+
clean_name = table_tokens[0]
|
|
306
|
+
subpartition_spec = "/".join(table_tokens[1:]) if len(table_tokens) > 1 else ""
|
|
307
|
+
return {
|
|
308
|
+
"name": "wait_for_{}_ds{}".format(clean_name, "" if lag == 0 else f"_minus_{lag}"),
|
|
309
|
+
"spec": "{}/ds={}{}".format(
|
|
310
|
+
clean_name,
|
|
311
|
+
"{{ ds }}" if lag == 0 else "{{{{ macros.ds_add(ds, -{}) }}}}".format(lag),
|
|
312
|
+
"/{}".format(subpartition_spec) if subpartition_spec else "",
|
|
313
|
+
),
|
|
314
|
+
"start": start,
|
|
315
|
+
"end": end,
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def wait_for_name(dep):
|
|
320
|
+
replace_nonalphanumeric = re.sub("[^a-zA-Z0-9]", "_", dep)
|
|
321
|
+
name = f"wait_for_{replace_nonalphanumeric}"
|
|
322
|
+
return re.sub("_+", "_", name).rstrip("_")
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def dedupe_in_order(seq):
|
|
326
|
+
seen = set()
|
|
327
|
+
seen_add = seen.add
|
|
328
|
+
return [x for x in seq if not (x in seen or seen_add(x))]
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def has_topic(group_by: api.GroupBy) -> bool:
|
|
332
|
+
"""Find if there's topic or mutationTopic for a source helps define streaming tasks"""
|
|
333
|
+
return any(
|
|
334
|
+
(source.entities and source.entities.mutationTopic)
|
|
335
|
+
or (source.events and source.events.topic)
|
|
336
|
+
for source in group_by.sources
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def get_offline_schedule(conf: ChrononJobTypes) -> Optional[str]:
|
|
341
|
+
schedule_interval = conf.metaData.executionInfo.scheduleCron or "@daily"
|
|
342
|
+
if schedule_interval == "@never":
|
|
343
|
+
return None
|
|
344
|
+
return schedule_interval
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def requires_log_flattening_task(conf: ChrononJobTypes) -> bool:
|
|
348
|
+
return (conf.metaData.samplePercent or 0) > 0
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def get_applicable_modes(conf: ChrononJobTypes) -> List[str]:
|
|
352
|
+
"""Based on a conf and mode determine if a conf should define a task."""
|
|
353
|
+
modes = [] # type: List[str]
|
|
354
|
+
|
|
355
|
+
if isinstance(conf, api.GroupBy):
|
|
356
|
+
group_by = cast(api.GroupBy, conf)
|
|
357
|
+
if group_by.backfillStartDate is not None:
|
|
358
|
+
modes.append("backfill")
|
|
359
|
+
|
|
360
|
+
online = group_by.metaData.online or False
|
|
361
|
+
|
|
362
|
+
if online:
|
|
363
|
+
modes.append("upload")
|
|
364
|
+
|
|
365
|
+
temporal_accuracy = group_by.accuracy or False
|
|
366
|
+
streaming = has_topic(group_by)
|
|
367
|
+
if temporal_accuracy or streaming:
|
|
368
|
+
modes.append("streaming")
|
|
369
|
+
|
|
370
|
+
elif isinstance(conf, api.Join):
|
|
371
|
+
join = cast(api.Join, conf)
|
|
372
|
+
|
|
373
|
+
if get_offline_schedule(conf) is not None:
|
|
374
|
+
modes.append("backfill")
|
|
375
|
+
modes.append("stats-summary")
|
|
376
|
+
|
|
377
|
+
if join.metaData.consistencyCheck is True:
|
|
378
|
+
modes.append("consistency-metrics-compute")
|
|
379
|
+
|
|
380
|
+
if requires_log_flattening_task(join):
|
|
381
|
+
modes.append("log-flattener")
|
|
382
|
+
|
|
383
|
+
if join.labelParts is not None:
|
|
384
|
+
modes.append("label-join")
|
|
385
|
+
|
|
386
|
+
elif isinstance(conf, api.StagingQuery):
|
|
387
|
+
modes.append("backfill")
|
|
388
|
+
else:
|
|
389
|
+
raise ValueError(f"Unsupported job type {type(conf).__name__}")
|
|
390
|
+
|
|
391
|
+
return modes
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def get_related_table_names(conf: ChrononJobTypes) -> List[str]:
|
|
395
|
+
table_name = output_table_name(conf, full_name=True)
|
|
396
|
+
|
|
397
|
+
applicable_modes = set(get_applicable_modes(conf))
|
|
398
|
+
related_tables = [] # type: List[str]
|
|
399
|
+
|
|
400
|
+
if "upload" in applicable_modes:
|
|
401
|
+
related_tables.append(f"{table_name}_upload")
|
|
402
|
+
if "stats-summary" in applicable_modes:
|
|
403
|
+
related_tables.append(f"{table_name}_daily_stats")
|
|
404
|
+
if "label-join" in applicable_modes:
|
|
405
|
+
related_tables.append(f"{table_name}_labels")
|
|
406
|
+
related_tables.append(f"{table_name}_labeled")
|
|
407
|
+
related_tables.append(f"{table_name}_labeled_latest")
|
|
408
|
+
if "log-flattener" in applicable_modes:
|
|
409
|
+
related_tables.append(f"{table_name}_logged")
|
|
410
|
+
if "consistency-metrics-compute" in applicable_modes:
|
|
411
|
+
related_tables.append(f"{table_name}_consistency")
|
|
412
|
+
|
|
413
|
+
if isinstance(conf, api.Join) and conf.bootstrapParts:
|
|
414
|
+
related_tables.append(f"{table_name}_bootstrap")
|
|
415
|
+
|
|
416
|
+
return related_tables
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class DotDict(dict):
|
|
420
|
+
def __getattr__(self, attr):
|
|
421
|
+
if attr in self:
|
|
422
|
+
value = self[attr]
|
|
423
|
+
return DotDict(value) if isinstance(value, dict) else value
|
|
424
|
+
return None
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def convert_json_to_obj(d):
|
|
428
|
+
if isinstance(d, dict):
|
|
429
|
+
return DotDict({k: convert_json_to_obj(v) for k, v in d.items()})
|
|
430
|
+
elif isinstance(d, list):
|
|
431
|
+
return [convert_json_to_obj(item) for item in d]
|
|
432
|
+
else:
|
|
433
|
+
return d
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def chronon_path(file_path: str) -> str:
|
|
437
|
+
conf_types = FOLDER_NAME_TO_CLASS.keys()
|
|
438
|
+
splits = file_path.split("/")
|
|
439
|
+
conf_occurences = [splits.index(typ) for typ in conf_types if typ in splits]
|
|
440
|
+
assert len(conf_occurences) > 0, (
|
|
441
|
+
f"Path: {file_path} doesn't contain folder with name among {conf_types}"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
index = min([splits.index(typ) for typ in conf_types if typ in splits])
|
|
445
|
+
rel_path = "/".join(splits[index:])
|
|
446
|
+
return rel_path
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def module_path(file_path: str) -> str:
|
|
450
|
+
adjusted_path = chronon_path(file_path)
|
|
451
|
+
assert adjusted_path.endswith(".py"), f"Path: {file_path} doesn't end with '.py'"
|
|
452
|
+
without_extension = adjusted_path[:-3]
|
|
453
|
+
mod_path = without_extension.replace("/", ".")
|
|
454
|
+
return mod_path
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def compose(arg, *methods):
|
|
458
|
+
"""
|
|
459
|
+
Allows composing deeply nested method calls - typically used in selects & derivations
|
|
460
|
+
The first arg is what is threaded into methods, methods can have more than one arg.
|
|
461
|
+
|
|
462
|
+
Example:
|
|
463
|
+
|
|
464
|
+
.. code-block:: python
|
|
465
|
+
compose(
|
|
466
|
+
"user_id_approx_distinct_count_by_query",
|
|
467
|
+
"map_entries",
|
|
468
|
+
"array_sort (x, y) -> IF(y.value > x.value, -1, IF(y.value < x.value, 1, 0))",
|
|
469
|
+
"transform entry -> entry.key"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
would produce (without the new lines or indents):
|
|
473
|
+
|
|
474
|
+
.. code-block:: text
|
|
475
|
+
|
|
476
|
+
transform(
|
|
477
|
+
array_sort(
|
|
478
|
+
map_entries(
|
|
479
|
+
user_id_approx_distinct_count_by_query
|
|
480
|
+
),
|
|
481
|
+
(x, y) -> IF(y.value > x.value, -1, IF(y.value < x.value, 1, 0))
|
|
482
|
+
),
|
|
483
|
+
entry -> entry.key
|
|
484
|
+
)
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
indent = " " * (len(methods))
|
|
488
|
+
|
|
489
|
+
result = [indent + arg]
|
|
490
|
+
|
|
491
|
+
for method in methods:
|
|
492
|
+
method_parts = method.split(" ", 1)
|
|
493
|
+
method = method_parts[0]
|
|
494
|
+
|
|
495
|
+
if len(method_parts) > 1:
|
|
496
|
+
remaining_args = method_parts[1]
|
|
497
|
+
last = result.pop()
|
|
498
|
+
result = result + [last + ",", indent + remaining_args]
|
|
499
|
+
|
|
500
|
+
indent = indent[:-4]
|
|
501
|
+
result = [f"{indent}{method}("] + result + [f"{indent})"]
|
|
502
|
+
|
|
503
|
+
return "\n".join(result)
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def clean_expression(expr):
|
|
507
|
+
"""
|
|
508
|
+
Cleans up an expression by removing leading and trailing whitespace and newlines.
|
|
509
|
+
"""
|
|
510
|
+
return re.sub(r"\s+", " ", expr).strip()
|
ai/chronon/windows.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import gen_thrift.common.ttypes as common
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _days(length: int) -> common.Window:
|
|
5
|
+
return common.Window(length=length, timeUnit=common.TimeUnit.DAYS)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _hours(length: int) -> common.Window:
|
|
9
|
+
return common.Window(length=length, timeUnit=common.TimeUnit.HOURS)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _from_str(s: str) -> common.Window:
|
|
13
|
+
"""
|
|
14
|
+
converts strings like "30d", "2h" etc into common.Window
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
s (str): Duration string in format "<number>(d|h)" where d=days, h=hours
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
common.Window: Window object with specified duration
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
ValueError: If string format is invalid
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
if not s or len(s) < 2:
|
|
27
|
+
raise ValueError(f"Invalid duration format: {s}")
|
|
28
|
+
|
|
29
|
+
# Get the numeric value and unit
|
|
30
|
+
value = s[:-1]
|
|
31
|
+
unit = s[-1].lower()
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
length = int(value)
|
|
35
|
+
if length <= 0:
|
|
36
|
+
raise ValueError(f"Duration must be positive: {s}")
|
|
37
|
+
|
|
38
|
+
if unit == "d":
|
|
39
|
+
return _days(length)
|
|
40
|
+
elif unit == "h":
|
|
41
|
+
return _hours(length)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Invalid time unit '{unit}'. Must be 'd' for days or 'h' for hours")
|
|
44
|
+
|
|
45
|
+
except ValueError as e:
|
|
46
|
+
if "invalid literal for int()" in str(e):
|
|
47
|
+
raise ValueError(f"Invalid numeric value in duration: {value}") from e
|
|
48
|
+
raise e from None
|