awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/ttypes.py +6 -6
- ai/chronon/airflow_helpers.py +20 -23
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +40 -17
- ai/chronon/cli/compile/compile_context.py +13 -17
- ai/chronon/cli/compile/compiler.py +59 -36
- ai/chronon/cli/compile/conf_validator.py +251 -99
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +6 -16
- ai/chronon/cli/compile/display/compile_status.py +10 -10
- ai/chronon/cli/compile/display/diff_result.py +79 -14
- ai/chronon/cli/compile/fill_templates.py +3 -8
- ai/chronon/cli/compile/parse_configs.py +10 -17
- ai/chronon/cli/compile/parse_teams.py +38 -34
- ai/chronon/cli/compile/serializer.py +3 -9
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +2 -13
- ai/chronon/cli/logger.py +0 -2
- ai/chronon/constants.py +1 -1
- ai/chronon/group_by.py +47 -47
- ai/chronon/join.py +46 -32
- ai/chronon/logger.py +1 -2
- ai/chronon/model.py +9 -4
- ai/chronon/query.py +2 -2
- ai/chronon/repo/__init__.py +1 -2
- ai/chronon/repo/aws.py +17 -31
- ai/chronon/repo/cluster.py +121 -50
- ai/chronon/repo/compile.py +14 -8
- ai/chronon/repo/constants.py +1 -1
- ai/chronon/repo/default_runner.py +32 -54
- ai/chronon/repo/explore.py +70 -73
- ai/chronon/repo/extract_objects.py +6 -9
- ai/chronon/repo/gcp.py +89 -88
- ai/chronon/repo/gitpython_utils.py +3 -2
- ai/chronon/repo/hub_runner.py +145 -55
- ai/chronon/repo/hub_uploader.py +2 -1
- ai/chronon/repo/init.py +12 -5
- ai/chronon/repo/join_backfill.py +19 -5
- ai/chronon/repo/run.py +42 -39
- ai/chronon/repo/serializer.py +4 -12
- ai/chronon/repo/utils.py +72 -63
- ai/chronon/repo/zipline.py +3 -19
- ai/chronon/repo/zipline_hub.py +211 -39
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +4 -8
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +9 -6
- ai/chronon/resources/gcp/teams.py +9 -21
- ai/chronon/source.py +2 -4
- ai/chronon/staging_query.py +60 -19
- ai/chronon/types.py +3 -2
- ai/chronon/utils.py +21 -68
- ai/chronon/windows.py +2 -4
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
- awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
- awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- {ai/chronon → gen_thrift}/api/ttypes.py +327 -197
- {ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
- gen_thrift/eval/ttypes.py +660 -0
- {ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
- {ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
- {ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
- ai/chronon/eval/__init__.py +0 -122
- ai/chronon/eval/query_parsing.py +0 -19
- ai/chronon/eval/sample_tables.py +0 -100
- ai/chronon/eval/table_scan.py +0 -186
- ai/chronon/orchestration/ttypes.py +0 -4406
- ai/chronon/resources/gcp/README.md +0 -174
- ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
- awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
- awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
- awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
- /jars/__init__.py → /__init__.py +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
- {ai/chronon → gen_thrift}/api/__init__.py +0 -0
- {ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
- {ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
- {ai/chronon/api → gen_thrift/common}/constants.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
- {ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
- {ai/chronon → gen_thrift}/planner/__init__.py +0 -0
- {ai/chronon → gen_thrift}/planner/constants.py +0 -0
ai/chronon/repo/hub_runner.py
CHANGED
|
@@ -1,16 +1,20 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
|
-
from
|
|
4
|
-
from
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import date, timedelta
|
|
5
|
+
from typing import Optional
|
|
5
6
|
|
|
6
7
|
import click
|
|
7
|
-
from
|
|
8
|
+
from gen_thrift.planner.ttypes import Mode
|
|
8
9
|
|
|
9
10
|
from ai.chronon.cli.git_utils import get_current_branch
|
|
10
11
|
from ai.chronon.repo import hub_uploader, utils
|
|
11
12
|
from ai.chronon.repo.constants import RunMode
|
|
13
|
+
from ai.chronon.repo.utils import handle_conf_not_found, print_possible_confs
|
|
12
14
|
from ai.chronon.repo.zipline_hub import ZiplineHub
|
|
13
15
|
|
|
16
|
+
ALLOWED_DATE_FORMATS = ["%Y-%m-%d"]
|
|
17
|
+
|
|
14
18
|
|
|
15
19
|
@click.group()
|
|
16
20
|
def hub():
|
|
@@ -21,60 +25,110 @@ def hub():
|
|
|
21
25
|
def common_options(func):
|
|
22
26
|
func = click.option("--repo", help="Path to chronon repo", default=".")(func)
|
|
23
27
|
func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
|
|
28
|
+
func = click.option(
|
|
29
|
+
"--hub_url", help="Zipline Hub address, e.g. http://localhost:3903", default=None
|
|
30
|
+
)(func)
|
|
24
31
|
return func
|
|
25
32
|
|
|
33
|
+
|
|
26
34
|
def ds_option(func):
|
|
27
|
-
return click.option(
|
|
35
|
+
return click.option(
|
|
36
|
+
"--ds",
|
|
37
|
+
help="the end partition to backfill the data",
|
|
38
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
39
|
+
)(func)
|
|
40
|
+
|
|
28
41
|
|
|
29
42
|
def start_ds_option(func):
|
|
30
43
|
return click.option(
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
44
|
+
"--start-ds",
|
|
45
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
46
|
+
help="override the original start partition for a range backfill. "
|
|
47
|
+
"It only supports staging query, group by backfill and join jobs. "
|
|
48
|
+
"It could leave holes in your final output table due to the override date range.",
|
|
49
|
+
)(func)
|
|
35
50
|
|
|
36
51
|
|
|
37
52
|
def end_ds_option(func):
|
|
38
|
-
return click.option(
|
|
39
|
-
|
|
53
|
+
return click.option(
|
|
54
|
+
"--end-ds",
|
|
55
|
+
help="the end ds for a range backfill",
|
|
56
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
57
|
+
default=str(date.today() - timedelta(days=2)),
|
|
58
|
+
)(func)
|
|
40
59
|
|
|
41
|
-
def submit_workflow(repo,
|
|
42
|
-
conf,
|
|
43
|
-
mode,
|
|
44
|
-
start_ds,
|
|
45
|
-
end_ds):
|
|
46
60
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
61
|
+
def submit_workflow(repo, conf, mode, start_ds, end_ds, hub_url=None):
|
|
62
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
63
|
+
if hub_url is not None:
|
|
64
|
+
zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
|
|
65
|
+
else:
|
|
66
|
+
zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
|
|
67
|
+
conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
|
|
50
68
|
branch = get_current_branch()
|
|
51
69
|
|
|
52
|
-
hub_uploader.compute_and_upload_diffs(
|
|
70
|
+
hub_uploader.compute_and_upload_diffs(
|
|
71
|
+
branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict
|
|
72
|
+
)
|
|
53
73
|
|
|
54
74
|
# get conf name
|
|
55
75
|
conf_name = utils.get_metadata_name_from_conf(repo, conf)
|
|
56
76
|
|
|
57
|
-
|
|
58
77
|
response_json = zipline_hub.call_workflow_start_api(
|
|
59
78
|
conf_name=conf_name,
|
|
60
79
|
mode=mode,
|
|
61
80
|
branch=branch, # Get the current branch
|
|
62
|
-
user=os.environ.get(
|
|
81
|
+
user=os.environ.get("USER"),
|
|
63
82
|
start=start_ds,
|
|
64
83
|
end=end_ds,
|
|
65
84
|
conf_hash=conf_name_to_hash_dict[conf_name].hash,
|
|
85
|
+
skip_long_running=False,
|
|
66
86
|
)
|
|
67
87
|
|
|
68
|
-
|
|
88
|
+
workflow_id = response_json.get("workflowId", "N/A")
|
|
89
|
+
print(" 🆔 Workflow Id:", workflow_id)
|
|
69
90
|
print_wf_url(
|
|
70
91
|
conf=conf,
|
|
71
92
|
conf_name=conf_name,
|
|
72
|
-
mode=
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
93
|
+
mode=mode,
|
|
94
|
+
workflow_id=workflow_id,
|
|
95
|
+
repo=repo
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def submit_schedule(repo, conf, hub_url=None):
|
|
100
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
101
|
+
if hub_url is not None:
|
|
102
|
+
zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
|
|
103
|
+
else:
|
|
104
|
+
zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
|
|
105
|
+
conf_name_to_obj_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
|
|
106
|
+
branch = get_current_branch()
|
|
107
|
+
|
|
108
|
+
hub_uploader.compute_and_upload_diffs(
|
|
109
|
+
branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_obj_dict
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# get conf name
|
|
113
|
+
conf_name = utils.get_metadata_name_from_conf(repo, conf)
|
|
114
|
+
schedule_modes = get_schedule_modes(os.path.join(repo, conf))
|
|
115
|
+
# create a dict for RunMode.BACKFILL.value and RunMode.DEPLOY.value to schedule_modes.offline_schedule and schedule_modes.online
|
|
116
|
+
modes = {
|
|
117
|
+
RunMode.BACKFILL.value.upper(): schedule_modes.offline_schedule,
|
|
118
|
+
RunMode.DEPLOY.value.upper(): schedule_modes.online,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
response_json = zipline_hub.call_schedule_api(
|
|
122
|
+
modes=modes,
|
|
123
|
+
branch=branch,
|
|
124
|
+
conf_name=conf_name,
|
|
125
|
+
conf_hash=conf_name_to_obj_dict[conf_name].hash,
|
|
76
126
|
)
|
|
77
127
|
|
|
128
|
+
schedules = response_json.get("schedules", "N/A")
|
|
129
|
+
readable_schedules = {Mode._VALUES_TO_NAMES[int(k)]: v for k, v in schedules.items()}
|
|
130
|
+
print(" 🗓️ Schedules Deployed:", readable_schedules)
|
|
131
|
+
|
|
78
132
|
|
|
79
133
|
# zipline hub backfill --conf=compiled/joins/join
|
|
80
134
|
# adhoc backfills
|
|
@@ -82,41 +136,58 @@ def submit_workflow(repo,
|
|
|
82
136
|
@common_options
|
|
83
137
|
@start_ds_option
|
|
84
138
|
@end_ds_option
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
start_ds,
|
|
88
|
-
end_ds):
|
|
139
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
140
|
+
def backfill(repo, conf, hub_url, start_ds, end_ds):
|
|
89
141
|
"""
|
|
90
142
|
- Submit a backfill job to Zipline.
|
|
91
143
|
Response should contain a list of confs that are different from what's on remote.
|
|
92
144
|
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
93
145
|
- Call the actual run API with mode set to backfill.
|
|
94
146
|
"""
|
|
95
|
-
submit_workflow(
|
|
96
|
-
|
|
147
|
+
submit_workflow(
|
|
148
|
+
repo, conf, RunMode.BACKFILL.value, start_ds, end_ds, hub_url=hub_url
|
|
149
|
+
)
|
|
97
150
|
|
|
98
151
|
|
|
99
|
-
# zipline hub
|
|
152
|
+
# zipline hub run-adhoc --conf=compiled/joins/join
|
|
100
153
|
# currently only supports one-off deploy node submission
|
|
101
154
|
@hub.command()
|
|
102
155
|
@common_options
|
|
103
156
|
@end_ds_option
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
end_ds):
|
|
157
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
158
|
+
def run_adhoc(repo, conf, hub_url, end_ds):
|
|
107
159
|
"""
|
|
108
|
-
- Submit a one-off deploy job to Zipline.
|
|
160
|
+
- Submit a one-off deploy job to Zipline. This submits the various jobs to allow your conf to be tested online.
|
|
109
161
|
Response should contain a list of confs that are different from what's on remote.
|
|
110
162
|
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
111
163
|
- Call the actual run API with mode set to deploy
|
|
112
164
|
"""
|
|
113
|
-
submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds)
|
|
165
|
+
submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds, hub_url=hub_url)
|
|
114
166
|
|
|
115
167
|
|
|
116
|
-
|
|
117
|
-
|
|
168
|
+
# zipline hub schedule --conf=compiled/joins/join
|
|
169
|
+
@hub.command()
|
|
170
|
+
@common_options
|
|
171
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
172
|
+
def schedule(repo, conf, hub_url):
|
|
173
|
+
"""
|
|
174
|
+
- Deploys a schedule for the specified conf to Zipline. This allows your conf to have various associated jobs run on a schedule.
|
|
175
|
+
This verb will introspect your conf to determine which of its jobs need to be scheduled (or paused if turned off) based on the
|
|
176
|
+
'offline_schedule' and 'online' fields.
|
|
177
|
+
"""
|
|
178
|
+
submit_schedule(repo, conf, hub_url=hub_url)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_metadata_map(file_path):
|
|
182
|
+
with open(file_path, "r") as f:
|
|
118
183
|
data = json.load(f)
|
|
119
|
-
|
|
184
|
+
metadata_map = data["metaData"]
|
|
185
|
+
return metadata_map
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_common_env_map(file_path):
|
|
189
|
+
metadata_map = get_metadata_map(file_path)
|
|
190
|
+
common_env_map = metadata_map["executionInfo"]["env"]["common"]
|
|
120
191
|
return common_env_map
|
|
121
192
|
|
|
122
193
|
|
|
@@ -124,18 +195,42 @@ def get_common_env_map(file_path):
|
|
|
124
195
|
class HubConfig:
|
|
125
196
|
hub_url: str
|
|
126
197
|
frontend_url: str
|
|
198
|
+
sa_name: Optional[str] = None
|
|
199
|
+
|
|
127
200
|
|
|
201
|
+
@dataclass
|
|
202
|
+
class ScheduleModes:
|
|
203
|
+
online: str
|
|
204
|
+
offline_schedule: str
|
|
128
205
|
|
|
129
|
-
|
|
130
|
-
|
|
206
|
+
|
|
207
|
+
def get_hub_conf(conf_path, root_dir="."):
|
|
208
|
+
file_path = os.path.join(root_dir, conf_path)
|
|
209
|
+
common_env_map = get_common_env_map(file_path)
|
|
131
210
|
hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
|
|
132
211
|
frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
|
|
133
|
-
|
|
212
|
+
sa_name = common_env_map.get("SA_NAME", os.environ.get("SA_NAME"))
|
|
213
|
+
return HubConfig(hub_url=hub_url, frontend_url=frontend_url, sa_name=sa_name)
|
|
214
|
+
|
|
134
215
|
|
|
216
|
+
def get_schedule_modes(conf_path):
|
|
217
|
+
metadata_map = get_metadata_map(conf_path)
|
|
218
|
+
online_value = metadata_map.get("online", False)
|
|
219
|
+
online = "true" if bool(online_value) else "false"
|
|
220
|
+
offline_schedule = metadata_map["executionInfo"].get("scheduleCron", None)
|
|
135
221
|
|
|
136
|
-
|
|
222
|
+
# check if offline_schedule is null or 'None' or '@daily' else throw an error
|
|
223
|
+
valid_schedules = {None, "None", "@daily"}
|
|
224
|
+
if offline_schedule not in valid_schedules:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Unsupported offline_schedule: {offline_schedule}. Only null, 'None', or '@daily' are supported."
|
|
227
|
+
)
|
|
228
|
+
offline_schedule = offline_schedule or "None"
|
|
229
|
+
return ScheduleModes(online=online, offline_schedule=offline_schedule)
|
|
137
230
|
|
|
138
|
-
|
|
231
|
+
|
|
232
|
+
def print_wf_url(conf, conf_name, mode, workflow_id, repo="."):
|
|
233
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
139
234
|
frontend_url = hub_conf.frontend_url
|
|
140
235
|
|
|
141
236
|
if "compiled/joins" in conf:
|
|
@@ -143,17 +238,13 @@ def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
|
|
|
143
238
|
elif "compiled/staging_queries" in conf:
|
|
144
239
|
hub_conf_type = "stagingqueries"
|
|
145
240
|
elif "compiled/group_by" in conf:
|
|
146
|
-
hub_conf_type = "
|
|
241
|
+
hub_conf_type = "groupbys"
|
|
147
242
|
elif "compiled/models" in conf:
|
|
148
243
|
hub_conf_type = "models"
|
|
149
244
|
else:
|
|
150
245
|
raise ValueError(f"Unsupported conf type: {conf}")
|
|
151
246
|
|
|
152
|
-
|
|
153
|
-
def _millis(date_str):
|
|
154
|
-
return int(datetime.strptime(date_str, "%Y-%m-%d").timestamp() * 1000)
|
|
155
|
-
|
|
156
|
-
def _mode_string(mode):
|
|
247
|
+
def _mode_string():
|
|
157
248
|
if mode == "backfill":
|
|
158
249
|
return "offline"
|
|
159
250
|
elif mode == "deploy":
|
|
@@ -161,11 +252,10 @@ def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
|
|
|
161
252
|
else:
|
|
162
253
|
raise ValueError(f"Unsupported mode: {mode}")
|
|
163
254
|
|
|
164
|
-
workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string(
|
|
255
|
+
workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string()}?workflowId={workflow_id}"
|
|
165
256
|
|
|
166
257
|
print(" 🔗 Workflow : " + workflow_url + "\n")
|
|
167
258
|
|
|
259
|
+
|
|
168
260
|
if __name__ == "__main__":
|
|
169
261
|
hub()
|
|
170
|
-
|
|
171
|
-
|
ai/chronon/repo/hub_uploader.py
CHANGED
ai/chronon/repo/init.py
CHANGED
|
@@ -17,7 +17,7 @@ from ai.chronon.cli.compile.display.console import console
|
|
|
17
17
|
envvar="CLOUD_PROVIDER",
|
|
18
18
|
help="Cloud provider to use.",
|
|
19
19
|
required=True,
|
|
20
|
-
type=click.Choice([
|
|
20
|
+
type=click.Choice(["aws", "gcp"], case_sensitive=False),
|
|
21
21
|
)
|
|
22
22
|
@click.option(
|
|
23
23
|
"--chronon-root",
|
|
@@ -31,9 +31,11 @@ def main(ctx, chronon_root, cloud_provider):
|
|
|
31
31
|
target_path = os.path.abspath(chronon_root)
|
|
32
32
|
|
|
33
33
|
if os.path.exists(target_path) and os.listdir(target_path):
|
|
34
|
-
choice = Prompt.ask(
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
choice = Prompt.ask(
|
|
35
|
+
f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
|
|
36
|
+
choices=["y", "n"],
|
|
37
|
+
default="y",
|
|
38
|
+
)
|
|
37
39
|
if choice == "n":
|
|
38
40
|
return
|
|
39
41
|
|
|
@@ -42,7 +44,12 @@ def main(ctx, chronon_root, cloud_provider):
|
|
|
42
44
|
try:
|
|
43
45
|
shutil.copytree(template_path, target_path, dirs_exist_ok=True)
|
|
44
46
|
console.print("[bold green] Project scaffolding created successfully! 🎉\n")
|
|
45
|
-
export_cmd = Syntax(
|
|
47
|
+
export_cmd = Syntax(
|
|
48
|
+
f"`export PYTHONPATH={target_path}:$PYTHONPATH`",
|
|
49
|
+
"bash",
|
|
50
|
+
theme="github-dark",
|
|
51
|
+
line_numbers=False,
|
|
52
|
+
)
|
|
46
53
|
console.print("Please copy the following command to your shell config:")
|
|
47
54
|
console.print(export_cmd)
|
|
48
55
|
except Exception:
|
ai/chronon/repo/join_backfill.py
CHANGED
|
@@ -8,7 +8,6 @@ from ai.chronon.utils import (
|
|
|
8
8
|
convert_json_to_obj,
|
|
9
9
|
dict_to_bash_commands,
|
|
10
10
|
dict_to_exports,
|
|
11
|
-
get_join_output_table_name,
|
|
12
11
|
join_part_name,
|
|
13
12
|
sanitize,
|
|
14
13
|
)
|
|
@@ -34,7 +33,13 @@ class JoinBackfill:
|
|
|
34
33
|
):
|
|
35
34
|
self.dag_id = "_".join(
|
|
36
35
|
map(
|
|
37
|
-
sanitize,
|
|
36
|
+
sanitize,
|
|
37
|
+
[
|
|
38
|
+
"chronon_joins_backfill",
|
|
39
|
+
os.path.basename(config_path).split("/")[-1],
|
|
40
|
+
start_date,
|
|
41
|
+
end_date,
|
|
42
|
+
],
|
|
38
43
|
)
|
|
39
44
|
)
|
|
40
45
|
self.start_date = start_date
|
|
@@ -56,7 +61,8 @@ class JoinBackfill:
|
|
|
56
61
|
"""
|
|
57
62
|
flow = Flow(self.join.metaData.name)
|
|
58
63
|
final_node = Node(
|
|
59
|
-
f"{TASK_PREFIX}__{sanitize(
|
|
64
|
+
f"{TASK_PREFIX}__{sanitize(self.join.table)}",
|
|
65
|
+
self.run_final_join(),
|
|
60
66
|
)
|
|
61
67
|
left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
|
|
62
68
|
flow.add_node(final_node)
|
|
@@ -89,11 +95,19 @@ class JoinBackfill:
|
|
|
89
95
|
|
|
90
96
|
def run_left_table(self):
|
|
91
97
|
settings = self.settings.get("left_table", self.settings["default"])
|
|
92
|
-
return
|
|
98
|
+
return (
|
|
99
|
+
self.export_template(settings)
|
|
100
|
+
+ " && "
|
|
101
|
+
+ self.command_template(extra_args={"mode": "backfill-left"})
|
|
102
|
+
)
|
|
93
103
|
|
|
94
104
|
def run_final_join(self):
|
|
95
105
|
settings = self.settings.get("final_join", self.settings["default"])
|
|
96
|
-
return
|
|
106
|
+
return (
|
|
107
|
+
self.export_template(settings)
|
|
108
|
+
+ " && "
|
|
109
|
+
+ self.command_template(extra_args={"mode": "backfill-final"})
|
|
110
|
+
)
|
|
97
111
|
|
|
98
112
|
def run(self, orchestrator: str, overrides: Optional[dict] = None):
|
|
99
113
|
from ai.chronon.constants import ADAPTERS
|
ai/chronon/repo/run.py
CHANGED
|
@@ -74,8 +74,7 @@ def set_defaults(ctx):
|
|
|
74
74
|
# "online_jar_fetch": os.path.join(chronon_repo_path, "scripts/fetch_online_jar.py"),
|
|
75
75
|
"online_args": os.environ.get("CHRONON_ONLINE_ARGS"),
|
|
76
76
|
"chronon_jar": os.environ.get("CHRONON_DRIVER_JAR"),
|
|
77
|
-
"list_apps": "python3 "
|
|
78
|
-
+ os.path.join(chronon_repo_path, "scripts/yarn_list.py"),
|
|
77
|
+
"list_apps": "python3 " + os.path.join(chronon_repo_path, "scripts/yarn_list.py"),
|
|
79
78
|
"render_info": os.path.join(chronon_repo_path, RENDER_INFO_DEFAULT_SCRIPT),
|
|
80
79
|
"project_conf": obj.get("project_conf"),
|
|
81
80
|
"artifact_prefix": os.environ.get("ARTIFACT_PREFIX"),
|
|
@@ -85,24 +84,23 @@ def set_defaults(ctx):
|
|
|
85
84
|
if ctx.params.get(key) is None and value is not None:
|
|
86
85
|
ctx.params[key] = value
|
|
87
86
|
|
|
87
|
+
|
|
88
88
|
def validate_flink_state(ctx, param, value):
|
|
89
89
|
uri_schemes = ["gs://", "s3://"]
|
|
90
90
|
if value and not any(value.startswith(scheme) for scheme in uri_schemes):
|
|
91
|
-
raise click.BadParameter(
|
|
92
|
-
f"Flink state uri must start with {uri_schemes}"
|
|
93
|
-
)
|
|
91
|
+
raise click.BadParameter(f"Flink state uri must start with {uri_schemes}")
|
|
94
92
|
return value
|
|
95
93
|
|
|
94
|
+
|
|
96
95
|
def validate_additional_jars(ctx, param, value):
|
|
97
96
|
if value:
|
|
98
|
-
jars = value.split(
|
|
97
|
+
jars = value.split(",")
|
|
99
98
|
for jar in jars:
|
|
100
|
-
if not jar.startswith((
|
|
101
|
-
raise click.BadParameter(
|
|
102
|
-
f"Additional jars must start with gs://, s3://: {jar}"
|
|
103
|
-
)
|
|
99
|
+
if not jar.startswith(("gs://", "s3://")):
|
|
100
|
+
raise click.BadParameter(f"Additional jars must start with gs://, s3://: {jar}")
|
|
104
101
|
return value
|
|
105
102
|
|
|
103
|
+
|
|
106
104
|
@click.command(
|
|
107
105
|
name="run",
|
|
108
106
|
context_settings=dict(allow_extra_args=True, ignore_unknown_options=True),
|
|
@@ -116,7 +114,9 @@ def validate_additional_jars(ctx, param, value):
|
|
|
116
114
|
default="dev",
|
|
117
115
|
help="Running environment - default to be dev",
|
|
118
116
|
)
|
|
119
|
-
@click.option(
|
|
117
|
+
@click.option(
|
|
118
|
+
"--mode", type=click.Choice([str(k) for k in MODE_ARGS.keys()]), default=str(RunMode.BACKFILL)
|
|
119
|
+
)
|
|
120
120
|
@click.option("--ds", help="the end partition to backfill the data")
|
|
121
121
|
@click.option("--app-name", help="app name. Default to {}".format(APP_NAME_TEMPLATE))
|
|
122
122
|
@click.option(
|
|
@@ -142,58 +142,61 @@ def validate_additional_jars(ctx, param, value):
|
|
|
142
142
|
help="Class name of Online Impl. Used for streaming and metadata-upload mode.",
|
|
143
143
|
)
|
|
144
144
|
@click.option("--version", required=False, help="Chronon version to use.")
|
|
145
|
-
@click.option(
|
|
146
|
-
"--spark-version", default="2.4.0", help="Spark version to use for downloading jar."
|
|
147
|
-
)
|
|
145
|
+
@click.option("--spark-version", default="2.4.0", help="Spark version to use for downloading jar.")
|
|
148
146
|
@click.option("--spark-submit-path", help="Path to spark-submit")
|
|
149
|
-
@click.option(
|
|
150
|
-
"--spark-streaming-submit-path", help="Path to spark-submit for streaming"
|
|
151
|
-
)
|
|
147
|
+
@click.option("--spark-streaming-submit-path", help="Path to spark-submit for streaming")
|
|
152
148
|
@click.option(
|
|
153
149
|
"--online-jar-fetch",
|
|
154
150
|
help="Path to script that can pull online jar. This will run only "
|
|
155
151
|
"when a file doesn't exist at location specified by online_jar",
|
|
156
152
|
)
|
|
157
|
-
@click.option(
|
|
158
|
-
"--sub-help", is_flag=True, help="print help command of the underlying jar and exit"
|
|
159
|
-
)
|
|
153
|
+
@click.option("--sub-help", is_flag=True, help="print help command of the underlying jar and exit")
|
|
160
154
|
@click.option(
|
|
161
155
|
"--conf-type",
|
|
162
156
|
help="related to sub-help - no need to set unless you are not working with a conf",
|
|
163
157
|
)
|
|
164
|
-
@click.option(
|
|
165
|
-
"--online-args", help="Basic arguments that need to be supplied to all online modes"
|
|
166
|
-
)
|
|
158
|
+
@click.option("--online-args", help="Basic arguments that need to be supplied to all online modes")
|
|
167
159
|
@click.option("--chronon-jar", help="Path to chronon OS jar")
|
|
168
160
|
@click.option("--release-tag", help="Use the latest jar for a particular tag.")
|
|
169
|
-
@click.option(
|
|
170
|
-
"--list-apps", help="command/script to list running jobs on the scheduler"
|
|
171
|
-
)
|
|
161
|
+
@click.option("--list-apps", help="command/script to list running jobs on the scheduler")
|
|
172
162
|
@click.option(
|
|
173
163
|
"--render-info",
|
|
174
164
|
help="Path to script rendering additional information of the given config. "
|
|
175
165
|
"Only applicable when mode is set to info",
|
|
176
166
|
)
|
|
177
167
|
@click.option("--kafka-bootstrap", help="Kafka bootstrap server in host:port format")
|
|
178
|
-
@click.option(
|
|
168
|
+
@click.option(
|
|
169
|
+
"--latest-savepoint",
|
|
170
|
+
is_flag=True,
|
|
171
|
+
default=False,
|
|
172
|
+
help="Deploys streaming job with latest savepoint",
|
|
173
|
+
)
|
|
179
174
|
@click.option("--custom-savepoint", help="Savepoint to deploy streaming job with.")
|
|
180
|
-
@click.option("--no-savepoint", is_flag=True, default=False, help="Deploys streaming job without a savepoint")
|
|
181
|
-
@click.option("--version-check", is_flag=True, default=False,
|
|
182
|
-
help="Checks if Zipline version of running streaming job is different from local version and deploys the job if they are different")
|
|
183
|
-
@click.option("--flink-state-uri",
|
|
184
|
-
help="Bucket for storing flink state checkpoints/savepoints and other internal pieces for orchestration.",
|
|
185
|
-
callback=validate_flink_state)
|
|
186
|
-
@click.option("--additional-jars",
|
|
187
|
-
help="Comma separated list of additional jar URIs to be included in the Flink job classpath (e.g. gs://bucket/jar1.jar,gs://bucket/jar2.jar).",
|
|
188
|
-
callback=validate_additional_jars)
|
|
189
175
|
@click.option(
|
|
190
|
-
"--
|
|
176
|
+
"--no-savepoint", is_flag=True, default=False, help="Deploys streaming job without a savepoint"
|
|
177
|
+
)
|
|
178
|
+
@click.option(
|
|
179
|
+
"--version-check",
|
|
191
180
|
is_flag=True,
|
|
192
|
-
|
|
181
|
+
default=False,
|
|
182
|
+
help="Checks if Zipline version of running streaming job is different from local version and deploys the job if they are different",
|
|
193
183
|
)
|
|
194
184
|
@click.option(
|
|
195
|
-
"--
|
|
185
|
+
"--flink-state-uri",
|
|
186
|
+
help="Bucket for storing flink state checkpoints/savepoints and other internal pieces for orchestration.",
|
|
187
|
+
callback=validate_flink_state,
|
|
188
|
+
)
|
|
189
|
+
@click.option(
|
|
190
|
+
"--additional-jars",
|
|
191
|
+
help="Comma separated list of additional jar URIs to be included in the Flink job classpath (e.g. gs://bucket/jar1.jar,gs://bucket/jar2.jar).",
|
|
192
|
+
callback=validate_additional_jars,
|
|
193
|
+
)
|
|
194
|
+
@click.option(
|
|
195
|
+
"--validate",
|
|
196
|
+
is_flag=True,
|
|
197
|
+
help="Validate the catalyst util Spark expression evaluation logic",
|
|
196
198
|
)
|
|
199
|
+
@click.option("--validate-rows", default="10000", help="Number of rows to run the validation on")
|
|
197
200
|
@click.option("--join-part-name", help="Name of the join part to use for join-part-job")
|
|
198
201
|
@click.option(
|
|
199
202
|
"--artifact-prefix",
|
ai/chronon/repo/serializer.py
CHANGED
|
@@ -50,18 +50,14 @@ class ThriftJSONDecoder(json.JSONDecoder):
|
|
|
50
50
|
(_, field_ttype, field_name, field_ttype_info, dummy) = field
|
|
51
51
|
if field_name not in val:
|
|
52
52
|
continue
|
|
53
|
-
converted_val = self._convert(
|
|
54
|
-
val[field_name], field_ttype, field_ttype_info
|
|
55
|
-
)
|
|
53
|
+
converted_val = self._convert(val[field_name], field_ttype, field_ttype_info)
|
|
56
54
|
setattr(ret, field_name, converted_val)
|
|
57
55
|
elif ttype == TType.LIST:
|
|
58
56
|
(element_ttype, element_ttype_info, _) = ttype_info
|
|
59
57
|
ret = [self._convert(x, element_ttype, element_ttype_info) for x in val]
|
|
60
58
|
elif ttype == TType.SET:
|
|
61
59
|
(element_ttype, element_ttype_info) = ttype_info
|
|
62
|
-
ret = set(
|
|
63
|
-
[self._convert(x, element_ttype, element_ttype_info) for x in val]
|
|
64
|
-
)
|
|
60
|
+
ret = set([self._convert(x, element_ttype, element_ttype_info) for x in val])
|
|
65
61
|
elif ttype == TType.MAP:
|
|
66
62
|
(key_ttype, key_ttype_info, val_ttype, val_ttype_info, _) = ttype_info
|
|
67
63
|
ret = dict(
|
|
@@ -117,9 +113,7 @@ def thrift_json(obj):
|
|
|
117
113
|
|
|
118
114
|
|
|
119
115
|
def thrift_simple_json(obj):
|
|
120
|
-
simple = TSerialization.serialize(
|
|
121
|
-
obj, protocol_factory=TSimpleJSONProtocolFactory()
|
|
122
|
-
)
|
|
116
|
+
simple = TSerialization.serialize(obj, protocol_factory=TSimpleJSONProtocolFactory())
|
|
123
117
|
parsed = json.loads(simple)
|
|
124
118
|
return json.dumps(parsed, indent=2, sort_keys=True)
|
|
125
119
|
|
|
@@ -131,9 +125,7 @@ def thrift_simple_json_protected(obj, obj_type) -> str:
|
|
|
131
125
|
actual = thrift_simple_json(thrift_obj)
|
|
132
126
|
differ = JsonDiffer()
|
|
133
127
|
diff = differ.diff(serialized, actual)
|
|
134
|
-
assert (
|
|
135
|
-
len(diff) == 0
|
|
136
|
-
), f"""Serialization can't be reversed
|
|
128
|
+
assert len(diff) == 0, f"""Serialization can't be reversed
|
|
137
129
|
diff: \n{diff}
|
|
138
130
|
original: \n{serialized}
|
|
139
131
|
"""
|