awx-zipline-ai 0.0.32__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +248 -0
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +336 -0
- ai/chronon/cli/compile/compile_context.py +173 -0
- ai/chronon/cli/compile/compiler.py +183 -0
- ai/chronon/cli/compile/conf_validator.py +742 -0
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +102 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +111 -0
- ai/chronon/cli/compile/fill_templates.py +35 -0
- ai/chronon/cli/compile/parse_configs.py +134 -0
- ai/chronon/cli/compile/parse_teams.py +242 -0
- ai/chronon/cli/compile/serializer.py +109 -0
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +145 -0
- ai/chronon/cli/logger.py +59 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/join.py +580 -0
- ai/chronon/logger.py +23 -0
- ai/chronon/model.py +40 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +39 -0
- ai/chronon/repo/aws.py +284 -0
- ai/chronon/repo/cluster.py +136 -0
- ai/chronon/repo/compile.py +62 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +269 -0
- ai/chronon/repo/explore.py +418 -0
- ai/chronon/repo/extract_objects.py +134 -0
- ai/chronon/repo/gcp.py +586 -0
- ai/chronon/repo/gitpython_utils.py +15 -0
- ai/chronon/repo/hub_runner.py +261 -0
- ai/chronon/repo/hub_uploader.py +109 -0
- ai/chronon/repo/init.py +60 -0
- ai/chronon/repo/join_backfill.py +119 -0
- ai/chronon/repo/run.py +296 -0
- ai/chronon/repo/serializer.py +133 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +481 -0
- ai/chronon/repo/zipline.py +35 -0
- ai/chronon/repo/zipline_hub.py +277 -0
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +26 -0
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +26 -0
- ai/chronon/resources/gcp/teams.py +58 -0
- ai/chronon/source.py +86 -0
- ai/chronon/staging_query.py +226 -0
- ai/chronon/types.py +58 -0
- ai/chronon/utils.py +510 -0
- ai/chronon/windows.py +48 -0
- awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
- awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
- awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- gen_thrift/api/__init__.py +1 -0
- gen_thrift/api/constants.py +15 -0
- gen_thrift/api/ttypes.py +3754 -0
- gen_thrift/common/__init__.py +1 -0
- gen_thrift/common/constants.py +15 -0
- gen_thrift/common/ttypes.py +1814 -0
- gen_thrift/eval/__init__.py +1 -0
- gen_thrift/eval/constants.py +15 -0
- gen_thrift/eval/ttypes.py +660 -0
- gen_thrift/fetcher/__init__.py +1 -0
- gen_thrift/fetcher/constants.py +15 -0
- gen_thrift/fetcher/ttypes.py +127 -0
- gen_thrift/hub/__init__.py +1 -0
- gen_thrift/hub/constants.py +15 -0
- gen_thrift/hub/ttypes.py +1109 -0
- gen_thrift/observability/__init__.py +1 -0
- gen_thrift/observability/constants.py +15 -0
- gen_thrift/observability/ttypes.py +2355 -0
- gen_thrift/planner/__init__.py +1 -0
- gen_thrift/planner/constants.py +15 -0
- gen_thrift/planner/ttypes.py +1967 -0
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import date, timedelta
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
import click
|
|
8
|
+
from gen_thrift.planner.ttypes import Mode
|
|
9
|
+
|
|
10
|
+
from ai.chronon.cli.git_utils import get_current_branch
|
|
11
|
+
from ai.chronon.repo import hub_uploader, utils
|
|
12
|
+
from ai.chronon.repo.constants import RunMode
|
|
13
|
+
from ai.chronon.repo.utils import handle_conf_not_found, print_possible_confs
|
|
14
|
+
from ai.chronon.repo.zipline_hub import ZiplineHub
|
|
15
|
+
|
|
16
|
+
ALLOWED_DATE_FORMATS = ["%Y-%m-%d"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@click.group()
|
|
20
|
+
def hub():
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
#### Common click options
|
|
25
|
+
def common_options(func):
|
|
26
|
+
func = click.option("--repo", help="Path to chronon repo", default=".")(func)
|
|
27
|
+
func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
|
|
28
|
+
func = click.option(
|
|
29
|
+
"--hub_url", help="Zipline Hub address, e.g. http://localhost:3903", default=None
|
|
30
|
+
)(func)
|
|
31
|
+
return func
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def ds_option(func):
|
|
35
|
+
return click.option(
|
|
36
|
+
"--ds",
|
|
37
|
+
help="the end partition to backfill the data",
|
|
38
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
39
|
+
)(func)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def start_ds_option(func):
|
|
43
|
+
return click.option(
|
|
44
|
+
"--start-ds",
|
|
45
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
46
|
+
help="override the original start partition for a range backfill. "
|
|
47
|
+
"It only supports staging query, group by backfill and join jobs. "
|
|
48
|
+
"It could leave holes in your final output table due to the override date range.",
|
|
49
|
+
)(func)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def end_ds_option(func):
|
|
53
|
+
return click.option(
|
|
54
|
+
"--end-ds",
|
|
55
|
+
help="the end ds for a range backfill",
|
|
56
|
+
type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
|
|
57
|
+
default=str(date.today() - timedelta(days=2)),
|
|
58
|
+
)(func)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def submit_workflow(repo, conf, mode, start_ds, end_ds, hub_url=None):
|
|
62
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
63
|
+
if hub_url is not None:
|
|
64
|
+
zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
|
|
65
|
+
else:
|
|
66
|
+
zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
|
|
67
|
+
conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
|
|
68
|
+
branch = get_current_branch()
|
|
69
|
+
|
|
70
|
+
hub_uploader.compute_and_upload_diffs(
|
|
71
|
+
branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# get conf name
|
|
75
|
+
conf_name = utils.get_metadata_name_from_conf(repo, conf)
|
|
76
|
+
|
|
77
|
+
response_json = zipline_hub.call_workflow_start_api(
|
|
78
|
+
conf_name=conf_name,
|
|
79
|
+
mode=mode,
|
|
80
|
+
branch=branch, # Get the current branch
|
|
81
|
+
user=os.environ.get("USER"),
|
|
82
|
+
start=start_ds,
|
|
83
|
+
end=end_ds,
|
|
84
|
+
conf_hash=conf_name_to_hash_dict[conf_name].hash,
|
|
85
|
+
skip_long_running=False,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
workflow_id = response_json.get("workflowId", "N/A")
|
|
89
|
+
print(" 🆔 Workflow Id:", workflow_id)
|
|
90
|
+
print_wf_url(
|
|
91
|
+
conf=conf,
|
|
92
|
+
conf_name=conf_name,
|
|
93
|
+
mode=mode,
|
|
94
|
+
workflow_id=workflow_id,
|
|
95
|
+
repo=repo
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def submit_schedule(repo, conf, hub_url=None):
|
|
100
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
101
|
+
if hub_url is not None:
|
|
102
|
+
zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
|
|
103
|
+
else:
|
|
104
|
+
zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
|
|
105
|
+
conf_name_to_obj_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
|
|
106
|
+
branch = get_current_branch()
|
|
107
|
+
|
|
108
|
+
hub_uploader.compute_and_upload_diffs(
|
|
109
|
+
branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_obj_dict
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# get conf name
|
|
113
|
+
conf_name = utils.get_metadata_name_from_conf(repo, conf)
|
|
114
|
+
schedule_modes = get_schedule_modes(os.path.join(repo, conf))
|
|
115
|
+
# create a dict for RunMode.BACKFILL.value and RunMode.DEPLOY.value to schedule_modes.offline_schedule and schedule_modes.online
|
|
116
|
+
modes = {
|
|
117
|
+
RunMode.BACKFILL.value.upper(): schedule_modes.offline_schedule,
|
|
118
|
+
RunMode.DEPLOY.value.upper(): schedule_modes.online,
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
response_json = zipline_hub.call_schedule_api(
|
|
122
|
+
modes=modes,
|
|
123
|
+
branch=branch,
|
|
124
|
+
conf_name=conf_name,
|
|
125
|
+
conf_hash=conf_name_to_obj_dict[conf_name].hash,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
schedules = response_json.get("schedules", "N/A")
|
|
129
|
+
readable_schedules = {Mode._VALUES_TO_NAMES[int(k)]: v for k, v in schedules.items()}
|
|
130
|
+
print(" 🗓️ Schedules Deployed:", readable_schedules)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# zipline hub backfill --conf=compiled/joins/join
|
|
134
|
+
# adhoc backfills
|
|
135
|
+
@hub.command()
|
|
136
|
+
@common_options
|
|
137
|
+
@start_ds_option
|
|
138
|
+
@end_ds_option
|
|
139
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
140
|
+
def backfill(repo, conf, hub_url, start_ds, end_ds):
|
|
141
|
+
"""
|
|
142
|
+
- Submit a backfill job to Zipline.
|
|
143
|
+
Response should contain a list of confs that are different from what's on remote.
|
|
144
|
+
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
145
|
+
- Call the actual run API with mode set to backfill.
|
|
146
|
+
"""
|
|
147
|
+
submit_workflow(
|
|
148
|
+
repo, conf, RunMode.BACKFILL.value, start_ds, end_ds, hub_url=hub_url
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# zipline hub run-adhoc --conf=compiled/joins/join
|
|
153
|
+
# currently only supports one-off deploy node submission
|
|
154
|
+
@hub.command()
|
|
155
|
+
@common_options
|
|
156
|
+
@end_ds_option
|
|
157
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
158
|
+
def run_adhoc(repo, conf, hub_url, end_ds):
|
|
159
|
+
"""
|
|
160
|
+
- Submit a one-off deploy job to Zipline. This submits the various jobs to allow your conf to be tested online.
|
|
161
|
+
Response should contain a list of confs that are different from what's on remote.
|
|
162
|
+
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
163
|
+
- Call the actual run API with mode set to deploy
|
|
164
|
+
"""
|
|
165
|
+
submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds, hub_url=hub_url)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# zipline hub schedule --conf=compiled/joins/join
|
|
169
|
+
@hub.command()
|
|
170
|
+
@common_options
|
|
171
|
+
@handle_conf_not_found(log_error=True, callback=print_possible_confs)
|
|
172
|
+
def schedule(repo, conf, hub_url):
|
|
173
|
+
"""
|
|
174
|
+
- Deploys a schedule for the specified conf to Zipline. This allows your conf to have various associated jobs run on a schedule.
|
|
175
|
+
This verb will introspect your conf to determine which of its jobs need to be scheduled (or paused if turned off) based on the
|
|
176
|
+
'offline_schedule' and 'online' fields.
|
|
177
|
+
"""
|
|
178
|
+
submit_schedule(repo, conf, hub_url=hub_url)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_metadata_map(file_path):
|
|
182
|
+
with open(file_path, "r") as f:
|
|
183
|
+
data = json.load(f)
|
|
184
|
+
metadata_map = data["metaData"]
|
|
185
|
+
return metadata_map
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def get_common_env_map(file_path):
|
|
189
|
+
metadata_map = get_metadata_map(file_path)
|
|
190
|
+
common_env_map = metadata_map["executionInfo"]["env"]["common"]
|
|
191
|
+
return common_env_map
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@dataclass
|
|
195
|
+
class HubConfig:
|
|
196
|
+
hub_url: str
|
|
197
|
+
frontend_url: str
|
|
198
|
+
sa_name: Optional[str] = None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@dataclass
|
|
202
|
+
class ScheduleModes:
|
|
203
|
+
online: str
|
|
204
|
+
offline_schedule: str
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def get_hub_conf(conf_path, root_dir="."):
|
|
208
|
+
file_path = os.path.join(root_dir, conf_path)
|
|
209
|
+
common_env_map = get_common_env_map(file_path)
|
|
210
|
+
hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
|
|
211
|
+
frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
|
|
212
|
+
sa_name = common_env_map.get("SA_NAME", os.environ.get("SA_NAME"))
|
|
213
|
+
return HubConfig(hub_url=hub_url, frontend_url=frontend_url, sa_name=sa_name)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_schedule_modes(conf_path):
|
|
217
|
+
metadata_map = get_metadata_map(conf_path)
|
|
218
|
+
online_value = metadata_map.get("online", False)
|
|
219
|
+
online = "true" if bool(online_value) else "false"
|
|
220
|
+
offline_schedule = metadata_map["executionInfo"].get("scheduleCron", None)
|
|
221
|
+
|
|
222
|
+
# check if offline_schedule is null or 'None' or '@daily' else throw an error
|
|
223
|
+
valid_schedules = {None, "None", "@daily"}
|
|
224
|
+
if offline_schedule not in valid_schedules:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Unsupported offline_schedule: {offline_schedule}. Only null, 'None', or '@daily' are supported."
|
|
227
|
+
)
|
|
228
|
+
offline_schedule = offline_schedule or "None"
|
|
229
|
+
return ScheduleModes(online=online, offline_schedule=offline_schedule)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def print_wf_url(conf, conf_name, mode, workflow_id, repo="."):
|
|
233
|
+
hub_conf = get_hub_conf(conf, root_dir=repo)
|
|
234
|
+
frontend_url = hub_conf.frontend_url
|
|
235
|
+
|
|
236
|
+
if "compiled/joins" in conf:
|
|
237
|
+
hub_conf_type = "joins"
|
|
238
|
+
elif "compiled/staging_queries" in conf:
|
|
239
|
+
hub_conf_type = "stagingqueries"
|
|
240
|
+
elif "compiled/group_by" in conf:
|
|
241
|
+
hub_conf_type = "groupbys"
|
|
242
|
+
elif "compiled/models" in conf:
|
|
243
|
+
hub_conf_type = "models"
|
|
244
|
+
else:
|
|
245
|
+
raise ValueError(f"Unsupported conf type: {conf}")
|
|
246
|
+
|
|
247
|
+
def _mode_string():
|
|
248
|
+
if mode == "backfill":
|
|
249
|
+
return "offline"
|
|
250
|
+
elif mode == "deploy":
|
|
251
|
+
return "online"
|
|
252
|
+
else:
|
|
253
|
+
raise ValueError(f"Unsupported mode: {mode}")
|
|
254
|
+
|
|
255
|
+
workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string()}?workflowId={workflow_id}"
|
|
256
|
+
|
|
257
|
+
print(" 🔗 Workflow : " + workflow_url + "\n")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
if __name__ == "__main__":
|
|
261
|
+
hub()
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from gen_thrift.api.ttypes import Conf
|
|
7
|
+
|
|
8
|
+
from ai.chronon.repo import (
|
|
9
|
+
FOLDER_NAME_TO_CLASS,
|
|
10
|
+
FOLDER_NAME_TO_CONF_TYPE,
|
|
11
|
+
)
|
|
12
|
+
from ai.chronon.repo.zipline_hub import ZiplineHub
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_local_repo_hashmap(root_dir: str):
|
|
16
|
+
compiled_dir = os.path.join(root_dir, "compiled")
|
|
17
|
+
# Returns a map of name -> (tbinary, file_hash)
|
|
18
|
+
results = {}
|
|
19
|
+
|
|
20
|
+
# Iterate through each object type folder (staging_queries, group_bys, joins etc)
|
|
21
|
+
for folder_name, _ in FOLDER_NAME_TO_CLASS.items():
|
|
22
|
+
folder_path = os.path.join(compiled_dir, folder_name)
|
|
23
|
+
if not os.path.exists(folder_path):
|
|
24
|
+
continue
|
|
25
|
+
|
|
26
|
+
# Find all json files recursively in this folder
|
|
27
|
+
json_files = [
|
|
28
|
+
f
|
|
29
|
+
for f in glob.glob(os.path.join(folder_path, "**/*"), recursive=True)
|
|
30
|
+
if os.path.isfile(f)
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
exceptions = []
|
|
34
|
+
|
|
35
|
+
for json_file in json_files:
|
|
36
|
+
try:
|
|
37
|
+
# Read the json file
|
|
38
|
+
with open(json_file, "r") as f:
|
|
39
|
+
thrift_json = f.read()
|
|
40
|
+
|
|
41
|
+
# Extract name from metadata in json
|
|
42
|
+
json_obj = json.loads(thrift_json)
|
|
43
|
+
name = json_obj["metaData"]["name"]
|
|
44
|
+
|
|
45
|
+
# Load the json into the appropriate object type based on folder
|
|
46
|
+
# binary = json2binary(thrift_json, obj_class)
|
|
47
|
+
|
|
48
|
+
md5_hash = hashlib.md5(thrift_json.encode()).hexdigest()
|
|
49
|
+
# md5_hash = hashlib.md5(thrift_json.encode()).hexdigest() + "123"
|
|
50
|
+
# results[name] = (binary, md5_hash, FOLDER_NAME_TO_CONF_TYPE[folder_name])
|
|
51
|
+
results[name] = Conf(
|
|
52
|
+
name=name,
|
|
53
|
+
hash=md5_hash,
|
|
54
|
+
# contents=binary,
|
|
55
|
+
contents=thrift_json,
|
|
56
|
+
confType=FOLDER_NAME_TO_CONF_TYPE[folder_name],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
except Exception as e:
|
|
60
|
+
exceptions.append(f"{json_file} - {e}")
|
|
61
|
+
|
|
62
|
+
if exceptions:
|
|
63
|
+
error_msg = (
|
|
64
|
+
"The following files had exceptions during upload: \n"
|
|
65
|
+
+ "\n".join(exceptions)
|
|
66
|
+
+ "\n\n Consider deleting the files (safe operation) and checking "
|
|
67
|
+
+ "your thrift version before rerunning your command."
|
|
68
|
+
)
|
|
69
|
+
raise RuntimeError(error_msg)
|
|
70
|
+
|
|
71
|
+
return results
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def compute_and_upload_diffs(
|
|
75
|
+
branch: str, zipline_hub: ZiplineHub, local_repo_confs: dict[str, Conf]
|
|
76
|
+
):
|
|
77
|
+
# Determine which confs are different from the ZiplineHub
|
|
78
|
+
# Call Zipline hub with `names_and_hashes` as the argument to get back
|
|
79
|
+
names_to_hashes = {name: local_conf.hash for name, local_conf in local_repo_confs.items()}
|
|
80
|
+
print(f"\n 🧮 Computed hashes for {len(names_to_hashes)} local files.")
|
|
81
|
+
|
|
82
|
+
changed_conf_names = zipline_hub.call_diff_api(names_to_hashes)
|
|
83
|
+
|
|
84
|
+
if not changed_conf_names:
|
|
85
|
+
print(f" ✅ Remote contains all local files. No need to upload '{branch}'.")
|
|
86
|
+
diffed_confs = {}
|
|
87
|
+
else:
|
|
88
|
+
unchanged = len(names_to_hashes) - len(changed_conf_names)
|
|
89
|
+
print(
|
|
90
|
+
f" 🔍 Detected {len(changed_conf_names)} changes on local branch '{branch}'. {unchanged} unchanged."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# a list of names for diffed hashes on branch
|
|
94
|
+
diffed_confs = {k: local_repo_confs[k] for k in changed_conf_names}
|
|
95
|
+
conf_names_str = "\n - ".join(diffed_confs.keys())
|
|
96
|
+
print(f" - {conf_names_str}")
|
|
97
|
+
|
|
98
|
+
diff_confs = []
|
|
99
|
+
for _, conf in diffed_confs.items():
|
|
100
|
+
diff_confs.append(conf.__dict__)
|
|
101
|
+
|
|
102
|
+
# Make PUT request to ZiplineHub
|
|
103
|
+
zipline_hub.call_upload_api(branch=branch, diff_confs=diff_confs)
|
|
104
|
+
print(f" ⬆️ Uploaded {len(diffed_confs)} changed confs to branch '{branch}'.")
|
|
105
|
+
|
|
106
|
+
zipline_hub.call_sync_api(branch=branch, names_to_hashes=names_to_hashes)
|
|
107
|
+
|
|
108
|
+
print(f" ✅ {len(names_to_hashes)} hashes updated on branch '{branch}'.\n")
|
|
109
|
+
return diffed_confs
|
ai/chronon/repo/init.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
from importlib_resources import files
|
|
8
|
+
from rich.prompt import Prompt
|
|
9
|
+
from rich.syntax import Syntax
|
|
10
|
+
|
|
11
|
+
from ai.chronon.cli.compile.display.console import console
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command(name="init")
|
|
15
|
+
@click.option(
|
|
16
|
+
"--cloud-provider",
|
|
17
|
+
envvar="CLOUD_PROVIDER",
|
|
18
|
+
help="Cloud provider to use.",
|
|
19
|
+
required=True,
|
|
20
|
+
type=click.Choice(["aws", "gcp"], case_sensitive=False),
|
|
21
|
+
)
|
|
22
|
+
@click.option(
|
|
23
|
+
"--chronon-root",
|
|
24
|
+
help="Path to the root chronon folder.",
|
|
25
|
+
default=os.path.join(os.getcwd(), "zipline"),
|
|
26
|
+
type=click.Path(file_okay=False, writable=True),
|
|
27
|
+
)
|
|
28
|
+
@click.pass_context
|
|
29
|
+
def main(ctx, chronon_root, cloud_provider):
|
|
30
|
+
template_path = files("ai.chronon").joinpath("resources", cloud_provider.lower())
|
|
31
|
+
target_path = os.path.abspath(chronon_root)
|
|
32
|
+
|
|
33
|
+
if os.path.exists(target_path) and os.listdir(target_path):
|
|
34
|
+
choice = Prompt.ask(
|
|
35
|
+
f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
|
|
36
|
+
choices=["y", "n"],
|
|
37
|
+
default="y",
|
|
38
|
+
)
|
|
39
|
+
if choice == "n":
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
console.print(f"Generating scaffolding at {target_path} ...")
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
shutil.copytree(template_path, target_path, dirs_exist_ok=True)
|
|
46
|
+
console.print("[bold green] Project scaffolding created successfully! 🎉\n")
|
|
47
|
+
export_cmd = Syntax(
|
|
48
|
+
f"`export PYTHONPATH={target_path}:$PYTHONPATH`",
|
|
49
|
+
"bash",
|
|
50
|
+
theme="github-dark",
|
|
51
|
+
line_numbers=False,
|
|
52
|
+
)
|
|
53
|
+
console.print("Please copy the following command to your shell config:")
|
|
54
|
+
console.print(export_cmd)
|
|
55
|
+
except Exception:
|
|
56
|
+
console.print_exception()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
if __name__ == "__main__":
|
|
60
|
+
main()
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from ai.chronon.scheduler.interfaces.flow import Flow
|
|
6
|
+
from ai.chronon.scheduler.interfaces.node import Node
|
|
7
|
+
from ai.chronon.utils import (
|
|
8
|
+
convert_json_to_obj,
|
|
9
|
+
dict_to_bash_commands,
|
|
10
|
+
dict_to_exports,
|
|
11
|
+
join_part_name,
|
|
12
|
+
sanitize,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
TASK_PREFIX = "compute_join"
|
|
16
|
+
DEFAULT_SPARK_SETTINGS = {
|
|
17
|
+
"default": {
|
|
18
|
+
"spark_version": "3.1.1",
|
|
19
|
+
"executor_memory": "4G",
|
|
20
|
+
"driver_memory": "4G",
|
|
21
|
+
"executor_cores": 2,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JoinBackfill:
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
start_date: str,
|
|
30
|
+
end_date: str,
|
|
31
|
+
config_path: str,
|
|
32
|
+
settings: dict = DEFAULT_SPARK_SETTINGS,
|
|
33
|
+
):
|
|
34
|
+
self.dag_id = "_".join(
|
|
35
|
+
map(
|
|
36
|
+
sanitize,
|
|
37
|
+
[
|
|
38
|
+
"chronon_joins_backfill",
|
|
39
|
+
os.path.basename(config_path).split("/")[-1],
|
|
40
|
+
start_date,
|
|
41
|
+
end_date,
|
|
42
|
+
],
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
self.start_date = start_date
|
|
46
|
+
self.end_date = end_date
|
|
47
|
+
self.config_path = config_path
|
|
48
|
+
self.settings = settings
|
|
49
|
+
with open(self.config_path, "r") as file:
|
|
50
|
+
config = file.read()
|
|
51
|
+
self.join = convert_json_to_obj(json.loads(config))
|
|
52
|
+
|
|
53
|
+
def build_flow(self) -> Flow:
|
|
54
|
+
"""
|
|
55
|
+
Build a flow from a Join object. Each join part is a node and will run in parallel.
|
|
56
|
+
The next step is final join, which is a node that depends on all join parts.
|
|
57
|
+
The final join will run after all join parts are done.
|
|
58
|
+
|
|
59
|
+
:param join: The Join object to build a flow from
|
|
60
|
+
:return: A Flow object that represents the flow of the Join
|
|
61
|
+
"""
|
|
62
|
+
flow = Flow(self.join.metaData.name)
|
|
63
|
+
final_node = Node(
|
|
64
|
+
f"{TASK_PREFIX}__{sanitize(self.join.table)}",
|
|
65
|
+
self.run_final_join(),
|
|
66
|
+
)
|
|
67
|
+
left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
|
|
68
|
+
flow.add_node(final_node)
|
|
69
|
+
flow.add_node(left_node)
|
|
70
|
+
for join_part in self.join.joinParts:
|
|
71
|
+
jp_full_name = join_part_name(join_part)
|
|
72
|
+
jp_node = Node(f"{TASK_PREFIX}__{jp_full_name}", self.run_join_part(jp_full_name))
|
|
73
|
+
flow.add_node(jp_node)
|
|
74
|
+
jp_node.add_dependency(left_node)
|
|
75
|
+
final_node.add_dependency(jp_node)
|
|
76
|
+
return flow
|
|
77
|
+
|
|
78
|
+
def export_template(self, settings: dict):
|
|
79
|
+
return f"{dict_to_exports(settings)}"
|
|
80
|
+
|
|
81
|
+
def command_template(self, extra_args: dict):
|
|
82
|
+
if self.start_date:
|
|
83
|
+
extra_args.update({"start_ds": self.start_date})
|
|
84
|
+
return f"""python3 /tmp/run.py --conf=/tmp/{self.config_path} --ds={self.end_date} \
|
|
85
|
+
{dict_to_bash_commands(extra_args)}"""
|
|
86
|
+
|
|
87
|
+
def run_join_part(self, join_part: str):
|
|
88
|
+
args = {
|
|
89
|
+
"mode": "backfill",
|
|
90
|
+
"selected_join_parts": join_part,
|
|
91
|
+
"use_cached_left": None,
|
|
92
|
+
}
|
|
93
|
+
settings = self.settings.get(join_part, self.settings["default"])
|
|
94
|
+
return self.export_template(settings) + " && " + self.command_template(extra_args=args)
|
|
95
|
+
|
|
96
|
+
def run_left_table(self):
|
|
97
|
+
settings = self.settings.get("left_table", self.settings["default"])
|
|
98
|
+
return (
|
|
99
|
+
self.export_template(settings)
|
|
100
|
+
+ " && "
|
|
101
|
+
+ self.command_template(extra_args={"mode": "backfill-left"})
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def run_final_join(self):
|
|
105
|
+
settings = self.settings.get("final_join", self.settings["default"])
|
|
106
|
+
return (
|
|
107
|
+
self.export_template(settings)
|
|
108
|
+
+ " && "
|
|
109
|
+
+ self.command_template(extra_args={"mode": "backfill-final"})
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def run(self, orchestrator: str, overrides: Optional[dict] = None):
|
|
113
|
+
from ai.chronon.constants import ADAPTERS
|
|
114
|
+
|
|
115
|
+
ADAPTERS.update(overrides)
|
|
116
|
+
orchestrator = ADAPTERS[orchestrator](dag_id=self.dag_id, start_date=self.start_date)
|
|
117
|
+
orchestrator.setup()
|
|
118
|
+
orchestrator.build_dag_from_flow(self.build_flow())
|
|
119
|
+
orchestrator.trigger_run()
|