awx-zipline-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/__init__.py +1 -0
- agent/constants.py +15 -0
- agent/ttypes.py +1684 -0
- ai/__init__.py +0 -0
- ai/chronon/__init__.py +0 -0
- ai/chronon/airflow_helpers.py +251 -0
- ai/chronon/api/__init__.py +1 -0
- ai/chronon/api/common/__init__.py +1 -0
- ai/chronon/api/common/constants.py +15 -0
- ai/chronon/api/common/ttypes.py +1844 -0
- ai/chronon/api/constants.py +15 -0
- ai/chronon/api/ttypes.py +3624 -0
- ai/chronon/cli/compile/column_hashing.py +313 -0
- ai/chronon/cli/compile/compile_context.py +177 -0
- ai/chronon/cli/compile/compiler.py +160 -0
- ai/chronon/cli/compile/conf_validator.py +590 -0
- ai/chronon/cli/compile/display/class_tracker.py +112 -0
- ai/chronon/cli/compile/display/compile_status.py +95 -0
- ai/chronon/cli/compile/display/compiled_obj.py +12 -0
- ai/chronon/cli/compile/display/console.py +3 -0
- ai/chronon/cli/compile/display/diff_result.py +46 -0
- ai/chronon/cli/compile/fill_templates.py +40 -0
- ai/chronon/cli/compile/parse_configs.py +141 -0
- ai/chronon/cli/compile/parse_teams.py +238 -0
- ai/chronon/cli/compile/serializer.py +115 -0
- ai/chronon/cli/git_utils.py +156 -0
- ai/chronon/cli/logger.py +61 -0
- ai/chronon/constants.py +3 -0
- ai/chronon/eval/__init__.py +122 -0
- ai/chronon/eval/query_parsing.py +19 -0
- ai/chronon/eval/sample_tables.py +100 -0
- ai/chronon/eval/table_scan.py +186 -0
- ai/chronon/fetcher/__init__.py +1 -0
- ai/chronon/fetcher/constants.py +15 -0
- ai/chronon/fetcher/ttypes.py +127 -0
- ai/chronon/group_by.py +692 -0
- ai/chronon/hub/__init__.py +1 -0
- ai/chronon/hub/constants.py +15 -0
- ai/chronon/hub/ttypes.py +1228 -0
- ai/chronon/join.py +566 -0
- ai/chronon/logger.py +24 -0
- ai/chronon/model.py +35 -0
- ai/chronon/observability/__init__.py +1 -0
- ai/chronon/observability/constants.py +15 -0
- ai/chronon/observability/ttypes.py +2192 -0
- ai/chronon/orchestration/__init__.py +1 -0
- ai/chronon/orchestration/constants.py +15 -0
- ai/chronon/orchestration/ttypes.py +4406 -0
- ai/chronon/planner/__init__.py +1 -0
- ai/chronon/planner/constants.py +15 -0
- ai/chronon/planner/ttypes.py +1686 -0
- ai/chronon/query.py +126 -0
- ai/chronon/repo/__init__.py +40 -0
- ai/chronon/repo/aws.py +298 -0
- ai/chronon/repo/cluster.py +65 -0
- ai/chronon/repo/compile.py +56 -0
- ai/chronon/repo/constants.py +164 -0
- ai/chronon/repo/default_runner.py +291 -0
- ai/chronon/repo/explore.py +421 -0
- ai/chronon/repo/extract_objects.py +137 -0
- ai/chronon/repo/gcp.py +585 -0
- ai/chronon/repo/gitpython_utils.py +14 -0
- ai/chronon/repo/hub_runner.py +171 -0
- ai/chronon/repo/hub_uploader.py +108 -0
- ai/chronon/repo/init.py +53 -0
- ai/chronon/repo/join_backfill.py +105 -0
- ai/chronon/repo/run.py +293 -0
- ai/chronon/repo/serializer.py +141 -0
- ai/chronon/repo/team_json_utils.py +46 -0
- ai/chronon/repo/utils.py +472 -0
- ai/chronon/repo/zipline.py +51 -0
- ai/chronon/repo/zipline_hub.py +105 -0
- ai/chronon/resources/gcp/README.md +174 -0
- ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
- ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +30 -0
- ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +23 -0
- ai/chronon/resources/gcp/teams.py +70 -0
- ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
- ai/chronon/source.py +88 -0
- ai/chronon/staging_query.py +185 -0
- ai/chronon/types.py +57 -0
- ai/chronon/utils.py +557 -0
- ai/chronon/windows.py +50 -0
- awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
- awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
- awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
- awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
- awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
- awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
- jars/__init__.py +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from urllib.parse import quote_plus
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
from attr import dataclass
|
|
8
|
+
|
|
9
|
+
from ai.chronon.cli.git_utils import get_current_branch
|
|
10
|
+
from ai.chronon.repo import hub_uploader, utils
|
|
11
|
+
from ai.chronon.repo.constants import RunMode
|
|
12
|
+
from ai.chronon.repo.zipline_hub import ZiplineHub
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
def hub():
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
#### Common click options
|
|
21
|
+
def common_options(func):
|
|
22
|
+
func = click.option("--repo", help="Path to chronon repo", default=".")(func)
|
|
23
|
+
func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
|
|
24
|
+
return func
|
|
25
|
+
|
|
26
|
+
def ds_option(func):
|
|
27
|
+
return click.option("--ds", help="the end partition to backfill the data")(func)
|
|
28
|
+
|
|
29
|
+
def start_ds_option(func):
|
|
30
|
+
return click.option(
|
|
31
|
+
"--start-ds",
|
|
32
|
+
help="override the original start partition for a range backfill. "
|
|
33
|
+
"It only supports staging query, group by backfill and join jobs. "
|
|
34
|
+
"It could leave holes in your final output table due to the override date range.",)(func)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def end_ds_option(func):
|
|
38
|
+
return click.option("--end-ds", help="the end ds for a range backfill")(func)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def submit_workflow(repo,
|
|
42
|
+
conf,
|
|
43
|
+
mode,
|
|
44
|
+
start_ds,
|
|
45
|
+
end_ds):
|
|
46
|
+
|
|
47
|
+
hub_conf = get_hub_conf(conf)
|
|
48
|
+
zipline_hub = ZiplineHub(base_url=hub_conf.hub_url)
|
|
49
|
+
conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir= repo)
|
|
50
|
+
branch = get_current_branch()
|
|
51
|
+
|
|
52
|
+
hub_uploader.compute_and_upload_diffs(branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict)
|
|
53
|
+
|
|
54
|
+
# get conf name
|
|
55
|
+
conf_name = utils.get_metadata_name_from_conf(repo, conf)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
response_json = zipline_hub.call_workflow_start_api(
|
|
59
|
+
conf_name=conf_name,
|
|
60
|
+
mode=mode,
|
|
61
|
+
branch=branch, # Get the current branch
|
|
62
|
+
user=os.environ.get('USER'),
|
|
63
|
+
start=start_ds,
|
|
64
|
+
end=end_ds,
|
|
65
|
+
conf_hash=conf_name_to_hash_dict[conf_name].hash,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
print(" 🆔 Workflow Id:", response_json.get("workflowId", "N/A"))
|
|
69
|
+
print_wf_url(
|
|
70
|
+
conf=conf,
|
|
71
|
+
conf_name=conf_name,
|
|
72
|
+
mode=RunMode.BACKFILL.value,
|
|
73
|
+
start_ds=start_ds,
|
|
74
|
+
end_ds=end_ds,
|
|
75
|
+
branch=branch
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# zipline hub backfill --conf=compiled/joins/join
|
|
80
|
+
# adhoc backfills
|
|
81
|
+
@hub.command()
|
|
82
|
+
@common_options
|
|
83
|
+
@start_ds_option
|
|
84
|
+
@end_ds_option
|
|
85
|
+
def backfill(repo,
|
|
86
|
+
conf,
|
|
87
|
+
start_ds,
|
|
88
|
+
end_ds):
|
|
89
|
+
"""
|
|
90
|
+
- Submit a backfill job to Zipline.
|
|
91
|
+
Response should contain a list of confs that are different from what's on remote.
|
|
92
|
+
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
93
|
+
- Call the actual run API with mode set to backfill.
|
|
94
|
+
"""
|
|
95
|
+
submit_workflow(repo, conf, RunMode.BACKFILL.value, start_ds, end_ds)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# zipline hub deploy --conf=compiled/joins/join
|
|
100
|
+
# currently only supports one-off deploy node submission
|
|
101
|
+
@hub.command()
|
|
102
|
+
@common_options
|
|
103
|
+
@end_ds_option
|
|
104
|
+
def deploy(repo,
|
|
105
|
+
conf,
|
|
106
|
+
end_ds):
|
|
107
|
+
"""
|
|
108
|
+
- Submit a one-off deploy job to Zipline.
|
|
109
|
+
Response should contain a list of confs that are different from what's on remote.
|
|
110
|
+
- Call upload API to upload the conf contents for the list of confs that were different.
|
|
111
|
+
- Call the actual run API with mode set to deploy
|
|
112
|
+
"""
|
|
113
|
+
submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def get_common_env_map(file_path):
|
|
117
|
+
with open(file_path, 'r') as f:
|
|
118
|
+
data = json.load(f)
|
|
119
|
+
common_env_map = data['metaData']['executionInfo']['env']['common']
|
|
120
|
+
return common_env_map
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class HubConfig:
|
|
125
|
+
hub_url: str
|
|
126
|
+
frontend_url: str
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def get_hub_conf(conf_path):
|
|
130
|
+
common_env_map = get_common_env_map(conf_path)
|
|
131
|
+
hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
|
|
132
|
+
frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
|
|
133
|
+
return HubConfig(hub_url=hub_url, frontend_url=frontend_url)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
|
|
137
|
+
|
|
138
|
+
hub_conf = get_hub_conf(conf)
|
|
139
|
+
frontend_url = hub_conf.frontend_url
|
|
140
|
+
|
|
141
|
+
if "compiled/joins" in conf:
|
|
142
|
+
hub_conf_type = "joins"
|
|
143
|
+
elif "compiled/staging_queries" in conf:
|
|
144
|
+
hub_conf_type = "stagingqueries"
|
|
145
|
+
elif "compiled/group_by" in conf:
|
|
146
|
+
hub_conf_type = "groupby"
|
|
147
|
+
elif "compiled/models" in conf:
|
|
148
|
+
hub_conf_type = "models"
|
|
149
|
+
else:
|
|
150
|
+
raise ValueError(f"Unsupported conf type: {conf}")
|
|
151
|
+
|
|
152
|
+
# TODO: frontend uses localtime to create the millis, we should make it use UTC and make this align
|
|
153
|
+
def _millis(date_str):
|
|
154
|
+
return int(datetime.strptime(date_str, "%Y-%m-%d").timestamp() * 1000)
|
|
155
|
+
|
|
156
|
+
def _mode_string(mode):
|
|
157
|
+
if mode == "backfill":
|
|
158
|
+
return "offline"
|
|
159
|
+
elif mode == "deploy":
|
|
160
|
+
return "online"
|
|
161
|
+
else:
|
|
162
|
+
raise ValueError(f"Unsupported mode: {mode}")
|
|
163
|
+
|
|
164
|
+
workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string(mode)}?start={_millis(start_ds)}&end={_millis(end_ds)}&branch={quote_plus(branch)}"
|
|
165
|
+
|
|
166
|
+
print(" 🔗 Workflow : " + workflow_url + "\n")
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
hub()
|
|
170
|
+
|
|
171
|
+
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import hashlib
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from ai.chronon.orchestration.ttypes import Conf
|
|
7
|
+
from ai.chronon.repo import (
|
|
8
|
+
FOLDER_NAME_TO_CLASS,
|
|
9
|
+
FOLDER_NAME_TO_CONF_TYPE,
|
|
10
|
+
)
|
|
11
|
+
from ai.chronon.repo.zipline_hub import ZiplineHub
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_local_repo_hashmap(root_dir: str):
|
|
15
|
+
compiled_dir = os.path.join(root_dir, "compiled")
|
|
16
|
+
# Returns a map of name -> (tbinary, file_hash)
|
|
17
|
+
results = {}
|
|
18
|
+
|
|
19
|
+
# Iterate through each object type folder (staging_queries, group_bys, joins etc)
|
|
20
|
+
for folder_name, _ in FOLDER_NAME_TO_CLASS.items():
|
|
21
|
+
folder_path = os.path.join(compiled_dir, folder_name)
|
|
22
|
+
if not os.path.exists(folder_path):
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
# Find all json files recursively in this folder
|
|
26
|
+
json_files = [
|
|
27
|
+
f
|
|
28
|
+
for f in glob.glob(os.path.join(folder_path, "**/*"), recursive=True)
|
|
29
|
+
if os.path.isfile(f)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
exceptions = []
|
|
33
|
+
|
|
34
|
+
for json_file in json_files:
|
|
35
|
+
try:
|
|
36
|
+
# Read the json file
|
|
37
|
+
with open(json_file, "r") as f:
|
|
38
|
+
thrift_json = f.read()
|
|
39
|
+
|
|
40
|
+
# Extract name from metadata in json
|
|
41
|
+
json_obj = json.loads(thrift_json)
|
|
42
|
+
name = json_obj["metaData"]["name"]
|
|
43
|
+
|
|
44
|
+
# Load the json into the appropriate object type based on folder
|
|
45
|
+
# binary = json2binary(thrift_json, obj_class)
|
|
46
|
+
|
|
47
|
+
md5_hash = hashlib.md5(thrift_json.encode()).hexdigest()
|
|
48
|
+
# md5_hash = hashlib.md5(thrift_json.encode()).hexdigest() + "123"
|
|
49
|
+
# results[name] = (binary, md5_hash, FOLDER_NAME_TO_CONF_TYPE[folder_name])
|
|
50
|
+
results[name] = Conf(
|
|
51
|
+
name=name,
|
|
52
|
+
hash=md5_hash,
|
|
53
|
+
# contents=binary,
|
|
54
|
+
contents=thrift_json,
|
|
55
|
+
confType=FOLDER_NAME_TO_CONF_TYPE[folder_name],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
exceptions.append(f"{json_file} - {e}")
|
|
60
|
+
|
|
61
|
+
if exceptions:
|
|
62
|
+
error_msg = (
|
|
63
|
+
"The following files had exceptions during upload: \n"
|
|
64
|
+
+ "\n".join(exceptions)
|
|
65
|
+
+ "\n\n Consider deleting the files (safe operation) and checking "
|
|
66
|
+
+ "your thrift version before rerunning your command."
|
|
67
|
+
)
|
|
68
|
+
raise RuntimeError(error_msg)
|
|
69
|
+
|
|
70
|
+
return results
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compute_and_upload_diffs(
|
|
74
|
+
branch: str, zipline_hub: ZiplineHub, local_repo_confs: dict[str, Conf]
|
|
75
|
+
):
|
|
76
|
+
# Determine which confs are different from the ZiplineHub
|
|
77
|
+
# Call Zipline hub with `names_and_hashes` as the argument to get back
|
|
78
|
+
names_to_hashes = {name: local_conf.hash for name, local_conf in local_repo_confs.items()}
|
|
79
|
+
print(f"\n 🧮 Computed hashes for {len(names_to_hashes)} local files.")
|
|
80
|
+
|
|
81
|
+
changed_conf_names = zipline_hub.call_diff_api(names_to_hashes)
|
|
82
|
+
|
|
83
|
+
if not changed_conf_names:
|
|
84
|
+
print(f" ✅ Remote contains all local files. No need to upload '{branch}'.")
|
|
85
|
+
diffed_confs = {}
|
|
86
|
+
else:
|
|
87
|
+
unchanged = len(names_to_hashes) - len(changed_conf_names)
|
|
88
|
+
print(
|
|
89
|
+
f" 🔍 Detected {len(changed_conf_names)} changes on local branch '{branch}'. {unchanged} unchanged."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# a list of names for diffed hashes on branch
|
|
93
|
+
diffed_confs = {k: local_repo_confs[k] for k in changed_conf_names}
|
|
94
|
+
conf_names_str = "\n - ".join(diffed_confs.keys())
|
|
95
|
+
print(f" - {conf_names_str}")
|
|
96
|
+
|
|
97
|
+
diff_confs = []
|
|
98
|
+
for _, conf in diffed_confs.items():
|
|
99
|
+
diff_confs.append(conf.__dict__)
|
|
100
|
+
|
|
101
|
+
# Make PUT request to ZiplineHub
|
|
102
|
+
zipline_hub.call_upload_api(branch=branch, diff_confs=diff_confs)
|
|
103
|
+
print(f" ⬆️ Uploaded {len(diffed_confs)} changed confs to branch '{branch}'.")
|
|
104
|
+
|
|
105
|
+
zipline_hub.call_sync_api(branch=branch, names_to_hashes=names_to_hashes)
|
|
106
|
+
|
|
107
|
+
print(f" ✅ {len(names_to_hashes)} hashes updated on branch '{branch}'.\n")
|
|
108
|
+
return diffed_confs
|
ai/chronon/repo/init.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
from importlib_resources import files
|
|
8
|
+
from rich.prompt import Prompt
|
|
9
|
+
from rich.syntax import Syntax
|
|
10
|
+
|
|
11
|
+
from ai.chronon.cli.compile.display.console import console
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@click.command(name="init")
|
|
15
|
+
@click.option(
|
|
16
|
+
"--cloud-provider",
|
|
17
|
+
envvar="CLOUD_PROVIDER",
|
|
18
|
+
help="Cloud provider to use.",
|
|
19
|
+
required=True,
|
|
20
|
+
type=click.Choice(['aws', 'gcp'], case_sensitive=False)
|
|
21
|
+
)
|
|
22
|
+
@click.option(
|
|
23
|
+
"--chronon-root",
|
|
24
|
+
help="Path to the root chronon folder.",
|
|
25
|
+
default=os.path.join(os.getcwd(), "zipline"),
|
|
26
|
+
type=click.Path(file_okay=False, writable=True),
|
|
27
|
+
)
|
|
28
|
+
@click.pass_context
|
|
29
|
+
def main(ctx, chronon_root, cloud_provider):
|
|
30
|
+
template_path = files("ai.chronon").joinpath("resources", cloud_provider.lower())
|
|
31
|
+
target_path = os.path.abspath(chronon_root)
|
|
32
|
+
|
|
33
|
+
if os.path.exists(target_path) and os.listdir(target_path):
|
|
34
|
+
choice = Prompt.ask(f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
|
|
35
|
+
choices=["y", "n"],
|
|
36
|
+
default="y")
|
|
37
|
+
if choice == "n":
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
console.print(f"Generating scaffolding at {target_path} ...")
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
shutil.copytree(template_path, target_path, dirs_exist_ok=True)
|
|
44
|
+
console.print("[bold green] Project scaffolding created successfully! 🎉\n")
|
|
45
|
+
export_cmd = Syntax(f"`export PYTHONPATH={target_path}:$PYTHONPATH`", "bash", theme="github-dark", line_numbers=False)
|
|
46
|
+
console.print("Please copy the following command to your shell config:")
|
|
47
|
+
console.print(export_cmd)
|
|
48
|
+
except Exception:
|
|
49
|
+
console.print_exception()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
main()
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from ai.chronon.scheduler.interfaces.flow import Flow
|
|
6
|
+
from ai.chronon.scheduler.interfaces.node import Node
|
|
7
|
+
from ai.chronon.utils import (
|
|
8
|
+
convert_json_to_obj,
|
|
9
|
+
dict_to_bash_commands,
|
|
10
|
+
dict_to_exports,
|
|
11
|
+
get_join_output_table_name,
|
|
12
|
+
join_part_name,
|
|
13
|
+
sanitize,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
TASK_PREFIX = "compute_join"
|
|
17
|
+
DEFAULT_SPARK_SETTINGS = {
|
|
18
|
+
"default": {
|
|
19
|
+
"spark_version": "3.1.1",
|
|
20
|
+
"executor_memory": "4G",
|
|
21
|
+
"driver_memory": "4G",
|
|
22
|
+
"executor_cores": 2,
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class JoinBackfill:
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
start_date: str,
|
|
31
|
+
end_date: str,
|
|
32
|
+
config_path: str,
|
|
33
|
+
settings: dict = DEFAULT_SPARK_SETTINGS,
|
|
34
|
+
):
|
|
35
|
+
self.dag_id = "_".join(
|
|
36
|
+
map(
|
|
37
|
+
sanitize, ["chronon_joins_backfill", os.path.basename(config_path).split("/")[-1], start_date, end_date]
|
|
38
|
+
)
|
|
39
|
+
)
|
|
40
|
+
self.start_date = start_date
|
|
41
|
+
self.end_date = end_date
|
|
42
|
+
self.config_path = config_path
|
|
43
|
+
self.settings = settings
|
|
44
|
+
with open(self.config_path, "r") as file:
|
|
45
|
+
config = file.read()
|
|
46
|
+
self.join = convert_json_to_obj(json.loads(config))
|
|
47
|
+
|
|
48
|
+
def build_flow(self) -> Flow:
|
|
49
|
+
"""
|
|
50
|
+
Build a flow from a Join object. Each join part is a node and will run in parallel.
|
|
51
|
+
The next step is final join, which is a node that depends on all join parts.
|
|
52
|
+
The final join will run after all join parts are done.
|
|
53
|
+
|
|
54
|
+
:param join: The Join object to build a flow from
|
|
55
|
+
:return: A Flow object that represents the flow of the Join
|
|
56
|
+
"""
|
|
57
|
+
flow = Flow(self.join.metaData.name)
|
|
58
|
+
final_node = Node(
|
|
59
|
+
f"{TASK_PREFIX}__{sanitize(get_join_output_table_name(self.join, full_name=True))}", self.run_final_join()
|
|
60
|
+
)
|
|
61
|
+
left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
|
|
62
|
+
flow.add_node(final_node)
|
|
63
|
+
flow.add_node(left_node)
|
|
64
|
+
for join_part in self.join.joinParts:
|
|
65
|
+
jp_full_name = join_part_name(join_part)
|
|
66
|
+
jp_node = Node(f"{TASK_PREFIX}__{jp_full_name}", self.run_join_part(jp_full_name))
|
|
67
|
+
flow.add_node(jp_node)
|
|
68
|
+
jp_node.add_dependency(left_node)
|
|
69
|
+
final_node.add_dependency(jp_node)
|
|
70
|
+
return flow
|
|
71
|
+
|
|
72
|
+
def export_template(self, settings: dict):
|
|
73
|
+
return f"{dict_to_exports(settings)}"
|
|
74
|
+
|
|
75
|
+
def command_template(self, extra_args: dict):
|
|
76
|
+
if self.start_date:
|
|
77
|
+
extra_args.update({"start_ds": self.start_date})
|
|
78
|
+
return f"""python3 /tmp/run.py --conf=/tmp/{self.config_path} --ds={self.end_date} \
|
|
79
|
+
{dict_to_bash_commands(extra_args)}"""
|
|
80
|
+
|
|
81
|
+
def run_join_part(self, join_part: str):
|
|
82
|
+
args = {
|
|
83
|
+
"mode": "backfill",
|
|
84
|
+
"selected_join_parts": join_part,
|
|
85
|
+
"use_cached_left": None,
|
|
86
|
+
}
|
|
87
|
+
settings = self.settings.get(join_part, self.settings["default"])
|
|
88
|
+
return self.export_template(settings) + " && " + self.command_template(extra_args=args)
|
|
89
|
+
|
|
90
|
+
def run_left_table(self):
|
|
91
|
+
settings = self.settings.get("left_table", self.settings["default"])
|
|
92
|
+
return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-left"})
|
|
93
|
+
|
|
94
|
+
def run_final_join(self):
|
|
95
|
+
settings = self.settings.get("final_join", self.settings["default"])
|
|
96
|
+
return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-final"})
|
|
97
|
+
|
|
98
|
+
def run(self, orchestrator: str, overrides: Optional[dict] = None):
|
|
99
|
+
from ai.chronon.constants import ADAPTERS
|
|
100
|
+
|
|
101
|
+
ADAPTERS.update(overrides)
|
|
102
|
+
orchestrator = ADAPTERS[orchestrator](dag_id=self.dag_id, start_date=self.start_date)
|
|
103
|
+
orchestrator.setup()
|
|
104
|
+
orchestrator.build_dag_from_flow(self.build_flow())
|
|
105
|
+
orchestrator.trigger_run()
|