awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
@@ -0,0 +1,261 @@
1
+ import json
2
+ import os
3
+ from dataclasses import dataclass
4
+ from datetime import date, timedelta
5
+ from typing import Optional
6
+
7
+ import click
8
+ from gen_thrift.planner.ttypes import Mode
9
+
10
+ from ai.chronon.cli.git_utils import get_current_branch
11
+ from ai.chronon.repo import hub_uploader, utils
12
+ from ai.chronon.repo.constants import RunMode
13
+ from ai.chronon.repo.utils import handle_conf_not_found, print_possible_confs
14
+ from ai.chronon.repo.zipline_hub import ZiplineHub
15
+
16
+ ALLOWED_DATE_FORMATS = ["%Y-%m-%d"]
17
+
18
+
19
+ @click.group()
20
+ def hub():
21
+ pass
22
+
23
+
24
+ #### Common click options
25
+ def common_options(func):
26
+ func = click.option("--repo", help="Path to chronon repo", default=".")(func)
27
+ func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
28
+ func = click.option(
29
+ "--hub_url", help="Zipline Hub address, e.g. http://localhost:3903", default=None
30
+ )(func)
31
+ return func
32
+
33
+
34
+ def ds_option(func):
35
+ return click.option(
36
+ "--ds",
37
+ help="the end partition to backfill the data",
38
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
39
+ )(func)
40
+
41
+
42
+ def start_ds_option(func):
43
+ return click.option(
44
+ "--start-ds",
45
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
46
+ help="override the original start partition for a range backfill. "
47
+ "It only supports staging query, group by backfill and join jobs. "
48
+ "It could leave holes in your final output table due to the override date range.",
49
+ )(func)
50
+
51
+
52
+ def end_ds_option(func):
53
+ return click.option(
54
+ "--end-ds",
55
+ help="the end ds for a range backfill",
56
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
57
+ default=str(date.today() - timedelta(days=2)),
58
+ )(func)
59
+
60
+
61
+ def submit_workflow(repo, conf, mode, start_ds, end_ds, hub_url=None):
62
+ hub_conf = get_hub_conf(conf, root_dir=repo)
63
+ if hub_url is not None:
64
+ zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
65
+ else:
66
+ zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
67
+ conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
68
+ branch = get_current_branch()
69
+
70
+ hub_uploader.compute_and_upload_diffs(
71
+ branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict
72
+ )
73
+
74
+ # get conf name
75
+ conf_name = utils.get_metadata_name_from_conf(repo, conf)
76
+
77
+ response_json = zipline_hub.call_workflow_start_api(
78
+ conf_name=conf_name,
79
+ mode=mode,
80
+ branch=branch, # Get the current branch
81
+ user=os.environ.get("USER"),
82
+ start=start_ds,
83
+ end=end_ds,
84
+ conf_hash=conf_name_to_hash_dict[conf_name].hash,
85
+ skip_long_running=False,
86
+ )
87
+
88
+ workflow_id = response_json.get("workflowId", "N/A")
89
+ print(" 🆔 Workflow Id:", workflow_id)
90
+ print_wf_url(
91
+ conf=conf,
92
+ conf_name=conf_name,
93
+ mode=mode,
94
+ workflow_id=workflow_id,
95
+ repo=repo
96
+ )
97
+
98
+
99
+ def submit_schedule(repo, conf, hub_url=None):
100
+ hub_conf = get_hub_conf(conf, root_dir=repo)
101
+ if hub_url is not None:
102
+ zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
103
+ else:
104
+ zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
105
+ conf_name_to_obj_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
106
+ branch = get_current_branch()
107
+
108
+ hub_uploader.compute_and_upload_diffs(
109
+ branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_obj_dict
110
+ )
111
+
112
+ # get conf name
113
+ conf_name = utils.get_metadata_name_from_conf(repo, conf)
114
+ schedule_modes = get_schedule_modes(os.path.join(repo, conf))
115
+ # create a dict for RunMode.BACKFILL.value and RunMode.DEPLOY.value to schedule_modes.offline_schedule and schedule_modes.online
116
+ modes = {
117
+ RunMode.BACKFILL.value.upper(): schedule_modes.offline_schedule,
118
+ RunMode.DEPLOY.value.upper(): schedule_modes.online,
119
+ }
120
+
121
+ response_json = zipline_hub.call_schedule_api(
122
+ modes=modes,
123
+ branch=branch,
124
+ conf_name=conf_name,
125
+ conf_hash=conf_name_to_obj_dict[conf_name].hash,
126
+ )
127
+
128
+ schedules = response_json.get("schedules", "N/A")
129
+ readable_schedules = {Mode._VALUES_TO_NAMES[int(k)]: v for k, v in schedules.items()}
130
+ print(" 🗓️ Schedules Deployed:", readable_schedules)
131
+
132
+
133
+ # zipline hub backfill --conf=compiled/joins/join
134
+ # adhoc backfills
135
+ @hub.command()
136
+ @common_options
137
+ @start_ds_option
138
+ @end_ds_option
139
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
140
+ def backfill(repo, conf, hub_url, start_ds, end_ds):
141
+ """
142
+ - Submit a backfill job to Zipline.
143
+ Response should contain a list of confs that are different from what's on remote.
144
+ - Call upload API to upload the conf contents for the list of confs that were different.
145
+ - Call the actual run API with mode set to backfill.
146
+ """
147
+ submit_workflow(
148
+ repo, conf, RunMode.BACKFILL.value, start_ds, end_ds, hub_url=hub_url
149
+ )
150
+
151
+
152
+ # zipline hub run-adhoc --conf=compiled/joins/join
153
+ # currently only supports one-off deploy node submission
154
+ @hub.command()
155
+ @common_options
156
+ @end_ds_option
157
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
158
+ def run_adhoc(repo, conf, hub_url, end_ds):
159
+ """
160
+ - Submit a one-off deploy job to Zipline. This submits the various jobs to allow your conf to be tested online.
161
+ Response should contain a list of confs that are different from what's on remote.
162
+ - Call upload API to upload the conf contents for the list of confs that were different.
163
+ - Call the actual run API with mode set to deploy
164
+ """
165
+ submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds, hub_url=hub_url)
166
+
167
+
168
+ # zipline hub schedule --conf=compiled/joins/join
169
+ @hub.command()
170
+ @common_options
171
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
172
+ def schedule(repo, conf, hub_url):
173
+ """
174
+ - Deploys a schedule for the specified conf to Zipline. This allows your conf to have various associated jobs run on a schedule.
175
+ This verb will introspect your conf to determine which of its jobs need to be scheduled (or paused if turned off) based on the
176
+ 'offline_schedule' and 'online' fields.
177
+ """
178
+ submit_schedule(repo, conf, hub_url=hub_url)
179
+
180
+
181
+ def get_metadata_map(file_path):
182
+ with open(file_path, "r") as f:
183
+ data = json.load(f)
184
+ metadata_map = data["metaData"]
185
+ return metadata_map
186
+
187
+
188
+ def get_common_env_map(file_path):
189
+ metadata_map = get_metadata_map(file_path)
190
+ common_env_map = metadata_map["executionInfo"]["env"]["common"]
191
+ return common_env_map
192
+
193
+
194
+ @dataclass
195
+ class HubConfig:
196
+ hub_url: str
197
+ frontend_url: str
198
+ sa_name: Optional[str] = None
199
+
200
+
201
+ @dataclass
202
+ class ScheduleModes:
203
+ online: str
204
+ offline_schedule: str
205
+
206
+
207
+ def get_hub_conf(conf_path, root_dir="."):
208
+ file_path = os.path.join(root_dir, conf_path)
209
+ common_env_map = get_common_env_map(file_path)
210
+ hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
211
+ frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
212
+ sa_name = common_env_map.get("SA_NAME", os.environ.get("SA_NAME"))
213
+ return HubConfig(hub_url=hub_url, frontend_url=frontend_url, sa_name=sa_name)
214
+
215
+
216
+ def get_schedule_modes(conf_path):
217
+ metadata_map = get_metadata_map(conf_path)
218
+ online_value = metadata_map.get("online", False)
219
+ online = "true" if bool(online_value) else "false"
220
+ offline_schedule = metadata_map["executionInfo"].get("scheduleCron", None)
221
+
222
+ # check if offline_schedule is null or 'None' or '@daily' else throw an error
223
+ valid_schedules = {None, "None", "@daily"}
224
+ if offline_schedule not in valid_schedules:
225
+ raise ValueError(
226
+ f"Unsupported offline_schedule: {offline_schedule}. Only null, 'None', or '@daily' are supported."
227
+ )
228
+ offline_schedule = offline_schedule or "None"
229
+ return ScheduleModes(online=online, offline_schedule=offline_schedule)
230
+
231
+
232
+ def print_wf_url(conf, conf_name, mode, workflow_id, repo="."):
233
+ hub_conf = get_hub_conf(conf, root_dir=repo)
234
+ frontend_url = hub_conf.frontend_url
235
+
236
+ if "compiled/joins" in conf:
237
+ hub_conf_type = "joins"
238
+ elif "compiled/staging_queries" in conf:
239
+ hub_conf_type = "stagingqueries"
240
+ elif "compiled/group_by" in conf:
241
+ hub_conf_type = "groupbys"
242
+ elif "compiled/models" in conf:
243
+ hub_conf_type = "models"
244
+ else:
245
+ raise ValueError(f"Unsupported conf type: {conf}")
246
+
247
+ def _mode_string():
248
+ if mode == "backfill":
249
+ return "offline"
250
+ elif mode == "deploy":
251
+ return "online"
252
+ else:
253
+ raise ValueError(f"Unsupported mode: {mode}")
254
+
255
+ workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string()}?workflowId={workflow_id}"
256
+
257
+ print(" 🔗 Workflow : " + workflow_url + "\n")
258
+
259
+
260
+ if __name__ == "__main__":
261
+ hub()
@@ -0,0 +1,109 @@
1
+ import glob
2
+ import hashlib
3
+ import json
4
+ import os
5
+
6
+ from gen_thrift.api.ttypes import Conf
7
+
8
+ from ai.chronon.repo import (
9
+ FOLDER_NAME_TO_CLASS,
10
+ FOLDER_NAME_TO_CONF_TYPE,
11
+ )
12
+ from ai.chronon.repo.zipline_hub import ZiplineHub
13
+
14
+
15
+ def build_local_repo_hashmap(root_dir: str):
16
+ compiled_dir = os.path.join(root_dir, "compiled")
17
+ # Returns a map of name -> (tbinary, file_hash)
18
+ results = {}
19
+
20
+ # Iterate through each object type folder (staging_queries, group_bys, joins etc)
21
+ for folder_name, _ in FOLDER_NAME_TO_CLASS.items():
22
+ folder_path = os.path.join(compiled_dir, folder_name)
23
+ if not os.path.exists(folder_path):
24
+ continue
25
+
26
+ # Find all json files recursively in this folder
27
+ json_files = [
28
+ f
29
+ for f in glob.glob(os.path.join(folder_path, "**/*"), recursive=True)
30
+ if os.path.isfile(f)
31
+ ]
32
+
33
+ exceptions = []
34
+
35
+ for json_file in json_files:
36
+ try:
37
+ # Read the json file
38
+ with open(json_file, "r") as f:
39
+ thrift_json = f.read()
40
+
41
+ # Extract name from metadata in json
42
+ json_obj = json.loads(thrift_json)
43
+ name = json_obj["metaData"]["name"]
44
+
45
+ # Load the json into the appropriate object type based on folder
46
+ # binary = json2binary(thrift_json, obj_class)
47
+
48
+ md5_hash = hashlib.md5(thrift_json.encode()).hexdigest()
49
+ # md5_hash = hashlib.md5(thrift_json.encode()).hexdigest() + "123"
50
+ # results[name] = (binary, md5_hash, FOLDER_NAME_TO_CONF_TYPE[folder_name])
51
+ results[name] = Conf(
52
+ name=name,
53
+ hash=md5_hash,
54
+ # contents=binary,
55
+ contents=thrift_json,
56
+ confType=FOLDER_NAME_TO_CONF_TYPE[folder_name],
57
+ )
58
+
59
+ except Exception as e:
60
+ exceptions.append(f"{json_file} - {e}")
61
+
62
+ if exceptions:
63
+ error_msg = (
64
+ "The following files had exceptions during upload: \n"
65
+ + "\n".join(exceptions)
66
+ + "\n\n Consider deleting the files (safe operation) and checking "
67
+ + "your thrift version before rerunning your command."
68
+ )
69
+ raise RuntimeError(error_msg)
70
+
71
+ return results
72
+
73
+
74
+ def compute_and_upload_diffs(
75
+ branch: str, zipline_hub: ZiplineHub, local_repo_confs: dict[str, Conf]
76
+ ):
77
+ # Determine which confs are different from the ZiplineHub
78
+ # Call Zipline hub with `names_and_hashes` as the argument to get back
79
+ names_to_hashes = {name: local_conf.hash for name, local_conf in local_repo_confs.items()}
80
+ print(f"\n 🧮 Computed hashes for {len(names_to_hashes)} local files.")
81
+
82
+ changed_conf_names = zipline_hub.call_diff_api(names_to_hashes)
83
+
84
+ if not changed_conf_names:
85
+ print(f" ✅ Remote contains all local files. No need to upload '{branch}'.")
86
+ diffed_confs = {}
87
+ else:
88
+ unchanged = len(names_to_hashes) - len(changed_conf_names)
89
+ print(
90
+ f" 🔍 Detected {len(changed_conf_names)} changes on local branch '{branch}'. {unchanged} unchanged."
91
+ )
92
+
93
+ # a list of names for diffed hashes on branch
94
+ diffed_confs = {k: local_repo_confs[k] for k in changed_conf_names}
95
+ conf_names_str = "\n - ".join(diffed_confs.keys())
96
+ print(f" - {conf_names_str}")
97
+
98
+ diff_confs = []
99
+ for _, conf in diffed_confs.items():
100
+ diff_confs.append(conf.__dict__)
101
+
102
+ # Make PUT request to ZiplineHub
103
+ zipline_hub.call_upload_api(branch=branch, diff_confs=diff_confs)
104
+ print(f" ⬆️ Uploaded {len(diffed_confs)} changed confs to branch '{branch}'.")
105
+
106
+ zipline_hub.call_sync_api(branch=branch, names_to_hashes=names_to_hashes)
107
+
108
+ print(f" ✅ {len(names_to_hashes)} hashes updated on branch '{branch}'.\n")
109
+ return diffed_confs
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import shutil
5
+
6
+ import click
7
+ from importlib_resources import files
8
+ from rich.prompt import Prompt
9
+ from rich.syntax import Syntax
10
+
11
+ from ai.chronon.cli.compile.display.console import console
12
+
13
+
14
+ @click.command(name="init")
15
+ @click.option(
16
+ "--cloud-provider",
17
+ envvar="CLOUD_PROVIDER",
18
+ help="Cloud provider to use.",
19
+ required=True,
20
+ type=click.Choice(["aws", "gcp"], case_sensitive=False),
21
+ )
22
+ @click.option(
23
+ "--chronon-root",
24
+ help="Path to the root chronon folder.",
25
+ default=os.path.join(os.getcwd(), "zipline"),
26
+ type=click.Path(file_okay=False, writable=True),
27
+ )
28
+ @click.pass_context
29
+ def main(ctx, chronon_root, cloud_provider):
30
+ template_path = files("ai.chronon").joinpath("resources", cloud_provider.lower())
31
+ target_path = os.path.abspath(chronon_root)
32
+
33
+ if os.path.exists(target_path) and os.listdir(target_path):
34
+ choice = Prompt.ask(
35
+ f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
36
+ choices=["y", "n"],
37
+ default="y",
38
+ )
39
+ if choice == "n":
40
+ return
41
+
42
+ console.print(f"Generating scaffolding at {target_path} ...")
43
+
44
+ try:
45
+ shutil.copytree(template_path, target_path, dirs_exist_ok=True)
46
+ console.print("[bold green] Project scaffolding created successfully! 🎉\n")
47
+ export_cmd = Syntax(
48
+ f"`export PYTHONPATH={target_path}:$PYTHONPATH`",
49
+ "bash",
50
+ theme="github-dark",
51
+ line_numbers=False,
52
+ )
53
+ console.print("Please copy the following command to your shell config:")
54
+ console.print(export_cmd)
55
+ except Exception:
56
+ console.print_exception()
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
@@ -0,0 +1,119 @@
1
+ import json
2
+ import os
3
+ from typing import Optional
4
+
5
+ from ai.chronon.scheduler.interfaces.flow import Flow
6
+ from ai.chronon.scheduler.interfaces.node import Node
7
+ from ai.chronon.utils import (
8
+ convert_json_to_obj,
9
+ dict_to_bash_commands,
10
+ dict_to_exports,
11
+ join_part_name,
12
+ sanitize,
13
+ )
14
+
15
+ TASK_PREFIX = "compute_join"
16
+ DEFAULT_SPARK_SETTINGS = {
17
+ "default": {
18
+ "spark_version": "3.1.1",
19
+ "executor_memory": "4G",
20
+ "driver_memory": "4G",
21
+ "executor_cores": 2,
22
+ }
23
+ }
24
+
25
+
26
+ class JoinBackfill:
27
+ def __init__(
28
+ self,
29
+ start_date: str,
30
+ end_date: str,
31
+ config_path: str,
32
+ settings: dict = DEFAULT_SPARK_SETTINGS,
33
+ ):
34
+ self.dag_id = "_".join(
35
+ map(
36
+ sanitize,
37
+ [
38
+ "chronon_joins_backfill",
39
+ os.path.basename(config_path).split("/")[-1],
40
+ start_date,
41
+ end_date,
42
+ ],
43
+ )
44
+ )
45
+ self.start_date = start_date
46
+ self.end_date = end_date
47
+ self.config_path = config_path
48
+ self.settings = settings
49
+ with open(self.config_path, "r") as file:
50
+ config = file.read()
51
+ self.join = convert_json_to_obj(json.loads(config))
52
+
53
+ def build_flow(self) -> Flow:
54
+ """
55
+ Build a flow from a Join object. Each join part is a node and will run in parallel.
56
+ The next step is final join, which is a node that depends on all join parts.
57
+ The final join will run after all join parts are done.
58
+
59
+ :param join: The Join object to build a flow from
60
+ :return: A Flow object that represents the flow of the Join
61
+ """
62
+ flow = Flow(self.join.metaData.name)
63
+ final_node = Node(
64
+ f"{TASK_PREFIX}__{sanitize(self.join.table)}",
65
+ self.run_final_join(),
66
+ )
67
+ left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
68
+ flow.add_node(final_node)
69
+ flow.add_node(left_node)
70
+ for join_part in self.join.joinParts:
71
+ jp_full_name = join_part_name(join_part)
72
+ jp_node = Node(f"{TASK_PREFIX}__{jp_full_name}", self.run_join_part(jp_full_name))
73
+ flow.add_node(jp_node)
74
+ jp_node.add_dependency(left_node)
75
+ final_node.add_dependency(jp_node)
76
+ return flow
77
+
78
+ def export_template(self, settings: dict):
79
+ return f"{dict_to_exports(settings)}"
80
+
81
+ def command_template(self, extra_args: dict):
82
+ if self.start_date:
83
+ extra_args.update({"start_ds": self.start_date})
84
+ return f"""python3 /tmp/run.py --conf=/tmp/{self.config_path} --ds={self.end_date} \
85
+ {dict_to_bash_commands(extra_args)}"""
86
+
87
+ def run_join_part(self, join_part: str):
88
+ args = {
89
+ "mode": "backfill",
90
+ "selected_join_parts": join_part,
91
+ "use_cached_left": None,
92
+ }
93
+ settings = self.settings.get(join_part, self.settings["default"])
94
+ return self.export_template(settings) + " && " + self.command_template(extra_args=args)
95
+
96
+ def run_left_table(self):
97
+ settings = self.settings.get("left_table", self.settings["default"])
98
+ return (
99
+ self.export_template(settings)
100
+ + " && "
101
+ + self.command_template(extra_args={"mode": "backfill-left"})
102
+ )
103
+
104
+ def run_final_join(self):
105
+ settings = self.settings.get("final_join", self.settings["default"])
106
+ return (
107
+ self.export_template(settings)
108
+ + " && "
109
+ + self.command_template(extra_args={"mode": "backfill-final"})
110
+ )
111
+
112
+ def run(self, orchestrator: str, overrides: Optional[dict] = None):
113
+ from ai.chronon.constants import ADAPTERS
114
+
115
+ ADAPTERS.update(overrides)
116
+ orchestrator = ADAPTERS[orchestrator](dag_id=self.dag_id, start_date=self.start_date)
117
+ orchestrator.setup()
118
+ orchestrator.build_dag_from_flow(self.build_flow())
119
+ orchestrator.trigger_run()