awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of awx-zipline-ai might be problematic. Click here for more details.

Files changed (96) hide show
  1. agent/ttypes.py +6 -6
  2. ai/chronon/airflow_helpers.py +20 -23
  3. ai/chronon/cli/__init__.py +0 -0
  4. ai/chronon/cli/compile/__init__.py +0 -0
  5. ai/chronon/cli/compile/column_hashing.py +40 -17
  6. ai/chronon/cli/compile/compile_context.py +13 -17
  7. ai/chronon/cli/compile/compiler.py +59 -36
  8. ai/chronon/cli/compile/conf_validator.py +251 -99
  9. ai/chronon/cli/compile/display/__init__.py +0 -0
  10. ai/chronon/cli/compile/display/class_tracker.py +6 -16
  11. ai/chronon/cli/compile/display/compile_status.py +10 -10
  12. ai/chronon/cli/compile/display/diff_result.py +79 -14
  13. ai/chronon/cli/compile/fill_templates.py +3 -8
  14. ai/chronon/cli/compile/parse_configs.py +10 -17
  15. ai/chronon/cli/compile/parse_teams.py +38 -34
  16. ai/chronon/cli/compile/serializer.py +3 -9
  17. ai/chronon/cli/compile/version_utils.py +42 -0
  18. ai/chronon/cli/git_utils.py +2 -13
  19. ai/chronon/cli/logger.py +0 -2
  20. ai/chronon/constants.py +1 -1
  21. ai/chronon/group_by.py +47 -47
  22. ai/chronon/join.py +46 -32
  23. ai/chronon/logger.py +1 -2
  24. ai/chronon/model.py +9 -4
  25. ai/chronon/query.py +2 -2
  26. ai/chronon/repo/__init__.py +1 -2
  27. ai/chronon/repo/aws.py +17 -31
  28. ai/chronon/repo/cluster.py +121 -50
  29. ai/chronon/repo/compile.py +14 -8
  30. ai/chronon/repo/constants.py +1 -1
  31. ai/chronon/repo/default_runner.py +32 -54
  32. ai/chronon/repo/explore.py +70 -73
  33. ai/chronon/repo/extract_objects.py +6 -9
  34. ai/chronon/repo/gcp.py +89 -88
  35. ai/chronon/repo/gitpython_utils.py +3 -2
  36. ai/chronon/repo/hub_runner.py +145 -55
  37. ai/chronon/repo/hub_uploader.py +2 -1
  38. ai/chronon/repo/init.py +12 -5
  39. ai/chronon/repo/join_backfill.py +19 -5
  40. ai/chronon/repo/run.py +42 -39
  41. ai/chronon/repo/serializer.py +4 -12
  42. ai/chronon/repo/utils.py +72 -63
  43. ai/chronon/repo/zipline.py +3 -19
  44. ai/chronon/repo/zipline_hub.py +211 -39
  45. ai/chronon/resources/__init__.py +0 -0
  46. ai/chronon/resources/gcp/__init__.py +0 -0
  47. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  48. ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
  49. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  50. ai/chronon/resources/gcp/joins/test/data.py +4 -8
  51. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  52. ai/chronon/resources/gcp/sources/test/data.py +9 -6
  53. ai/chronon/resources/gcp/teams.py +9 -21
  54. ai/chronon/source.py +2 -4
  55. ai/chronon/staging_query.py +60 -19
  56. ai/chronon/types.py +3 -2
  57. ai/chronon/utils.py +21 -68
  58. ai/chronon/windows.py +2 -4
  59. {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.0.dist-info}/METADATA +47 -24
  60. awx_zipline_ai-0.3.0.dist-info/RECORD +96 -0
  61. awx_zipline_ai-0.3.0.dist-info/top_level.txt +4 -0
  62. gen_thrift/__init__.py +0 -0
  63. {ai/chronon → gen_thrift}/api/ttypes.py +327 -197
  64. {ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
  65. gen_thrift/eval/ttypes.py +660 -0
  66. {ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
  67. {ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
  68. {ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
  69. ai/chronon/eval/__init__.py +0 -122
  70. ai/chronon/eval/query_parsing.py +0 -19
  71. ai/chronon/eval/sample_tables.py +0 -100
  72. ai/chronon/eval/table_scan.py +0 -186
  73. ai/chronon/orchestration/ttypes.py +0 -4406
  74. ai/chronon/resources/gcp/README.md +0 -174
  75. ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
  76. awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
  77. awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
  78. awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
  79. /jars/__init__.py → /__init__.py +0 -0
  80. {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.0.dist-info}/WHEEL +0 -0
  81. {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.0.dist-info}/entry_points.txt +0 -0
  82. {ai/chronon → gen_thrift}/api/__init__.py +0 -0
  83. {ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
  84. {ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
  85. {ai/chronon/api → gen_thrift/common}/constants.py +0 -0
  86. {ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
  87. {ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
  88. {ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
  89. {ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
  90. {ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
  91. {ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
  92. {ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
  93. {ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
  94. {ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
  95. {ai/chronon → gen_thrift}/planner/__init__.py +0 -0
  96. {ai/chronon → gen_thrift}/planner/constants.py +0 -0
@@ -1,16 +1,20 @@
1
1
  import json
2
2
  import os
3
- from datetime import datetime
4
- from urllib.parse import quote_plus
3
+ from dataclasses import dataclass
4
+ from datetime import date, timedelta
5
+ from typing import Optional
5
6
 
6
7
  import click
7
- from attr import dataclass
8
+ from gen_thrift.planner.ttypes import Mode
8
9
 
9
10
  from ai.chronon.cli.git_utils import get_current_branch
10
11
  from ai.chronon.repo import hub_uploader, utils
11
12
  from ai.chronon.repo.constants import RunMode
13
+ from ai.chronon.repo.utils import handle_conf_not_found, print_possible_confs
12
14
  from ai.chronon.repo.zipline_hub import ZiplineHub
13
15
 
16
+ ALLOWED_DATE_FORMATS = ["%Y-%m-%d"]
17
+
14
18
 
15
19
  @click.group()
16
20
  def hub():
@@ -21,60 +25,110 @@ def hub():
21
25
  def common_options(func):
22
26
  func = click.option("--repo", help="Path to chronon repo", default=".")(func)
23
27
  func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
28
+ func = click.option(
29
+ "--hub_url", help="Zipline Hub address, e.g. http://localhost:3903", default=None
30
+ )(func)
24
31
  return func
25
32
 
33
+
26
34
  def ds_option(func):
27
- return click.option("--ds", help="the end partition to backfill the data")(func)
35
+ return click.option(
36
+ "--ds",
37
+ help="the end partition to backfill the data",
38
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
39
+ )(func)
40
+
28
41
 
29
42
  def start_ds_option(func):
30
43
  return click.option(
31
- "--start-ds",
32
- help="override the original start partition for a range backfill. "
33
- "It only supports staging query, group by backfill and join jobs. "
34
- "It could leave holes in your final output table due to the override date range.",)(func)
44
+ "--start-ds",
45
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
46
+ help="override the original start partition for a range backfill. "
47
+ "It only supports staging query, group by backfill and join jobs. "
48
+ "It could leave holes in your final output table due to the override date range.",
49
+ )(func)
35
50
 
36
51
 
37
52
  def end_ds_option(func):
38
- return click.option("--end-ds", help="the end ds for a range backfill")(func)
39
-
53
+ return click.option(
54
+ "--end-ds",
55
+ help="the end ds for a range backfill",
56
+ type=click.DateTime(formats=ALLOWED_DATE_FORMATS),
57
+ default=str(date.today() - timedelta(days=2)),
58
+ )(func)
40
59
 
41
- def submit_workflow(repo,
42
- conf,
43
- mode,
44
- start_ds,
45
- end_ds):
46
60
 
47
- hub_conf = get_hub_conf(conf)
48
- zipline_hub = ZiplineHub(base_url=hub_conf.hub_url)
49
- conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir= repo)
61
+ def submit_workflow(repo, conf, mode, start_ds, end_ds, hub_url=None):
62
+ hub_conf = get_hub_conf(conf, root_dir=repo)
63
+ if hub_url is not None:
64
+ zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
65
+ else:
66
+ zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
67
+ conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
50
68
  branch = get_current_branch()
51
69
 
52
- hub_uploader.compute_and_upload_diffs(branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict)
70
+ hub_uploader.compute_and_upload_diffs(
71
+ branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict
72
+ )
53
73
 
54
74
  # get conf name
55
75
  conf_name = utils.get_metadata_name_from_conf(repo, conf)
56
76
 
57
-
58
77
  response_json = zipline_hub.call_workflow_start_api(
59
78
  conf_name=conf_name,
60
79
  mode=mode,
61
80
  branch=branch, # Get the current branch
62
- user=os.environ.get('USER'),
81
+ user=os.environ.get("USER"),
63
82
  start=start_ds,
64
83
  end=end_ds,
65
84
  conf_hash=conf_name_to_hash_dict[conf_name].hash,
85
+ skip_long_running=False,
66
86
  )
67
87
 
68
- print(" 🆔 Workflow Id:", response_json.get("workflowId", "N/A"))
88
+ workflow_id = response_json.get("workflowId", "N/A")
89
+ print(" 🆔 Workflow Id:", workflow_id)
69
90
  print_wf_url(
70
91
  conf=conf,
71
92
  conf_name=conf_name,
72
- mode=RunMode.BACKFILL.value,
73
- start_ds=start_ds,
74
- end_ds=end_ds,
75
- branch=branch
93
+ mode=mode,
94
+ workflow_id=workflow_id,
95
+ repo=repo
96
+ )
97
+
98
+
99
+ def submit_schedule(repo, conf, hub_url=None):
100
+ hub_conf = get_hub_conf(conf, root_dir=repo)
101
+ if hub_url is not None:
102
+ zipline_hub = ZiplineHub(base_url=hub_url, sa_name=hub_conf.sa_name)
103
+ else:
104
+ zipline_hub = ZiplineHub(base_url=hub_conf.hub_url, sa_name=hub_conf.sa_name)
105
+ conf_name_to_obj_dict = hub_uploader.build_local_repo_hashmap(root_dir=repo)
106
+ branch = get_current_branch()
107
+
108
+ hub_uploader.compute_and_upload_diffs(
109
+ branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_obj_dict
110
+ )
111
+
112
+ # get conf name
113
+ conf_name = utils.get_metadata_name_from_conf(repo, conf)
114
+ schedule_modes = get_schedule_modes(os.path.join(repo, conf))
115
+ # create a dict for RunMode.BACKFILL.value and RunMode.DEPLOY.value to schedule_modes.offline_schedule and schedule_modes.online
116
+ modes = {
117
+ RunMode.BACKFILL.value.upper(): schedule_modes.offline_schedule,
118
+ RunMode.DEPLOY.value.upper(): schedule_modes.online,
119
+ }
120
+
121
+ response_json = zipline_hub.call_schedule_api(
122
+ modes=modes,
123
+ branch=branch,
124
+ conf_name=conf_name,
125
+ conf_hash=conf_name_to_obj_dict[conf_name].hash,
76
126
  )
77
127
 
128
+ schedules = response_json.get("schedules", "N/A")
129
+ readable_schedules = {Mode._VALUES_TO_NAMES[int(k)]: v for k, v in schedules.items()}
130
+ print(" 🗓️ Schedules Deployed:", readable_schedules)
131
+
78
132
 
79
133
  # zipline hub backfill --conf=compiled/joins/join
80
134
  # adhoc backfills
@@ -82,41 +136,58 @@ def submit_workflow(repo,
82
136
  @common_options
83
137
  @start_ds_option
84
138
  @end_ds_option
85
- def backfill(repo,
86
- conf,
87
- start_ds,
88
- end_ds):
139
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
140
+ def backfill(repo, conf, hub_url, start_ds, end_ds):
89
141
  """
90
142
  - Submit a backfill job to Zipline.
91
143
  Response should contain a list of confs that are different from what's on remote.
92
144
  - Call upload API to upload the conf contents for the list of confs that were different.
93
145
  - Call the actual run API with mode set to backfill.
94
146
  """
95
- submit_workflow(repo, conf, RunMode.BACKFILL.value, start_ds, end_ds)
96
-
147
+ submit_workflow(
148
+ repo, conf, RunMode.BACKFILL.value, start_ds, end_ds, hub_url=hub_url
149
+ )
97
150
 
98
151
 
99
- # zipline hub deploy --conf=compiled/joins/join
152
+ # zipline hub run-adhoc --conf=compiled/joins/join
100
153
  # currently only supports one-off deploy node submission
101
154
  @hub.command()
102
155
  @common_options
103
156
  @end_ds_option
104
- def deploy(repo,
105
- conf,
106
- end_ds):
157
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
158
+ def run_adhoc(repo, conf, hub_url, end_ds):
107
159
  """
108
- - Submit a one-off deploy job to Zipline.
160
+ - Submit a one-off deploy job to Zipline. This submits the various jobs to allow your conf to be tested online.
109
161
  Response should contain a list of confs that are different from what's on remote.
110
162
  - Call upload API to upload the conf contents for the list of confs that were different.
111
163
  - Call the actual run API with mode set to deploy
112
164
  """
113
- submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds)
165
+ submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds, hub_url=hub_url)
114
166
 
115
167
 
116
- def get_common_env_map(file_path):
117
- with open(file_path, 'r') as f:
168
+ # zipline hub schedule --conf=compiled/joins/join
169
+ @hub.command()
170
+ @common_options
171
+ @handle_conf_not_found(log_error=True, callback=print_possible_confs)
172
+ def schedule(repo, conf, hub_url):
173
+ """
174
+ - Deploys a schedule for the specified conf to Zipline. This allows your conf to have various associated jobs run on a schedule.
175
+ This verb will introspect your conf to determine which of its jobs need to be scheduled (or paused if turned off) based on the
176
+ 'offline_schedule' and 'online' fields.
177
+ """
178
+ submit_schedule(repo, conf, hub_url=hub_url)
179
+
180
+
181
+ def get_metadata_map(file_path):
182
+ with open(file_path, "r") as f:
118
183
  data = json.load(f)
119
- common_env_map = data['metaData']['executionInfo']['env']['common']
184
+ metadata_map = data["metaData"]
185
+ return metadata_map
186
+
187
+
188
+ def get_common_env_map(file_path):
189
+ metadata_map = get_metadata_map(file_path)
190
+ common_env_map = metadata_map["executionInfo"]["env"]["common"]
120
191
  return common_env_map
121
192
 
122
193
 
@@ -124,18 +195,42 @@ def get_common_env_map(file_path):
124
195
  class HubConfig:
125
196
  hub_url: str
126
197
  frontend_url: str
198
+ sa_name: Optional[str] = None
199
+
127
200
 
201
+ @dataclass
202
+ class ScheduleModes:
203
+ online: str
204
+ offline_schedule: str
128
205
 
129
- def get_hub_conf(conf_path):
130
- common_env_map = get_common_env_map(conf_path)
206
+
207
+ def get_hub_conf(conf_path, root_dir="."):
208
+ file_path = os.path.join(root_dir, conf_path)
209
+ common_env_map = get_common_env_map(file_path)
131
210
  hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
132
211
  frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
133
- return HubConfig(hub_url=hub_url, frontend_url=frontend_url)
212
+ sa_name = common_env_map.get("SA_NAME", os.environ.get("SA_NAME"))
213
+ return HubConfig(hub_url=hub_url, frontend_url=frontend_url, sa_name=sa_name)
214
+
134
215
 
216
+ def get_schedule_modes(conf_path):
217
+ metadata_map = get_metadata_map(conf_path)
218
+ online_value = metadata_map.get("online", False)
219
+ online = "true" if bool(online_value) else "false"
220
+ offline_schedule = metadata_map["executionInfo"].get("scheduleCron", None)
135
221
 
136
- def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
222
+ # check if offline_schedule is null or 'None' or '@daily' else throw an error
223
+ valid_schedules = {None, "None", "@daily"}
224
+ if offline_schedule not in valid_schedules:
225
+ raise ValueError(
226
+ f"Unsupported offline_schedule: {offline_schedule}. Only null, 'None', or '@daily' are supported."
227
+ )
228
+ offline_schedule = offline_schedule or "None"
229
+ return ScheduleModes(online=online, offline_schedule=offline_schedule)
137
230
 
138
- hub_conf = get_hub_conf(conf)
231
+
232
+ def print_wf_url(conf, conf_name, mode, workflow_id, repo="."):
233
+ hub_conf = get_hub_conf(conf, root_dir=repo)
139
234
  frontend_url = hub_conf.frontend_url
140
235
 
141
236
  if "compiled/joins" in conf:
@@ -143,17 +238,13 @@ def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
143
238
  elif "compiled/staging_queries" in conf:
144
239
  hub_conf_type = "stagingqueries"
145
240
  elif "compiled/group_by" in conf:
146
- hub_conf_type = "groupby"
241
+ hub_conf_type = "groupbys"
147
242
  elif "compiled/models" in conf:
148
243
  hub_conf_type = "models"
149
244
  else:
150
245
  raise ValueError(f"Unsupported conf type: {conf}")
151
246
 
152
- # TODO: frontend uses localtime to create the millis, we should make it use UTC and make this align
153
- def _millis(date_str):
154
- return int(datetime.strptime(date_str, "%Y-%m-%d").timestamp() * 1000)
155
-
156
- def _mode_string(mode):
247
+ def _mode_string():
157
248
  if mode == "backfill":
158
249
  return "offline"
159
250
  elif mode == "deploy":
@@ -161,11 +252,10 @@ def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
161
252
  else:
162
253
  raise ValueError(f"Unsupported mode: {mode}")
163
254
 
164
- workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string(mode)}?start={_millis(start_ds)}&end={_millis(end_ds)}&branch={quote_plus(branch)}"
255
+ workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string()}?workflowId={workflow_id}"
165
256
 
166
257
  print(" 🔗 Workflow : " + workflow_url + "\n")
167
258
 
259
+
168
260
  if __name__ == "__main__":
169
261
  hub()
170
-
171
-
@@ -3,7 +3,8 @@ import hashlib
3
3
  import json
4
4
  import os
5
5
 
6
- from ai.chronon.orchestration.ttypes import Conf
6
+ from gen_thrift.api.ttypes import Conf
7
+
7
8
  from ai.chronon.repo import (
8
9
  FOLDER_NAME_TO_CLASS,
9
10
  FOLDER_NAME_TO_CONF_TYPE,
ai/chronon/repo/init.py CHANGED
@@ -17,7 +17,7 @@ from ai.chronon.cli.compile.display.console import console
17
17
  envvar="CLOUD_PROVIDER",
18
18
  help="Cloud provider to use.",
19
19
  required=True,
20
- type=click.Choice(['aws', 'gcp'], case_sensitive=False)
20
+ type=click.Choice(["aws", "gcp"], case_sensitive=False),
21
21
  )
22
22
  @click.option(
23
23
  "--chronon-root",
@@ -31,9 +31,11 @@ def main(ctx, chronon_root, cloud_provider):
31
31
  target_path = os.path.abspath(chronon_root)
32
32
 
33
33
  if os.path.exists(target_path) and os.listdir(target_path):
34
- choice = Prompt.ask(f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
35
- choices=["y", "n"],
36
- default="y")
34
+ choice = Prompt.ask(
35
+ f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
36
+ choices=["y", "n"],
37
+ default="y",
38
+ )
37
39
  if choice == "n":
38
40
  return
39
41
 
@@ -42,7 +44,12 @@ def main(ctx, chronon_root, cloud_provider):
42
44
  try:
43
45
  shutil.copytree(template_path, target_path, dirs_exist_ok=True)
44
46
  console.print("[bold green] Project scaffolding created successfully! 🎉\n")
45
- export_cmd = Syntax(f"`export PYTHONPATH={target_path}:$PYTHONPATH`", "bash", theme="github-dark", line_numbers=False)
47
+ export_cmd = Syntax(
48
+ f"`export PYTHONPATH={target_path}:$PYTHONPATH`",
49
+ "bash",
50
+ theme="github-dark",
51
+ line_numbers=False,
52
+ )
46
53
  console.print("Please copy the following command to your shell config:")
47
54
  console.print(export_cmd)
48
55
  except Exception:
@@ -8,7 +8,6 @@ from ai.chronon.utils import (
8
8
  convert_json_to_obj,
9
9
  dict_to_bash_commands,
10
10
  dict_to_exports,
11
- get_join_output_table_name,
12
11
  join_part_name,
13
12
  sanitize,
14
13
  )
@@ -34,7 +33,13 @@ class JoinBackfill:
34
33
  ):
35
34
  self.dag_id = "_".join(
36
35
  map(
37
- sanitize, ["chronon_joins_backfill", os.path.basename(config_path).split("/")[-1], start_date, end_date]
36
+ sanitize,
37
+ [
38
+ "chronon_joins_backfill",
39
+ os.path.basename(config_path).split("/")[-1],
40
+ start_date,
41
+ end_date,
42
+ ],
38
43
  )
39
44
  )
40
45
  self.start_date = start_date
@@ -56,7 +61,8 @@ class JoinBackfill:
56
61
  """
57
62
  flow = Flow(self.join.metaData.name)
58
63
  final_node = Node(
59
- f"{TASK_PREFIX}__{sanitize(get_join_output_table_name(self.join, full_name=True))}", self.run_final_join()
64
+ f"{TASK_PREFIX}__{sanitize(self.join.table)}",
65
+ self.run_final_join(),
60
66
  )
61
67
  left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
62
68
  flow.add_node(final_node)
@@ -89,11 +95,19 @@ class JoinBackfill:
89
95
 
90
96
  def run_left_table(self):
91
97
  settings = self.settings.get("left_table", self.settings["default"])
92
- return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-left"})
98
+ return (
99
+ self.export_template(settings)
100
+ + " && "
101
+ + self.command_template(extra_args={"mode": "backfill-left"})
102
+ )
93
103
 
94
104
  def run_final_join(self):
95
105
  settings = self.settings.get("final_join", self.settings["default"])
96
- return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-final"})
106
+ return (
107
+ self.export_template(settings)
108
+ + " && "
109
+ + self.command_template(extra_args={"mode": "backfill-final"})
110
+ )
97
111
 
98
112
  def run(self, orchestrator: str, overrides: Optional[dict] = None):
99
113
  from ai.chronon.constants import ADAPTERS
ai/chronon/repo/run.py CHANGED
@@ -74,8 +74,7 @@ def set_defaults(ctx):
74
74
  # "online_jar_fetch": os.path.join(chronon_repo_path, "scripts/fetch_online_jar.py"),
75
75
  "online_args": os.environ.get("CHRONON_ONLINE_ARGS"),
76
76
  "chronon_jar": os.environ.get("CHRONON_DRIVER_JAR"),
77
- "list_apps": "python3 "
78
- + os.path.join(chronon_repo_path, "scripts/yarn_list.py"),
77
+ "list_apps": "python3 " + os.path.join(chronon_repo_path, "scripts/yarn_list.py"),
79
78
  "render_info": os.path.join(chronon_repo_path, RENDER_INFO_DEFAULT_SCRIPT),
80
79
  "project_conf": obj.get("project_conf"),
81
80
  "artifact_prefix": os.environ.get("ARTIFACT_PREFIX"),
@@ -85,24 +84,23 @@ def set_defaults(ctx):
85
84
  if ctx.params.get(key) is None and value is not None:
86
85
  ctx.params[key] = value
87
86
 
87
+
88
88
  def validate_flink_state(ctx, param, value):
89
89
  uri_schemes = ["gs://", "s3://"]
90
90
  if value and not any(value.startswith(scheme) for scheme in uri_schemes):
91
- raise click.BadParameter(
92
- f"Flink state uri must start with {uri_schemes}"
93
- )
91
+ raise click.BadParameter(f"Flink state uri must start with {uri_schemes}")
94
92
  return value
95
93
 
94
+
96
95
  def validate_additional_jars(ctx, param, value):
97
96
  if value:
98
- jars = value.split(',')
97
+ jars = value.split(",")
99
98
  for jar in jars:
100
- if not jar.startswith(('gs://', 's3://')):
101
- raise click.BadParameter(
102
- f"Additional jars must start with gs://, s3://: {jar}"
103
- )
99
+ if not jar.startswith(("gs://", "s3://")):
100
+ raise click.BadParameter(f"Additional jars must start with gs://, s3://: {jar}")
104
101
  return value
105
102
 
103
+
106
104
  @click.command(
107
105
  name="run",
108
106
  context_settings=dict(allow_extra_args=True, ignore_unknown_options=True),
@@ -116,7 +114,9 @@ def validate_additional_jars(ctx, param, value):
116
114
  default="dev",
117
115
  help="Running environment - default to be dev",
118
116
  )
119
- @click.option("--mode", type=click.Choice([str(k) for k in MODE_ARGS.keys()]), default=str(RunMode.BACKFILL))
117
+ @click.option(
118
+ "--mode", type=click.Choice([str(k) for k in MODE_ARGS.keys()]), default=str(RunMode.BACKFILL)
119
+ )
120
120
  @click.option("--ds", help="the end partition to backfill the data")
121
121
  @click.option("--app-name", help="app name. Default to {}".format(APP_NAME_TEMPLATE))
122
122
  @click.option(
@@ -142,58 +142,61 @@ def validate_additional_jars(ctx, param, value):
142
142
  help="Class name of Online Impl. Used for streaming and metadata-upload mode.",
143
143
  )
144
144
  @click.option("--version", required=False, help="Chronon version to use.")
145
- @click.option(
146
- "--spark-version", default="2.4.0", help="Spark version to use for downloading jar."
147
- )
145
+ @click.option("--spark-version", default="2.4.0", help="Spark version to use for downloading jar.")
148
146
  @click.option("--spark-submit-path", help="Path to spark-submit")
149
- @click.option(
150
- "--spark-streaming-submit-path", help="Path to spark-submit for streaming"
151
- )
147
+ @click.option("--spark-streaming-submit-path", help="Path to spark-submit for streaming")
152
148
  @click.option(
153
149
  "--online-jar-fetch",
154
150
  help="Path to script that can pull online jar. This will run only "
155
151
  "when a file doesn't exist at location specified by online_jar",
156
152
  )
157
- @click.option(
158
- "--sub-help", is_flag=True, help="print help command of the underlying jar and exit"
159
- )
153
+ @click.option("--sub-help", is_flag=True, help="print help command of the underlying jar and exit")
160
154
  @click.option(
161
155
  "--conf-type",
162
156
  help="related to sub-help - no need to set unless you are not working with a conf",
163
157
  )
164
- @click.option(
165
- "--online-args", help="Basic arguments that need to be supplied to all online modes"
166
- )
158
+ @click.option("--online-args", help="Basic arguments that need to be supplied to all online modes")
167
159
  @click.option("--chronon-jar", help="Path to chronon OS jar")
168
160
  @click.option("--release-tag", help="Use the latest jar for a particular tag.")
169
- @click.option(
170
- "--list-apps", help="command/script to list running jobs on the scheduler"
171
- )
161
+ @click.option("--list-apps", help="command/script to list running jobs on the scheduler")
172
162
  @click.option(
173
163
  "--render-info",
174
164
  help="Path to script rendering additional information of the given config. "
175
165
  "Only applicable when mode is set to info",
176
166
  )
177
167
  @click.option("--kafka-bootstrap", help="Kafka bootstrap server in host:port format")
178
- @click.option("--latest-savepoint", is_flag=True, default=False, help="Deploys streaming job with latest savepoint")
168
+ @click.option(
169
+ "--latest-savepoint",
170
+ is_flag=True,
171
+ default=False,
172
+ help="Deploys streaming job with latest savepoint",
173
+ )
179
174
  @click.option("--custom-savepoint", help="Savepoint to deploy streaming job with.")
180
- @click.option("--no-savepoint", is_flag=True, default=False, help="Deploys streaming job without a savepoint")
181
- @click.option("--version-check", is_flag=True, default=False,
182
- help="Checks if Zipline version of running streaming job is different from local version and deploys the job if they are different")
183
- @click.option("--flink-state-uri",
184
- help="Bucket for storing flink state checkpoints/savepoints and other internal pieces for orchestration.",
185
- callback=validate_flink_state)
186
- @click.option("--additional-jars",
187
- help="Comma separated list of additional jar URIs to be included in the Flink job classpath (e.g. gs://bucket/jar1.jar,gs://bucket/jar2.jar).",
188
- callback=validate_additional_jars)
189
175
  @click.option(
190
- "--validate",
176
+ "--no-savepoint", is_flag=True, default=False, help="Deploys streaming job without a savepoint"
177
+ )
178
+ @click.option(
179
+ "--version-check",
191
180
  is_flag=True,
192
- help="Validate the catalyst util Spark expression evaluation logic",
181
+ default=False,
182
+ help="Checks if Zipline version of running streaming job is different from local version and deploys the job if they are different",
193
183
  )
194
184
  @click.option(
195
- "--validate-rows", default="10000", help="Number of rows to run the validation on"
185
+ "--flink-state-uri",
186
+ help="Bucket for storing flink state checkpoints/savepoints and other internal pieces for orchestration.",
187
+ callback=validate_flink_state,
188
+ )
189
+ @click.option(
190
+ "--additional-jars",
191
+ help="Comma separated list of additional jar URIs to be included in the Flink job classpath (e.g. gs://bucket/jar1.jar,gs://bucket/jar2.jar).",
192
+ callback=validate_additional_jars,
193
+ )
194
+ @click.option(
195
+ "--validate",
196
+ is_flag=True,
197
+ help="Validate the catalyst util Spark expression evaluation logic",
196
198
  )
199
+ @click.option("--validate-rows", default="10000", help="Number of rows to run the validation on")
197
200
  @click.option("--join-part-name", help="Name of the join part to use for join-part-job")
198
201
  @click.option(
199
202
  "--artifact-prefix",
@@ -50,18 +50,14 @@ class ThriftJSONDecoder(json.JSONDecoder):
50
50
  (_, field_ttype, field_name, field_ttype_info, dummy) = field
51
51
  if field_name not in val:
52
52
  continue
53
- converted_val = self._convert(
54
- val[field_name], field_ttype, field_ttype_info
55
- )
53
+ converted_val = self._convert(val[field_name], field_ttype, field_ttype_info)
56
54
  setattr(ret, field_name, converted_val)
57
55
  elif ttype == TType.LIST:
58
56
  (element_ttype, element_ttype_info, _) = ttype_info
59
57
  ret = [self._convert(x, element_ttype, element_ttype_info) for x in val]
60
58
  elif ttype == TType.SET:
61
59
  (element_ttype, element_ttype_info) = ttype_info
62
- ret = set(
63
- [self._convert(x, element_ttype, element_ttype_info) for x in val]
64
- )
60
+ ret = set([self._convert(x, element_ttype, element_ttype_info) for x in val])
65
61
  elif ttype == TType.MAP:
66
62
  (key_ttype, key_ttype_info, val_ttype, val_ttype_info, _) = ttype_info
67
63
  ret = dict(
@@ -117,9 +113,7 @@ def thrift_json(obj):
117
113
 
118
114
 
119
115
  def thrift_simple_json(obj):
120
- simple = TSerialization.serialize(
121
- obj, protocol_factory=TSimpleJSONProtocolFactory()
122
- )
116
+ simple = TSerialization.serialize(obj, protocol_factory=TSimpleJSONProtocolFactory())
123
117
  parsed = json.loads(simple)
124
118
  return json.dumps(parsed, indent=2, sort_keys=True)
125
119
 
@@ -131,9 +125,7 @@ def thrift_simple_json_protected(obj, obj_type) -> str:
131
125
  actual = thrift_simple_json(thrift_obj)
132
126
  differ = JsonDiffer()
133
127
  diff = differ.diff(serialized, actual)
134
- assert (
135
- len(diff) == 0
136
- ), f"""Serialization can't be reversed
128
+ assert len(diff) == 0, f"""Serialization can't be reversed
137
129
  diff: \n{diff}
138
130
  original: \n{serialized}
139
131
  """