awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
@@ -0,0 +1,171 @@
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from urllib.parse import quote_plus
5
+
6
+ import click
7
+ from attr import dataclass
8
+
9
+ from ai.chronon.cli.git_utils import get_current_branch
10
+ from ai.chronon.repo import hub_uploader, utils
11
+ from ai.chronon.repo.constants import RunMode
12
+ from ai.chronon.repo.zipline_hub import ZiplineHub
13
+
14
+
15
+ @click.group()
16
+ def hub():
17
+ pass
18
+
19
+
20
+ #### Common click options
21
+ def common_options(func):
22
+ func = click.option("--repo", help="Path to chronon repo", default=".")(func)
23
+ func = click.option("--conf", required=True, help="Conf param - required for every mode")(func)
24
+ return func
25
+
26
+ def ds_option(func):
27
+ return click.option("--ds", help="the end partition to backfill the data")(func)
28
+
29
+ def start_ds_option(func):
30
+ return click.option(
31
+ "--start-ds",
32
+ help="override the original start partition for a range backfill. "
33
+ "It only supports staging query, group by backfill and join jobs. "
34
+ "It could leave holes in your final output table due to the override date range.",)(func)
35
+
36
+
37
+ def end_ds_option(func):
38
+ return click.option("--end-ds", help="the end ds for a range backfill")(func)
39
+
40
+
41
+ def submit_workflow(repo,
42
+ conf,
43
+ mode,
44
+ start_ds,
45
+ end_ds):
46
+
47
+ hub_conf = get_hub_conf(conf)
48
+ zipline_hub = ZiplineHub(base_url=hub_conf.hub_url)
49
+ conf_name_to_hash_dict = hub_uploader.build_local_repo_hashmap(root_dir= repo)
50
+ branch = get_current_branch()
51
+
52
+ hub_uploader.compute_and_upload_diffs(branch, zipline_hub=zipline_hub, local_repo_confs=conf_name_to_hash_dict)
53
+
54
+ # get conf name
55
+ conf_name = utils.get_metadata_name_from_conf(repo, conf)
56
+
57
+
58
+ response_json = zipline_hub.call_workflow_start_api(
59
+ conf_name=conf_name,
60
+ mode=mode,
61
+ branch=branch, # Get the current branch
62
+ user=os.environ.get('USER'),
63
+ start=start_ds,
64
+ end=end_ds,
65
+ conf_hash=conf_name_to_hash_dict[conf_name].hash,
66
+ )
67
+
68
+ print(" 🆔 Workflow Id:", response_json.get("workflowId", "N/A"))
69
+ print_wf_url(
70
+ conf=conf,
71
+ conf_name=conf_name,
72
+ mode=RunMode.BACKFILL.value,
73
+ start_ds=start_ds,
74
+ end_ds=end_ds,
75
+ branch=branch
76
+ )
77
+
78
+
79
+ # zipline hub backfill --conf=compiled/joins/join
80
+ # adhoc backfills
81
+ @hub.command()
82
+ @common_options
83
+ @start_ds_option
84
+ @end_ds_option
85
+ def backfill(repo,
86
+ conf,
87
+ start_ds,
88
+ end_ds):
89
+ """
90
+ - Submit a backfill job to Zipline.
91
+ Response should contain a list of confs that are different from what's on remote.
92
+ - Call upload API to upload the conf contents for the list of confs that were different.
93
+ - Call the actual run API with mode set to backfill.
94
+ """
95
+ submit_workflow(repo, conf, RunMode.BACKFILL.value, start_ds, end_ds)
96
+
97
+
98
+
99
+ # zipline hub deploy --conf=compiled/joins/join
100
+ # currently only supports one-off deploy node submission
101
+ @hub.command()
102
+ @common_options
103
+ @end_ds_option
104
+ def deploy(repo,
105
+ conf,
106
+ end_ds):
107
+ """
108
+ - Submit a one-off deploy job to Zipline.
109
+ Response should contain a list of confs that are different from what's on remote.
110
+ - Call upload API to upload the conf contents for the list of confs that were different.
111
+ - Call the actual run API with mode set to deploy
112
+ """
113
+ submit_workflow(repo, conf, RunMode.DEPLOY.value, end_ds, end_ds)
114
+
115
+
116
+ def get_common_env_map(file_path):
117
+ with open(file_path, 'r') as f:
118
+ data = json.load(f)
119
+ common_env_map = data['metaData']['executionInfo']['env']['common']
120
+ return common_env_map
121
+
122
+
123
+ @dataclass
124
+ class HubConfig:
125
+ hub_url: str
126
+ frontend_url: str
127
+
128
+
129
+ def get_hub_conf(conf_path):
130
+ common_env_map = get_common_env_map(conf_path)
131
+ hub_url = common_env_map.get("HUB_URL", os.environ.get("HUB_URL"))
132
+ frontend_url = common_env_map.get("FRONTEND_URL", os.environ.get("FRONTEND_URL"))
133
+ return HubConfig(hub_url=hub_url, frontend_url=frontend_url)
134
+
135
+
136
+ def print_wf_url(conf, conf_name, mode, start_ds, end_ds, branch):
137
+
138
+ hub_conf = get_hub_conf(conf)
139
+ frontend_url = hub_conf.frontend_url
140
+
141
+ if "compiled/joins" in conf:
142
+ hub_conf_type = "joins"
143
+ elif "compiled/staging_queries" in conf:
144
+ hub_conf_type = "stagingqueries"
145
+ elif "compiled/group_by" in conf:
146
+ hub_conf_type = "groupby"
147
+ elif "compiled/models" in conf:
148
+ hub_conf_type = "models"
149
+ else:
150
+ raise ValueError(f"Unsupported conf type: {conf}")
151
+
152
+ # TODO: frontend uses localtime to create the millis, we should make it use UTC and make this align
153
+ def _millis(date_str):
154
+ return int(datetime.strptime(date_str, "%Y-%m-%d").timestamp() * 1000)
155
+
156
+ def _mode_string(mode):
157
+ if mode == "backfill":
158
+ return "offline"
159
+ elif mode == "deploy":
160
+ return "online"
161
+ else:
162
+ raise ValueError(f"Unsupported mode: {mode}")
163
+
164
+ workflow_url = f"{frontend_url.rstrip('/')}/{hub_conf_type}/{conf_name}/{_mode_string(mode)}?start={_millis(start_ds)}&end={_millis(end_ds)}&branch={quote_plus(branch)}"
165
+
166
+ print(" 🔗 Workflow : " + workflow_url + "\n")
167
+
168
+ if __name__ == "__main__":
169
+ hub()
170
+
171
+
@@ -0,0 +1,108 @@
1
+ import glob
2
+ import hashlib
3
+ import json
4
+ import os
5
+
6
+ from ai.chronon.orchestration.ttypes import Conf
7
+ from ai.chronon.repo import (
8
+ FOLDER_NAME_TO_CLASS,
9
+ FOLDER_NAME_TO_CONF_TYPE,
10
+ )
11
+ from ai.chronon.repo.zipline_hub import ZiplineHub
12
+
13
+
14
+ def build_local_repo_hashmap(root_dir: str):
15
+ compiled_dir = os.path.join(root_dir, "compiled")
16
+ # Returns a map of name -> (tbinary, file_hash)
17
+ results = {}
18
+
19
+ # Iterate through each object type folder (staging_queries, group_bys, joins etc)
20
+ for folder_name, _ in FOLDER_NAME_TO_CLASS.items():
21
+ folder_path = os.path.join(compiled_dir, folder_name)
22
+ if not os.path.exists(folder_path):
23
+ continue
24
+
25
+ # Find all json files recursively in this folder
26
+ json_files = [
27
+ f
28
+ for f in glob.glob(os.path.join(folder_path, "**/*"), recursive=True)
29
+ if os.path.isfile(f)
30
+ ]
31
+
32
+ exceptions = []
33
+
34
+ for json_file in json_files:
35
+ try:
36
+ # Read the json file
37
+ with open(json_file, "r") as f:
38
+ thrift_json = f.read()
39
+
40
+ # Extract name from metadata in json
41
+ json_obj = json.loads(thrift_json)
42
+ name = json_obj["metaData"]["name"]
43
+
44
+ # Load the json into the appropriate object type based on folder
45
+ # binary = json2binary(thrift_json, obj_class)
46
+
47
+ md5_hash = hashlib.md5(thrift_json.encode()).hexdigest()
48
+ # md5_hash = hashlib.md5(thrift_json.encode()).hexdigest() + "123"
49
+ # results[name] = (binary, md5_hash, FOLDER_NAME_TO_CONF_TYPE[folder_name])
50
+ results[name] = Conf(
51
+ name=name,
52
+ hash=md5_hash,
53
+ # contents=binary,
54
+ contents=thrift_json,
55
+ confType=FOLDER_NAME_TO_CONF_TYPE[folder_name],
56
+ )
57
+
58
+ except Exception as e:
59
+ exceptions.append(f"{json_file} - {e}")
60
+
61
+ if exceptions:
62
+ error_msg = (
63
+ "The following files had exceptions during upload: \n"
64
+ + "\n".join(exceptions)
65
+ + "\n\n Consider deleting the files (safe operation) and checking "
66
+ + "your thrift version before rerunning your command."
67
+ )
68
+ raise RuntimeError(error_msg)
69
+
70
+ return results
71
+
72
+
73
+ def compute_and_upload_diffs(
74
+ branch: str, zipline_hub: ZiplineHub, local_repo_confs: dict[str, Conf]
75
+ ):
76
+ # Determine which confs are different from the ZiplineHub
77
+ # Call Zipline hub with `names_and_hashes` as the argument to get back
78
+ names_to_hashes = {name: local_conf.hash for name, local_conf in local_repo_confs.items()}
79
+ print(f"\n 🧮 Computed hashes for {len(names_to_hashes)} local files.")
80
+
81
+ changed_conf_names = zipline_hub.call_diff_api(names_to_hashes)
82
+
83
+ if not changed_conf_names:
84
+ print(f" ✅ Remote contains all local files. No need to upload '{branch}'.")
85
+ diffed_confs = {}
86
+ else:
87
+ unchanged = len(names_to_hashes) - len(changed_conf_names)
88
+ print(
89
+ f" 🔍 Detected {len(changed_conf_names)} changes on local branch '{branch}'. {unchanged} unchanged."
90
+ )
91
+
92
+ # a list of names for diffed hashes on branch
93
+ diffed_confs = {k: local_repo_confs[k] for k in changed_conf_names}
94
+ conf_names_str = "\n - ".join(diffed_confs.keys())
95
+ print(f" - {conf_names_str}")
96
+
97
+ diff_confs = []
98
+ for _, conf in diffed_confs.items():
99
+ diff_confs.append(conf.__dict__)
100
+
101
+ # Make PUT request to ZiplineHub
102
+ zipline_hub.call_upload_api(branch=branch, diff_confs=diff_confs)
103
+ print(f" ⬆️ Uploaded {len(diffed_confs)} changed confs to branch '{branch}'.")
104
+
105
+ zipline_hub.call_sync_api(branch=branch, names_to_hashes=names_to_hashes)
106
+
107
+ print(f" ✅ {len(names_to_hashes)} hashes updated on branch '{branch}'.\n")
108
+ return diffed_confs
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import shutil
5
+
6
+ import click
7
+ from importlib_resources import files
8
+ from rich.prompt import Prompt
9
+ from rich.syntax import Syntax
10
+
11
+ from ai.chronon.cli.compile.display.console import console
12
+
13
+
14
+ @click.command(name="init")
15
+ @click.option(
16
+ "--cloud-provider",
17
+ envvar="CLOUD_PROVIDER",
18
+ help="Cloud provider to use.",
19
+ required=True,
20
+ type=click.Choice(['aws', 'gcp'], case_sensitive=False)
21
+ )
22
+ @click.option(
23
+ "--chronon-root",
24
+ help="Path to the root chronon folder.",
25
+ default=os.path.join(os.getcwd(), "zipline"),
26
+ type=click.Path(file_okay=False, writable=True),
27
+ )
28
+ @click.pass_context
29
+ def main(ctx, chronon_root, cloud_provider):
30
+ template_path = files("ai.chronon").joinpath("resources", cloud_provider.lower())
31
+ target_path = os.path.abspath(chronon_root)
32
+
33
+ if os.path.exists(target_path) and os.listdir(target_path):
34
+ choice = Prompt.ask(f"[bold yellow] Warning: [/]{target_path} is not empty. Proceed?",
35
+ choices=["y", "n"],
36
+ default="y")
37
+ if choice == "n":
38
+ return
39
+
40
+ console.print(f"Generating scaffolding at {target_path} ...")
41
+
42
+ try:
43
+ shutil.copytree(template_path, target_path, dirs_exist_ok=True)
44
+ console.print("[bold green] Project scaffolding created successfully! 🎉\n")
45
+ export_cmd = Syntax(f"`export PYTHONPATH={target_path}:$PYTHONPATH`", "bash", theme="github-dark", line_numbers=False)
46
+ console.print("Please copy the following command to your shell config:")
47
+ console.print(export_cmd)
48
+ except Exception:
49
+ console.print_exception()
50
+
51
+
52
+ if __name__ == "__main__":
53
+ main()
@@ -0,0 +1,105 @@
1
+ import json
2
+ import os
3
+ from typing import Optional
4
+
5
+ from ai.chronon.scheduler.interfaces.flow import Flow
6
+ from ai.chronon.scheduler.interfaces.node import Node
7
+ from ai.chronon.utils import (
8
+ convert_json_to_obj,
9
+ dict_to_bash_commands,
10
+ dict_to_exports,
11
+ get_join_output_table_name,
12
+ join_part_name,
13
+ sanitize,
14
+ )
15
+
16
+ TASK_PREFIX = "compute_join"
17
+ DEFAULT_SPARK_SETTINGS = {
18
+ "default": {
19
+ "spark_version": "3.1.1",
20
+ "executor_memory": "4G",
21
+ "driver_memory": "4G",
22
+ "executor_cores": 2,
23
+ }
24
+ }
25
+
26
+
27
+ class JoinBackfill:
28
+ def __init__(
29
+ self,
30
+ start_date: str,
31
+ end_date: str,
32
+ config_path: str,
33
+ settings: dict = DEFAULT_SPARK_SETTINGS,
34
+ ):
35
+ self.dag_id = "_".join(
36
+ map(
37
+ sanitize, ["chronon_joins_backfill", os.path.basename(config_path).split("/")[-1], start_date, end_date]
38
+ )
39
+ )
40
+ self.start_date = start_date
41
+ self.end_date = end_date
42
+ self.config_path = config_path
43
+ self.settings = settings
44
+ with open(self.config_path, "r") as file:
45
+ config = file.read()
46
+ self.join = convert_json_to_obj(json.loads(config))
47
+
48
+ def build_flow(self) -> Flow:
49
+ """
50
+ Build a flow from a Join object. Each join part is a node and will run in parallel.
51
+ The next step is final join, which is a node that depends on all join parts.
52
+ The final join will run after all join parts are done.
53
+
54
+ :param join: The Join object to build a flow from
55
+ :return: A Flow object that represents the flow of the Join
56
+ """
57
+ flow = Flow(self.join.metaData.name)
58
+ final_node = Node(
59
+ f"{TASK_PREFIX}__{sanitize(get_join_output_table_name(self.join, full_name=True))}", self.run_final_join()
60
+ )
61
+ left_node = Node(f"{TASK_PREFIX}__left_table", self.run_left_table())
62
+ flow.add_node(final_node)
63
+ flow.add_node(left_node)
64
+ for join_part in self.join.joinParts:
65
+ jp_full_name = join_part_name(join_part)
66
+ jp_node = Node(f"{TASK_PREFIX}__{jp_full_name}", self.run_join_part(jp_full_name))
67
+ flow.add_node(jp_node)
68
+ jp_node.add_dependency(left_node)
69
+ final_node.add_dependency(jp_node)
70
+ return flow
71
+
72
+ def export_template(self, settings: dict):
73
+ return f"{dict_to_exports(settings)}"
74
+
75
+ def command_template(self, extra_args: dict):
76
+ if self.start_date:
77
+ extra_args.update({"start_ds": self.start_date})
78
+ return f"""python3 /tmp/run.py --conf=/tmp/{self.config_path} --ds={self.end_date} \
79
+ {dict_to_bash_commands(extra_args)}"""
80
+
81
+ def run_join_part(self, join_part: str):
82
+ args = {
83
+ "mode": "backfill",
84
+ "selected_join_parts": join_part,
85
+ "use_cached_left": None,
86
+ }
87
+ settings = self.settings.get(join_part, self.settings["default"])
88
+ return self.export_template(settings) + " && " + self.command_template(extra_args=args)
89
+
90
+ def run_left_table(self):
91
+ settings = self.settings.get("left_table", self.settings["default"])
92
+ return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-left"})
93
+
94
+ def run_final_join(self):
95
+ settings = self.settings.get("final_join", self.settings["default"])
96
+ return self.export_template(settings) + " && " + self.command_template(extra_args={"mode": "backfill-final"})
97
+
98
+ def run(self, orchestrator: str, overrides: Optional[dict] = None):
99
+ from ai.chronon.constants import ADAPTERS
100
+
101
+ ADAPTERS.update(overrides)
102
+ orchestrator = ADAPTERS[orchestrator](dag_id=self.dag_id, start_date=self.start_date)
103
+ orchestrator.setup()
104
+ orchestrator.build_dag_from_flow(self.build_flow())
105
+ orchestrator.trigger_run()