awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
ai/chronon/query.py ADDED
@@ -0,0 +1,126 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from collections import OrderedDict
16
+ from typing import Dict, List
17
+
18
+ import ai.chronon.api.ttypes as api
19
+
20
+
21
+ def Query(
22
+ selects: Dict[str, str] = None,
23
+ wheres: List[str] = None,
24
+ start_partition: str = None,
25
+ end_partition: str = None,
26
+ time_column: str = None,
27
+ setups: List[str] = None,
28
+ mutation_time_column: str = None,
29
+ reversal_column: str = None,
30
+ partition_column: str = None,
31
+ partition_format: str = None,
32
+ sub_partitions_to_wait_for: List[str] = None,
33
+ ) -> api.Query:
34
+ """
35
+ Create a query object that is used to scan data from various data sources.
36
+ This contains partition ranges, row level transformations and filtering logic.
37
+ Additionally we also require a time_column for TEMPORAL events, mutation_time_column & reversal
38
+ for TEMPORAL entities.
39
+
40
+ :param selects: Spark sql expressions with only arithmetic, function application & inline lambdas.
41
+ You can also apply udfs see setups param below.::
42
+
43
+ Example: {
44
+ "alias": "built_in_function(col1) * my_udf(col2)",
45
+ "alias1": "aggregate(array_col, 0, (acc, x) -> acc + x)"
46
+ }
47
+
48
+ See: https://spark.apache.org/docs/latest/api/sql/#built-in-functions
49
+ When none, we will assume that no transformations are needed and will pick columns necessary for aggregations.
50
+ :type selects: List[str], optional
51
+ :param wheres: Used for filtering. Same as above, but each expression must return boolean.
52
+ Expressions are joined using AND.
53
+ :type wheres: List[str], optional
54
+ :param start_partition: From which partition of the source is the data valid from - inclusive.
55
+ When absent we will consider all available data is usable.
56
+ :type start_partition: str, optional
57
+ :param end_partition: Till what partition of the source is the data valid till - inclusive.
58
+ Not specified unless you know for a fact that a particular source has expired after a partition and you
59
+ should instead use another source after this partition.
60
+ :type end_partition: str, optional
61
+ :param time_column: a single expression to produce time as ** milliseconds since epoch**.
62
+ :type time_column: str, optional
63
+ :param setups: you can register UDFs using setups
64
+ ["ADD JAR YOUR_JAR", "create temporary function YOU_UDF_NAME as YOUR_CLASS"]
65
+ :type setups: List[str], optional
66
+ :param mutation_time_column: For entities, with real time accuracy, you need to specify an expression that
67
+ represents mutation time. Time should be milliseconds since epoch.
68
+ This is not necessary for event sources, defaults to "mutation_ts"
69
+ :type mutation_time_column: str, optional
70
+ :param reversal_column: (defaults to "is_before")
71
+ For entities with realtime accuracy, we divide updates into two additions & reversal.
72
+ updates have two rows - one with is_before = True (the old value) & is_before = False (the new value)
73
+ inserts only have is_before = false (just the new value).
74
+ deletes only have is_before = true (just the old value).
75
+ This is not necessary for event sources.
76
+ :type reversal_column: str, optional
77
+ :param partition_column:
78
+ Specify this to override spark.chronon.partition.column set in teams.py for this particular query.
79
+ :type partition_column: str, optional
80
+ :param sub_partitions_to_wait_for:
81
+ Additional partitions to be used in sensing that the source data has landed. Should be a full partition string, such as `hr=23:00'
82
+ :type sub_partitions_to_wait_for: List[str], optional
83
+ :param partition_format:
84
+ Date format string to expect the partition values to be in.
85
+ :type partition_format: str, optional
86
+ :return: A Query object that Chronon can use to scan just the necessary data efficiently.
87
+ """
88
+ return api.Query(
89
+ selects=selects,
90
+ wheres=wheres,
91
+ startPartition=start_partition,
92
+ endPartition=end_partition,
93
+ timeColumn=time_column,
94
+ setups=setups,
95
+ mutationTimeColumn=mutation_time_column,
96
+ reversalColumn=reversal_column,
97
+ partitionColumn=partition_column,
98
+ subPartitionsToWaitFor=sub_partitions_to_wait_for,
99
+ partitionFormat=partition_format
100
+ )
101
+
102
+
103
+ def selects(*args, **kwargs):
104
+ """
105
+ Create a dictionary required for the selects parameter of Query.
106
+
107
+ .. code-block:: python
108
+ selects(
109
+ "event_id",
110
+ user_id="user_id",
111
+ )
112
+
113
+ creates the following dictionary:
114
+
115
+ .. code-block:: python
116
+ {
117
+ "event_id": "event_id",
118
+ "user_id": "user_id"
119
+ }
120
+ """
121
+ result = OrderedDict()
122
+ for x in args:
123
+ result[x] = x
124
+ for k, v in kwargs.items():
125
+ result[k] = v
126
+ return result
@@ -0,0 +1,40 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ai.chronon.api.ttypes import GroupBy, Join, Model, StagingQuery
16
+ from ai.chronon.orchestration.ttypes import ConfType
17
+
18
+ JOIN_FOLDER_NAME = "joins"
19
+ GROUP_BY_FOLDER_NAME = "group_bys"
20
+ STAGING_QUERY_FOLDER_NAME = "staging_queries"
21
+ MODEL_FOLDER_NAME = "models"
22
+ # TODO - make team part of thrift API?
23
+ TEAMS_FILE_PATH = "teams.json"
24
+ OUTPUT_ROOT = "production"
25
+
26
+ # This is set in the main function -
27
+ # from command line or from env variable during invocation
28
+ FOLDER_NAME_TO_CLASS = {
29
+ GROUP_BY_FOLDER_NAME: GroupBy,
30
+ JOIN_FOLDER_NAME: Join,
31
+ STAGING_QUERY_FOLDER_NAME: StagingQuery,
32
+ MODEL_FOLDER_NAME: Model,
33
+ }
34
+
35
+ FOLDER_NAME_TO_CONF_TYPE = {
36
+ GROUP_BY_FOLDER_NAME: ConfType.GROUP_BY,
37
+ JOIN_FOLDER_NAME: ConfType.JOIN,
38
+ STAGING_QUERY_FOLDER_NAME: ConfType.STAGING_QUERY,
39
+ MODEL_FOLDER_NAME: ConfType.MODEL,
40
+ }
ai/chronon/repo/aws.py ADDED
@@ -0,0 +1,298 @@
1
+ import json
2
+ import multiprocessing
3
+ import os
4
+ from typing import List
5
+
6
+ import boto3
7
+
8
+ from ai.chronon.logger import get_logger
9
+ from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
10
+ from ai.chronon.repo.default_runner import Runner
11
+ from ai.chronon.repo.utils import (
12
+ JobType,
13
+ check_call,
14
+ extract_filename_from_path,
15
+ get_customer_id,
16
+ split_date_range,
17
+ )
18
+
19
+ LOG = get_logger()
20
+
21
+ # AWS SPECIFIC CONSTANTS
22
+ EMR_ENTRY = "ai.chronon.integrations.aws.EmrSubmitter"
23
+ ZIPLINE_AWS_JAR_DEFAULT = "cloud_aws_lib_deploy.jar"
24
+ ZIPLINE_AWS_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.aws.AwsApiImpl"
25
+ ZIPLINE_AWS_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
26
+ ZIPLINE_AWS_SERVICE_JAR = "service_assembly_deploy.jar"
27
+
28
+ LOCAL_FILE_TO_ETAG_JSON = f"{ZIPLINE_DIRECTORY}/local_file_to_etag.json"
29
+
30
+ EMR_MOUNT_FILE_PREFIX = "/mnt/zipline/"
31
+
32
+
33
+ class AwsRunner(Runner):
34
+ def __init__(self, args):
35
+ aws_jar_path = AwsRunner.download_zipline_aws_jar(
36
+ ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_JAR_DEFAULT
37
+ )
38
+ service_jar_path = AwsRunner.download_zipline_aws_jar(
39
+ ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_SERVICE_JAR
40
+ )
41
+ jar_path = (
42
+ f"{service_jar_path}:{aws_jar_path}" if args['mode'] == "fetch" else aws_jar_path
43
+ )
44
+ self.version = args.get("version", "latest")
45
+
46
+ super().__init__(args, os.path.expanduser(jar_path))
47
+
48
+ @staticmethod
49
+ def upload_s3_file(
50
+ bucket_name: str, source_file_name: str, destination_blob_name: str
51
+ ):
52
+ """Uploads a file to the bucket."""
53
+ obj = boto3.client("s3")
54
+ try:
55
+ obj.upload_file(source_file_name, bucket_name, destination_blob_name)
56
+ print(
57
+ f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
58
+ )
59
+ return f"s3://{bucket_name}/{destination_blob_name}"
60
+ except Exception as e:
61
+ raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
62
+
63
+ @staticmethod
64
+ def download_zipline_aws_jar(destination_dir: str, customer_id: str, version: str, jar_name: str):
65
+ s3_client = boto3.client("s3")
66
+ destination_path = f"{destination_dir}/{jar_name}"
67
+ source_key_name = f"release/{version}/jars/{jar_name}"
68
+ bucket_name = f"zipline-artifacts-{customer_id}"
69
+
70
+ are_identical = (
71
+ AwsRunner.compare_s3_and_local_file_hashes(
72
+ bucket_name, source_key_name, destination_path
73
+ )
74
+ if os.path.exists(destination_path)
75
+ else False
76
+ )
77
+
78
+ if are_identical:
79
+ print(f"{destination_path} matches S3 {bucket_name}/{source_key_name}")
80
+ else:
81
+ print(
82
+ f"{destination_path} does NOT match S3 {bucket_name}/{source_key_name}"
83
+ )
84
+ print(f"Downloading {jar_name} from S3...")
85
+
86
+ s3_client.download_file(
87
+ Filename=destination_path, Bucket=bucket_name, Key=source_key_name
88
+ )
89
+ # Persist ETag to prevent downloading the same file next time
90
+ etag = AwsRunner.get_s3_file_hash(bucket_name, source_key_name)
91
+ if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
92
+ with open(LOCAL_FILE_TO_ETAG_JSON, "r") as file:
93
+ data = json.load(file)
94
+
95
+ # Add the new entry
96
+ data[destination_path] = etag
97
+
98
+ # Write the updated dictionary back to the file
99
+ with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
100
+ json.dump(data, file)
101
+ else:
102
+ with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
103
+ data = {destination_path: etag}
104
+ json.dump(data, file)
105
+
106
+ return destination_path
107
+
108
+ @staticmethod
109
+ def get_s3_file_hash(bucket_name: str, file_name: str):
110
+ s3_client = boto3.client("s3")
111
+ response = s3_client.head_object(Bucket=bucket_name, Key=file_name)
112
+ return response["ETag"].strip('"')
113
+
114
+ @staticmethod
115
+ def get_local_file_hash(file_name: str):
116
+ # read in the json file
117
+ if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
118
+ with open(LOCAL_FILE_TO_ETAG_JSON, "r") as f:
119
+ data = json.load(f)
120
+ if file_name in data:
121
+ return data[file_name]
122
+ return None
123
+
124
+ @staticmethod
125
+ def compare_s3_and_local_file_hashes(
126
+ bucket_name: str, s3_file_path: str, local_file_path: str
127
+ ):
128
+ try:
129
+ s3_hash = AwsRunner.get_s3_file_hash(bucket_name, s3_file_path)
130
+ local_hash = AwsRunner.get_local_file_hash(local_file_path)
131
+ print(f"Local hash: {local_hash}, S3 hash: {s3_hash}")
132
+ return s3_hash == local_hash
133
+ except Exception as e:
134
+ print(f"Error comparing files: {str(e)}")
135
+ return False
136
+
137
+ def generate_emr_submitter_args(
138
+ self,
139
+ user_args: str,
140
+ job_type: JobType = JobType.SPARK,
141
+ local_files_to_upload: List[str] = None,
142
+ ):
143
+ customer_warehouse_bucket_name = f"zipline-warehouse-{get_customer_id()}"
144
+ s3_files = []
145
+ for source_file in local_files_to_upload:
146
+ # upload to `metadata` folder
147
+ destination_file_path = (
148
+ f"metadata/{extract_filename_from_path(source_file)}"
149
+ )
150
+ s3_files.append(
151
+ AwsRunner.upload_s3_file(
152
+ customer_warehouse_bucket_name, source_file, destination_file_path
153
+ )
154
+ )
155
+
156
+ # we also want the additional-confs included here. it should already be in the bucket
157
+
158
+ zipline_artifacts_bucket_prefix = "s3://zipline-artifacts"
159
+
160
+ s3_files.append(
161
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}/confs/additional-confs.yaml"
162
+ )
163
+
164
+ s3_file_args = ",".join(s3_files)
165
+
166
+ # include jar uri. should also already be in the bucket
167
+ jar_uri = (
168
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
169
+ + f"/release/{self.version}/jars/{ZIPLINE_AWS_JAR_DEFAULT}"
170
+ )
171
+
172
+ final_args = "{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class}"
173
+
174
+ if job_type == JobType.FLINK:
175
+ main_class = "ai.chronon.flink.FlinkJob"
176
+ flink_jar_uri = (
177
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
178
+ + f"/jars/{ZIPLINE_AWS_FLINK_JAR_DEFAULT}"
179
+ )
180
+ return (
181
+ final_args.format(
182
+ user_args=user_args,
183
+ jar_uri=jar_uri,
184
+ job_type=job_type.value,
185
+ main_class=main_class,
186
+ )
187
+ + f" --flink-main-jar-uri={flink_jar_uri}"
188
+ )
189
+
190
+ elif job_type == JobType.SPARK:
191
+ main_class = "ai.chronon.spark.Driver"
192
+ return (
193
+ final_args.format(
194
+ user_args=user_args,
195
+ jar_uri=jar_uri,
196
+ job_type=job_type.value,
197
+ main_class=main_class,
198
+ )
199
+ + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml"
200
+ f" --files={s3_file_args}"
201
+ )
202
+ else:
203
+ raise ValueError(f"Invalid job type: {job_type}")
204
+
205
+ def run(self):
206
+ command_list = []
207
+ if self.mode == "info":
208
+ command_list.append(
209
+ "python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
210
+ script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
211
+ )
212
+ )
213
+ elif self.sub_help or self.mode == "fetch":
214
+ entrypoint = "ai.chronon.online.fetcher.FetcherMain"
215
+ command_list.append(
216
+ "java -cp {jar} {entrypoint} {subcommand} {args}".format(
217
+ jar=self.jar_path,
218
+ entrypoint=entrypoint,
219
+ args="--help" if self.sub_help else self._gen_final_args(),
220
+ subcommand=ROUTES[self.conf_type][self.mode],
221
+ )
222
+ )
223
+ elif self.mode in ["streaming", "streaming-client"]:
224
+ raise ValueError("Streaming is not supported for AWS yet.")
225
+ else:
226
+ local_files_to_upload_to_aws = []
227
+ if self.conf:
228
+ local_files_to_upload_to_aws.append(os.path.join(self.repo, self.conf))
229
+ if self.parallelism > 1:
230
+ assert self.start_ds is not None and self.ds is not None, (
231
+ "To use parallelism, please specify --start-ds and --end-ds to "
232
+ "break down into multiple backfill jobs"
233
+ )
234
+ date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
235
+ for start_ds, end_ds in date_ranges:
236
+ user_args = "{subcommand} {args} {additional_args}".format(
237
+ subcommand=ROUTES[self.conf_type][self.mode],
238
+ args=self._gen_final_args(
239
+ start_ds=start_ds,
240
+ end_ds=end_ds,
241
+ # when we download files from s3 to emr, they'll be mounted at /mnt/zipline
242
+ override_conf_path=(
243
+ EMR_MOUNT_FILE_PREFIX
244
+ + extract_filename_from_path(self.conf)
245
+ if self.conf
246
+ else None
247
+ ),
248
+ ),
249
+ additional_args=os.environ.get(
250
+ "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
251
+ ),
252
+ )
253
+
254
+ emr_args = self.generate_emr_submitter_args(
255
+ local_files_to_upload=local_files_to_upload_to_aws,
256
+ # for now, self.conf is the only local file that requires uploading to gcs
257
+ user_args=user_args,
258
+ )
259
+ command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
260
+ command_list.append(command)
261
+ else:
262
+ user_args = ("{subcommand} {args} {additional_args}").format(
263
+ subcommand=ROUTES[self.conf_type][self.mode],
264
+ args=self._gen_final_args(
265
+ start_ds=self.start_ds,
266
+ # when we download files from s3 to emr, they'll be mounted at /mnt/zipline
267
+ override_conf_path=(
268
+ EMR_MOUNT_FILE_PREFIX
269
+ + extract_filename_from_path(self.conf)
270
+ if self.conf
271
+ else None
272
+ ),
273
+ ),
274
+ additional_args=os.environ.get(
275
+ "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
276
+ ),
277
+ )
278
+
279
+ emr_args = self.generate_emr_submitter_args(
280
+ # for now, self.conf is the only local file that requires uploading
281
+ local_files_to_upload=local_files_to_upload_to_aws,
282
+ user_args=user_args,
283
+ )
284
+ command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
285
+ command_list.append(command)
286
+
287
+ if len(command_list) > 1:
288
+ # parallel backfill mode
289
+ with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
290
+ LOG.info(
291
+ "Running args list {} with pool size {}".format(
292
+ command_list, self.parallelism
293
+ )
294
+ )
295
+ pool.map(check_call, command_list)
296
+ elif len(command_list) == 1:
297
+ # TODO: add log tailing
298
+ check_call(command_list[0])
@@ -0,0 +1,65 @@
1
+ import json
2
+
3
+
4
+ def generate_dataproc_cluster_config(num_workers, project_id, artifact_prefix, master_host_type="n2-highmem-64",
5
+ worker_host_type="n2-highmem-16",
6
+ subnetwork="default", idle_timeout="7200s", initialization_actions=None, tags=None):
7
+ """
8
+ Create a configuration for a Dataproc cluster.
9
+ :return: A json string representing the configuration.
10
+ """
11
+ if initialization_actions is None:
12
+ initialization_actions = []
13
+ return json.dumps({
14
+ "gceClusterConfig": {
15
+ "subnetworkUri": subnetwork,
16
+ "serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
17
+ "serviceAccountScopes": [
18
+ "https://www.googleapis.com/auth/cloud-platform",
19
+ "https://www.googleapis.com/auth/cloud.useraccounts.readonly",
20
+ "https://www.googleapis.com/auth/devstorage.read_write",
21
+ "https://www.googleapis.com/auth/logging.write"
22
+ ],
23
+ "metadata": {
24
+ "hive-version": "3.1.2",
25
+ "SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
26
+ "artifact_prefix": artifact_prefix.rstrip("/"),
27
+ },
28
+ "tags": tags or []
29
+ },
30
+ "masterConfig": {
31
+ "numInstances": 1,
32
+ "machineTypeUri": master_host_type,
33
+ "diskConfig": {
34
+ "bootDiskType": "pd-standard",
35
+ "bootDiskSizeGb": 1024
36
+ }
37
+ },
38
+ "workerConfig": {
39
+ "numInstances": num_workers,
40
+ "machineTypeUri": worker_host_type,
41
+ "diskConfig": {
42
+ "bootDiskType": "pd-standard",
43
+ "bootDiskSizeGb": 64,
44
+ "numLocalSsds": 2
45
+ }
46
+ },
47
+ "softwareConfig": {
48
+ "imageVersion": "2.2.50-debian12",
49
+ "optionalComponents": [
50
+ "FLINK",
51
+ "JUPYTER",
52
+ ],
53
+ "properties": {
54
+
55
+ }
56
+ },
57
+ "initializationActions": [{"executable_file": initialization_action} for initialization_action in (
58
+ (initialization_actions or []) + [artifact_prefix.rstrip("/")+"/scripts/copy_java_security.sh"])],
59
+ "endpointConfig": {
60
+ "enableHttpPortAccess": True,
61
+ },
62
+ "lifecycleConfig": {
63
+ "idleDeleteTtl": idle_timeout,
64
+ }
65
+ })
@@ -0,0 +1,56 @@
1
+ import os
2
+ import sys
3
+
4
+ import click
5
+
6
+ from ai.chronon.cli.compile.compile_context import CompileContext
7
+ from ai.chronon.cli.compile.compiler import Compiler
8
+ from ai.chronon.cli.compile.display.console import console
9
+
10
+
11
+ @click.command(name="compile")
12
+ @click.option(
13
+ "--chronon-root",
14
+ envvar="CHRONON_ROOT",
15
+ help="Path to the root chronon folder",
16
+ default=os.getcwd(),
17
+ )
18
+ def compile(chronon_root):
19
+
20
+ print()
21
+
22
+ if chronon_root not in sys.path:
23
+ console.print(
24
+ f"Adding [cyan italic]{chronon_root}[/cyan italic] to python path, during compile."
25
+ )
26
+ sys.path.append(chronon_root)
27
+ else:
28
+ console.print(
29
+ f"[cyan italic]{chronon_root}[/cyan italic] already on python path."
30
+ )
31
+
32
+ return __compile(chronon_root)
33
+
34
+
35
+ def __compile(chronon_root):
36
+ if chronon_root:
37
+ chronon_root_path = os.path.expanduser(chronon_root)
38
+ os.chdir(chronon_root_path)
39
+
40
+ # check that a "teams.py" file exists in the current directory
41
+ if not (os.path.exists("teams.py") or os.path.exists("teams.json")):
42
+ raise click.ClickException(
43
+ (
44
+ "teams.py or teams.json file not found in current directory."
45
+ " Please run from the top level of conf directory."
46
+ )
47
+ )
48
+
49
+ compile_context = CompileContext()
50
+ compiler = Compiler(compile_context)
51
+ results = compiler.compile()
52
+ return results
53
+
54
+
55
+ if __name__ == "__main__":
56
+ compile()