awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/query.py ADDED
@@ -0,0 +1,126 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from collections import OrderedDict
16
+ from typing import Dict, List
17
+
18
+ import gen_thrift.api.ttypes as api
19
+
20
+
21
+ def Query(
22
+ selects: Dict[str, str] = None,
23
+ wheres: List[str] = None,
24
+ start_partition: str = None,
25
+ end_partition: str = None,
26
+ time_column: str = None,
27
+ setups: List[str] = None,
28
+ mutation_time_column: str = None,
29
+ reversal_column: str = None,
30
+ partition_column: str = None,
31
+ partition_format: str = None,
32
+ sub_partitions_to_wait_for: List[str] = None,
33
+ ) -> api.Query:
34
+ """
35
+ Create a query object that is used to scan data from various data sources.
36
+ This contains partition ranges, row level transformations and filtering logic.
37
+ Additionally we also require a time_column for TEMPORAL events, mutation_time_column & reversal
38
+ for TEMPORAL entities.
39
+
40
+ :param selects: Spark sql expressions with only arithmetic, function application & inline lambdas.
41
+ You can also apply udfs see setups param below.::
42
+
43
+ Example: {
44
+ "alias": "built_in_function(col1) * my_udf(col2)",
45
+ "alias1": "aggregate(array_col, 0, (acc, x) -> acc + x)"
46
+ }
47
+
48
+ See: https://spark.apache.org/docs/latest/api/sql/#built-in-functions
49
+ When none, we will assume that no transformations are needed and will pick columns necessary for aggregations.
50
+ :type selects: List[str], optional
51
+ :param wheres: Used for filtering. Same as above, but each expression must return boolean.
52
+ Expressions are joined using AND.
53
+ :type wheres: List[str], optional
54
+ :param start_partition: From which partition of the source is the data valid from - inclusive.
55
+ When absent we will consider all available data is usable.
56
+ :type start_partition: str, optional
57
+ :param end_partition: Till what partition of the source is the data valid till - inclusive.
58
+ Not specified unless you know for a fact that a particular source has expired after a partition and you
59
+ should instead use another source after this partition.
60
+ :type end_partition: str, optional
61
+ :param time_column: a single expression to produce time as ** milliseconds since epoch**.
62
+ :type time_column: str, optional
63
+ :param setups: you can register UDFs using setups
64
+ ["ADD JAR YOUR_JAR", "create temporary function YOU_UDF_NAME as YOUR_CLASS"]
65
+ :type setups: List[str], optional
66
+ :param mutation_time_column: For entities, with real time accuracy, you need to specify an expression that
67
+ represents mutation time. Time should be milliseconds since epoch.
68
+ This is not necessary for event sources, defaults to "mutation_ts"
69
+ :type mutation_time_column: str, optional
70
+ :param reversal_column: (defaults to "is_before")
71
+ For entities with realtime accuracy, we divide updates into two additions & reversal.
72
+ updates have two rows - one with is_before = True (the old value) & is_before = False (the new value)
73
+ inserts only have is_before = false (just the new value).
74
+ deletes only have is_before = true (just the old value).
75
+ This is not necessary for event sources.
76
+ :type reversal_column: str, optional
77
+ :param partition_column:
78
+ Specify this to override spark.chronon.partition.column set in teams.py for this particular query.
79
+ :type partition_column: str, optional
80
+ :param sub_partitions_to_wait_for:
81
+ Additional partitions to be used in sensing that the source data has landed. Should be a full partition string, such as `hr=23:00'
82
+ :type sub_partitions_to_wait_for: List[str], optional
83
+ :param partition_format:
84
+ Date format string to expect the partition values to be in.
85
+ :type partition_format: str, optional
86
+ :return: A Query object that Chronon can use to scan just the necessary data efficiently.
87
+ """
88
+ return api.Query(
89
+ selects=selects,
90
+ wheres=wheres,
91
+ startPartition=start_partition,
92
+ endPartition=end_partition,
93
+ timeColumn=time_column,
94
+ setups=setups,
95
+ mutationTimeColumn=mutation_time_column,
96
+ reversalColumn=reversal_column,
97
+ partitionColumn=partition_column,
98
+ subPartitionsToWaitFor=sub_partitions_to_wait_for,
99
+ partitionFormat=partition_format,
100
+ )
101
+
102
+
103
+ def selects(*args, **kwargs):
104
+ """
105
+ Create a dictionary required for the selects parameter of Query.
106
+
107
+ .. code-block:: python
108
+ selects(
109
+ "event_id",
110
+ user_id="user_id",
111
+ )
112
+
113
+ creates the following dictionary:
114
+
115
+ .. code-block:: python
116
+ {
117
+ "event_id": "event_id",
118
+ "user_id": "user_id"
119
+ }
120
+ """
121
+ result = OrderedDict()
122
+ for x in args:
123
+ result[x] = x
124
+ for k, v in kwargs.items():
125
+ result[k] = v
126
+ return result
@@ -0,0 +1,39 @@
1
+ # Copyright (C) 2023 The Chronon Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from gen_thrift.api.ttypes import ConfType, GroupBy, Join, Model, StagingQuery
16
+
17
+ JOIN_FOLDER_NAME = "joins"
18
+ GROUP_BY_FOLDER_NAME = "group_bys"
19
+ STAGING_QUERY_FOLDER_NAME = "staging_queries"
20
+ MODEL_FOLDER_NAME = "models"
21
+ # TODO - make team part of thrift API?
22
+ TEAMS_FILE_PATH = "teams.json"
23
+ OUTPUT_ROOT = "production"
24
+
25
+ # This is set in the main function -
26
+ # from command line or from env variable during invocation
27
+ FOLDER_NAME_TO_CLASS = {
28
+ GROUP_BY_FOLDER_NAME: GroupBy,
29
+ JOIN_FOLDER_NAME: Join,
30
+ STAGING_QUERY_FOLDER_NAME: StagingQuery,
31
+ MODEL_FOLDER_NAME: Model,
32
+ }
33
+
34
+ FOLDER_NAME_TO_CONF_TYPE = {
35
+ GROUP_BY_FOLDER_NAME: ConfType.GROUP_BY,
36
+ JOIN_FOLDER_NAME: ConfType.JOIN,
37
+ STAGING_QUERY_FOLDER_NAME: ConfType.STAGING_QUERY,
38
+ MODEL_FOLDER_NAME: ConfType.MODEL,
39
+ }
ai/chronon/repo/aws.py ADDED
@@ -0,0 +1,284 @@
1
+ import json
2
+ import multiprocessing
3
+ import os
4
+ from typing import List
5
+
6
+ import boto3
7
+
8
+ from ai.chronon.logger import get_logger
9
+ from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
10
+ from ai.chronon.repo.default_runner import Runner
11
+ from ai.chronon.repo.utils import (
12
+ JobType,
13
+ check_call,
14
+ extract_filename_from_path,
15
+ get_customer_id,
16
+ split_date_range,
17
+ )
18
+
19
+ LOG = get_logger()
20
+
21
+ # AWS SPECIFIC CONSTANTS
22
+ EMR_ENTRY = "ai.chronon.integrations.aws.EmrSubmitter"
23
+ ZIPLINE_AWS_JAR_DEFAULT = "cloud_aws_lib_deploy.jar"
24
+ ZIPLINE_AWS_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.aws.AwsApiImpl"
25
+ ZIPLINE_AWS_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
26
+ ZIPLINE_AWS_SERVICE_JAR = "service_assembly_deploy.jar"
27
+
28
+ LOCAL_FILE_TO_ETAG_JSON = f"{ZIPLINE_DIRECTORY}/local_file_to_etag.json"
29
+
30
+ EMR_MOUNT_FILE_PREFIX = "/mnt/zipline/"
31
+
32
+
33
+ class AwsRunner(Runner):
34
+ def __init__(self, args):
35
+ aws_jar_path = AwsRunner.download_zipline_aws_jar(
36
+ ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_JAR_DEFAULT
37
+ )
38
+ service_jar_path = AwsRunner.download_zipline_aws_jar(
39
+ ZIPLINE_DIRECTORY, get_customer_id(), args["version"], ZIPLINE_AWS_SERVICE_JAR
40
+ )
41
+ jar_path = f"{service_jar_path}:{aws_jar_path}" if args["mode"] == "fetch" else aws_jar_path
42
+ self.version = args.get("version", "latest")
43
+
44
+ super().__init__(args, os.path.expanduser(jar_path))
45
+
46
+ @staticmethod
47
+ def upload_s3_file(bucket_name: str, source_file_name: str, destination_blob_name: str):
48
+ """Uploads a file to the bucket."""
49
+ obj = boto3.client("s3")
50
+ try:
51
+ obj.upload_file(source_file_name, bucket_name, destination_blob_name)
52
+ print(
53
+ f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
54
+ )
55
+ return f"s3://{bucket_name}/{destination_blob_name}"
56
+ except Exception as e:
57
+ raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
58
+
59
+ @staticmethod
60
+ def download_zipline_aws_jar(
61
+ destination_dir: str, customer_id: str, version: str, jar_name: str
62
+ ):
63
+ s3_client = boto3.client("s3")
64
+ destination_path = f"{destination_dir}/{jar_name}"
65
+ source_key_name = f"release/{version}/jars/{jar_name}"
66
+ bucket_name = f"zipline-artifacts-{customer_id}"
67
+
68
+ are_identical = (
69
+ AwsRunner.compare_s3_and_local_file_hashes(
70
+ bucket_name, source_key_name, destination_path
71
+ )
72
+ if os.path.exists(destination_path)
73
+ else False
74
+ )
75
+
76
+ if are_identical:
77
+ print(f"{destination_path} matches S3 {bucket_name}/{source_key_name}")
78
+ else:
79
+ print(f"{destination_path} does NOT match S3 {bucket_name}/{source_key_name}")
80
+ print(f"Downloading {jar_name} from S3...")
81
+
82
+ s3_client.download_file(
83
+ Filename=destination_path, Bucket=bucket_name, Key=source_key_name
84
+ )
85
+ # Persist ETag to prevent downloading the same file next time
86
+ etag = AwsRunner.get_s3_file_hash(bucket_name, source_key_name)
87
+ if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
88
+ with open(LOCAL_FILE_TO_ETAG_JSON, "r") as file:
89
+ data = json.load(file)
90
+
91
+ # Add the new entry
92
+ data[destination_path] = etag
93
+
94
+ # Write the updated dictionary back to the file
95
+ with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
96
+ json.dump(data, file)
97
+ else:
98
+ with open(LOCAL_FILE_TO_ETAG_JSON, "w") as file:
99
+ data = {destination_path: etag}
100
+ json.dump(data, file)
101
+
102
+ return destination_path
103
+
104
+ @staticmethod
105
+ def get_s3_file_hash(bucket_name: str, file_name: str):
106
+ s3_client = boto3.client("s3")
107
+ response = s3_client.head_object(Bucket=bucket_name, Key=file_name)
108
+ return response["ETag"].strip('"')
109
+
110
+ @staticmethod
111
+ def get_local_file_hash(file_name: str):
112
+ # read in the json file
113
+ if os.path.exists(LOCAL_FILE_TO_ETAG_JSON):
114
+ with open(LOCAL_FILE_TO_ETAG_JSON, "r") as f:
115
+ data = json.load(f)
116
+ if file_name in data:
117
+ return data[file_name]
118
+ return None
119
+
120
+ @staticmethod
121
+ def compare_s3_and_local_file_hashes(bucket_name: str, s3_file_path: str, local_file_path: str):
122
+ try:
123
+ s3_hash = AwsRunner.get_s3_file_hash(bucket_name, s3_file_path)
124
+ local_hash = AwsRunner.get_local_file_hash(local_file_path)
125
+ print(f"Local hash: {local_hash}, S3 hash: {s3_hash}")
126
+ return s3_hash == local_hash
127
+ except Exception as e:
128
+ print(f"Error comparing files: {str(e)}")
129
+ return False
130
+
131
+ def generate_emr_submitter_args(
132
+ self,
133
+ user_args: str,
134
+ job_type: JobType = JobType.SPARK,
135
+ local_files_to_upload: List[str] = None,
136
+ ):
137
+ customer_warehouse_bucket_name = f"zipline-warehouse-{get_customer_id()}"
138
+ s3_files = []
139
+ for source_file in local_files_to_upload:
140
+ # upload to `metadata` folder
141
+ destination_file_path = f"metadata/{extract_filename_from_path(source_file)}"
142
+ s3_files.append(
143
+ AwsRunner.upload_s3_file(
144
+ customer_warehouse_bucket_name, source_file, destination_file_path
145
+ )
146
+ )
147
+
148
+ # we also want the additional-confs included here. it should already be in the bucket
149
+
150
+ zipline_artifacts_bucket_prefix = "s3://zipline-artifacts"
151
+
152
+ s3_files.append(
153
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}/confs/additional-confs.yaml"
154
+ )
155
+
156
+ s3_file_args = ",".join(s3_files)
157
+
158
+ # include jar uri. should also already be in the bucket
159
+ jar_uri = (
160
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
161
+ + f"/release/{self.version}/jars/{ZIPLINE_AWS_JAR_DEFAULT}"
162
+ )
163
+
164
+ final_args = (
165
+ "{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class}"
166
+ )
167
+
168
+ if job_type == JobType.FLINK:
169
+ main_class = "ai.chronon.flink.FlinkJob"
170
+ flink_jar_uri = (
171
+ f"{zipline_artifacts_bucket_prefix}-{get_customer_id()}"
172
+ + f"/jars/{ZIPLINE_AWS_FLINK_JAR_DEFAULT}"
173
+ )
174
+ return (
175
+ final_args.format(
176
+ user_args=user_args,
177
+ jar_uri=jar_uri,
178
+ job_type=job_type.value,
179
+ main_class=main_class,
180
+ )
181
+ + f" --flink-main-jar-uri={flink_jar_uri}"
182
+ )
183
+
184
+ elif job_type == JobType.SPARK:
185
+ main_class = "ai.chronon.spark.Driver"
186
+ return (
187
+ final_args.format(
188
+ user_args=user_args,
189
+ jar_uri=jar_uri,
190
+ job_type=job_type.value,
191
+ main_class=main_class,
192
+ )
193
+ + f" --additional-conf-path={EMR_MOUNT_FILE_PREFIX}additional-confs.yaml"
194
+ f" --files={s3_file_args}"
195
+ )
196
+ else:
197
+ raise ValueError(f"Invalid job type: {job_type}")
198
+
199
+ def run(self):
200
+ command_list = []
201
+ if self.mode == "info":
202
+ command_list.append(
203
+ "python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
204
+ script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
205
+ )
206
+ )
207
+ elif self.sub_help or self.mode == "fetch":
208
+ entrypoint = "ai.chronon.online.fetcher.FetcherMain"
209
+ command_list.append(
210
+ "java -cp {jar} {entrypoint} {subcommand} {args}".format(
211
+ jar=self.jar_path,
212
+ entrypoint=entrypoint,
213
+ args="--help" if self.sub_help else self._gen_final_args(),
214
+ subcommand=ROUTES[self.conf_type][self.mode],
215
+ )
216
+ )
217
+ elif self.mode in ["streaming", "streaming-client"]:
218
+ raise ValueError("Streaming is not supported for AWS yet.")
219
+ else:
220
+ local_files_to_upload_to_aws = []
221
+ if self.conf:
222
+ local_files_to_upload_to_aws.append(os.path.join(self.repo, self.conf))
223
+ if self.parallelism > 1:
224
+ assert self.start_ds is not None and self.ds is not None, (
225
+ "To use parallelism, please specify --start-ds and --end-ds to "
226
+ "break down into multiple backfill jobs"
227
+ )
228
+ date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
229
+ for start_ds, end_ds in date_ranges:
230
+ user_args = "{subcommand} {args} {additional_args}".format(
231
+ subcommand=ROUTES[self.conf_type][self.mode],
232
+ args=self._gen_final_args(
233
+ start_ds=start_ds,
234
+ end_ds=end_ds,
235
+ # when we download files from s3 to emr, they'll be mounted at /mnt/zipline
236
+ override_conf_path=(
237
+ EMR_MOUNT_FILE_PREFIX + extract_filename_from_path(self.conf)
238
+ if self.conf
239
+ else None
240
+ ),
241
+ ),
242
+ additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
243
+ )
244
+
245
+ emr_args = self.generate_emr_submitter_args(
246
+ local_files_to_upload=local_files_to_upload_to_aws,
247
+ # for now, self.conf is the only local file that requires uploading to gcs
248
+ user_args=user_args,
249
+ )
250
+ command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
251
+ command_list.append(command)
252
+ else:
253
+ user_args = ("{subcommand} {args} {additional_args}").format(
254
+ subcommand=ROUTES[self.conf_type][self.mode],
255
+ args=self._gen_final_args(
256
+ start_ds=self.start_ds,
257
+ # when we download files from s3 to emr, they'll be mounted at /mnt/zipline
258
+ override_conf_path=(
259
+ EMR_MOUNT_FILE_PREFIX + extract_filename_from_path(self.conf)
260
+ if self.conf
261
+ else None
262
+ ),
263
+ ),
264
+ additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
265
+ )
266
+
267
+ emr_args = self.generate_emr_submitter_args(
268
+ # for now, self.conf is the only local file that requires uploading
269
+ local_files_to_upload=local_files_to_upload_to_aws,
270
+ user_args=user_args,
271
+ )
272
+ command = f"java -cp {self.jar_path} {EMR_ENTRY} {emr_args}"
273
+ command_list.append(command)
274
+
275
+ if len(command_list) > 1:
276
+ # parallel backfill mode
277
+ with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
278
+ LOG.info(
279
+ "Running args list {} with pool size {}".format(command_list, self.parallelism)
280
+ )
281
+ pool.map(check_call, command_list)
282
+ elif len(command_list) == 1:
283
+ # TODO: add log tailing
284
+ check_call(command_list[0])
@@ -0,0 +1,136 @@
1
+ import json
2
+
3
+
4
+ def generate_dataproc_cluster_config(
5
+ num_workers,
6
+ project_id,
7
+ artifact_prefix,
8
+ master_host_type="n2-highmem-64",
9
+ worker_host_type="n2-highmem-16",
10
+ subnetwork="default",
11
+ idle_timeout="7200s",
12
+ initialization_actions=None,
13
+ tags=None,
14
+ ):
15
+ """
16
+ Create a configuration for a Dataproc cluster.
17
+ :return: A json string representing the configuration.
18
+ """
19
+ if initialization_actions is None:
20
+ initialization_actions = []
21
+ return json.dumps(
22
+ {
23
+ "gceClusterConfig": {
24
+ "subnetworkUri": subnetwork,
25
+ "serviceAccount": "dataproc@" + project_id + ".iam.gserviceaccount.com",
26
+ "serviceAccountScopes": [
27
+ "https://www.googleapis.com/auth/cloud-platform",
28
+ "https://www.googleapis.com/auth/monitoring",
29
+ "https://www.googleapis.com/auth/cloud.useraccounts.readonly",
30
+ "https://www.googleapis.com/auth/devstorage.read_write",
31
+ "https://www.googleapis.com/auth/logging.write",
32
+ ],
33
+ "metadata": {
34
+ "hive-version": "3.1.2",
35
+ "SPARK_BQ_CONNECTOR_URL": "gs://spark-lib/bigquery/spark-3.5-bigquery-0.42.1.jar",
36
+ "artifact_prefix": artifact_prefix.rstrip("/"),
37
+ },
38
+ "tags": tags or [],
39
+ },
40
+ "masterConfig": {
41
+ "numInstances": 1,
42
+ "machineTypeUri": master_host_type,
43
+ "diskConfig": {"bootDiskType": "pd-standard", "bootDiskSizeGb": 1024},
44
+ },
45
+ "workerConfig": {
46
+ "numInstances": num_workers,
47
+ "machineTypeUri": worker_host_type,
48
+ "diskConfig": {
49
+ "bootDiskType": "pd-standard",
50
+ "bootDiskSizeGb": 64,
51
+ "numLocalSsds": 2,
52
+ },
53
+ },
54
+ "softwareConfig": {
55
+ "imageVersion": "2.2.66-debian12",
56
+ "optionalComponents": [
57
+ "FLINK",
58
+ "JUPYTER",
59
+ ],
60
+ "properties": {
61
+ "dataproc:dataproc.logging.stackdriver.enable": "true",
62
+ "dataproc:jobs.file-backed-output.enable": "true",
63
+ "dataproc:dataproc.logging.stackdriver.job.driver.enable": "true",
64
+ "dataproc:dataproc.logging.stackdriver.job.yarn.container.enable": "true",
65
+ },
66
+ },
67
+ "initializationActions": [
68
+ {"executable_file": initialization_action}
69
+ for initialization_action in (
70
+ (initialization_actions or [])
71
+ + [artifact_prefix.rstrip("/") + "/scripts/copy_java_security.sh"]
72
+ )
73
+ ],
74
+ "endpointConfig": {
75
+ "enableHttpPortAccess": True,
76
+ },
77
+ "lifecycleConfig": {
78
+ "idleDeleteTtl": idle_timeout,
79
+ },
80
+ }
81
+ )
82
+
83
+
84
+ def fixed_cluster(
85
+ size,
86
+ project_id,
87
+ artifact_prefix,
88
+ subnetwork="default",
89
+ initialization_actions=None,
90
+ tags=None,
91
+ ):
92
+ """
93
+ Create a Dataproc cluster configuration based on t-shirt sizes.
94
+
95
+ :param size: T-shirt size - 'small', 'medium', or 'large'
96
+ :param project_id: GCP project ID
97
+ :param artifact_prefix: Artifact prefix for initialization scripts
98
+ :param subnetwork: Subnetwork for the cluster
99
+ :param initialization_actions: List of initialization actions
100
+ :param tags: List of tags for the cluster
101
+ :return: A json string representing the cluster configuration
102
+ """
103
+ size_configs = {
104
+ "small": {
105
+ "num_workers": 20,
106
+ "worker_host_type": "n2-highmem-4", # 16GB, 4 cores
107
+ "master_host_type": "n2-highmem-4", # Same as worker for consistency
108
+ },
109
+ "medium": {
110
+ "num_workers": 50,
111
+ "worker_host_type": "n2-highmem-16", # 32GB, 8 cores
112
+ "master_host_type": "n2-highmem-16", # Same as worker for consistency
113
+ },
114
+ "large": {
115
+ "num_workers": 250,
116
+ "worker_host_type": "n2-highmem-16", # 64GB, 16 cores
117
+ "master_host_type": "n2-highmem-16", # Same as worker for consistency
118
+ },
119
+ }
120
+
121
+ if size not in size_configs:
122
+ raise ValueError(f"Invalid size '{size}'. Must be one of: {list(size_configs.keys())}")
123
+
124
+ config = size_configs[size]
125
+
126
+ return generate_dataproc_cluster_config(
127
+ num_workers=config["num_workers"],
128
+ project_id=project_id,
129
+ artifact_prefix=artifact_prefix,
130
+ master_host_type=config["master_host_type"],
131
+ worker_host_type=config["worker_host_type"],
132
+ subnetwork=subnetwork,
133
+ idle_timeout="3600s", # 1 hour of inactivity
134
+ initialization_actions=initialization_actions,
135
+ tags=tags,
136
+ )
@@ -0,0 +1,62 @@
1
+ import os
2
+ import sys
3
+
4
+ import click
5
+
6
+ from ai.chronon.cli.compile.compile_context import CompileContext
7
+ from ai.chronon.cli.compile.compiler import Compiler
8
+ from ai.chronon.cli.compile.display.console import console
9
+
10
+
11
+ @click.command(name="compile")
12
+ @click.option(
13
+ "--chronon-root",
14
+ envvar="CHRONON_ROOT",
15
+ help="Path to the root chronon folder",
16
+ default=os.getcwd(),
17
+ )
18
+ @click.option(
19
+ "--ignore-python-errors",
20
+ is_flag=True,
21
+ default=False,
22
+ help="Allow compilation to proceed even with Python errors (useful for testing)",
23
+ )
24
+ def compile(chronon_root, ignore_python_errors):
25
+ print()
26
+
27
+ if chronon_root is None or chronon_root == "":
28
+ chronon_root = os.getcwd()
29
+
30
+ if chronon_root not in sys.path:
31
+ console.print(
32
+ f"Adding [cyan italic]{chronon_root}[/cyan italic] to python path, during compile."
33
+ )
34
+ sys.path.append(chronon_root)
35
+ else:
36
+ console.print(f"[cyan italic]{chronon_root}[/cyan italic] already on python path.")
37
+
38
+ return __compile(chronon_root, ignore_python_errors)
39
+
40
+
41
+ def __compile(chronon_root, ignore_python_errors=False):
42
+ if chronon_root:
43
+ chronon_root_path = os.path.expanduser(chronon_root)
44
+ os.chdir(chronon_root_path)
45
+
46
+ # check that a "teams.py" file exists in the current directory
47
+ if not (os.path.exists("teams.py") or os.path.exists("teams.json")):
48
+ raise click.ClickException(
49
+ (
50
+ "teams.py or teams.json file not found in current directory."
51
+ " Please run from the top level of conf directory."
52
+ )
53
+ )
54
+
55
+ compile_context = CompileContext(ignore_python_errors=ignore_python_errors)
56
+ compiler = Compiler(compile_context)
57
+ results = compiler.compile()
58
+ return results
59
+
60
+
61
+ if __name__ == "__main__":
62
+ compile()