awx-zipline-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent/__init__.py +1 -0
  2. agent/constants.py +15 -0
  3. agent/ttypes.py +1684 -0
  4. ai/__init__.py +0 -0
  5. ai/chronon/__init__.py +0 -0
  6. ai/chronon/airflow_helpers.py +251 -0
  7. ai/chronon/api/__init__.py +1 -0
  8. ai/chronon/api/common/__init__.py +1 -0
  9. ai/chronon/api/common/constants.py +15 -0
  10. ai/chronon/api/common/ttypes.py +1844 -0
  11. ai/chronon/api/constants.py +15 -0
  12. ai/chronon/api/ttypes.py +3624 -0
  13. ai/chronon/cli/compile/column_hashing.py +313 -0
  14. ai/chronon/cli/compile/compile_context.py +177 -0
  15. ai/chronon/cli/compile/compiler.py +160 -0
  16. ai/chronon/cli/compile/conf_validator.py +590 -0
  17. ai/chronon/cli/compile/display/class_tracker.py +112 -0
  18. ai/chronon/cli/compile/display/compile_status.py +95 -0
  19. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  20. ai/chronon/cli/compile/display/console.py +3 -0
  21. ai/chronon/cli/compile/display/diff_result.py +46 -0
  22. ai/chronon/cli/compile/fill_templates.py +40 -0
  23. ai/chronon/cli/compile/parse_configs.py +141 -0
  24. ai/chronon/cli/compile/parse_teams.py +238 -0
  25. ai/chronon/cli/compile/serializer.py +115 -0
  26. ai/chronon/cli/git_utils.py +156 -0
  27. ai/chronon/cli/logger.py +61 -0
  28. ai/chronon/constants.py +3 -0
  29. ai/chronon/eval/__init__.py +122 -0
  30. ai/chronon/eval/query_parsing.py +19 -0
  31. ai/chronon/eval/sample_tables.py +100 -0
  32. ai/chronon/eval/table_scan.py +186 -0
  33. ai/chronon/fetcher/__init__.py +1 -0
  34. ai/chronon/fetcher/constants.py +15 -0
  35. ai/chronon/fetcher/ttypes.py +127 -0
  36. ai/chronon/group_by.py +692 -0
  37. ai/chronon/hub/__init__.py +1 -0
  38. ai/chronon/hub/constants.py +15 -0
  39. ai/chronon/hub/ttypes.py +1228 -0
  40. ai/chronon/join.py +566 -0
  41. ai/chronon/logger.py +24 -0
  42. ai/chronon/model.py +35 -0
  43. ai/chronon/observability/__init__.py +1 -0
  44. ai/chronon/observability/constants.py +15 -0
  45. ai/chronon/observability/ttypes.py +2192 -0
  46. ai/chronon/orchestration/__init__.py +1 -0
  47. ai/chronon/orchestration/constants.py +15 -0
  48. ai/chronon/orchestration/ttypes.py +4406 -0
  49. ai/chronon/planner/__init__.py +1 -0
  50. ai/chronon/planner/constants.py +15 -0
  51. ai/chronon/planner/ttypes.py +1686 -0
  52. ai/chronon/query.py +126 -0
  53. ai/chronon/repo/__init__.py +40 -0
  54. ai/chronon/repo/aws.py +298 -0
  55. ai/chronon/repo/cluster.py +65 -0
  56. ai/chronon/repo/compile.py +56 -0
  57. ai/chronon/repo/constants.py +164 -0
  58. ai/chronon/repo/default_runner.py +291 -0
  59. ai/chronon/repo/explore.py +421 -0
  60. ai/chronon/repo/extract_objects.py +137 -0
  61. ai/chronon/repo/gcp.py +585 -0
  62. ai/chronon/repo/gitpython_utils.py +14 -0
  63. ai/chronon/repo/hub_runner.py +171 -0
  64. ai/chronon/repo/hub_uploader.py +108 -0
  65. ai/chronon/repo/init.py +53 -0
  66. ai/chronon/repo/join_backfill.py +105 -0
  67. ai/chronon/repo/run.py +293 -0
  68. ai/chronon/repo/serializer.py +141 -0
  69. ai/chronon/repo/team_json_utils.py +46 -0
  70. ai/chronon/repo/utils.py +472 -0
  71. ai/chronon/repo/zipline.py +51 -0
  72. ai/chronon/repo/zipline_hub.py +105 -0
  73. ai/chronon/resources/gcp/README.md +174 -0
  74. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  75. ai/chronon/resources/gcp/group_bys/test/data.py +34 -0
  76. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  77. ai/chronon/resources/gcp/joins/test/data.py +30 -0
  78. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  79. ai/chronon/resources/gcp/sources/test/data.py +23 -0
  80. ai/chronon/resources/gcp/teams.py +70 -0
  81. ai/chronon/resources/gcp/zipline-cli-install.sh +54 -0
  82. ai/chronon/source.py +88 -0
  83. ai/chronon/staging_query.py +185 -0
  84. ai/chronon/types.py +57 -0
  85. ai/chronon/utils.py +557 -0
  86. ai/chronon/windows.py +50 -0
  87. awx_zipline_ai-0.2.0.dist-info/METADATA +173 -0
  88. awx_zipline_ai-0.2.0.dist-info/RECORD +93 -0
  89. awx_zipline_ai-0.2.0.dist-info/WHEEL +5 -0
  90. awx_zipline_ai-0.2.0.dist-info/entry_points.txt +2 -0
  91. awx_zipline_ai-0.2.0.dist-info/licenses/LICENSE +202 -0
  92. awx_zipline_ai-0.2.0.dist-info/top_level.txt +3 -0
  93. jars/__init__.py +0 -0
ai/chronon/repo/gcp.py ADDED
@@ -0,0 +1,585 @@
1
+ import base64
2
+ import json
3
+ import multiprocessing
4
+ import os
5
+ import time
6
+ import uuid
7
+ from urllib.parse import urlparse
8
+
9
+ import crcmod
10
+ from google.cloud import storage
11
+
12
+ from ai.chronon.logger import get_logger
13
+ from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
14
+ from ai.chronon.repo.default_runner import Runner
15
+ from ai.chronon.repo.utils import (
16
+ JobType,
17
+ check_call,
18
+ check_output,
19
+ extract_filename_from_path,
20
+ get_customer_warehouse_bucket,
21
+ get_environ_arg,
22
+ retry_decorator,
23
+ split_date_range,
24
+ )
25
+
26
+ LOG = get_logger()
27
+
28
+ # GCP DATAPROC SPECIFIC CONSTANTS
29
+ DATAPROC_ENTRY = "ai.chronon.integrations.cloud_gcp.DataprocSubmitter"
30
+ ZIPLINE_GCP_JAR_DEFAULT = "cloud_gcp_lib_deploy.jar"
31
+ ZIPLINE_GCP_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.cloud_gcp.GcpApiImpl"
32
+ ZIPLINE_GCP_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
33
+ ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT = "connectors_pubsub_deploy.jar"
34
+ ZIPLINE_GCP_SERVICE_JAR = "service_assembly_deploy.jar"
35
+
36
+
37
+ class GcpRunner(Runner):
38
+ def __init__(self, args):
39
+ self._remote_artifact_prefix = args.get("artifact_prefix")
40
+ if not self._remote_artifact_prefix:
41
+ raise ValueError(
42
+ "GCP artifact prefix not set."
43
+ )
44
+
45
+ self._version = args.get("version")
46
+ gcp_jar_path = GcpRunner.download_zipline_dataproc_jar(
47
+ self._remote_artifact_prefix,
48
+ ZIPLINE_DIRECTORY,
49
+ self._version,
50
+ ZIPLINE_GCP_JAR_DEFAULT,
51
+ )
52
+ service_jar_path = GcpRunner.download_zipline_dataproc_jar(
53
+ self._remote_artifact_prefix,
54
+ ZIPLINE_DIRECTORY,
55
+ self._version,
56
+ ZIPLINE_GCP_SERVICE_JAR,
57
+ )
58
+ jar_path = (
59
+ f"{service_jar_path}:{gcp_jar_path}"
60
+ if args["mode"] == "fetch"
61
+ else gcp_jar_path
62
+ )
63
+
64
+ self._args = args
65
+ self.job_id = str(uuid.uuid4())
66
+
67
+ super().__init__(args, os.path.expanduser(jar_path))
68
+
69
+
70
+ @staticmethod
71
+ def get_gcp_project_id() -> str:
72
+ return get_environ_arg("GCP_PROJECT_ID")
73
+
74
+ @staticmethod
75
+ def get_gcp_bigtable_instance_id() -> str:
76
+ return get_environ_arg("GCP_BIGTABLE_INSTANCE_ID")
77
+
78
+ @staticmethod
79
+ def is_pubsub_enabled() -> bool:
80
+ pubsub_enabled_env = get_environ_arg("ENABLE_PUBSUB", ignoreError=True)
81
+ return pubsub_enabled_env and pubsub_enabled_env.lower() == "true"
82
+
83
+ @staticmethod
84
+ def get_gcp_region_id() -> str:
85
+ return get_environ_arg("GCP_REGION")
86
+
87
+ @staticmethod
88
+ @retry_decorator(retries=2, backoff=5)
89
+ def download_gcs_to_text(remote_file_name: str):
90
+ """Download from the bucket using path."""
91
+ parsed = urlparse(remote_file_name)
92
+ bucket_name = parsed.netloc
93
+ source_blob_name = parsed.path.lstrip("/")
94
+ try:
95
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
96
+ bucket = storage_client.bucket(bucket_name)
97
+ blob = bucket.blob(source_blob_name)
98
+
99
+ if not blob.exists():
100
+ return blob.exists(), None
101
+ else:
102
+ return blob.exists(), blob.download_as_text()
103
+ except Exception as e:
104
+ raise RuntimeError(
105
+ f"Failed to download {source_blob_name}: {str(e)}"
106
+ ) from e
107
+
108
+ @staticmethod
109
+ @retry_decorator(retries=2, backoff=5)
110
+ def download_gcs_file(remote_file_name: str, destination_file_name: str):
111
+ """Download from the bucket using path."""
112
+ parsed = urlparse(remote_file_name)
113
+ bucket_name = parsed.netloc
114
+ source_blob_name = parsed.path.lstrip("/")
115
+ try:
116
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
117
+ bucket = storage_client.bucket(bucket_name)
118
+ blob = bucket.blob(source_blob_name)
119
+
120
+ if not blob.exists():
121
+ raise FileNotFoundError(
122
+ f"Blob {source_blob_name} not found in bucket {bucket_name}"
123
+ )
124
+ blob.download_to_filename(destination_file_name)
125
+ LOG.info(
126
+ "Downloaded storage object {} from bucket {} to local file {}.".format(
127
+ source_blob_name, bucket_name, destination_file_name
128
+ )
129
+ )
130
+ except Exception as e:
131
+ raise RuntimeError(
132
+ f"Failed to download {source_blob_name}: {str(e)}"
133
+ ) from e
134
+
135
+ @staticmethod
136
+ @retry_decorator(retries=2, backoff=5)
137
+ def upload_gcs_blob(bucket_name, source_file_name, destination_blob_name):
138
+ """Uploads a file to the bucket."""
139
+
140
+ try:
141
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
142
+ bucket = storage_client.bucket(bucket_name)
143
+ blob = bucket.blob(destination_blob_name)
144
+ blob.upload_from_filename(source_file_name)
145
+
146
+ LOG.info(
147
+ f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
148
+ )
149
+ return f"gs://{bucket_name}/{destination_blob_name}"
150
+ except Exception as e:
151
+ raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
152
+
153
+ @staticmethod
154
+ def get_gcs_file_hash(remote_file_path: str) -> str:
155
+ """
156
+ Get the hash of a file stored in Google Cloud Storage.
157
+ """
158
+ parsed = urlparse(remote_file_path)
159
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
160
+ bucket_name = parsed.netloc
161
+ blob_name = parsed.path.lstrip("/")
162
+ bucket = storage_client.bucket(bucket_name)
163
+ blob = bucket.get_blob(blob_name)
164
+
165
+ if not blob:
166
+ raise FileNotFoundError(
167
+ f"File {blob_name} not found in bucket {bucket_name}"
168
+ )
169
+
170
+ return blob.crc32c
171
+
172
+ @staticmethod
173
+ def get_local_file_hash(file_path: str) -> str:
174
+ """
175
+ Calculate CRC32C hash of a local file.
176
+
177
+ Args:
178
+ file_path: Path to the local file
179
+
180
+ Returns:
181
+ Base64-encoded string of the file's CRC32C hash
182
+ """
183
+ crc32c_hash = crcmod.predefined.Crc("crc-32c")
184
+
185
+ with open(file_path, "rb") as f:
186
+ # Read the file in chunks to handle large files efficiently
187
+ for chunk in iter(lambda: f.read(4096), b""):
188
+ crc32c_hash.update(chunk)
189
+
190
+ # Convert to base64 to match GCS format
191
+ return base64.b64encode(crc32c_hash.digest()).decode("utf-8")
192
+
193
+ @staticmethod
194
+ def compare_gcs_and_local_file_hashes(
195
+ remote_file_path: str, local_file_path: str
196
+ ) -> bool:
197
+ """
198
+ Compare hashes of a GCS file and a local file to check if they're identical.
199
+
200
+ Args:
201
+ remote_file_path: URI of the remote object in GCS
202
+ local_file_path: Path to the local file to compare
203
+
204
+ Returns:
205
+ True if files are identical, False otherwise
206
+ """
207
+ try:
208
+ gcs_hash = GcpRunner.get_gcs_file_hash(remote_file_path)
209
+ local_hash = GcpRunner.get_local_file_hash(local_file_path)
210
+
211
+ LOG.info(
212
+ f"Local hash of {local_file_path}: {local_hash}. GCS file {remote_file_path} hash: {gcs_hash}"
213
+ )
214
+
215
+ return gcs_hash == local_hash
216
+
217
+ except Exception as e:
218
+ LOG.info(f"Error comparing files: {str(e)}")
219
+ return False
220
+
221
+ @staticmethod
222
+ def download_zipline_dataproc_jar(remote_file_path: str, local_file_path: str, version: str, jar_name: str
223
+ ):
224
+ source_path = os.path.join(remote_file_path, "release", version, "jars", jar_name)
225
+ dest_path = os.path.join(local_file_path, jar_name)
226
+
227
+ are_identical = (
228
+ GcpRunner.compare_gcs_and_local_file_hashes(
229
+ source_path, dest_path
230
+ )
231
+ if os.path.exists(dest_path)
232
+ else False
233
+ )
234
+
235
+ if are_identical:
236
+ LOG.info(f"{dest_path} matches GCS {source_path}")
237
+ else:
238
+ LOG.info(
239
+ f"{dest_path} does NOT match GCS {source_path}"
240
+ )
241
+ LOG.info(f"Downloading {jar_name} from GCS...")
242
+
243
+ GcpRunner.download_gcs_file(source_path, dest_path)
244
+ return dest_path
245
+
246
+ def generate_dataproc_submitter_args(
247
+ self,
248
+ user_args: str,
249
+ version: str,
250
+ customer_artifact_prefix: str,
251
+ job_type: JobType = JobType.SPARK,
252
+ metadata_conf_path: str = None,
253
+ ):
254
+
255
+ parsed = urlparse(customer_artifact_prefix)
256
+ source_blob_name = parsed.path.lstrip("/")
257
+
258
+ gcs_files = []
259
+
260
+ # upload to `metadata` folder
261
+ if metadata_conf_path:
262
+ destination_file_path = os.path.join(
263
+ source_blob_name,
264
+ "metadata",
265
+ self.job_id,
266
+ f"{extract_filename_from_path(metadata_conf_path)}"
267
+ )
268
+ gcs_files.append(
269
+ GcpRunner.upload_gcs_blob(
270
+ get_customer_warehouse_bucket(), metadata_conf_path, destination_file_path
271
+ )
272
+ )
273
+
274
+ gcs_file_args = ",".join(gcs_files)
275
+ release_prefix = os.path.join(customer_artifact_prefix, "release", version, "jars")
276
+
277
+ # include jar uri. should also already be in the bucket
278
+ jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_JAR_DEFAULT}")
279
+
280
+ final_args = "{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class} --zipline-version={zipline_version} --job-id={job_id}"
281
+
282
+ if job_type == JobType.FLINK:
283
+ main_class = "ai.chronon.flink.FlinkJob"
284
+ flink_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_JAR_DEFAULT}")
285
+ enable_pubsub = GcpRunner.is_pubsub_enabled()
286
+ flink_pubsub_connector_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}")
287
+ base_formatted_args = final_args.format(
288
+ user_args=user_args,
289
+ jar_uri=jar_uri,
290
+ job_type=job_type.value,
291
+ main_class=main_class,
292
+ zipline_version=self._version,
293
+ job_id=self.job_id,
294
+ ) + f" --flink-main-jar-uri={flink_jar_uri}"
295
+ if enable_pubsub:
296
+ base_formatted_args += f" --flink-pubsub-jar-uri={flink_pubsub_connector_jar_uri}"
297
+ return base_formatted_args
298
+
299
+ elif job_type == JobType.SPARK:
300
+ main_class = "ai.chronon.spark.Driver"
301
+ return " ".join([
302
+ final_args.format(
303
+ user_args=user_args,
304
+ jar_uri=jar_uri,
305
+ job_type=job_type.value,
306
+ main_class=main_class,
307
+ zipline_version=self._version,
308
+ job_id=self.job_id,
309
+ ), "--is-gcp",
310
+ f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
311
+ f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
312
+ f"--files={gcs_file_args}" if gcs_file_args else "",
313
+ ]
314
+ )
315
+ else:
316
+ raise ValueError(f"Invalid job type: {job_type}")
317
+
318
+ @staticmethod
319
+ def get_state_dataproc_job(job_id):
320
+ jobs_info_str = check_output(
321
+ f"gcloud dataproc jobs describe {job_id} --region={GcpRunner.get_gcp_region_id()} "
322
+ f"--project={GcpRunner.get_gcp_project_id()} --format=json"
323
+ ).decode("utf-8")
324
+ job_info = json.loads(jobs_info_str)
325
+ return job_info.get("status", {}).get("state", "")
326
+
327
+ def run_dataproc_flink_streaming(self):
328
+ user_args = {
329
+ "--groupby-name": self.conf_metadata_name,
330
+ "--kafka-bootstrap": self.kafka_bootstrap,
331
+ "--online-class": ZIPLINE_GCP_ONLINE_CLASS_DEFAULT,
332
+ "-ZGCP_PROJECT_ID": GcpRunner.get_gcp_project_id(),
333
+ "-ZGCP_BIGTABLE_INSTANCE_ID": GcpRunner.get_gcp_bigtable_instance_id(),
334
+ "--validate-rows": self.validate_rows,
335
+ "--streaming-manifest-path": self.streaming_manifest_path,
336
+ "--streaming-checkpoint-path": self.streaming_checkpoint_path,
337
+ "--local-zipline-version": self._version,
338
+
339
+ # Need these for extracting metadata name in submitter
340
+ "--local-conf-path": self.local_abs_conf_path,
341
+ "--original-mode": self.mode,
342
+ "--conf-type": self.conf_type,
343
+ }
344
+
345
+ args = self._args.get("args")
346
+ if "check-if-job-is-running" in args:
347
+ user_args["--streaming-mode"] = "check-if-job-is-running"
348
+ elif "deploy" in args:
349
+ user_args["--streaming-mode"] = "deploy"
350
+
351
+ flag_args = {"--validate": self.validate, "--enable-debug": self.enable_debug}
352
+
353
+ # Set the savepoint deploy strategy
354
+ if self.latest_savepoint:
355
+ flag_args["--latest-savepoint"] = self.latest_savepoint
356
+ elif self.custom_savepoint:
357
+ user_args["--custom-savepoint"] = self.custom_savepoint
358
+ else:
359
+ flag_args["--no-savepoint"] = self.no_savepoint
360
+
361
+ # Set version check deploy
362
+ if self.version_check:
363
+ flag_args["--version-check"] = self.version_check
364
+
365
+ # Set additional jars
366
+ if self.additional_jars:
367
+ user_args["--additional-jars"] = self.additional_jars
368
+
369
+ user_args_str = " ".join(f"{key}={value}" for key, value in user_args.items() if value)
370
+ # if online args are set we add them to the user_args_str
371
+ if self.online_args:
372
+ user_args_str += " " + self.online_args
373
+
374
+ flag_args_str = " ".join(key for key, value in flag_args.items() if value)
375
+ dataproc_args = self.generate_dataproc_submitter_args(
376
+ job_type=JobType.FLINK,
377
+ version=self._version,
378
+ customer_artifact_prefix=self._remote_artifact_prefix,
379
+ user_args=" ".join([user_args_str, flag_args_str]),
380
+ )
381
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
382
+
383
+ return command
384
+
385
+ def run(self):
386
+ command_list = []
387
+ if self.mode == "info":
388
+ command_list.append(
389
+ "python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
390
+ script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
391
+ )
392
+ )
393
+ elif self.sub_help or self.mode == "fetch":
394
+ entrypoint = "ai.chronon.online.fetcher.FetcherMain"
395
+ command_list.append(
396
+ "java -cp {jar} {entrypoint} {subcommand} {args}".format(
397
+ jar=self.jar_path,
398
+ entrypoint=entrypoint,
399
+ args="--help" if self.sub_help else self._gen_final_args(),
400
+ subcommand=ROUTES[self.conf_type][self.mode],
401
+ )
402
+ )
403
+ elif self.mode == "metastore":
404
+ # We could presumably support other metastore options but
405
+ # for now only poking for a particular partition is supported.
406
+ args = self._args.get("args")
407
+ supported_subcommands = ["check-partitions"]
408
+ assert (
409
+ "check-partitions" in args
410
+ ), f"Must specify one of the following subcommands: {supported_subcommands}"
411
+ assert (
412
+ "--partition-names" in args
413
+ ), "Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
414
+
415
+ dataproc_args = self.generate_dataproc_submitter_args(
416
+ # for now, self.conf is the only local file that requires uploading to gcs
417
+ user_args=self._gen_final_args(),
418
+ version=self._version,
419
+ customer_artifact_prefix=self._remote_artifact_prefix,
420
+ metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
421
+ )
422
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
423
+ command_list.append(command)
424
+ elif self.mode in ["streaming", "streaming-client"]:
425
+ args = self._args.get("args")
426
+ # streaming mode
427
+ command = self.run_dataproc_flink_streaming()
428
+ command_list.append(command)
429
+ else:
430
+ if self.parallelism > 1:
431
+ assert self.start_ds is not None and self.ds is not None, (
432
+ "To use parallelism, please specify --start-ds and --end-ds to "
433
+ "break down into multiple backfill jobs"
434
+ )
435
+ date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
436
+ for start_ds, end_ds in date_ranges:
437
+ user_args = ("{subcommand} {args} {additional_args}").format(
438
+ subcommand=ROUTES[self.conf_type][self.mode],
439
+ args=self._gen_final_args(
440
+ start_ds=start_ds,
441
+ end_ds=end_ds,
442
+ # overriding the conf here because we only want the
443
+ # filename, not the full path. When we upload this to
444
+ # GCS, the full path does get reflected on GCS. But
445
+ # when we include the gcs file path as part of dataproc,
446
+ # the file is copied to root and not the complete path
447
+ # is copied.
448
+ override_conf_path=(
449
+ extract_filename_from_path(self.conf)
450
+ if self.conf
451
+ else None
452
+ ),
453
+ ),
454
+ additional_args=os.environ.get(
455
+ "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
456
+ ),
457
+ )
458
+
459
+ dataproc_args = self.generate_dataproc_submitter_args(
460
+ # for now, self.conf is the only local file that requires uploading to gcs
461
+ user_args=user_args,
462
+ version=self._version,
463
+ customer_artifact_prefix=self._remote_artifact_prefix,
464
+ metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
465
+ )
466
+ command = (
467
+ f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
468
+ )
469
+ command_list.append(command)
470
+ else:
471
+ user_args = ("{subcommand} {args} {additional_args}").format(
472
+ subcommand=ROUTES[self.conf_type][self.mode],
473
+ args=self._gen_final_args(
474
+ start_ds=self.start_ds,
475
+ # overriding the conf here because we only want the filename,
476
+ # not the full path. When we upload this to GCS, the full path
477
+ # does get reflected on GCS. But when we include the gcs file
478
+ # path as part of dataproc, the file is copied to root and
479
+ # not the complete path is copied.
480
+ override_conf_path=(
481
+ extract_filename_from_path(self.conf) if self.conf else None
482
+ ),
483
+ ),
484
+ additional_args=os.environ.get(
485
+ "CHRONON_CONFIG_ADDITIONAL_ARGS", ""
486
+ ),
487
+ )
488
+ dataproc_args = self.generate_dataproc_submitter_args(
489
+ user_args=user_args,
490
+ version=self._version,
491
+ customer_artifact_prefix=self._remote_artifact_prefix,
492
+ metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
493
+ )
494
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
495
+ command_list.append(command)
496
+
497
+ if len(command_list) > 1:
498
+ # parallel backfill mode
499
+ with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
500
+ LOG.info(
501
+ "Running args list {} with pool size {}".format(
502
+ command_list, self.parallelism
503
+ )
504
+ )
505
+ pool.map(check_call, command_list)
506
+ elif len(command_list) == 1:
507
+ output = check_output(command_list[0]).decode("utf-8").split("\n")
508
+ print(*output, sep="\n")
509
+
510
+ dataproc_submitter_id_str = "Dataproc submitter job id"
511
+
512
+ dataproc_submitter_logs = [s for s in output if dataproc_submitter_id_str in s]
513
+
514
+ submitted_job_id = None
515
+ if dataproc_submitter_logs:
516
+ log = dataproc_submitter_logs[0]
517
+ submitted_job_id = (
518
+ log[log.index(dataproc_submitter_id_str) + len(dataproc_submitter_id_str) + 1 :]
519
+ ).strip()
520
+
521
+
522
+ if not self.disable_cloud_logging and submitted_job_id:
523
+ LOG.info(
524
+ """
525
+ <-----------------------------------------------------------------------------------
526
+ ------------------------------------------------------------------------------------
527
+ DATAPROC LOGS
528
+ ------------------------------------------------------------------------------------
529
+ ------------------------------------------------------------------------------------>
530
+ """
531
+ )
532
+ check_call(
533
+ f"gcloud dataproc jobs wait {submitted_job_id} --region={GcpRunner.get_gcp_region_id()} "
534
+ f"--project={GcpRunner.get_gcp_project_id()}"
535
+ )
536
+
537
+ # Fetch the final job state
538
+ job_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
539
+
540
+ LOG.info("<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>")
541
+ if job_state != 'DONE':
542
+ LOG.info(f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}")
543
+ raise RuntimeError(f"Job {submitted_job_id} failed.")
544
+ else:
545
+ LOG.info(f"Job {submitted_job_id} is in DONE state.")
546
+ return
547
+
548
+ # If streaming deploy job, poll and check for final
549
+ if (submitted_job_id and self.mode in ["streaming", "streaming-client"]
550
+ and "deploy" in self._args.get("args")):
551
+ # Poll the dataproc job id for 5 minutes until the job
552
+ total_time_seconds = 5 * 60
553
+ interval_seconds = 10
554
+ start_time = time.time()
555
+ while time.time() - start_time < total_time_seconds:
556
+ current_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
557
+
558
+ non_terminal_states = ['SETUP_DONE', 'RUNNING', 'PENDING', 'STATE_UNSPECIFIED']
559
+ if current_state not in non_terminal_states:
560
+ raise RuntimeError(f"Flink job is not in {non_terminal_states}. "
561
+ f"Current state: {current_state}")
562
+
563
+ manifest_path = os.path.join(self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt")
564
+ manifest_exists, raw_manifest = self.download_gcs_to_text(str(manifest_path))
565
+
566
+ if manifest_exists:
567
+ manifest = raw_manifest.strip()
568
+ LOG.info(f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]")
569
+ manifest_tuples = manifest.split(",")
570
+
571
+ flink_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")][0]
572
+ parent_job_id = [f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")][0]
573
+
574
+ if parent_job_id == submitted_job_id:
575
+ LOG.info(f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
576
+ f" Dataproc job ID = [{submitted_job_id}]")
577
+ break
578
+ else:
579
+ LOG.info(f"Flink manifest not updated with new Dataproc job id {submitted_job_id}.")
580
+ LOG.info(f"Sleeping for {interval_seconds} seconds...")
581
+ time.sleep(interval_seconds)
582
+ else:
583
+ raise RuntimeError(
584
+ f"Failed to confirm Flink manifest for new deployment with Dataproc job id {submitted_job_id}."
585
+ )
@@ -0,0 +1,14 @@
1
+ from typing import Optional
2
+
3
+ from git import Repo
4
+
5
+
6
+ def get_default_origin_branch(path, repo: Optional[Repo] = None):
7
+ if not repo:
8
+ repo = Repo(path, search_parent_directories=True)
9
+ return repo.remotes.origin.refs.HEAD.reference.name.split('/')[-1]
10
+
11
+ def get_current_branch(path, repo: Optional[Repo] = None):
12
+ if not repo:
13
+ repo = Repo(path, search_parent_directories=True)
14
+ return repo.active_branch.name