awx-zipline-ai 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. __init__.py +0 -0
  2. agent/__init__.py +1 -0
  3. agent/constants.py +15 -0
  4. agent/ttypes.py +1684 -0
  5. ai/__init__.py +0 -0
  6. ai/chronon/__init__.py +0 -0
  7. ai/chronon/airflow_helpers.py +248 -0
  8. ai/chronon/cli/__init__.py +0 -0
  9. ai/chronon/cli/compile/__init__.py +0 -0
  10. ai/chronon/cli/compile/column_hashing.py +336 -0
  11. ai/chronon/cli/compile/compile_context.py +173 -0
  12. ai/chronon/cli/compile/compiler.py +183 -0
  13. ai/chronon/cli/compile/conf_validator.py +742 -0
  14. ai/chronon/cli/compile/display/__init__.py +0 -0
  15. ai/chronon/cli/compile/display/class_tracker.py +102 -0
  16. ai/chronon/cli/compile/display/compile_status.py +95 -0
  17. ai/chronon/cli/compile/display/compiled_obj.py +12 -0
  18. ai/chronon/cli/compile/display/console.py +3 -0
  19. ai/chronon/cli/compile/display/diff_result.py +111 -0
  20. ai/chronon/cli/compile/fill_templates.py +35 -0
  21. ai/chronon/cli/compile/parse_configs.py +134 -0
  22. ai/chronon/cli/compile/parse_teams.py +242 -0
  23. ai/chronon/cli/compile/serializer.py +109 -0
  24. ai/chronon/cli/compile/version_utils.py +42 -0
  25. ai/chronon/cli/git_utils.py +145 -0
  26. ai/chronon/cli/logger.py +59 -0
  27. ai/chronon/constants.py +3 -0
  28. ai/chronon/group_by.py +692 -0
  29. ai/chronon/join.py +580 -0
  30. ai/chronon/logger.py +23 -0
  31. ai/chronon/model.py +40 -0
  32. ai/chronon/query.py +126 -0
  33. ai/chronon/repo/__init__.py +39 -0
  34. ai/chronon/repo/aws.py +284 -0
  35. ai/chronon/repo/cluster.py +136 -0
  36. ai/chronon/repo/compile.py +62 -0
  37. ai/chronon/repo/constants.py +164 -0
  38. ai/chronon/repo/default_runner.py +269 -0
  39. ai/chronon/repo/explore.py +418 -0
  40. ai/chronon/repo/extract_objects.py +134 -0
  41. ai/chronon/repo/gcp.py +586 -0
  42. ai/chronon/repo/gitpython_utils.py +15 -0
  43. ai/chronon/repo/hub_runner.py +261 -0
  44. ai/chronon/repo/hub_uploader.py +109 -0
  45. ai/chronon/repo/init.py +60 -0
  46. ai/chronon/repo/join_backfill.py +119 -0
  47. ai/chronon/repo/run.py +296 -0
  48. ai/chronon/repo/serializer.py +133 -0
  49. ai/chronon/repo/team_json_utils.py +46 -0
  50. ai/chronon/repo/utils.py +481 -0
  51. ai/chronon/repo/zipline.py +35 -0
  52. ai/chronon/repo/zipline_hub.py +277 -0
  53. ai/chronon/resources/__init__.py +0 -0
  54. ai/chronon/resources/gcp/__init__.py +0 -0
  55. ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
  56. ai/chronon/resources/gcp/group_bys/test/__init__.py +0 -0
  57. ai/chronon/resources/gcp/group_bys/test/data.py +30 -0
  58. ai/chronon/resources/gcp/joins/__init__.py +0 -0
  59. ai/chronon/resources/gcp/joins/test/__init__.py +0 -0
  60. ai/chronon/resources/gcp/joins/test/data.py +26 -0
  61. ai/chronon/resources/gcp/sources/__init__.py +0 -0
  62. ai/chronon/resources/gcp/sources/test/__init__.py +0 -0
  63. ai/chronon/resources/gcp/sources/test/data.py +26 -0
  64. ai/chronon/resources/gcp/teams.py +58 -0
  65. ai/chronon/source.py +86 -0
  66. ai/chronon/staging_query.py +226 -0
  67. ai/chronon/types.py +58 -0
  68. ai/chronon/utils.py +510 -0
  69. ai/chronon/windows.py +48 -0
  70. awx_zipline_ai-0.0.32.dist-info/METADATA +197 -0
  71. awx_zipline_ai-0.0.32.dist-info/RECORD +96 -0
  72. awx_zipline_ai-0.0.32.dist-info/WHEEL +5 -0
  73. awx_zipline_ai-0.0.32.dist-info/entry_points.txt +2 -0
  74. awx_zipline_ai-0.0.32.dist-info/top_level.txt +4 -0
  75. gen_thrift/__init__.py +0 -0
  76. gen_thrift/api/__init__.py +1 -0
  77. gen_thrift/api/constants.py +15 -0
  78. gen_thrift/api/ttypes.py +3754 -0
  79. gen_thrift/common/__init__.py +1 -0
  80. gen_thrift/common/constants.py +15 -0
  81. gen_thrift/common/ttypes.py +1814 -0
  82. gen_thrift/eval/__init__.py +1 -0
  83. gen_thrift/eval/constants.py +15 -0
  84. gen_thrift/eval/ttypes.py +660 -0
  85. gen_thrift/fetcher/__init__.py +1 -0
  86. gen_thrift/fetcher/constants.py +15 -0
  87. gen_thrift/fetcher/ttypes.py +127 -0
  88. gen_thrift/hub/__init__.py +1 -0
  89. gen_thrift/hub/constants.py +15 -0
  90. gen_thrift/hub/ttypes.py +1109 -0
  91. gen_thrift/observability/__init__.py +1 -0
  92. gen_thrift/observability/constants.py +15 -0
  93. gen_thrift/observability/ttypes.py +2355 -0
  94. gen_thrift/planner/__init__.py +1 -0
  95. gen_thrift/planner/constants.py +15 -0
  96. gen_thrift/planner/ttypes.py +1967 -0
ai/chronon/repo/gcp.py ADDED
@@ -0,0 +1,586 @@
1
+ import base64
2
+ import json
3
+ import multiprocessing
4
+ import os
5
+ import time
6
+ import uuid
7
+ from urllib.parse import urlparse
8
+
9
+ import crcmod
10
+ from google.cloud import storage
11
+
12
+ from ai.chronon.logger import get_logger
13
+ from ai.chronon.repo.constants import ROUTES, ZIPLINE_DIRECTORY
14
+ from ai.chronon.repo.default_runner import Runner
15
+ from ai.chronon.repo.utils import (
16
+ JobType,
17
+ check_call,
18
+ check_output,
19
+ extract_filename_from_path,
20
+ get_customer_warehouse_bucket,
21
+ get_environ_arg,
22
+ retry_decorator,
23
+ split_date_range,
24
+ )
25
+
26
+ LOG = get_logger()
27
+
28
+ # GCP DATAPROC SPECIFIC CONSTANTS
29
+ DATAPROC_ENTRY = "ai.chronon.integrations.cloud_gcp.DataprocSubmitter"
30
+ ZIPLINE_GCP_JAR_DEFAULT = "cloud_gcp_lib_deploy.jar"
31
+ ZIPLINE_GCP_ONLINE_CLASS_DEFAULT = "ai.chronon.integrations.cloud_gcp.GcpApiImpl"
32
+ ZIPLINE_GCP_FLINK_JAR_DEFAULT = "flink_assembly_deploy.jar"
33
+ ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT = "connectors_pubsub_deploy.jar"
34
+ ZIPLINE_GCP_SERVICE_JAR = "service_assembly_deploy.jar"
35
+
36
+
37
+ class GcpRunner(Runner):
38
+ def __init__(self, args):
39
+ self._remote_artifact_prefix = args.get("artifact_prefix")
40
+ if not self._remote_artifact_prefix:
41
+ raise ValueError("GCP artifact prefix not set.")
42
+
43
+ self._version = args.get("version")
44
+ gcp_jar_path = GcpRunner.download_zipline_dataproc_jar(
45
+ self._remote_artifact_prefix,
46
+ ZIPLINE_DIRECTORY,
47
+ self._version,
48
+ ZIPLINE_GCP_JAR_DEFAULT,
49
+ )
50
+ service_jar_path = GcpRunner.download_zipline_dataproc_jar(
51
+ self._remote_artifact_prefix,
52
+ ZIPLINE_DIRECTORY,
53
+ self._version,
54
+ ZIPLINE_GCP_SERVICE_JAR,
55
+ )
56
+ jar_path = f"{gcp_jar_path}:{service_jar_path}" if args["mode"] == "fetch" else gcp_jar_path
57
+
58
+ self._args = args
59
+ self.job_id = str(uuid.uuid4())
60
+
61
+ super().__init__(args, os.path.expanduser(jar_path))
62
+
63
+ @staticmethod
64
+ def get_gcp_project_id() -> str:
65
+ return get_environ_arg("GCP_PROJECT_ID")
66
+
67
+ @staticmethod
68
+ def get_gcp_bigtable_instance_id() -> str:
69
+ return get_environ_arg("GCP_BIGTABLE_INSTANCE_ID")
70
+
71
+ @staticmethod
72
+ def is_pubsub_enabled() -> bool:
73
+ pubsub_enabled_env = get_environ_arg("ENABLE_PUBSUB", ignoreError=True)
74
+ return pubsub_enabled_env and pubsub_enabled_env.lower() == "true"
75
+
76
+ @staticmethod
77
+ def get_gcp_region_id() -> str:
78
+ return get_environ_arg("GCP_REGION")
79
+
80
+ @staticmethod
81
+ @retry_decorator(retries=2, backoff=5)
82
+ def download_gcs_to_text(remote_file_name: str):
83
+ """Download from the bucket using path."""
84
+ parsed = urlparse(remote_file_name)
85
+ bucket_name = parsed.netloc
86
+ source_blob_name = parsed.path.lstrip("/")
87
+ try:
88
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
89
+ bucket = storage_client.bucket(bucket_name)
90
+ blob = bucket.blob(source_blob_name)
91
+
92
+ if not blob.exists():
93
+ return blob.exists(), None
94
+ else:
95
+ return blob.exists(), blob.download_as_text()
96
+ except Exception as e:
97
+ raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
98
+
99
+ @staticmethod
100
+ @retry_decorator(retries=2, backoff=5)
101
+ def download_gcs_file(remote_file_name: str, destination_file_name: str):
102
+ """Download from the bucket using path."""
103
+ parsed = urlparse(remote_file_name)
104
+ bucket_name = parsed.netloc
105
+ source_blob_name = parsed.path.lstrip("/")
106
+ try:
107
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
108
+ bucket = storage_client.bucket(bucket_name)
109
+ blob = bucket.blob(source_blob_name)
110
+
111
+ if not blob.exists():
112
+ raise FileNotFoundError(
113
+ f"Blob {source_blob_name} not found in bucket {bucket_name}"
114
+ )
115
+ blob.download_to_filename(destination_file_name)
116
+ LOG.info(
117
+ "Downloaded storage object {} from bucket {} to local file {}.".format(
118
+ source_blob_name, bucket_name, destination_file_name
119
+ )
120
+ )
121
+ except Exception as e:
122
+ raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
123
+
124
+ @staticmethod
125
+ @retry_decorator(retries=2, backoff=5)
126
+ def upload_gcs_blob(bucket_name, source_file_name, destination_blob_name):
127
+ """Uploads a file to the bucket."""
128
+
129
+ try:
130
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
131
+ bucket = storage_client.bucket(bucket_name)
132
+ blob = bucket.blob(destination_blob_name)
133
+ blob.upload_from_filename(source_file_name)
134
+
135
+ LOG.info(
136
+ f"File {source_file_name} uploaded to {destination_blob_name} in bucket {bucket_name}."
137
+ )
138
+ return f"gs://{bucket_name}/{destination_blob_name}"
139
+ except Exception as e:
140
+ raise RuntimeError(f"Failed to upload {source_file_name}: {str(e)}") from e
141
+
142
+ @staticmethod
143
+ def get_gcs_file_hash(remote_file_path: str) -> str:
144
+ """
145
+ Get the hash of a file stored in Google Cloud Storage.
146
+ """
147
+ parsed = urlparse(remote_file_path)
148
+ storage_client = storage.Client(project=GcpRunner.get_gcp_project_id())
149
+ bucket_name = parsed.netloc
150
+ blob_name = parsed.path.lstrip("/")
151
+ bucket = storage_client.bucket(bucket_name)
152
+ blob = bucket.get_blob(blob_name)
153
+
154
+ if not blob:
155
+ raise FileNotFoundError(f"File {blob_name} not found in bucket {bucket_name}")
156
+
157
+ return blob.crc32c
158
+
159
+ @staticmethod
160
+ def get_local_file_hash(file_path: str) -> str:
161
+ """
162
+ Calculate CRC32C hash of a local file.
163
+
164
+ Args:
165
+ file_path: Path to the local file
166
+
167
+ Returns:
168
+ Base64-encoded string of the file's CRC32C hash
169
+ """
170
+ crc32c_hash = crcmod.predefined.Crc("crc-32c")
171
+
172
+ with open(file_path, "rb") as f:
173
+ # Read the file in chunks to handle large files efficiently
174
+ for chunk in iter(lambda: f.read(4096), b""):
175
+ crc32c_hash.update(chunk)
176
+
177
+ # Convert to base64 to match GCS format
178
+ return base64.b64encode(crc32c_hash.digest()).decode("utf-8")
179
+
180
+ @staticmethod
181
+ def compare_gcs_and_local_file_hashes(remote_file_path: str, local_file_path: str) -> bool:
182
+ """
183
+ Compare hashes of a GCS file and a local file to check if they're identical.
184
+
185
+ Args:
186
+ remote_file_path: URI of the remote object in GCS
187
+ local_file_path: Path to the local file to compare
188
+
189
+ Returns:
190
+ True if files are identical, False otherwise
191
+ """
192
+ try:
193
+ gcs_hash = GcpRunner.get_gcs_file_hash(remote_file_path)
194
+ local_hash = GcpRunner.get_local_file_hash(local_file_path)
195
+
196
+ LOG.info(
197
+ f"Local hash of {local_file_path}: {local_hash}. GCS file {remote_file_path} hash: {gcs_hash}"
198
+ )
199
+
200
+ return gcs_hash == local_hash
201
+
202
+ except Exception as e:
203
+ LOG.info(f"Error comparing files: {str(e)}")
204
+ return False
205
+
206
+ @staticmethod
207
+ def download_zipline_dataproc_jar(
208
+ remote_file_path: str, local_file_path: str, version: str, jar_name: str
209
+ ):
210
+ source_path = os.path.join(remote_file_path, "release", version, "jars", jar_name)
211
+ dest_path = os.path.join(local_file_path, jar_name)
212
+
213
+ are_identical = (
214
+ GcpRunner.compare_gcs_and_local_file_hashes(source_path, dest_path)
215
+ if os.path.exists(dest_path)
216
+ else False
217
+ )
218
+
219
+ if are_identical:
220
+ LOG.info(f"{dest_path} matches GCS {source_path}")
221
+ else:
222
+ LOG.info(f"{dest_path} does NOT match GCS {source_path}")
223
+ LOG.info(f"Downloading {jar_name} from GCS...")
224
+
225
+ GcpRunner.download_gcs_file(source_path, dest_path)
226
+ return dest_path
227
+
228
+ def generate_dataproc_submitter_args(
229
+ self,
230
+ user_args: str,
231
+ version: str,
232
+ customer_artifact_prefix: str,
233
+ job_type: JobType = JobType.SPARK,
234
+ metadata_conf_path: str = None,
235
+ ):
236
+ parsed = urlparse(customer_artifact_prefix)
237
+ source_blob_name = parsed.path.lstrip("/")
238
+
239
+ gcs_files = []
240
+
241
+ # upload to `metadata` folder
242
+ if metadata_conf_path:
243
+ destination_file_path = os.path.join(
244
+ source_blob_name,
245
+ "metadata",
246
+ self.job_id,
247
+ f"{extract_filename_from_path(metadata_conf_path)}",
248
+ )
249
+ gcs_files.append(
250
+ GcpRunner.upload_gcs_blob(
251
+ get_customer_warehouse_bucket(), metadata_conf_path, destination_file_path
252
+ )
253
+ )
254
+
255
+ gcs_file_args = ",".join(gcs_files)
256
+ release_prefix = os.path.join(customer_artifact_prefix, "release", version, "jars")
257
+
258
+ # include jar uri. should also already be in the bucket
259
+ jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_JAR_DEFAULT}")
260
+
261
+ final_args = "{user_args} --jar-uri={jar_uri} --job-type={job_type} --main-class={main_class} --zipline-version={zipline_version} --job-id={job_id}"
262
+
263
+ if job_type == JobType.FLINK:
264
+ main_class = "ai.chronon.flink.FlinkJob"
265
+ flink_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_JAR_DEFAULT}")
266
+ enable_pubsub = GcpRunner.is_pubsub_enabled()
267
+ flink_pubsub_connector_jar_uri = os.path.join(
268
+ release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}"
269
+ )
270
+ base_formatted_args = (
271
+ final_args.format(
272
+ user_args=user_args,
273
+ jar_uri=jar_uri,
274
+ job_type=job_type.value,
275
+ main_class=main_class,
276
+ zipline_version=self._version,
277
+ job_id=self.job_id,
278
+ )
279
+ + f" --flink-main-jar-uri={flink_jar_uri}"
280
+ )
281
+ if enable_pubsub:
282
+ base_formatted_args += f" --flink-pubsub-jar-uri={flink_pubsub_connector_jar_uri}"
283
+ return base_formatted_args
284
+
285
+ elif job_type == JobType.SPARK:
286
+ main_class = "ai.chronon.spark.Driver"
287
+ return " ".join(
288
+ [
289
+ final_args.format(
290
+ user_args=user_args,
291
+ jar_uri=jar_uri,
292
+ job_type=job_type.value,
293
+ main_class=main_class,
294
+ zipline_version=self._version,
295
+ job_id=self.job_id,
296
+ ),
297
+ "--is-gcp",
298
+ f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
299
+ f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
300
+ f"--files={gcs_file_args}" if gcs_file_args else "",
301
+ ]
302
+ )
303
+ else:
304
+ raise ValueError(f"Invalid job type: {job_type}")
305
+
306
+ @staticmethod
307
+ def get_state_dataproc_job(job_id):
308
+ jobs_info_str = check_output(
309
+ f"gcloud dataproc jobs describe {job_id} --region={GcpRunner.get_gcp_region_id()} "
310
+ f"--project={GcpRunner.get_gcp_project_id()} --format=json"
311
+ ).decode("utf-8")
312
+ job_info = json.loads(jobs_info_str)
313
+ return job_info.get("status", {}).get("state", "")
314
+
315
+ def run_dataproc_flink_streaming(self):
316
+ user_args = {
317
+ "--groupby-name": self.conf_metadata_name,
318
+ "--kafka-bootstrap": self.kafka_bootstrap,
319
+ "--online-class": ZIPLINE_GCP_ONLINE_CLASS_DEFAULT,
320
+ "-ZGCP_PROJECT_ID": GcpRunner.get_gcp_project_id(),
321
+ "-ZGCP_BIGTABLE_INSTANCE_ID": GcpRunner.get_gcp_bigtable_instance_id(),
322
+ "--validate-rows": self.validate_rows,
323
+ "--streaming-manifest-path": self.streaming_manifest_path,
324
+ "--streaming-checkpoint-path": self.streaming_checkpoint_path,
325
+ "--local-zipline-version": self._version,
326
+ # Need these for extracting metadata name in submitter
327
+ "--local-conf-path": self.local_abs_conf_path,
328
+ "--original-mode": self.mode,
329
+ "--conf-type": self.conf_type,
330
+ }
331
+
332
+ args = self._args.get("args")
333
+ if "check-if-job-is-running" in args:
334
+ user_args["--streaming-mode"] = "check-if-job-is-running"
335
+ elif "deploy" in args:
336
+ user_args["--streaming-mode"] = "deploy"
337
+
338
+ flag_args = {"--validate": self.validate, "--enable-debug": self.enable_debug}
339
+
340
+ # Set the savepoint deploy strategy
341
+ if self.latest_savepoint:
342
+ flag_args["--latest-savepoint"] = self.latest_savepoint
343
+ elif self.custom_savepoint:
344
+ user_args["--custom-savepoint"] = self.custom_savepoint
345
+ else:
346
+ flag_args["--no-savepoint"] = self.no_savepoint
347
+
348
+ # Set version check deploy
349
+ if self.version_check:
350
+ flag_args["--version-check"] = self.version_check
351
+
352
+ # Set additional jars
353
+ if self.additional_jars:
354
+ user_args["--additional-jars"] = self.additional_jars
355
+
356
+ user_args_str = " ".join(f"{key}={value}" for key, value in user_args.items() if value)
357
+ # if online args are set we add them to the user_args_str
358
+ if self.online_args:
359
+ user_args_str += " " + self.online_args
360
+
361
+ flag_args_str = " ".join(key for key, value in flag_args.items() if value)
362
+ dataproc_args = self.generate_dataproc_submitter_args(
363
+ job_type=JobType.FLINK,
364
+ version=self._version,
365
+ customer_artifact_prefix=self._remote_artifact_prefix,
366
+ user_args=" ".join([user_args_str, flag_args_str]),
367
+ )
368
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
369
+
370
+ return command
371
+
372
+ def run(self):
373
+ command_list = []
374
+ if self.mode == "info":
375
+ command_list.append(
376
+ "python3 {script} --conf {conf} --ds {ds} --repo {repo}".format(
377
+ script=self.render_info, conf=self.conf, ds=self.ds, repo=self.repo
378
+ )
379
+ )
380
+ elif self.sub_help or self.mode == "fetch":
381
+ entrypoint = "ai.chronon.online.fetcher.FetcherMain"
382
+ command_list.append(
383
+ "java -cp {jar} {entrypoint} {subcommand} {args}".format(
384
+ jar=self.jar_path,
385
+ entrypoint=entrypoint,
386
+ args="--help" if self.sub_help else self._gen_final_args(),
387
+ subcommand=ROUTES[self.conf_type][self.mode],
388
+ )
389
+ )
390
+ elif self.mode == "metastore":
391
+ # We could presumably support other metastore options but
392
+ # for now only poking for a particular partition is supported.
393
+ args = self._args.get("args")
394
+ supported_subcommands = ["check-partitions"]
395
+ assert "check-partitions" in args, (
396
+ f"Must specify one of the following subcommands: {supported_subcommands}"
397
+ )
398
+ assert "--partition-names" in args, (
399
+ "Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
400
+ )
401
+
402
+ dataproc_args = self.generate_dataproc_submitter_args(
403
+ # for now, self.conf is the only local file that requires uploading to gcs
404
+ user_args=self._gen_final_args(),
405
+ version=self._version,
406
+ customer_artifact_prefix=self._remote_artifact_prefix,
407
+ metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
408
+ )
409
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
410
+ command_list.append(command)
411
+ elif self.mode in ["streaming", "streaming-client"]:
412
+ args = self._args.get("args")
413
+ # streaming mode
414
+ command = self.run_dataproc_flink_streaming()
415
+ command_list.append(command)
416
+ else:
417
+ if self.parallelism > 1:
418
+ assert self.start_ds is not None and self.ds is not None, (
419
+ "To use parallelism, please specify --start-ds and --end-ds to "
420
+ "break down into multiple backfill jobs"
421
+ )
422
+ date_ranges = split_date_range(self.start_ds, self.ds, self.parallelism)
423
+ for start_ds, end_ds in date_ranges:
424
+ user_args = ("{subcommand} {args} {additional_args}").format(
425
+ subcommand=ROUTES[self.conf_type][self.mode],
426
+ args=self._gen_final_args(
427
+ start_ds=start_ds,
428
+ end_ds=end_ds,
429
+ # overriding the conf here because we only want the
430
+ # filename, not the full path. When we upload this to
431
+ # GCS, the full path does get reflected on GCS. But
432
+ # when we include the gcs file path as part of dataproc,
433
+ # the file is copied to root and not the complete path
434
+ # is copied.
435
+ override_conf_path=(
436
+ extract_filename_from_path(self.conf) if self.conf else None
437
+ ),
438
+ ),
439
+ additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
440
+ )
441
+
442
+ dataproc_args = self.generate_dataproc_submitter_args(
443
+ # for now, self.conf is the only local file that requires uploading to gcs
444
+ user_args=user_args,
445
+ version=self._version,
446
+ customer_artifact_prefix=self._remote_artifact_prefix,
447
+ metadata_conf_path=str(os.path.join(self.repo, self.conf))
448
+ if self.conf
449
+ else None,
450
+ )
451
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
452
+ command_list.append(command)
453
+ else:
454
+ user_args = ("{subcommand} {args} {additional_args}").format(
455
+ subcommand=ROUTES[self.conf_type][self.mode],
456
+ args=self._gen_final_args(
457
+ start_ds=self.start_ds,
458
+ # overriding the conf here because we only want the filename,
459
+ # not the full path. When we upload this to GCS, the full path
460
+ # does get reflected on GCS. But when we include the gcs file
461
+ # path as part of dataproc, the file is copied to root and
462
+ # not the complete path is copied.
463
+ override_conf_path=(
464
+ extract_filename_from_path(self.conf) if self.conf else None
465
+ ),
466
+ ),
467
+ additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
468
+ )
469
+ dataproc_args = self.generate_dataproc_submitter_args(
470
+ user_args=user_args,
471
+ version=self._version,
472
+ customer_artifact_prefix=self._remote_artifact_prefix,
473
+ metadata_conf_path=str(os.path.join(self.repo, self.conf))
474
+ if self.conf
475
+ else None,
476
+ )
477
+ command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
478
+ command_list.append(command)
479
+
480
+ if len(command_list) > 1:
481
+ # parallel backfill mode
482
+ with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
483
+ LOG.info(
484
+ "Running args list {} with pool size {}".format(command_list, self.parallelism)
485
+ )
486
+ pool.map(check_call, command_list)
487
+ elif len(command_list) == 1:
488
+ output = check_output(command_list[0]).decode("utf-8").split("\n")
489
+ print(*output, sep="\n")
490
+
491
+ dataproc_submitter_id_str = "Dataproc submitter job id"
492
+
493
+ dataproc_submitter_logs = [s for s in output if dataproc_submitter_id_str in s]
494
+
495
+ submitted_job_id = None
496
+ if dataproc_submitter_logs:
497
+ log = dataproc_submitter_logs[0]
498
+ submitted_job_id = (
499
+ log[log.index(dataproc_submitter_id_str) + len(dataproc_submitter_id_str) + 1 :]
500
+ ).strip()
501
+
502
+ if not self.disable_cloud_logging and submitted_job_id:
503
+ LOG.info(
504
+ """
505
+ <-----------------------------------------------------------------------------------
506
+ ------------------------------------------------------------------------------------
507
+ DATAPROC LOGS
508
+ ------------------------------------------------------------------------------------
509
+ ------------------------------------------------------------------------------------>
510
+ """
511
+ )
512
+ check_call(
513
+ f"gcloud dataproc jobs wait {submitted_job_id} --region={GcpRunner.get_gcp_region_id()} "
514
+ f"--project={GcpRunner.get_gcp_project_id()}"
515
+ )
516
+
517
+ # Fetch the final job state
518
+ job_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
519
+
520
+ LOG.info(
521
+ "<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>"
522
+ )
523
+ if job_state != "DONE":
524
+ LOG.info(
525
+ f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}"
526
+ )
527
+ raise RuntimeError(f"Job {submitted_job_id} failed.")
528
+ else:
529
+ LOG.info(f"Job {submitted_job_id} is in DONE state.")
530
+ return
531
+
532
+ # If streaming deploy job, poll and check for final
533
+ if (
534
+ submitted_job_id
535
+ and self.mode in ["streaming", "streaming-client"]
536
+ and "deploy" in self._args.get("args")
537
+ ):
538
+ # Poll the dataproc job id for 5 minutes until the job
539
+ total_time_seconds = 5 * 60
540
+ interval_seconds = 10
541
+ start_time = time.time()
542
+ while time.time() - start_time < total_time_seconds:
543
+ current_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
544
+
545
+ non_terminal_states = ["SETUP_DONE", "RUNNING", "PENDING", "STATE_UNSPECIFIED"]
546
+ if current_state not in non_terminal_states:
547
+ raise RuntimeError(
548
+ f"Flink job is not in {non_terminal_states}. "
549
+ f"Current state: {current_state}"
550
+ )
551
+
552
+ manifest_path = os.path.join(
553
+ self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt"
554
+ )
555
+ manifest_exists, raw_manifest = self.download_gcs_to_text(str(manifest_path))
556
+
557
+ if manifest_exists:
558
+ manifest = raw_manifest.strip()
559
+ LOG.info(
560
+ f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]"
561
+ )
562
+ manifest_tuples = manifest.split(",")
563
+
564
+ flink_job_id = [
565
+ f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")
566
+ ][0]
567
+ parent_job_id = [
568
+ f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")
569
+ ][0]
570
+
571
+ if parent_job_id == submitted_job_id:
572
+ LOG.info(
573
+ f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
574
+ f" Dataproc job ID = [{submitted_job_id}]"
575
+ )
576
+ break
577
+ else:
578
+ LOG.info(
579
+ f"Flink manifest not updated with new Dataproc job id {submitted_job_id}."
580
+ )
581
+ LOG.info(f"Sleeping for {interval_seconds} seconds...")
582
+ time.sleep(interval_seconds)
583
+ else:
584
+ raise RuntimeError(
585
+ f"Failed to confirm Flink manifest for new deployment with Dataproc job id {submitted_job_id}."
586
+ )
@@ -0,0 +1,15 @@
1
+ from typing import Optional
2
+
3
+ from git import Repo
4
+
5
+
6
+ def get_default_origin_branch(path, repo: Optional[Repo] = None):
7
+ if not repo:
8
+ repo = Repo(path, search_parent_directories=True)
9
+ return repo.remotes.origin.refs.HEAD.reference.name.split("/")[-1]
10
+
11
+
12
+ def get_current_branch(path, repo: Optional[Repo] = None):
13
+ if not repo:
14
+ repo = Repo(path, search_parent_directories=True)
15
+ return repo.active_branch.name