awx-zipline-ai 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent/ttypes.py +6 -6
- ai/chronon/airflow_helpers.py +20 -23
- ai/chronon/cli/__init__.py +0 -0
- ai/chronon/cli/compile/__init__.py +0 -0
- ai/chronon/cli/compile/column_hashing.py +40 -17
- ai/chronon/cli/compile/compile_context.py +13 -17
- ai/chronon/cli/compile/compiler.py +59 -36
- ai/chronon/cli/compile/conf_validator.py +251 -99
- ai/chronon/cli/compile/display/__init__.py +0 -0
- ai/chronon/cli/compile/display/class_tracker.py +6 -16
- ai/chronon/cli/compile/display/compile_status.py +10 -10
- ai/chronon/cli/compile/display/diff_result.py +79 -14
- ai/chronon/cli/compile/fill_templates.py +3 -8
- ai/chronon/cli/compile/parse_configs.py +10 -17
- ai/chronon/cli/compile/parse_teams.py +38 -34
- ai/chronon/cli/compile/serializer.py +3 -9
- ai/chronon/cli/compile/version_utils.py +42 -0
- ai/chronon/cli/git_utils.py +2 -13
- ai/chronon/cli/logger.py +0 -2
- ai/chronon/constants.py +1 -1
- ai/chronon/group_by.py +47 -47
- ai/chronon/join.py +46 -32
- ai/chronon/logger.py +1 -2
- ai/chronon/model.py +9 -4
- ai/chronon/query.py +2 -2
- ai/chronon/repo/__init__.py +1 -2
- ai/chronon/repo/aws.py +17 -31
- ai/chronon/repo/cluster.py +121 -50
- ai/chronon/repo/compile.py +14 -8
- ai/chronon/repo/constants.py +1 -1
- ai/chronon/repo/default_runner.py +32 -54
- ai/chronon/repo/explore.py +70 -73
- ai/chronon/repo/extract_objects.py +6 -9
- ai/chronon/repo/gcp.py +89 -88
- ai/chronon/repo/gitpython_utils.py +3 -2
- ai/chronon/repo/hub_runner.py +145 -55
- ai/chronon/repo/hub_uploader.py +2 -1
- ai/chronon/repo/init.py +12 -5
- ai/chronon/repo/join_backfill.py +19 -5
- ai/chronon/repo/run.py +42 -39
- ai/chronon/repo/serializer.py +4 -12
- ai/chronon/repo/utils.py +72 -63
- ai/chronon/repo/zipline.py +3 -19
- ai/chronon/repo/zipline_hub.py +211 -39
- ai/chronon/resources/__init__.py +0 -0
- ai/chronon/resources/gcp/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/__init__.py +0 -0
- ai/chronon/resources/gcp/group_bys/test/data.py +13 -17
- ai/chronon/resources/gcp/joins/__init__.py +0 -0
- ai/chronon/resources/gcp/joins/test/data.py +4 -8
- ai/chronon/resources/gcp/sources/__init__.py +0 -0
- ai/chronon/resources/gcp/sources/test/data.py +9 -6
- ai/chronon/resources/gcp/teams.py +9 -21
- ai/chronon/source.py +2 -4
- ai/chronon/staging_query.py +60 -19
- ai/chronon/types.py +3 -2
- ai/chronon/utils.py +21 -68
- ai/chronon/windows.py +2 -4
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/METADATA +48 -24
- awx_zipline_ai-0.3.1.dist-info/RECORD +96 -0
- awx_zipline_ai-0.3.1.dist-info/top_level.txt +4 -0
- gen_thrift/__init__.py +0 -0
- {ai/chronon → gen_thrift}/api/ttypes.py +327 -197
- {ai/chronon/api → gen_thrift}/common/ttypes.py +9 -39
- gen_thrift/eval/ttypes.py +660 -0
- {ai/chronon → gen_thrift}/hub/ttypes.py +12 -131
- {ai/chronon → gen_thrift}/observability/ttypes.py +343 -180
- {ai/chronon → gen_thrift}/planner/ttypes.py +326 -45
- ai/chronon/eval/__init__.py +0 -122
- ai/chronon/eval/query_parsing.py +0 -19
- ai/chronon/eval/sample_tables.py +0 -100
- ai/chronon/eval/table_scan.py +0 -186
- ai/chronon/orchestration/ttypes.py +0 -4406
- ai/chronon/resources/gcp/README.md +0 -174
- ai/chronon/resources/gcp/zipline-cli-install.sh +0 -54
- awx_zipline_ai-0.2.1.dist-info/RECORD +0 -93
- awx_zipline_ai-0.2.1.dist-info/licenses/LICENSE +0 -202
- awx_zipline_ai-0.2.1.dist-info/top_level.txt +0 -3
- /jars/__init__.py → /__init__.py +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/WHEEL +0 -0
- {awx_zipline_ai-0.2.1.dist-info → awx_zipline_ai-0.3.1.dist-info}/entry_points.txt +0 -0
- {ai/chronon → gen_thrift}/api/__init__.py +0 -0
- {ai/chronon/api/common → gen_thrift/api}/constants.py +0 -0
- {ai/chronon/api → gen_thrift}/common/__init__.py +0 -0
- {ai/chronon/api → gen_thrift/common}/constants.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/__init__.py +0 -0
- {ai/chronon/fetcher → gen_thrift/eval}/constants.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/__init__.py +0 -0
- {ai/chronon/hub → gen_thrift/fetcher}/constants.py +0 -0
- {ai/chronon → gen_thrift}/fetcher/ttypes.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/__init__.py +0 -0
- {ai/chronon/observability → gen_thrift/hub}/constants.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/__init__.py +0 -0
- {ai/chronon/orchestration → gen_thrift/observability}/constants.py +0 -0
- {ai/chronon → gen_thrift}/planner/__init__.py +0 -0
- {ai/chronon → gen_thrift}/planner/constants.py +0 -0
ai/chronon/repo/gcp.py
CHANGED
|
@@ -38,9 +38,7 @@ class GcpRunner(Runner):
|
|
|
38
38
|
def __init__(self, args):
|
|
39
39
|
self._remote_artifact_prefix = args.get("artifact_prefix")
|
|
40
40
|
if not self._remote_artifact_prefix:
|
|
41
|
-
raise ValueError(
|
|
42
|
-
"GCP artifact prefix not set."
|
|
43
|
-
)
|
|
41
|
+
raise ValueError("GCP artifact prefix not set.")
|
|
44
42
|
|
|
45
43
|
self._version = args.get("version")
|
|
46
44
|
gcp_jar_path = GcpRunner.download_zipline_dataproc_jar(
|
|
@@ -55,18 +53,13 @@ class GcpRunner(Runner):
|
|
|
55
53
|
self._version,
|
|
56
54
|
ZIPLINE_GCP_SERVICE_JAR,
|
|
57
55
|
)
|
|
58
|
-
jar_path =
|
|
59
|
-
f"{service_jar_path}:{gcp_jar_path}"
|
|
60
|
-
if args["mode"] == "fetch"
|
|
61
|
-
else gcp_jar_path
|
|
62
|
-
)
|
|
56
|
+
jar_path = f"{gcp_jar_path}:{service_jar_path}" if args["mode"] == "fetch" else gcp_jar_path
|
|
63
57
|
|
|
64
58
|
self._args = args
|
|
65
59
|
self.job_id = str(uuid.uuid4())
|
|
66
60
|
|
|
67
61
|
super().__init__(args, os.path.expanduser(jar_path))
|
|
68
62
|
|
|
69
|
-
|
|
70
63
|
@staticmethod
|
|
71
64
|
def get_gcp_project_id() -> str:
|
|
72
65
|
return get_environ_arg("GCP_PROJECT_ID")
|
|
@@ -101,9 +94,7 @@ class GcpRunner(Runner):
|
|
|
101
94
|
else:
|
|
102
95
|
return blob.exists(), blob.download_as_text()
|
|
103
96
|
except Exception as e:
|
|
104
|
-
raise RuntimeError(
|
|
105
|
-
f"Failed to download {source_blob_name}: {str(e)}"
|
|
106
|
-
) from e
|
|
97
|
+
raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
|
|
107
98
|
|
|
108
99
|
@staticmethod
|
|
109
100
|
@retry_decorator(retries=2, backoff=5)
|
|
@@ -128,9 +119,7 @@ class GcpRunner(Runner):
|
|
|
128
119
|
)
|
|
129
120
|
)
|
|
130
121
|
except Exception as e:
|
|
131
|
-
raise RuntimeError(
|
|
132
|
-
f"Failed to download {source_blob_name}: {str(e)}"
|
|
133
|
-
) from e
|
|
122
|
+
raise RuntimeError(f"Failed to download {source_blob_name}: {str(e)}") from e
|
|
134
123
|
|
|
135
124
|
@staticmethod
|
|
136
125
|
@retry_decorator(retries=2, backoff=5)
|
|
@@ -163,9 +152,7 @@ class GcpRunner(Runner):
|
|
|
163
152
|
blob = bucket.get_blob(blob_name)
|
|
164
153
|
|
|
165
154
|
if not blob:
|
|
166
|
-
raise FileNotFoundError(
|
|
167
|
-
f"File {blob_name} not found in bucket {bucket_name}"
|
|
168
|
-
)
|
|
155
|
+
raise FileNotFoundError(f"File {blob_name} not found in bucket {bucket_name}")
|
|
169
156
|
|
|
170
157
|
return blob.crc32c
|
|
171
158
|
|
|
@@ -191,9 +178,7 @@ class GcpRunner(Runner):
|
|
|
191
178
|
return base64.b64encode(crc32c_hash.digest()).decode("utf-8")
|
|
192
179
|
|
|
193
180
|
@staticmethod
|
|
194
|
-
def compare_gcs_and_local_file_hashes(
|
|
195
|
-
remote_file_path: str, local_file_path: str
|
|
196
|
-
) -> bool:
|
|
181
|
+
def compare_gcs_and_local_file_hashes(remote_file_path: str, local_file_path: str) -> bool:
|
|
197
182
|
"""
|
|
198
183
|
Compare hashes of a GCS file and a local file to check if they're identical.
|
|
199
184
|
|
|
@@ -219,15 +204,14 @@ class GcpRunner(Runner):
|
|
|
219
204
|
return False
|
|
220
205
|
|
|
221
206
|
@staticmethod
|
|
222
|
-
def download_zipline_dataproc_jar(
|
|
207
|
+
def download_zipline_dataproc_jar(
|
|
208
|
+
remote_file_path: str, local_file_path: str, version: str, jar_name: str
|
|
223
209
|
):
|
|
224
210
|
source_path = os.path.join(remote_file_path, "release", version, "jars", jar_name)
|
|
225
211
|
dest_path = os.path.join(local_file_path, jar_name)
|
|
226
212
|
|
|
227
213
|
are_identical = (
|
|
228
|
-
GcpRunner.compare_gcs_and_local_file_hashes(
|
|
229
|
-
source_path, dest_path
|
|
230
|
-
)
|
|
214
|
+
GcpRunner.compare_gcs_and_local_file_hashes(source_path, dest_path)
|
|
231
215
|
if os.path.exists(dest_path)
|
|
232
216
|
else False
|
|
233
217
|
)
|
|
@@ -235,9 +219,7 @@ class GcpRunner(Runner):
|
|
|
235
219
|
if are_identical:
|
|
236
220
|
LOG.info(f"{dest_path} matches GCS {source_path}")
|
|
237
221
|
else:
|
|
238
|
-
LOG.info(
|
|
239
|
-
f"{dest_path} does NOT match GCS {source_path}"
|
|
240
|
-
)
|
|
222
|
+
LOG.info(f"{dest_path} does NOT match GCS {source_path}")
|
|
241
223
|
LOG.info(f"Downloading {jar_name} from GCS...")
|
|
242
224
|
|
|
243
225
|
GcpRunner.download_gcs_file(source_path, dest_path)
|
|
@@ -251,7 +233,6 @@ class GcpRunner(Runner):
|
|
|
251
233
|
job_type: JobType = JobType.SPARK,
|
|
252
234
|
metadata_conf_path: str = None,
|
|
253
235
|
):
|
|
254
|
-
|
|
255
236
|
parsed = urlparse(customer_artifact_prefix)
|
|
256
237
|
source_blob_name = parsed.path.lstrip("/")
|
|
257
238
|
|
|
@@ -263,7 +244,7 @@ class GcpRunner(Runner):
|
|
|
263
244
|
source_blob_name,
|
|
264
245
|
"metadata",
|
|
265
246
|
self.job_id,
|
|
266
|
-
f"{extract_filename_from_path(metadata_conf_path)}"
|
|
247
|
+
f"{extract_filename_from_path(metadata_conf_path)}",
|
|
267
248
|
)
|
|
268
249
|
gcs_files.append(
|
|
269
250
|
GcpRunner.upload_gcs_blob(
|
|
@@ -283,35 +264,42 @@ class GcpRunner(Runner):
|
|
|
283
264
|
main_class = "ai.chronon.flink.FlinkJob"
|
|
284
265
|
flink_jar_uri = os.path.join(release_prefix, f"{ZIPLINE_GCP_FLINK_JAR_DEFAULT}")
|
|
285
266
|
enable_pubsub = GcpRunner.is_pubsub_enabled()
|
|
286
|
-
flink_pubsub_connector_jar_uri = os.path.join(
|
|
287
|
-
|
|
267
|
+
flink_pubsub_connector_jar_uri = os.path.join(
|
|
268
|
+
release_prefix, f"{ZIPLINE_GCP_FLINK_PUBSUB_JAR_DEFAULT}"
|
|
269
|
+
)
|
|
270
|
+
base_formatted_args = (
|
|
271
|
+
final_args.format(
|
|
288
272
|
user_args=user_args,
|
|
289
273
|
jar_uri=jar_uri,
|
|
290
274
|
job_type=job_type.value,
|
|
291
275
|
main_class=main_class,
|
|
292
276
|
zipline_version=self._version,
|
|
293
277
|
job_id=self.job_id,
|
|
294
|
-
)
|
|
278
|
+
)
|
|
279
|
+
+ f" --flink-main-jar-uri={flink_jar_uri}"
|
|
280
|
+
)
|
|
295
281
|
if enable_pubsub:
|
|
296
282
|
base_formatted_args += f" --flink-pubsub-jar-uri={flink_pubsub_connector_jar_uri}"
|
|
297
283
|
return base_formatted_args
|
|
298
284
|
|
|
299
285
|
elif job_type == JobType.SPARK:
|
|
300
286
|
main_class = "ai.chronon.spark.Driver"
|
|
301
|
-
return " ".join(
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
287
|
+
return " ".join(
|
|
288
|
+
[
|
|
289
|
+
final_args.format(
|
|
290
|
+
user_args=user_args,
|
|
291
|
+
jar_uri=jar_uri,
|
|
292
|
+
job_type=job_type.value,
|
|
293
|
+
main_class=main_class,
|
|
294
|
+
zipline_version=self._version,
|
|
295
|
+
job_id=self.job_id,
|
|
296
|
+
),
|
|
297
|
+
"--is-gcp",
|
|
298
|
+
f"--gcp-project-id={GcpRunner.get_gcp_project_id()}",
|
|
299
|
+
f"--gcp-bigtable-instance-id={GcpRunner.get_gcp_bigtable_instance_id()}",
|
|
300
|
+
f"--files={gcs_file_args}" if gcs_file_args else "",
|
|
301
|
+
]
|
|
302
|
+
)
|
|
315
303
|
else:
|
|
316
304
|
raise ValueError(f"Invalid job type: {job_type}")
|
|
317
305
|
|
|
@@ -335,7 +323,6 @@ class GcpRunner(Runner):
|
|
|
335
323
|
"--streaming-manifest-path": self.streaming_manifest_path,
|
|
336
324
|
"--streaming-checkpoint-path": self.streaming_checkpoint_path,
|
|
337
325
|
"--local-zipline-version": self._version,
|
|
338
|
-
|
|
339
326
|
# Need these for extracting metadata name in submitter
|
|
340
327
|
"--local-conf-path": self.local_abs_conf_path,
|
|
341
328
|
"--original-mode": self.mode,
|
|
@@ -405,19 +392,19 @@ class GcpRunner(Runner):
|
|
|
405
392
|
# for now only poking for a particular partition is supported.
|
|
406
393
|
args = self._args.get("args")
|
|
407
394
|
supported_subcommands = ["check-partitions"]
|
|
408
|
-
assert (
|
|
409
|
-
"
|
|
410
|
-
)
|
|
411
|
-
assert (
|
|
412
|
-
"
|
|
413
|
-
)
|
|
395
|
+
assert "check-partitions" in args, (
|
|
396
|
+
f"Must specify one of the following subcommands: {supported_subcommands}"
|
|
397
|
+
)
|
|
398
|
+
assert "--partition-names" in args, (
|
|
399
|
+
"Must specify a list of `--partition-names=schema.table/pk1=pv1/pk2=pv2"
|
|
400
|
+
)
|
|
414
401
|
|
|
415
402
|
dataproc_args = self.generate_dataproc_submitter_args(
|
|
416
403
|
# for now, self.conf is the only local file that requires uploading to gcs
|
|
417
404
|
user_args=self._gen_final_args(),
|
|
418
405
|
version=self._version,
|
|
419
406
|
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
420
|
-
metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None
|
|
407
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf)) if self.conf else None,
|
|
421
408
|
)
|
|
422
409
|
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
423
410
|
command_list.append(command)
|
|
@@ -446,14 +433,10 @@ class GcpRunner(Runner):
|
|
|
446
433
|
# the file is copied to root and not the complete path
|
|
447
434
|
# is copied.
|
|
448
435
|
override_conf_path=(
|
|
449
|
-
extract_filename_from_path(self.conf)
|
|
450
|
-
if self.conf
|
|
451
|
-
else None
|
|
436
|
+
extract_filename_from_path(self.conf) if self.conf else None
|
|
452
437
|
),
|
|
453
438
|
),
|
|
454
|
-
additional_args=os.environ.get(
|
|
455
|
-
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
456
|
-
),
|
|
439
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
457
440
|
)
|
|
458
441
|
|
|
459
442
|
dataproc_args = self.generate_dataproc_submitter_args(
|
|
@@ -461,11 +444,11 @@ class GcpRunner(Runner):
|
|
|
461
444
|
user_args=user_args,
|
|
462
445
|
version=self._version,
|
|
463
446
|
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
464
|
-
metadata_conf_path=str(os.path.join(self.repo, self.conf))
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
447
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf))
|
|
448
|
+
if self.conf
|
|
449
|
+
else None,
|
|
468
450
|
)
|
|
451
|
+
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
469
452
|
command_list.append(command)
|
|
470
453
|
else:
|
|
471
454
|
user_args = ("{subcommand} {args} {additional_args}").format(
|
|
@@ -481,15 +464,15 @@ class GcpRunner(Runner):
|
|
|
481
464
|
extract_filename_from_path(self.conf) if self.conf else None
|
|
482
465
|
),
|
|
483
466
|
),
|
|
484
|
-
additional_args=os.environ.get(
|
|
485
|
-
"CHRONON_CONFIG_ADDITIONAL_ARGS", ""
|
|
486
|
-
),
|
|
467
|
+
additional_args=os.environ.get("CHRONON_CONFIG_ADDITIONAL_ARGS", ""),
|
|
487
468
|
)
|
|
488
469
|
dataproc_args = self.generate_dataproc_submitter_args(
|
|
489
470
|
user_args=user_args,
|
|
490
471
|
version=self._version,
|
|
491
472
|
customer_artifact_prefix=self._remote_artifact_prefix,
|
|
492
|
-
metadata_conf_path=str(os.path.join(self.repo, self.conf))
|
|
473
|
+
metadata_conf_path=str(os.path.join(self.repo, self.conf))
|
|
474
|
+
if self.conf
|
|
475
|
+
else None,
|
|
493
476
|
)
|
|
494
477
|
command = f"java -cp {self.jar_path} {DATAPROC_ENTRY} {dataproc_args}"
|
|
495
478
|
command_list.append(command)
|
|
@@ -498,9 +481,7 @@ class GcpRunner(Runner):
|
|
|
498
481
|
# parallel backfill mode
|
|
499
482
|
with multiprocessing.Pool(processes=int(self.parallelism)) as pool:
|
|
500
483
|
LOG.info(
|
|
501
|
-
"Running args list {} with pool size {}".format(
|
|
502
|
-
command_list, self.parallelism
|
|
503
|
-
)
|
|
484
|
+
"Running args list {} with pool size {}".format(command_list, self.parallelism)
|
|
504
485
|
)
|
|
505
486
|
pool.map(check_call, command_list)
|
|
506
487
|
elif len(command_list) == 1:
|
|
@@ -518,7 +499,6 @@ class GcpRunner(Runner):
|
|
|
518
499
|
log[log.index(dataproc_submitter_id_str) + len(dataproc_submitter_id_str) + 1 :]
|
|
519
500
|
).strip()
|
|
520
501
|
|
|
521
|
-
|
|
522
502
|
if not self.disable_cloud_logging and submitted_job_id:
|
|
523
503
|
LOG.info(
|
|
524
504
|
"""
|
|
@@ -537,17 +517,24 @@ class GcpRunner(Runner):
|
|
|
537
517
|
# Fetch the final job state
|
|
538
518
|
job_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
|
|
539
519
|
|
|
540
|
-
LOG.info(
|
|
541
|
-
|
|
542
|
-
|
|
520
|
+
LOG.info(
|
|
521
|
+
"<<<<<<<<<<<<<<<<-----------------JOB STATUS----------------->>>>>>>>>>>>>>>>>"
|
|
522
|
+
)
|
|
523
|
+
if job_state != "DONE":
|
|
524
|
+
LOG.info(
|
|
525
|
+
f"Job {submitted_job_id} is not in DONE state. Current state: {job_state}"
|
|
526
|
+
)
|
|
543
527
|
raise RuntimeError(f"Job {submitted_job_id} failed.")
|
|
544
528
|
else:
|
|
545
529
|
LOG.info(f"Job {submitted_job_id} is in DONE state.")
|
|
546
530
|
return
|
|
547
531
|
|
|
548
532
|
# If streaming deploy job, poll and check for final
|
|
549
|
-
if (
|
|
550
|
-
|
|
533
|
+
if (
|
|
534
|
+
submitted_job_id
|
|
535
|
+
and self.mode in ["streaming", "streaming-client"]
|
|
536
|
+
and "deploy" in self._args.get("args")
|
|
537
|
+
):
|
|
551
538
|
# Poll the dataproc job id for 5 minutes until the job
|
|
552
539
|
total_time_seconds = 5 * 60
|
|
553
540
|
interval_seconds = 10
|
|
@@ -555,28 +542,42 @@ class GcpRunner(Runner):
|
|
|
555
542
|
while time.time() - start_time < total_time_seconds:
|
|
556
543
|
current_state = GcpRunner.get_state_dataproc_job(submitted_job_id)
|
|
557
544
|
|
|
558
|
-
non_terminal_states = [
|
|
545
|
+
non_terminal_states = ["SETUP_DONE", "RUNNING", "PENDING", "STATE_UNSPECIFIED"]
|
|
559
546
|
if current_state not in non_terminal_states:
|
|
560
|
-
raise RuntimeError(
|
|
561
|
-
|
|
547
|
+
raise RuntimeError(
|
|
548
|
+
f"Flink job is not in {non_terminal_states}. "
|
|
549
|
+
f"Current state: {current_state}"
|
|
550
|
+
)
|
|
562
551
|
|
|
563
|
-
manifest_path = os.path.join(
|
|
552
|
+
manifest_path = os.path.join(
|
|
553
|
+
self.streaming_manifest_path, self.conf_metadata_name, "manifest.txt"
|
|
554
|
+
)
|
|
564
555
|
manifest_exists, raw_manifest = self.download_gcs_to_text(str(manifest_path))
|
|
565
556
|
|
|
566
557
|
if manifest_exists:
|
|
567
558
|
manifest = raw_manifest.strip()
|
|
568
|
-
LOG.info(
|
|
559
|
+
LOG.info(
|
|
560
|
+
f"Checking Flink manifest to confirm deployment. Manifest: [{manifest}]"
|
|
561
|
+
)
|
|
569
562
|
manifest_tuples = manifest.split(",")
|
|
570
563
|
|
|
571
|
-
flink_job_id = [
|
|
572
|
-
|
|
564
|
+
flink_job_id = [
|
|
565
|
+
f.split("=")[1] for f in manifest_tuples if f.startswith("flinkJobId")
|
|
566
|
+
][0]
|
|
567
|
+
parent_job_id = [
|
|
568
|
+
f.split("=")[1] for f in manifest_tuples if f.startswith("parentJobId")
|
|
569
|
+
][0]
|
|
573
570
|
|
|
574
571
|
if parent_job_id == submitted_job_id:
|
|
575
|
-
LOG.info(
|
|
576
|
-
|
|
572
|
+
LOG.info(
|
|
573
|
+
f"Flink job has been deployed successfully. Flink job ID = [{flink_job_id}]."
|
|
574
|
+
f" Dataproc job ID = [{submitted_job_id}]"
|
|
575
|
+
)
|
|
577
576
|
break
|
|
578
577
|
else:
|
|
579
|
-
LOG.info(
|
|
578
|
+
LOG.info(
|
|
579
|
+
f"Flink manifest not updated with new Dataproc job id {submitted_job_id}."
|
|
580
|
+
)
|
|
580
581
|
LOG.info(f"Sleeping for {interval_seconds} seconds...")
|
|
581
582
|
time.sleep(interval_seconds)
|
|
582
583
|
else:
|
|
@@ -6,9 +6,10 @@ from git import Repo
|
|
|
6
6
|
def get_default_origin_branch(path, repo: Optional[Repo] = None):
|
|
7
7
|
if not repo:
|
|
8
8
|
repo = Repo(path, search_parent_directories=True)
|
|
9
|
-
return repo.remotes.origin.refs.HEAD.reference.name.split(
|
|
9
|
+
return repo.remotes.origin.refs.HEAD.reference.name.split("/")[-1]
|
|
10
|
+
|
|
10
11
|
|
|
11
12
|
def get_current_branch(path, repo: Optional[Repo] = None):
|
|
12
13
|
if not repo:
|
|
13
14
|
repo = Repo(path, search_parent_directories=True)
|
|
14
|
-
return repo.active_branch.name
|
|
15
|
+
return repo.active_branch.name
|