acdc_aws_etl_pipeline 0.6.4__py3-none-any.whl → 0.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acdc_aws_etl_pipeline/upload/metadata_submitter.py +388 -157
- {acdc_aws_etl_pipeline-0.6.4.dist-info → acdc_aws_etl_pipeline-0.6.6.dist-info}/METADATA +2 -1
- {acdc_aws_etl_pipeline-0.6.4.dist-info → acdc_aws_etl_pipeline-0.6.6.dist-info}/RECORD +4 -4
- {acdc_aws_etl_pipeline-0.6.4.dist-info → acdc_aws_etl_pipeline-0.6.6.dist-info}/WHEEL +0 -0
|
@@ -9,6 +9,12 @@ from gen3.index import Gen3Index
|
|
|
9
9
|
from gen3.submission import Gen3Submission
|
|
10
10
|
import logging
|
|
11
11
|
from datetime import datetime
|
|
12
|
+
import jwt
|
|
13
|
+
from typing import Dict, List
|
|
14
|
+
import re
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import uuid
|
|
17
|
+
from acdc_aws_etl_pipeline.validate.validate import write_parquet_to_db
|
|
12
18
|
|
|
13
19
|
logger = logging.getLogger(__name__)
|
|
14
20
|
|
|
@@ -201,16 +207,17 @@ def read_metadata_json_s3(s3_uri: str, session) -> dict:
|
|
|
201
207
|
logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
|
|
202
208
|
return data
|
|
203
209
|
|
|
204
|
-
def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
|
|
210
|
+
def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = None) -> list:
|
|
205
211
|
"""
|
|
206
|
-
Read a DataImportOrder.txt file from S3 and return node order as a list.
|
|
212
|
+
Read a DataImportOrder.txt file from S3 and return node order as a list, optionally excluding some nodes.
|
|
207
213
|
|
|
208
214
|
Args:
|
|
209
215
|
s3_uri (str): S3 URI to the DataImportOrder.txt file.
|
|
210
216
|
session (boto3.Session): Boto3 session.
|
|
217
|
+
exclude_nodes (list, optional): Node names to exclude from result.
|
|
211
218
|
|
|
212
219
|
Returns:
|
|
213
|
-
list: Node names (order as listed in file).
|
|
220
|
+
list: Node names (order as listed in file), optionally excluding nodes in exclude_nodes.
|
|
214
221
|
|
|
215
222
|
Raises:
|
|
216
223
|
ValueError: If the provided S3 URI does not point to DataImportOrder.txt.
|
|
@@ -224,7 +231,11 @@ def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
|
|
|
224
231
|
obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
|
|
225
232
|
content = obj['Body'].read().decode('utf-8')
|
|
226
233
|
import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
|
|
227
|
-
logger.debug(f"
|
|
234
|
+
logger.debug(f"Raw import order from S3 file: {import_order}")
|
|
235
|
+
if exclude_nodes is not None:
|
|
236
|
+
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
237
|
+
logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
|
|
238
|
+
logger.debug(f"Final import order from S3 file {s3_uri}: {import_order}")
|
|
228
239
|
return import_order
|
|
229
240
|
|
|
230
241
|
def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
|
|
@@ -347,7 +358,27 @@ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) ->
|
|
|
347
358
|
logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
|
|
348
359
|
raise e
|
|
349
360
|
|
|
350
|
-
|
|
361
|
+
|
|
362
|
+
def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
|
|
363
|
+
"""
|
|
364
|
+
Extracts the URL from a JSON Web Token (JWT) credential.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
jwt_string (string): The JSON Web Token (JWT) credential.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
str: The extracted URL.
|
|
371
|
+
"""
|
|
372
|
+
logger.info("Decoding JWT to extract API URL.")
|
|
373
|
+
url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '')
|
|
374
|
+
if '/user' in url:
|
|
375
|
+
url = url.split('/user')[0]
|
|
376
|
+
url = f"{url}/api/{api_version}"
|
|
377
|
+
logger.info(f"Extracted API URL from JWT: {url}")
|
|
378
|
+
return url
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def create_gen3_submission_class(api_key: dict):
|
|
351
382
|
"""
|
|
352
383
|
Create and authenticate a Gen3Submission client using a temporary file for API key.
|
|
353
384
|
|
|
@@ -357,185 +388,385 @@ def create_gen3_submission_class(api_key: dict, api_endpoint: str):
|
|
|
357
388
|
|
|
358
389
|
Returns:
|
|
359
390
|
Gen3Submission: An authenticated Gen3Submission object.
|
|
360
|
-
|
|
361
|
-
Notes:
|
|
362
|
-
The temporary file storing the API key is deleted after use.
|
|
363
391
|
"""
|
|
364
|
-
|
|
365
|
-
|
|
392
|
+
logger.debug("Extracting JWT token from API key dict.")
|
|
393
|
+
jwt_token = api_key['api_key']
|
|
394
|
+
logger.info("Inferring API endpoint from JWT token.")
|
|
395
|
+
api_endpoint = infer_api_endpoint_from_jwt(jwt_token)
|
|
396
|
+
logger.debug(f"Inferred API endpoint: {api_endpoint}")
|
|
366
397
|
logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
|
|
367
|
-
|
|
368
|
-
submit =
|
|
369
|
-
|
|
370
|
-
try:
|
|
371
|
-
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", dir="/tmp") as tmp_file:
|
|
372
|
-
json.dump(api_key, tmp_file)
|
|
373
|
-
tmp_api_key_path = tmp_file.name
|
|
374
|
-
auth = Gen3Auth(refresh_file=tmp_api_key_path)
|
|
375
|
-
submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
376
|
-
return submit
|
|
377
|
-
finally:
|
|
378
|
-
if tmp_api_key_path and os.path.exists(tmp_api_key_path):
|
|
379
|
-
try:
|
|
380
|
-
os.remove(tmp_api_key_path)
|
|
381
|
-
logger.debug(f"Temporary API key file {tmp_api_key_path} deleted.")
|
|
382
|
-
except Exception as e:
|
|
383
|
-
logger.warning(f"Failed to delete temporary API key file {tmp_api_key_path}: {e}")
|
|
398
|
+
auth = Gen3Auth(refresh_token=api_key)
|
|
399
|
+
submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
400
|
+
return submit
|
|
384
401
|
|
|
385
|
-
def write_submission_results(results, output_path, mode='w'):
|
|
386
|
-
with open(output_path, mode) as f:
|
|
387
|
-
json.dump(results, f, indent=4)
|
|
388
402
|
|
|
389
|
-
def
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
403
|
+
def submit_data_chunks(
|
|
404
|
+
split_json_list: list,
|
|
405
|
+
node: str,
|
|
406
|
+
gen3_submitter,
|
|
393
407
|
project_id: str,
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
max_retries: int = 5,
|
|
399
|
-
write_submission_results_path: str = None
|
|
400
|
-
):
|
|
408
|
+
max_retries: int,
|
|
409
|
+
file_path: str,
|
|
410
|
+
program_id: str = "program1"
|
|
411
|
+
) -> List[Dict]:
|
|
401
412
|
"""
|
|
402
|
-
Submit
|
|
413
|
+
Submit each chunk of data (in split_json_list) for a given node to Gen3, using retry logic and logging on failures.
|
|
403
414
|
|
|
404
415
|
Args:
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
project_id (str):
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
exclude_nodes (list, optional): List of node names to skip (default: ["project", "program", "acknowledgement", "publication"]).
|
|
413
|
-
max_retries (int, optional): Maximum number of retry attempts per node chunk. Default: 5.
|
|
416
|
+
split_json_list (list): List of JSON-serializable chunked data to submit.
|
|
417
|
+
node (str): Name of the data node being submitted.
|
|
418
|
+
gen3_submitter: A Gen3Submission instance for making submissions.
|
|
419
|
+
project_id (str): The project identifier within Gen3.
|
|
420
|
+
max_retries (int): Maximum number of retry attempts per chunk on failure.
|
|
421
|
+
file_path (str): Path of the file that was submitted. Used only for data capture.
|
|
422
|
+
program_id (str, optional): The Gen3 program id (default: "program1").
|
|
414
423
|
|
|
415
424
|
Returns:
|
|
416
|
-
|
|
425
|
+
List[Dict]: List of response dictionaries for each submitted chunk.
|
|
417
426
|
|
|
418
427
|
Raises:
|
|
419
|
-
Exception:
|
|
428
|
+
Exception: If submission fails after all retry attempts for any chunk.
|
|
429
|
+
"""
|
|
430
|
+
|
|
431
|
+
n_json_data = len(split_json_list)
|
|
432
|
+
response_results = []
|
|
433
|
+
|
|
434
|
+
for index, jsn in enumerate(split_json_list):
|
|
435
|
+
progress_str = f"{index + 1}/{n_json_data}"
|
|
436
|
+
|
|
437
|
+
submission_success = False
|
|
438
|
+
last_exception = None
|
|
420
439
|
|
|
421
|
-
|
|
422
|
-
|
|
440
|
+
attempt = 0
|
|
441
|
+
while attempt <= max_retries:
|
|
442
|
+
try:
|
|
443
|
+
if attempt == 0:
|
|
444
|
+
log_msg = (
|
|
445
|
+
f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
446
|
+
f"Split: {progress_str:<5}"
|
|
447
|
+
)
|
|
448
|
+
logger.info(log_msg)
|
|
449
|
+
else:
|
|
450
|
+
log_msg = (
|
|
451
|
+
f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
452
|
+
f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
|
|
453
|
+
)
|
|
454
|
+
logger.warning(log_msg)
|
|
455
|
+
|
|
456
|
+
res = gen3_submitter.submit_record(program_id, project_id, jsn)
|
|
457
|
+
res.update({"file_path": file_path})
|
|
458
|
+
response_results.append(res)
|
|
459
|
+
submission_success = True
|
|
460
|
+
logger.info(
|
|
461
|
+
f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
|
|
462
|
+
f"Node: {node:<12} | Split: {progress_str:<5}"
|
|
463
|
+
)
|
|
464
|
+
break # Success
|
|
465
|
+
|
|
466
|
+
except Exception as e:
|
|
467
|
+
last_exception = e
|
|
468
|
+
logger.error(
|
|
469
|
+
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
if attempt < max_retries:
|
|
473
|
+
import time
|
|
474
|
+
time.sleep(0.2)
|
|
475
|
+
else:
|
|
476
|
+
logger.critical(
|
|
477
|
+
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
478
|
+
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
479
|
+
)
|
|
480
|
+
attempt += 1
|
|
481
|
+
|
|
482
|
+
if not submission_success:
|
|
483
|
+
# After retries, still failed
|
|
484
|
+
raise Exception(
|
|
485
|
+
f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
|
|
486
|
+
f"Last error: {last_exception}"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
logger.info(f"Finished submitting node '{node}'.")
|
|
490
|
+
return response_results
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def flatten_submission_results(submission_results: List[Dict]) -> List[Dict]:
|
|
423
494
|
"""
|
|
495
|
+
Flattens a list of Gen3 submission result dictionaries into a single list of entity dictionaries.
|
|
424
496
|
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
497
|
+
For each submission result, this function processes its entities (if any),
|
|
498
|
+
extracting the 'project_id' and 'submitter_id' from the 'unique_keys' field (if present)
|
|
499
|
+
into the top-level entity dictionary for easy access.
|
|
428
500
|
|
|
429
|
-
|
|
430
|
-
exclude_nodes = ["project", "program", "acknowledgement", "publication"]
|
|
501
|
+
Any submission result that does not have a code of 200 or lacks entities is skipped, and a warning is logged.
|
|
431
502
|
|
|
432
|
-
|
|
433
|
-
|
|
503
|
+
Args:
|
|
504
|
+
submission_results (List[Dict]):
|
|
505
|
+
A list of Gen3 submission result dictionaries, each containing at least a "code" and "entities" entry.
|
|
434
506
|
|
|
435
|
-
|
|
436
|
-
|
|
507
|
+
Returns:
|
|
508
|
+
List[Dict]:
|
|
509
|
+
A flat list, where each element is an entity dictionary (with keys 'project_id' and 'submitter_id' added if available).
|
|
510
|
+
"""
|
|
511
|
+
flat_list_dict = []
|
|
512
|
+
total = len(submission_results)
|
|
513
|
+
logger.info(f"Flattening {total} submission result(s)...")
|
|
514
|
+
|
|
515
|
+
for idx, obj in enumerate(submission_results, 1):
|
|
516
|
+
transaction_id = obj.get("transaction_id")
|
|
517
|
+
code = obj.get("code")
|
|
518
|
+
if code != 200:
|
|
519
|
+
logger.warning(f"Skipping submission result at index {idx-1} (code={code})")
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
entities = obj.get("entities")
|
|
523
|
+
|
|
524
|
+
if entities is None:
|
|
525
|
+
logger.warning(f"No entities found in submission result at index {idx-1}")
|
|
526
|
+
continue
|
|
527
|
+
|
|
528
|
+
logger.info(f"Processing submission result {idx} of {total}, {len(entities)} entities")
|
|
529
|
+
|
|
530
|
+
for entity in entities:
|
|
531
|
+
unique_keys = entity.get("unique_keys", [{}])
|
|
532
|
+
if unique_keys and isinstance(unique_keys, list):
|
|
533
|
+
keys = unique_keys[0]
|
|
534
|
+
entity["project_id"] = keys.get("project_id")
|
|
535
|
+
entity["submitter_id"] = keys.get("submitter_id")
|
|
536
|
+
entity["transaction_id"] = transaction_id
|
|
537
|
+
entity["file_path"] = obj.get("file_path", '')
|
|
538
|
+
flat_list_dict.append(entity)
|
|
539
|
+
|
|
540
|
+
# renaming cols
|
|
541
|
+
for entity in flat_list_dict:
|
|
542
|
+
entity["gen3_guid"] = entity.pop("id", None)
|
|
543
|
+
entity["node"] = entity.pop("type", None)
|
|
544
|
+
|
|
545
|
+
logger.info(f"Finished flattening. Total entities: {len(flat_list_dict)}")
|
|
546
|
+
return flat_list_dict
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def find_version_from_path(path):
|
|
550
|
+
version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
|
|
551
|
+
found_versions = []
|
|
552
|
+
|
|
553
|
+
for segment in path.split('/'):
|
|
554
|
+
match = version_pattern.match(segment)
|
|
555
|
+
if match:
|
|
556
|
+
found_versions.append(match.group(1))
|
|
557
|
+
|
|
558
|
+
if not found_versions:
|
|
559
|
+
return None
|
|
560
|
+
|
|
561
|
+
if len(found_versions) > 1:
|
|
562
|
+
logger.warning("more than one match found in path for version string")
|
|
563
|
+
|
|
564
|
+
return found_versions[-1]
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
def collect_versions_from_metadata_file_list(metadata_file_list):
|
|
568
|
+
versions = []
|
|
569
|
+
for file_path in metadata_file_list:
|
|
570
|
+
version = find_version_from_path(file_path)
|
|
571
|
+
if version:
|
|
572
|
+
versions.append(version)
|
|
573
|
+
versions = list(set(versions))
|
|
574
|
+
if len(versions) > 1:
|
|
575
|
+
logger.error(f"more than one version found in metadata file list: {metadata_file_list}")
|
|
576
|
+
raise
|
|
577
|
+
return versions[0]
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
class MetadataSubmitter:
|
|
581
|
+
def __init__(
|
|
582
|
+
self,
|
|
583
|
+
metadata_file_list: list,
|
|
584
|
+
api_key: dict,
|
|
585
|
+
project_id: str,
|
|
586
|
+
data_import_order_path: str,
|
|
587
|
+
program_id: str = "program1",
|
|
588
|
+
max_size_kb: int = 100,
|
|
589
|
+
exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
|
|
590
|
+
max_retries: int = 3,
|
|
591
|
+
aws_profile: str = None
|
|
592
|
+
):
|
|
593
|
+
"""
|
|
594
|
+
Initialises a MetadataSubmitter for submitting a set of metadata JSON files to a Gen3 data commons endpoint, in order.
|
|
437
595
|
|
|
596
|
+
Args:
|
|
597
|
+
metadata_file_list (list): List of local file paths or S3 URIs to metadata .json files, one per node type.
|
|
598
|
+
api_key (dict): Gen3 API key as a parsed dictionary.
|
|
599
|
+
project_id (str): Gen3 project ID to submit data to.
|
|
600
|
+
data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying node submission order.
|
|
601
|
+
program_id (str, optional): Gen3 program ID (default: "program1").
|
|
602
|
+
max_size_kb (int, optional): Maximum size per submission chunk, in KB (default: 100).
|
|
603
|
+
exclude_nodes (list, optional): List of node names to skip during submission (default: ["project", "program", "acknowledgement", "publication"]).
|
|
604
|
+
max_retries (int, optional): Maximum number of retry attempts per node chunk (default: 3).
|
|
605
|
+
aws_profile (str, optional): AWS CLI named profile to use for boto3 session (default: None).
|
|
606
|
+
"""
|
|
607
|
+
self.metadata_file_list = metadata_file_list
|
|
608
|
+
self.api_key = api_key
|
|
609
|
+
self.project_id = project_id
|
|
610
|
+
self.data_import_order_path = data_import_order_path
|
|
611
|
+
self.program_id = program_id
|
|
612
|
+
self.max_size_kb = max_size_kb
|
|
613
|
+
self.exclude_nodes = exclude_nodes
|
|
614
|
+
self.max_retries = max_retries
|
|
615
|
+
self.submission_results = []
|
|
616
|
+
self.aws_profile = aws_profile
|
|
617
|
+
self.boto3_session = self._create_boto3_session()
|
|
618
|
+
logger.info("MetadataSubmitter initialised.")
|
|
619
|
+
|
|
620
|
+
def _create_gen3_submission_class(self):
|
|
621
|
+
return create_gen3_submission_class(self.api_key)
|
|
622
|
+
|
|
623
|
+
def _create_boto3_session(self):
|
|
624
|
+
return create_boto3_session(self.aws_profile)
|
|
625
|
+
|
|
626
|
+
def _read_data_import_order(self, data_import_order_path: str, exclude_nodes: list[str], boto3_session = None):
|
|
438
627
|
if is_s3_uri(data_import_order_path):
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
logger.debug(f"Import order from S3: {import_order}")
|
|
628
|
+
session = boto3_session or self.boto3_session
|
|
629
|
+
return read_data_import_order_txt_s3(data_import_order_path, session, exclude_nodes)
|
|
442
630
|
else:
|
|
443
|
-
|
|
444
|
-
import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
445
|
-
logger.debug(f"Import order from file: {import_order}")
|
|
631
|
+
return read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
446
632
|
|
|
447
|
-
|
|
633
|
+
def _prepare_json_chunks(self, metadata_file_path: str, max_size_kb: int) -> List[List[Dict]]:
|
|
634
|
+
"""
|
|
635
|
+
Read JSON data from a given file path and split it into chunks,
|
|
636
|
+
each with a maximum size of ``max_size_kb`` kilobytes.
|
|
448
637
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
continue
|
|
453
|
-
file = file_map.get(node)
|
|
454
|
-
if not file:
|
|
455
|
-
logger.info(f"Skipping node '{node}' (not present in file list).")
|
|
456
|
-
continue
|
|
638
|
+
Args:
|
|
639
|
+
metadata_file_path (str): File path (local or S3 URI) to the JSON data.
|
|
640
|
+
max_size_kb (int): Maximum allowed size (in kilobytes) for each chunk.
|
|
457
641
|
|
|
458
|
-
|
|
642
|
+
Returns:
|
|
643
|
+
list: A list of chunks, where each chunk is a list of dictionaries
|
|
644
|
+
containing JSON data.
|
|
645
|
+
"""
|
|
646
|
+
logger.info(f"Reading metadata json from {metadata_file_path}")
|
|
647
|
+
if is_s3_uri(metadata_file_path):
|
|
648
|
+
session = self.boto3_session
|
|
649
|
+
data = read_metadata_json_s3(metadata_file_path, session)
|
|
650
|
+
else:
|
|
651
|
+
data = read_metadata_json(metadata_file_path)
|
|
652
|
+
return split_json_objects(data, max_size_kb)
|
|
459
653
|
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
json_data = read_metadata_json_s3(file, boto3_session)
|
|
464
|
-
else:
|
|
465
|
-
logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
|
|
466
|
-
json_data = read_metadata_json(file)
|
|
467
|
-
except Exception as e:
|
|
468
|
-
logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
|
|
469
|
-
raise Exception(f"Failed to read JSON metadata for node '{node}' from {file}: {e}")
|
|
654
|
+
def _create_file_map(self):
|
|
655
|
+
"""
|
|
656
|
+
Generate a mapping from node names to metadata file paths.
|
|
470
657
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
|
|
475
|
-
)
|
|
658
|
+
This method infers the node name for each file in `self.metadata_file_list`
|
|
659
|
+
and returns a dictionary where the keys are node names and the values
|
|
660
|
+
are the corresponding file paths.
|
|
476
661
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
486
|
-
f"Split: {progress_str:<5}"
|
|
487
|
-
if attempt == 0 else
|
|
488
|
-
f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
489
|
-
f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
|
|
490
|
-
)
|
|
491
|
-
logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
|
|
492
|
-
|
|
493
|
-
res = submit.submit_record("program1", project_id, jsn)
|
|
494
|
-
|
|
495
|
-
if write_submission_results_path is not None:
|
|
496
|
-
log_filename = os.path.join(
|
|
497
|
-
log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
|
|
498
|
-
)
|
|
499
|
-
abs_log_filename = os.path.abspath(log_filename)
|
|
500
|
-
with open(abs_log_filename, "a") as f:
|
|
501
|
-
json.dump(res, f)
|
|
502
|
-
f.write("\n")
|
|
503
|
-
logger.info(
|
|
504
|
-
f"Wrote submission response to log file: {abs_log_filename}"
|
|
505
|
-
)
|
|
506
|
-
|
|
507
|
-
logger.info(
|
|
508
|
-
f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
|
|
509
|
-
f"Node: {node:<12} | Split: {progress_str:<5}"
|
|
510
|
-
)
|
|
511
|
-
submission_success = True
|
|
512
|
-
break # Success
|
|
513
|
-
|
|
514
|
-
except Exception as e:
|
|
515
|
-
last_exception = e
|
|
516
|
-
logger.error(
|
|
517
|
-
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
518
|
-
)
|
|
519
|
-
if attempt < max_retries:
|
|
520
|
-
import time
|
|
521
|
-
time.sleep(0.2)
|
|
522
|
-
else:
|
|
523
|
-
logger.critical(
|
|
524
|
-
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
525
|
-
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
526
|
-
)
|
|
527
|
-
|
|
528
|
-
if not submission_success:
|
|
529
|
-
# After retries, still failed
|
|
530
|
-
raise Exception(
|
|
531
|
-
f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
|
|
532
|
-
f"Last error: {last_exception}"
|
|
533
|
-
)
|
|
662
|
+
Returns:
|
|
663
|
+
dict: Dictionary mapping node names (str) to their associated metadata file paths (str).
|
|
664
|
+
"""
|
|
665
|
+
file_map = {
|
|
666
|
+
get_node_from_file_path(file): file
|
|
667
|
+
for file in self.metadata_file_list
|
|
668
|
+
}
|
|
669
|
+
return file_map
|
|
534
670
|
|
|
535
|
-
|
|
671
|
+
def submit_metadata(self) -> List[Dict]:
|
|
672
|
+
"""
|
|
673
|
+
Submits metadata for each node defined in the data import order, except those in the exclude list.
|
|
536
674
|
|
|
537
|
-
|
|
675
|
+
For each node, this method retrieves the corresponding metadata file, splits the JSON data
|
|
676
|
+
into size-constrained chunks, and submits each chunk to the Gen3 submission API. Responses
|
|
677
|
+
from all submissions are gathered and returned as a list.
|
|
538
678
|
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
679
|
+
Returns:
|
|
680
|
+
List[Dict]: A list of response dictionaries returned from the Gen3 metadata submissions.
|
|
681
|
+
"""
|
|
682
|
+
gen3_submitter = self._create_gen3_submission_class()
|
|
683
|
+
data_import_order = self._read_data_import_order(self.data_import_order_path, self.exclude_nodes, self.boto3_session)
|
|
684
|
+
file_map = self._create_file_map()
|
|
685
|
+
output_response_list_dict = []
|
|
686
|
+
|
|
687
|
+
logger.info("Starting metadata submission.")
|
|
688
|
+
for node in data_import_order:
|
|
689
|
+
|
|
690
|
+
if node in self.exclude_nodes:
|
|
691
|
+
logger.info(f"Skipping node '{node}' (in exclude list).")
|
|
692
|
+
continue
|
|
693
|
+
file_path = file_map.get(node)
|
|
694
|
+
if not file_path:
|
|
695
|
+
logger.info(f"Skipping node '{node}' (not present in file list).")
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
logger.info(f"Processing file '{file_path}' for node '{node}'.")
|
|
699
|
+
logger.info("Splitting JSON data into chunks.")
|
|
700
|
+
json_chunks = self._prepare_json_chunks(file_path, self.max_size_kb)
|
|
701
|
+
|
|
702
|
+
logger.info("Submitting chunks to Gen3.")
|
|
703
|
+
response_list = submit_data_chunks(
|
|
704
|
+
split_json_list=json_chunks,
|
|
705
|
+
node=node,
|
|
706
|
+
file_path=file_path,
|
|
707
|
+
gen3_submitter=gen3_submitter,
|
|
708
|
+
project_id=self.project_id,
|
|
709
|
+
max_retries=self.max_retries,
|
|
710
|
+
program_id=self.program_id
|
|
711
|
+
)
|
|
712
|
+
output_response_list_dict.extend(response_list)
|
|
713
|
+
|
|
714
|
+
self.submission_results = output_response_list_dict
|
|
715
|
+
return output_response_list_dict
|
|
716
|
+
|
|
717
|
+
def upload_metadata_submission_results(
|
|
718
|
+
self,
|
|
719
|
+
dataset_root: str,
|
|
720
|
+
database: str,
|
|
721
|
+
table: str,
|
|
722
|
+
partition_cols: list = ["upload_datetime"],
|
|
723
|
+
):
|
|
724
|
+
"""
|
|
725
|
+
Uploads the submission results to s3 and parquet table.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
dataset_root (str): S3 path where the parquet files will be stored
|
|
729
|
+
(e.g., "s3://acdc-dataops-metadata/metadata_upload/").
|
|
730
|
+
database (str): Database name for storing the metadata upload
|
|
731
|
+
(e.g., "acdc_dataops_metadata_db").
|
|
732
|
+
table (str): Table name for storing the metadata upload
|
|
733
|
+
(e.g., "metadata_upload").
|
|
734
|
+
partition_cols (list, optional): List of column names to partition the parquet table by.
|
|
735
|
+
Defaults to ["upload_datetime"].
|
|
736
|
+
"""
|
|
737
|
+
logger.info("Collecting version from metadata file list.")
|
|
738
|
+
version = collect_versions_from_metadata_file_list(self.metadata_file_list)
|
|
739
|
+
logger.info(f"Extracted version: {version}")
|
|
740
|
+
|
|
741
|
+
logger.info("Inferring API endpoint from JWT.")
|
|
742
|
+
api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
|
|
743
|
+
logger.info(f"Using API endpoint: {api_endpoint}")
|
|
744
|
+
|
|
745
|
+
upload_datetime = datetime.now().isoformat()
|
|
746
|
+
upload_id = str(uuid.uuid4())
|
|
747
|
+
logger.info(f"Upload datetime: {upload_datetime}")
|
|
748
|
+
logger.info(f"Generated upload ID: {upload_id}")
|
|
749
|
+
|
|
750
|
+
logger.info("Flattening submission results for upload.")
|
|
751
|
+
flattened_results = flatten_submission_results(self.submission_results)
|
|
752
|
+
logger.info(f"Flattened {len(flattened_results)} submission result entries.")
|
|
753
|
+
|
|
754
|
+
logger.info("Converting flattened results to DataFrame.")
|
|
755
|
+
flattened_results_df = pd.DataFrame(flattened_results)
|
|
756
|
+
flattened_results_df['upload_datetime'] = upload_datetime
|
|
757
|
+
flattened_results_df['upload_id'] = upload_id
|
|
758
|
+
flattened_results_df['api_endpoint'] = api_endpoint
|
|
759
|
+
flattened_results_df['version'] = version
|
|
760
|
+
|
|
761
|
+
logger.info(
|
|
762
|
+
f"Writing DataFrame to parquet and S3/table: "
|
|
763
|
+
f"dataset_root={dataset_root}, database={database}, table={table}, partition_cols={partition_cols}"
|
|
764
|
+
)
|
|
765
|
+
write_parquet_to_db(
|
|
766
|
+
df=flattened_results_df,
|
|
767
|
+
dataset_root=dataset_root,
|
|
768
|
+
database=database,
|
|
769
|
+
table=table,
|
|
770
|
+
partition_cols=partition_cols
|
|
771
|
+
)
|
|
772
|
+
logger.info("Metadata submission results upload complete.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.6
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -17,6 +17,7 @@ Requires-Dist: dbt-core (==1.9.4)
|
|
|
17
17
|
Requires-Dist: gen3 (>=4.27.4,<5.0.0)
|
|
18
18
|
Requires-Dist: gen3_validator (>=2.0.0,<3.0.0)
|
|
19
19
|
Requires-Dist: numpy (<2.0.0)
|
|
20
|
+
Requires-Dist: pyjwt (>=2.10.1,<3.0.0)
|
|
20
21
|
Requires-Dist: pytest
|
|
21
22
|
Requires-Dist: python-dotenv
|
|
22
23
|
Requires-Dist: pytz (>=2025.2,<2026.0)
|
|
@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6
|
|
|
3
3
|
acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
|
|
4
4
|
acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
|
|
5
5
|
acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
|
|
6
|
-
acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=
|
|
6
|
+
acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=k5q5hRkj-dWo25z9nVZI2eNh0xnmQU8TPDffSSnQlUY,29906
|
|
7
7
|
acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
|
|
8
8
|
acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
|
|
9
9
|
acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
|
|
10
10
|
acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
|
|
11
11
|
acdc_aws_etl_pipeline/validate/validate.py,sha256=zLqK9i92FsRAaBOGdY-G7-vb0e6tmkoUXhY6zCfbjN8,24895
|
|
12
|
-
acdc_aws_etl_pipeline-0.6.
|
|
13
|
-
acdc_aws_etl_pipeline-0.6.
|
|
14
|
-
acdc_aws_etl_pipeline-0.6.
|
|
12
|
+
acdc_aws_etl_pipeline-0.6.6.dist-info/METADATA,sha256=n2wMY9pJS49KUdUmhzd-JkPJHx7Fe4XMmMLGH4kI1eo,2926
|
|
13
|
+
acdc_aws_etl_pipeline-0.6.6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
14
|
+
acdc_aws_etl_pipeline-0.6.6.dist-info/RECORD,,
|
|
File without changes
|