acdc_aws_etl_pipeline 0.4.5__tar.gz → 0.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/PKG-INFO +1 -1
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/pyproject.toml +1 -1
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +101 -91
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/README.md +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
- {acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
|
@@ -418,111 +418,121 @@ def submit_metadata(
|
|
|
418
418
|
Notes:
|
|
419
419
|
Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
|
|
420
420
|
"""
|
|
421
|
-
|
|
421
|
+
|
|
422
422
|
timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
|
|
423
423
|
log_dir = f"submission_logs/{timestamp}"
|
|
424
424
|
os.makedirs(log_dir, exist_ok=True)
|
|
425
|
-
|
|
425
|
+
|
|
426
426
|
if exclude_nodes is None:
|
|
427
427
|
exclude_nodes = ["project", "program", "acknowledgement", "publication"]
|
|
428
428
|
|
|
429
429
|
logger.info("Starting metadata submission process.")
|
|
430
430
|
logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
|
|
431
|
-
submit = create_gen3_submission_class(api_key, api_endpoint)
|
|
432
|
-
|
|
433
|
-
if is_s3_uri(data_import_order_path):
|
|
434
|
-
logger.info(f"Reading import order from S3: {data_import_order_path}")
|
|
435
|
-
import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
|
|
436
|
-
logger.debug(f"Import order from S3: {import_order}")
|
|
437
|
-
else:
|
|
438
|
-
logger.info(f"Reading import order from file: {data_import_order_path}")
|
|
439
|
-
import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
440
|
-
logger.debug(f"Import order from file: {import_order}")
|
|
441
|
-
|
|
442
|
-
# Map node name to file for fast access and avoid repeatedly scanning file_list
|
|
443
|
-
file_map = {get_node_from_file_path(file): file for file in file_list}
|
|
444
|
-
|
|
445
|
-
for node in import_order:
|
|
446
|
-
if node in exclude_nodes:
|
|
447
|
-
logger.info(f"Skipping node '{node}' (in exclude list).")
|
|
448
|
-
continue
|
|
449
|
-
file = file_map.get(node)
|
|
450
|
-
if not file:
|
|
451
|
-
logger.info(f"Skipping node '{node}' (not present in file list).")
|
|
452
|
-
continue
|
|
453
|
-
|
|
454
|
-
logger.info(f"Processing file '{file}' for node '{node}'.")
|
|
455
|
-
|
|
456
|
-
try:
|
|
457
|
-
if is_s3_uri(file):
|
|
458
|
-
logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
|
|
459
|
-
json_data = read_metadata_json_s3(file, boto3_session)
|
|
460
|
-
else:
|
|
461
|
-
logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
|
|
462
|
-
json_data = read_metadata_json(file)
|
|
463
|
-
except Exception as e:
|
|
464
|
-
logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
|
|
465
|
-
continue
|
|
466
|
-
|
|
467
|
-
if not json_data:
|
|
468
|
-
logger.info(f"Skipping node '{node}' due to errors in reading JSON.")
|
|
469
|
-
continue
|
|
470
|
-
|
|
471
|
-
split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
|
|
472
|
-
n_json_data = len(split_json_list)
|
|
473
|
-
logger.info(
|
|
474
|
-
f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
|
|
475
|
-
)
|
|
476
431
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
432
|
+
try:
|
|
433
|
+
submit = create_gen3_submission_class(api_key, api_endpoint)
|
|
434
|
+
|
|
435
|
+
if is_s3_uri(data_import_order_path):
|
|
436
|
+
logger.info(f"Reading import order from S3: {data_import_order_path}")
|
|
437
|
+
import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
|
|
438
|
+
logger.debug(f"Import order from S3: {import_order}")
|
|
439
|
+
else:
|
|
440
|
+
logger.info(f"Reading import order from file: {data_import_order_path}")
|
|
441
|
+
import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
442
|
+
logger.debug(f"Import order from file: {import_order}")
|
|
443
|
+
|
|
444
|
+
file_map = {get_node_from_file_path(file): file for file in file_list}
|
|
445
|
+
|
|
446
|
+
for node in import_order:
|
|
447
|
+
if node in exclude_nodes:
|
|
448
|
+
logger.info(f"Skipping node '{node}' (in exclude list).")
|
|
449
|
+
continue
|
|
450
|
+
file = file_map.get(node)
|
|
451
|
+
if not file:
|
|
452
|
+
logger.info(f"Skipping node '{node}' (not present in file list).")
|
|
453
|
+
continue
|
|
454
|
+
|
|
455
|
+
logger.info(f"Processing file '{file}' for node '{node}'.")
|
|
456
|
+
|
|
457
|
+
try:
|
|
458
|
+
if is_s3_uri(file):
|
|
459
|
+
logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
|
|
460
|
+
json_data = read_metadata_json_s3(file, boto3_session)
|
|
461
|
+
else:
|
|
462
|
+
logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
|
|
463
|
+
json_data = read_metadata_json(file)
|
|
464
|
+
except Exception as e:
|
|
465
|
+
logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
|
|
466
|
+
raise Exception(f"Failed to read JSON metadata for node '{node}' from {file}: {e}")
|
|
467
|
+
|
|
468
|
+
split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
|
|
469
|
+
n_json_data = len(split_json_list)
|
|
470
|
+
logger.info(
|
|
471
|
+
f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
for index, jsn in enumerate(split_json_list):
|
|
475
|
+
progress_str = f"{index + 1}/{n_json_data}"
|
|
476
|
+
|
|
477
|
+
submission_success = False
|
|
478
|
+
last_exception = None
|
|
479
|
+
for attempt in range(max_retries + 1):
|
|
480
|
+
try:
|
|
481
|
+
log_msg = (
|
|
482
|
+
f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
483
|
+
f"Split: {progress_str:<5}"
|
|
484
|
+
if attempt == 0 else
|
|
485
|
+
f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
486
|
+
f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
|
|
497
487
|
)
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
488
|
+
logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
|
|
489
|
+
|
|
490
|
+
res = submit.submit_record("program1", project_id, jsn)
|
|
491
|
+
|
|
492
|
+
if write_submission_results_path is not None:
|
|
493
|
+
log_filename = os.path.join(
|
|
494
|
+
log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
|
|
495
|
+
)
|
|
496
|
+
abs_log_filename = os.path.abspath(log_filename)
|
|
497
|
+
with open(abs_log_filename, "a") as f:
|
|
498
|
+
json.dump(res, f)
|
|
499
|
+
f.write("\n")
|
|
500
|
+
logger.info(
|
|
501
|
+
f"Wrote submission response to log file: {abs_log_filename}"
|
|
502
|
+
)
|
|
503
|
+
|
|
502
504
|
logger.info(
|
|
503
|
-
f"
|
|
505
|
+
f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
|
|
506
|
+
f"Node: {node:<12} | Split: {progress_str:<5}"
|
|
504
507
|
)
|
|
508
|
+
submission_success = True
|
|
509
|
+
break # Success
|
|
505
510
|
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
511
|
+
except Exception as e:
|
|
512
|
+
last_exception = e
|
|
513
|
+
logger.error(
|
|
514
|
+
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
515
|
+
)
|
|
516
|
+
if attempt < max_retries:
|
|
517
|
+
import time
|
|
518
|
+
time.sleep(0.2)
|
|
519
|
+
else:
|
|
520
|
+
logger.critical(
|
|
521
|
+
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
522
|
+
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if not submission_success:
|
|
526
|
+
# After retries, still failed
|
|
527
|
+
raise Exception(
|
|
528
|
+
f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
|
|
529
|
+
f"Last error: {last_exception}"
|
|
509
530
|
)
|
|
510
|
-
break # Successful, move to next chunk
|
|
511
531
|
|
|
512
|
-
|
|
513
|
-
logger.error(
|
|
514
|
-
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
515
|
-
)
|
|
516
|
-
if attempt < max_retries:
|
|
517
|
-
import time
|
|
518
|
-
time.sleep(0.2)
|
|
519
|
-
else:
|
|
520
|
-
logger.critical(
|
|
521
|
-
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
522
|
-
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
523
|
-
)
|
|
524
|
-
raise
|
|
532
|
+
logger.info(f"Finished submitting node '{node}'.")
|
|
525
533
|
|
|
526
|
-
logger.info(
|
|
534
|
+
logger.info("--- Submission process complete ---")
|
|
527
535
|
|
|
528
|
-
|
|
536
|
+
except Exception as exc:
|
|
537
|
+
logger.exception(f"Critical error during submission process: {exc}")
|
|
538
|
+
raise
|
|
File without changes
|
{acdc_aws_etl_pipeline-0.4.5 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|