acdc_aws_etl_pipeline 0.4.3__tar.gz → 0.4.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/PKG-INFO +1 -1
  2. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/pyproject.toml +1 -1
  3. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +101 -91
  4. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/README.md +0 -0
  5. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
  6. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
  7. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
  8. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
  9. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -0
  10. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
  11. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
  12. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
  13. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
  14. {acdc_aws_etl_pipeline-0.4.3 → acdc_aws_etl_pipeline-0.4.6}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.4.3
3
+ Version: 0.4.6
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "acdc_aws_etl_pipeline"
3
- version = "0.4.3"
3
+ version = "0.4.6"
4
4
  description = "Tools for ACDC ETL pipeline"
5
5
  authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
6
6
  readme = "README.md"
@@ -418,111 +418,121 @@ def submit_metadata(
418
418
  Notes:
419
419
  Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
420
420
  """
421
-
421
+
422
422
  timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
423
423
  log_dir = f"submission_logs/{timestamp}"
424
424
  os.makedirs(log_dir, exist_ok=True)
425
-
425
+
426
426
  if exclude_nodes is None:
427
427
  exclude_nodes = ["project", "program", "acknowledgement", "publication"]
428
428
 
429
429
  logger.info("Starting metadata submission process.")
430
430
  logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
431
- submit = create_gen3_submission_class(api_key, api_endpoint)
432
-
433
- if is_s3_uri(data_import_order_path):
434
- logger.info(f"Reading import order from S3: {data_import_order_path}")
435
- import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
436
- logger.debug(f"Import order from S3: {import_order}")
437
- else:
438
- logger.info(f"Reading import order from file: {data_import_order_path}")
439
- import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
440
- logger.debug(f"Import order from file: {import_order}")
441
-
442
- # Map node name to file for fast access and avoid repeatedly scanning file_list
443
- file_map = {get_node_from_file_path(file): file for file in file_list}
444
-
445
- for node in import_order:
446
- if node in exclude_nodes:
447
- logger.info(f"Skipping node '{node}' (in exclude list).")
448
- continue
449
- file = file_map.get(node)
450
- if not file:
451
- logger.info(f"Skipping node '{node}' (not present in file list).")
452
- continue
453
-
454
- logger.info(f"Processing file '{file}' for node '{node}'.")
455
-
456
- try:
457
- if is_s3_uri(file):
458
- logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
459
- json_data = read_metadata_json_s3(file, boto3_session)
460
- else:
461
- logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
462
- json_data = read_metadata_json(file)
463
- except Exception as e:
464
- logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
465
- continue
466
-
467
- if not json_data:
468
- logger.info(f"Skipping node '{node}' due to errors in reading JSON.")
469
- continue
470
-
471
- split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
472
- n_json_data = len(split_json_list)
473
- logger.info(
474
- f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
475
- )
476
431
 
477
- for index, jsn in enumerate(split_json_list):
478
- progress_str = f"{index + 1}/{n_json_data}"
479
-
480
- for attempt in range(max_retries + 1):
481
- try:
482
- log_msg = (
483
- f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
484
- f"Split: {progress_str:<5}"
485
- if attempt == 0 else
486
- f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
487
- f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
488
- )
489
- logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
490
-
491
- res = submit.submit_record("program1", project_id, jsn)
492
-
493
- # writing submission results as log json
494
- if write_submission_results_path is not None:
495
- log_filename = os.path.join(
496
- log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
432
+ try:
433
+ submit = create_gen3_submission_class(api_key, api_endpoint)
434
+
435
+ if is_s3_uri(data_import_order_path):
436
+ logger.info(f"Reading import order from S3: {data_import_order_path}")
437
+ import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
438
+ logger.debug(f"Import order from S3: {import_order}")
439
+ else:
440
+ logger.info(f"Reading import order from file: {data_import_order_path}")
441
+ import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
442
+ logger.debug(f"Import order from file: {import_order}")
443
+
444
+ file_map = {get_node_from_file_path(file): file for file in file_list}
445
+
446
+ for node in import_order:
447
+ if node in exclude_nodes:
448
+ logger.info(f"Skipping node '{node}' (in exclude list).")
449
+ continue
450
+ file = file_map.get(node)
451
+ if not file:
452
+ logger.info(f"Skipping node '{node}' (not present in file list).")
453
+ continue
454
+
455
+ logger.info(f"Processing file '{file}' for node '{node}'.")
456
+
457
+ try:
458
+ if is_s3_uri(file):
459
+ logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
460
+ json_data = read_metadata_json_s3(file, boto3_session)
461
+ else:
462
+ logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
463
+ json_data = read_metadata_json(file)
464
+ except Exception as e:
465
+ logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
466
+ raise Exception(f"Failed to read JSON metadata for node '{node}' from {file}: {e}")
467
+
468
+ split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
469
+ n_json_data = len(split_json_list)
470
+ logger.info(
471
+ f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
472
+ )
473
+
474
+ for index, jsn in enumerate(split_json_list):
475
+ progress_str = f"{index + 1}/{n_json_data}"
476
+
477
+ submission_success = False
478
+ last_exception = None
479
+ for attempt in range(max_retries + 1):
480
+ try:
481
+ log_msg = (
482
+ f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
483
+ f"Split: {progress_str:<5}"
484
+ if attempt == 0 else
485
+ f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
486
+ f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
497
487
  )
498
- abs_log_filename = os.path.abspath(log_filename)
499
- with open(abs_log_filename, "a") as f:
500
- json.dump(res, f)
501
- f.write("\n")
488
+ logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
489
+
490
+ res = submit.submit_record("program1", project_id, jsn)
491
+
492
+ if write_submission_results_path is not None:
493
+ log_filename = os.path.join(
494
+ log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
495
+ )
496
+ abs_log_filename = os.path.abspath(log_filename)
497
+ with open(abs_log_filename, "a") as f:
498
+ json.dump(res, f)
499
+ f.write("\n")
500
+ logger.info(
501
+ f"Wrote submission response to log file: {abs_log_filename}"
502
+ )
503
+
502
504
  logger.info(
503
- f"Wrote submission response to log file: {abs_log_filename}"
505
+ f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
506
+ f"Node: {node:<12} | Split: {progress_str:<5}"
504
507
  )
508
+ submission_success = True
509
+ break # Success
505
510
 
506
- logger.info(
507
- f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
508
- f"Node: {node:<12} | Split: {progress_str:<5}"
511
+ except Exception as e:
512
+ last_exception = e
513
+ logger.error(
514
+ f"Error submitting chunk {progress_str} for node '{node}': {e}"
515
+ )
516
+ if attempt < max_retries:
517
+ import time
518
+ time.sleep(0.2)
519
+ else:
520
+ logger.critical(
521
+ f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
522
+ f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
523
+ )
524
+
525
+ if not submission_success:
526
+ # After retries, still failed
527
+ raise Exception(
528
+ f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
529
+ f"Last error: {last_exception}"
509
530
  )
510
- break # Successful, move to next chunk
511
531
 
512
- except Exception as e:
513
- logger.error(
514
- f"Error submitting chunk {progress_str} for node '{node}': {e}"
515
- )
516
- if attempt < max_retries:
517
- import time
518
- time.sleep(0.2)
519
- else:
520
- logger.critical(
521
- f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
522
- f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
523
- )
524
- raise
532
+ logger.info(f"Finished submitting node '{node}'.")
525
533
 
526
- logger.info(f"Finished submitting node '{node}'.")
534
+ logger.info("--- Submission process complete ---")
527
535
 
528
- logger.info("--- Submission process complete ---")
536
+ except Exception as exc:
537
+ logger.exception(f"Critical error during submission process: {exc}")
538
+ raise