acdc_aws_etl_pipeline 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -419,110 +419,123 @@ def submit_metadata(
419
419
  Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
420
420
  """
421
421
 
422
+ # redefine to use local cache in /tmp
423
+ os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
424
+
422
425
  timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
423
426
  log_dir = f"submission_logs/{timestamp}"
424
427
  os.makedirs(log_dir, exist_ok=True)
425
-
428
+
426
429
  if exclude_nodes is None:
427
430
  exclude_nodes = ["project", "program", "acknowledgement", "publication"]
428
431
 
429
432
  logger.info("Starting metadata submission process.")
430
433
  logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
431
- submit = create_gen3_submission_class(api_key, api_endpoint)
432
-
433
- if is_s3_uri(data_import_order_path):
434
- logger.info(f"Reading import order from S3: {data_import_order_path}")
435
- import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
436
- logger.debug(f"Import order from S3: {import_order}")
437
- else:
438
- logger.info(f"Reading import order from file: {data_import_order_path}")
439
- import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
440
- logger.debug(f"Import order from file: {import_order}")
441
-
442
- # Map node name to file for fast access and avoid repeatedly scanning file_list
443
- file_map = {get_node_from_file_path(file): file for file in file_list}
444
-
445
- for node in import_order:
446
- if node in exclude_nodes:
447
- logger.info(f"Skipping node '{node}' (in exclude list).")
448
- continue
449
- file = file_map.get(node)
450
- if not file:
451
- logger.info(f"Skipping node '{node}' (not present in file list).")
452
- continue
453
-
454
- logger.info(f"Processing file '{file}' for node '{node}'.")
455
-
456
- try:
457
- if is_s3_uri(file):
458
- logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
459
- json_data = read_metadata_json_s3(file, boto3_session)
460
- else:
461
- logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
462
- json_data = read_metadata_json(file)
463
- except Exception as e:
464
- logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
465
- continue
466
-
467
- if not json_data:
468
- logger.info(f"Skipping node '{node}' due to errors in reading JSON.")
469
- continue
470
-
471
- split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
472
- n_json_data = len(split_json_list)
473
- logger.info(
474
- f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
475
- )
476
434
 
477
- for index, jsn in enumerate(split_json_list):
478
- progress_str = f"{index + 1}/{n_json_data}"
479
-
480
- for attempt in range(max_retries + 1):
481
- try:
482
- log_msg = (
483
- f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
484
- f"Split: {progress_str:<5}"
485
- if attempt == 0 else
486
- f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
487
- f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
488
- )
489
- logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
490
-
491
- res = submit.submit_record("program1", project_id, jsn)
492
-
493
- # writing submission results as log json
494
- if write_submission_results_path is not None:
495
- log_filename = os.path.join(
496
- log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
435
+ try:
436
+ submit = create_gen3_submission_class(api_key, api_endpoint)
437
+
438
+ if is_s3_uri(data_import_order_path):
439
+ logger.info(f"Reading import order from S3: {data_import_order_path}")
440
+ import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
441
+ logger.debug(f"Import order from S3: {import_order}")
442
+ else:
443
+ logger.info(f"Reading import order from file: {data_import_order_path}")
444
+ import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
445
+ logger.debug(f"Import order from file: {import_order}")
446
+
447
+ file_map = {get_node_from_file_path(file): file for file in file_list}
448
+
449
+ for node in import_order:
450
+ if node in exclude_nodes:
451
+ logger.info(f"Skipping node '{node}' (in exclude list).")
452
+ continue
453
+ file = file_map.get(node)
454
+ if not file:
455
+ logger.info(f"Skipping node '{node}' (not present in file list).")
456
+ continue
457
+
458
+ logger.info(f"Processing file '{file}' for node '{node}'.")
459
+
460
+ try:
461
+ if is_s3_uri(file):
462
+ logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
463
+ json_data = read_metadata_json_s3(file, boto3_session)
464
+ else:
465
+ logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
466
+ json_data = read_metadata_json(file)
467
+ except Exception as e:
468
+ logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
469
+ raise Exception(f"Failed to read JSON metadata for node '{node}' from {file}: {e}")
470
+
471
+ split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
472
+ n_json_data = len(split_json_list)
473
+ logger.info(
474
+ f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
475
+ )
476
+
477
+ for index, jsn in enumerate(split_json_list):
478
+ progress_str = f"{index + 1}/{n_json_data}"
479
+
480
+ submission_success = False
481
+ last_exception = None
482
+ for attempt in range(max_retries + 1):
483
+ try:
484
+ log_msg = (
485
+ f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
486
+ f"Split: {progress_str:<5}"
487
+ if attempt == 0 else
488
+ f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
489
+ f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
497
490
  )
498
- abs_log_filename = os.path.abspath(log_filename)
499
- with open(abs_log_filename, "a") as f:
500
- json.dump(res, f)
501
- f.write("\n")
491
+ logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
492
+
493
+ res = submit.submit_record("program1", project_id, jsn)
494
+
495
+ if write_submission_results_path is not None:
496
+ log_filename = os.path.join(
497
+ log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
498
+ )
499
+ abs_log_filename = os.path.abspath(log_filename)
500
+ with open(abs_log_filename, "a") as f:
501
+ json.dump(res, f)
502
+ f.write("\n")
503
+ logger.info(
504
+ f"Wrote submission response to log file: {abs_log_filename}"
505
+ )
506
+
502
507
  logger.info(
503
- f"Wrote submission response to log file: {abs_log_filename}"
508
+ f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
509
+ f"Node: {node:<12} | Split: {progress_str:<5}"
504
510
  )
511
+ submission_success = True
512
+ break # Success
505
513
 
506
- logger.info(
507
- f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
508
- f"Node: {node:<12} | Split: {progress_str:<5}"
514
+ except Exception as e:
515
+ last_exception = e
516
+ logger.error(
517
+ f"Error submitting chunk {progress_str} for node '{node}': {e}"
518
+ )
519
+ if attempt < max_retries:
520
+ import time
521
+ time.sleep(0.2)
522
+ else:
523
+ logger.critical(
524
+ f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
525
+ f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
526
+ )
527
+
528
+ if not submission_success:
529
+ # After retries, still failed
530
+ raise Exception(
531
+ f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
532
+ f"Last error: {last_exception}"
509
533
  )
510
- break # Successful, move to next chunk
511
534
 
512
- except Exception as e:
513
- logger.error(
514
- f"Error submitting chunk {progress_str} for node '{node}': {e}"
515
- )
516
- if attempt < max_retries:
517
- import time
518
- time.sleep(0.2)
519
- else:
520
- logger.critical(
521
- f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
522
- f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
523
- )
524
- raise
535
+ logger.info(f"Finished submitting node '{node}'.")
525
536
 
526
- logger.info(f"Finished submitting node '{node}'.")
537
+ logger.info("--- Submission process complete ---")
527
538
 
528
- logger.info("--- Submission process complete ---")
539
+ except Exception as exc:
540
+ logger.exception(f"Critical error during submission process: {exc}")
541
+ raise
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.4.5
3
+ Version: 0.4.7
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=0meo4Sq6o0EPvgfFu0QPMQ5ZEHDiSEMsVG
3
3
  acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
4
4
  acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
5
5
  acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
6
- acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=9ms1X3rjP4INGfNSr4GsM6FkTOfT3JRF3PxAN6pisFE,20316
6
+ acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=agRDalSU8cyNQ0rm0BTm-U8pdWe93w3roQ-kPBhpxw0,21071
7
7
  acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
8
8
  acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
9
9
  acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
10
10
  acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
11
11
  acdc_aws_etl_pipeline/validate/validate.py,sha256=fTa76YvixCWOGkAIuR7CZ2WryMJcpc2wvSOHLZDEknc,28159
12
- acdc_aws_etl_pipeline-0.4.5.dist-info/METADATA,sha256=AclICNhbZYTn6D5sMY6bvpWRIZTaXHHnsynyzgRW-Po,2853
13
- acdc_aws_etl_pipeline-0.4.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
- acdc_aws_etl_pipeline-0.4.5.dist-info/RECORD,,
12
+ acdc_aws_etl_pipeline-0.4.7.dist-info/METADATA,sha256=2EoWAI7fk6Naqd2g-YAslv-JdSS0cxyssO32tUsh9yE,2853
13
+ acdc_aws_etl_pipeline-0.4.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
+ acdc_aws_etl_pipeline-0.4.7.dist-info/RECORD,,