acdc_aws_etl_pipeline 0.6.5__py3-none-any.whl → 0.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,12 @@ from gen3.index import Gen3Index
9
9
  from gen3.submission import Gen3Submission
10
10
  import logging
11
11
  from datetime import datetime
12
+ import jwt
13
+ from typing import Dict, List
14
+ import re
15
+ import pandas as pd
16
+ import uuid
17
+ from acdc_aws_etl_pipeline.validate.validate import write_parquet_to_db
12
18
 
13
19
  logger = logging.getLogger(__name__)
14
20
 
@@ -201,16 +207,17 @@ def read_metadata_json_s3(s3_uri: str, session) -> dict:
201
207
  logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
202
208
  return data
203
209
 
204
- def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
210
+ def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = None) -> list:
205
211
  """
206
- Read a DataImportOrder.txt file from S3 and return node order as a list.
212
+ Read a DataImportOrder.txt file from S3 and return node order as a list, optionally excluding some nodes.
207
213
 
208
214
  Args:
209
215
  s3_uri (str): S3 URI to the DataImportOrder.txt file.
210
216
  session (boto3.Session): Boto3 session.
217
+ exclude_nodes (list, optional): Node names to exclude from result.
211
218
 
212
219
  Returns:
213
- list: Node names (order as listed in file).
220
+ list: Node names (order as listed in file), optionally excluding nodes in exclude_nodes.
214
221
 
215
222
  Raises:
216
223
  ValueError: If the provided S3 URI does not point to DataImportOrder.txt.
@@ -224,7 +231,11 @@ def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
224
231
  obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
225
232
  content = obj['Body'].read().decode('utf-8')
226
233
  import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
227
- logger.debug(f"Read import order from S3 file: {import_order}")
234
+ logger.debug(f"Raw import order from S3 file: {import_order}")
235
+ if exclude_nodes is not None:
236
+ import_order = [node for node in import_order if node not in exclude_nodes]
237
+ logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
238
+ logger.debug(f"Final import order from S3 file {s3_uri}: {import_order}")
228
239
  return import_order
229
240
 
230
241
  def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
@@ -347,7 +358,27 @@ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) ->
347
358
  logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
348
359
  raise e
349
360
 
350
- def create_gen3_submission_class(api_key: dict, api_endpoint: str):
361
+
362
+ def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
363
+ """
364
+ Extracts the URL from a JSON Web Token (JWT) credential.
365
+
366
+ Args:
367
+ jwt_string (string): The JSON Web Token (JWT) credential.
368
+
369
+ Returns:
370
+ str: The extracted URL.
371
+ """
372
+ logger.info("Decoding JWT to extract API URL.")
373
+ url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '')
374
+ if '/user' in url:
375
+ url = url.split('/user')[0]
376
+ url = f"{url}/api/{api_version}"
377
+ logger.info(f"Extracted API URL from JWT: {url}")
378
+ return url
379
+
380
+
381
+ def create_gen3_submission_class(api_key: dict):
351
382
  """
352
383
  Create and authenticate a Gen3Submission client using a temporary file for API key.
353
384
 
@@ -357,185 +388,385 @@ def create_gen3_submission_class(api_key: dict, api_endpoint: str):
357
388
 
358
389
  Returns:
359
390
  Gen3Submission: An authenticated Gen3Submission object.
360
-
361
- Notes:
362
- The temporary file storing the API key is deleted after use.
363
391
  """
364
- import tempfile
365
-
392
+ logger.debug("Extracting JWT token from API key dict.")
393
+ jwt_token = api_key['api_key']
394
+ logger.info("Inferring API endpoint from JWT token.")
395
+ api_endpoint = infer_api_endpoint_from_jwt(jwt_token)
396
+ logger.debug(f"Inferred API endpoint: {api_endpoint}")
366
397
  logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
367
- tmp_api_key_path = None
368
- submit = None
369
-
370
- try:
371
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", dir="/tmp") as tmp_file:
372
- json.dump(api_key, tmp_file)
373
- tmp_api_key_path = tmp_file.name
374
- auth = Gen3Auth(refresh_file=tmp_api_key_path)
375
- submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
376
- return submit
377
- finally:
378
- if tmp_api_key_path and os.path.exists(tmp_api_key_path):
379
- try:
380
- os.remove(tmp_api_key_path)
381
- logger.debug(f"Temporary API key file {tmp_api_key_path} deleted.")
382
- except Exception as e:
383
- logger.warning(f"Failed to delete temporary API key file {tmp_api_key_path}: {e}")
398
+ auth = Gen3Auth(refresh_token=api_key)
399
+ submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
400
+ return submit
384
401
 
385
- def write_submission_results(results, output_path, mode='w'):
386
- with open(output_path, mode) as f:
387
- json.dump(results, f, indent=4)
388
402
 
389
- def submit_metadata(
390
- file_list: list,
391
- api_key: str,
392
- api_endpoint: str,
403
+ def submit_data_chunks(
404
+ split_json_list: list,
405
+ node: str,
406
+ gen3_submitter,
393
407
  project_id: str,
394
- data_import_order_path: str,
395
- boto3_session,
396
- max_size_kb: int = 50,
397
- exclude_nodes: list = None,
398
- max_retries: int = 5,
399
- write_submission_results_path: str = None
400
- ):
408
+ max_retries: int,
409
+ file_path: str,
410
+ program_id: str = "program1"
411
+ ) -> List[Dict]:
401
412
  """
402
- Submit a set of metadata JSON files to a Gen3 data commons endpoint, in order.
413
+ Submit each chunk of data (in split_json_list) for a given node to Gen3, using retry logic and logging on failures.
403
414
 
404
415
  Args:
405
- file_list (list): List of paths (local or S3 URIs) to metadata .json files, one per node type.
406
- api_key (str): Gen3 API key (parsed dict or JSON string).
407
- api_endpoint (str): Gen3 data commons endpoint URL.
408
- project_id (str): Gen3 project ID to submit data to.
409
- data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying submission order.
410
- boto3_session (boto3.Session): Existing AWS/boto3 session for S3 & secret usage.
411
- max_size_kb (int, optional): Maximum size per submission chunk, in KB. Default: 50.
412
- exclude_nodes (list, optional): List of node names to skip (default: ["project", "program", "acknowledgement", "publication"]).
413
- max_retries (int, optional): Maximum number of retry attempts per node chunk. Default: 5.
416
+ split_json_list (list): List of JSON-serializable chunked data to submit.
417
+ node (str): Name of the data node being submitted.
418
+ gen3_submitter: A Gen3Submission instance for making submissions.
419
+ project_id (str): The project identifier within Gen3.
420
+ max_retries (int): Maximum number of retry attempts per chunk on failure.
421
+ file_path (str): Path of the file that was submitted. Used only for data capture.
422
+ program_id (str, optional): The Gen3 program id (default: "program1").
414
423
 
415
424
  Returns:
416
- None
425
+ List[Dict]: List of response dictionaries for each submitted chunk.
417
426
 
418
427
  Raises:
419
- Exception: On critical submission failure for any chunk.
428
+ Exception: If submission fails after all retry attempts for any chunk.
429
+ """
430
+
431
+ n_json_data = len(split_json_list)
432
+ response_results = []
433
+
434
+ for index, jsn in enumerate(split_json_list):
435
+ progress_str = f"{index + 1}/{n_json_data}"
436
+
437
+ submission_success = False
438
+ last_exception = None
420
439
 
421
- Notes:
422
- Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
440
+ attempt = 0
441
+ while attempt <= max_retries:
442
+ try:
443
+ if attempt == 0:
444
+ log_msg = (
445
+ f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
446
+ f"Split: {progress_str:<5}"
447
+ )
448
+ logger.info(log_msg)
449
+ else:
450
+ log_msg = (
451
+ f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
452
+ f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
453
+ )
454
+ logger.warning(log_msg)
455
+
456
+ res = gen3_submitter.submit_record(program_id, project_id, jsn)
457
+ res.update({"file_path": file_path})
458
+ response_results.append(res)
459
+ submission_success = True
460
+ logger.info(
461
+ f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
462
+ f"Node: {node:<12} | Split: {progress_str:<5}"
463
+ )
464
+ break # Success
465
+
466
+ except Exception as e:
467
+ last_exception = e
468
+ logger.error(
469
+ f"Error submitting chunk {progress_str} for node '{node}': {e}"
470
+ )
471
+
472
+ if attempt < max_retries:
473
+ import time
474
+ time.sleep(0.2)
475
+ else:
476
+ logger.critical(
477
+ f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
478
+ f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
479
+ )
480
+ attempt += 1
481
+
482
+ if not submission_success:
483
+ # After retries, still failed
484
+ raise Exception(
485
+ f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
486
+ f"Last error: {last_exception}"
487
+ )
488
+
489
+ logger.info(f"Finished submitting node '{node}'.")
490
+ return response_results
491
+
492
+
493
+ def flatten_submission_results(submission_results: List[Dict]) -> List[Dict]:
423
494
  """
495
+ Flattens a list of Gen3 submission result dictionaries into a single list of entity dictionaries.
424
496
 
425
- timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
426
- log_dir = f"submission_logs/{timestamp}"
427
- os.makedirs(log_dir, exist_ok=True)
497
+ For each submission result, this function processes its entities (if any),
498
+ extracting the 'project_id' and 'submitter_id' from the 'unique_keys' field (if present)
499
+ into the top-level entity dictionary for easy access.
428
500
 
429
- if exclude_nodes is None:
430
- exclude_nodes = ["project", "program", "acknowledgement", "publication"]
501
+ Any submission result that does not have a code of 200 or lacks entities is skipped, and a warning is logged.
431
502
 
432
- logger.info("Starting metadata submission process.")
433
- logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
503
+ Args:
504
+ submission_results (List[Dict]):
505
+ A list of Gen3 submission result dictionaries, each containing at least a "code" and "entities" entry.
434
506
 
435
- try:
436
- submit = create_gen3_submission_class(api_key, api_endpoint)
507
+ Returns:
508
+ List[Dict]:
509
+ A flat list, where each element is an entity dictionary (with keys 'project_id' and 'submitter_id' added if available).
510
+ """
511
+ flat_list_dict = []
512
+ total = len(submission_results)
513
+ logger.info(f"Flattening {total} submission result(s)...")
514
+
515
+ for idx, obj in enumerate(submission_results, 1):
516
+ transaction_id = obj.get("transaction_id")
517
+ code = obj.get("code")
518
+ if code != 200:
519
+ logger.warning(f"Skipping submission result at index {idx-1} (code={code})")
520
+ continue
521
+
522
+ entities = obj.get("entities")
523
+
524
+ if entities is None:
525
+ logger.warning(f"No entities found in submission result at index {idx-1}")
526
+ continue
527
+
528
+ logger.info(f"Processing submission result {idx} of {total}, {len(entities)} entities")
529
+
530
+ for entity in entities:
531
+ unique_keys = entity.get("unique_keys", [{}])
532
+ if unique_keys and isinstance(unique_keys, list):
533
+ keys = unique_keys[0]
534
+ entity["project_id"] = keys.get("project_id")
535
+ entity["submitter_id"] = keys.get("submitter_id")
536
+ entity["transaction_id"] = transaction_id
537
+ entity["file_path"] = obj.get("file_path", '')
538
+ flat_list_dict.append(entity)
539
+
540
+ # renaming cols
541
+ for entity in flat_list_dict:
542
+ entity["gen3_guid"] = entity.pop("id", None)
543
+ entity["node"] = entity.pop("type", None)
544
+
545
+ logger.info(f"Finished flattening. Total entities: {len(flat_list_dict)}")
546
+ return flat_list_dict
547
+
548
+
549
+ def find_version_from_path(path):
550
+ version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
551
+ found_versions = []
552
+
553
+ for segment in path.split('/'):
554
+ match = version_pattern.match(segment)
555
+ if match:
556
+ found_versions.append(match.group(1))
557
+
558
+ if not found_versions:
559
+ return None
560
+
561
+ if len(found_versions) > 1:
562
+ logger.warning("more than one match found in path for version string")
563
+
564
+ return found_versions[-1]
565
+
566
+
567
+ def collect_versions_from_metadata_file_list(metadata_file_list):
568
+ versions = []
569
+ for file_path in metadata_file_list:
570
+ version = find_version_from_path(file_path)
571
+ if version:
572
+ versions.append(version)
573
+ versions = list(set(versions))
574
+ if len(versions) > 1:
575
+ logger.error(f"more than one version found in metadata file list: {metadata_file_list}")
576
+ raise
577
+ return versions[0]
578
+
579
+
580
+ class MetadataSubmitter:
581
+ def __init__(
582
+ self,
583
+ metadata_file_list: list,
584
+ api_key: dict,
585
+ project_id: str,
586
+ data_import_order_path: str,
587
+ program_id: str = "program1",
588
+ max_size_kb: int = 100,
589
+ exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
590
+ max_retries: int = 3,
591
+ aws_profile: str = None
592
+ ):
593
+ """
594
+ Initialises a MetadataSubmitter for submitting a set of metadata JSON files to a Gen3 data commons endpoint, in order.
437
595
 
596
+ Args:
597
+ metadata_file_list (list): List of local file paths or S3 URIs to metadata .json files, one per node type.
598
+ api_key (dict): Gen3 API key as a parsed dictionary.
599
+ project_id (str): Gen3 project ID to submit data to.
600
+ data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying node submission order.
601
+ program_id (str, optional): Gen3 program ID (default: "program1").
602
+ max_size_kb (int, optional): Maximum size per submission chunk, in KB (default: 100).
603
+ exclude_nodes (list, optional): List of node names to skip during submission (default: ["project", "program", "acknowledgement", "publication"]).
604
+ max_retries (int, optional): Maximum number of retry attempts per node chunk (default: 3).
605
+ aws_profile (str, optional): AWS CLI named profile to use for boto3 session (default: None).
606
+ """
607
+ self.metadata_file_list = metadata_file_list
608
+ self.api_key = api_key
609
+ self.project_id = project_id
610
+ self.data_import_order_path = data_import_order_path
611
+ self.program_id = program_id
612
+ self.max_size_kb = max_size_kb
613
+ self.exclude_nodes = exclude_nodes
614
+ self.max_retries = max_retries
615
+ self.submission_results = []
616
+ self.aws_profile = aws_profile
617
+ self.boto3_session = self._create_boto3_session()
618
+ logger.info("MetadataSubmitter initialised.")
619
+
620
+ def _create_gen3_submission_class(self):
621
+ return create_gen3_submission_class(self.api_key)
622
+
623
+ def _create_boto3_session(self):
624
+ return create_boto3_session(self.aws_profile)
625
+
626
+ def _read_data_import_order(self, data_import_order_path: str, exclude_nodes: list[str], boto3_session = None):
438
627
  if is_s3_uri(data_import_order_path):
439
- logger.info(f"Reading import order from S3: {data_import_order_path}")
440
- import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
441
- logger.debug(f"Import order from S3: {import_order}")
628
+ session = boto3_session or self.boto3_session
629
+ return read_data_import_order_txt_s3(data_import_order_path, session, exclude_nodes)
442
630
  else:
443
- logger.info(f"Reading import order from file: {data_import_order_path}")
444
- import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
445
- logger.debug(f"Import order from file: {import_order}")
631
+ return read_data_import_order_txt(data_import_order_path, exclude_nodes)
446
632
 
447
- file_map = {get_node_from_file_path(file): file for file in file_list}
633
+ def _prepare_json_chunks(self, metadata_file_path: str, max_size_kb: int) -> List[List[Dict]]:
634
+ """
635
+ Read JSON data from a given file path and split it into chunks,
636
+ each with a maximum size of ``max_size_kb`` kilobytes.
448
637
 
449
- for node in import_order:
450
- if node in exclude_nodes:
451
- logger.info(f"Skipping node '{node}' (in exclude list).")
452
- continue
453
- file = file_map.get(node)
454
- if not file:
455
- logger.info(f"Skipping node '{node}' (not present in file list).")
456
- continue
638
+ Args:
639
+ metadata_file_path (str): File path (local or S3 URI) to the JSON data.
640
+ max_size_kb (int): Maximum allowed size (in kilobytes) for each chunk.
457
641
 
458
- logger.info(f"Processing file '{file}' for node '{node}'.")
642
+ Returns:
643
+ list: A list of chunks, where each chunk is a list of dictionaries
644
+ containing JSON data.
645
+ """
646
+ logger.info(f"Reading metadata json from {metadata_file_path}")
647
+ if is_s3_uri(metadata_file_path):
648
+ session = self.boto3_session
649
+ data = read_metadata_json_s3(metadata_file_path, session)
650
+ else:
651
+ data = read_metadata_json(metadata_file_path)
652
+ return split_json_objects(data, max_size_kb)
459
653
 
460
- try:
461
- if is_s3_uri(file):
462
- logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
463
- json_data = read_metadata_json_s3(file, boto3_session)
464
- else:
465
- logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
466
- json_data = read_metadata_json(file)
467
- except Exception as e:
468
- logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
469
- raise Exception(f"Failed to read JSON metadata for node '{node}' from {file}: {e}")
654
+ def _create_file_map(self):
655
+ """
656
+ Generate a mapping from node names to metadata file paths.
470
657
 
471
- split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
472
- n_json_data = len(split_json_list)
473
- logger.info(
474
- f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
475
- )
658
+ This method infers the node name for each file in `self.metadata_file_list`
659
+ and returns a dictionary where the keys are node names and the values
660
+ are the corresponding file paths.
476
661
 
477
- for index, jsn in enumerate(split_json_list):
478
- progress_str = f"{index + 1}/{n_json_data}"
479
-
480
- submission_success = False
481
- last_exception = None
482
- for attempt in range(max_retries + 1):
483
- try:
484
- log_msg = (
485
- f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
486
- f"Split: {progress_str:<5}"
487
- if attempt == 0 else
488
- f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
489
- f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
490
- )
491
- logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
492
-
493
- res = submit.submit_record("program1", project_id, jsn)
494
-
495
- if write_submission_results_path is not None:
496
- log_filename = os.path.join(
497
- log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
498
- )
499
- abs_log_filename = os.path.abspath(log_filename)
500
- with open(abs_log_filename, "a") as f:
501
- json.dump(res, f)
502
- f.write("\n")
503
- logger.info(
504
- f"Wrote submission response to log file: {abs_log_filename}"
505
- )
506
-
507
- logger.info(
508
- f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
509
- f"Node: {node:<12} | Split: {progress_str:<5}"
510
- )
511
- submission_success = True
512
- break # Success
513
-
514
- except Exception as e:
515
- last_exception = e
516
- logger.error(
517
- f"Error submitting chunk {progress_str} for node '{node}': {e}"
518
- )
519
- if attempt < max_retries:
520
- import time
521
- time.sleep(0.2)
522
- else:
523
- logger.critical(
524
- f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
525
- f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
526
- )
527
-
528
- if not submission_success:
529
- # After retries, still failed
530
- raise Exception(
531
- f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
532
- f"Last error: {last_exception}"
533
- )
662
+ Returns:
663
+ dict: Dictionary mapping node names (str) to their associated metadata file paths (str).
664
+ """
665
+ file_map = {
666
+ get_node_from_file_path(file): file
667
+ for file in self.metadata_file_list
668
+ }
669
+ return file_map
534
670
 
535
- logger.info(f"Finished submitting node '{node}'.")
671
+ def submit_metadata(self) -> List[Dict]:
672
+ """
673
+ Submits metadata for each node defined in the data import order, except those in the exclude list.
536
674
 
537
- logger.info("--- Submission process complete ---")
675
+ For each node, this method retrieves the corresponding metadata file, splits the JSON data
676
+ into size-constrained chunks, and submits each chunk to the Gen3 submission API. Responses
677
+ from all submissions are gathered and returned as a list.
538
678
 
539
- except Exception as exc:
540
- logger.exception(f"Critical error during submission process: {exc}")
541
- raise
679
+ Returns:
680
+ List[Dict]: A list of response dictionaries returned from the Gen3 metadata submissions.
681
+ """
682
+ gen3_submitter = self._create_gen3_submission_class()
683
+ data_import_order = self._read_data_import_order(self.data_import_order_path, self.exclude_nodes, self.boto3_session)
684
+ file_map = self._create_file_map()
685
+ output_response_list_dict = []
686
+
687
+ logger.info("Starting metadata submission.")
688
+ for node in data_import_order:
689
+
690
+ if node in self.exclude_nodes:
691
+ logger.info(f"Skipping node '{node}' (in exclude list).")
692
+ continue
693
+ file_path = file_map.get(node)
694
+ if not file_path:
695
+ logger.info(f"Skipping node '{node}' (not present in file list).")
696
+ continue
697
+
698
+ logger.info(f"Processing file '{file_path}' for node '{node}'.")
699
+ logger.info("Splitting JSON data into chunks.")
700
+ json_chunks = self._prepare_json_chunks(file_path, self.max_size_kb)
701
+
702
+ logger.info("Submitting chunks to Gen3.")
703
+ response_list = submit_data_chunks(
704
+ split_json_list=json_chunks,
705
+ node=node,
706
+ file_path=file_path,
707
+ gen3_submitter=gen3_submitter,
708
+ project_id=self.project_id,
709
+ max_retries=self.max_retries,
710
+ program_id=self.program_id
711
+ )
712
+ output_response_list_dict.extend(response_list)
713
+
714
+ self.submission_results = output_response_list_dict
715
+ return output_response_list_dict
716
+
717
+ def upload_metadata_submission_results(
718
+ self,
719
+ dataset_root: str,
720
+ database: str,
721
+ table: str,
722
+ partition_cols: list = ["upload_datetime"],
723
+ ):
724
+ """
725
+ Uploads the submission results to s3 and parquet table.
726
+
727
+ Args:
728
+ dataset_root (str): S3 path where the parquet files will be stored
729
+ (e.g., "s3://acdc-dataops-metadata/metadata_upload/").
730
+ database (str): Database name for storing the metadata upload
731
+ (e.g., "acdc_dataops_metadata_db").
732
+ table (str): Table name for storing the metadata upload
733
+ (e.g., "metadata_upload").
734
+ partition_cols (list, optional): List of column names to partition the parquet table by.
735
+ Defaults to ["upload_datetime"].
736
+ """
737
+ logger.info("Collecting version from metadata file list.")
738
+ version = collect_versions_from_metadata_file_list(self.metadata_file_list)
739
+ logger.info(f"Extracted version: {version}")
740
+
741
+ logger.info("Inferring API endpoint from JWT.")
742
+ api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
743
+ logger.info(f"Using API endpoint: {api_endpoint}")
744
+
745
+ upload_datetime = datetime.now().isoformat()
746
+ upload_id = str(uuid.uuid4())
747
+ logger.info(f"Upload datetime: {upload_datetime}")
748
+ logger.info(f"Generated upload ID: {upload_id}")
749
+
750
+ logger.info("Flattening submission results for upload.")
751
+ flattened_results = flatten_submission_results(self.submission_results)
752
+ logger.info(f"Flattened {len(flattened_results)} submission result entries.")
753
+
754
+ logger.info("Converting flattened results to DataFrame.")
755
+ flattened_results_df = pd.DataFrame(flattened_results)
756
+ flattened_results_df['upload_datetime'] = upload_datetime
757
+ flattened_results_df['upload_id'] = upload_id
758
+ flattened_results_df['api_endpoint'] = api_endpoint
759
+ flattened_results_df['version'] = version
760
+
761
+ logger.info(
762
+ f"Writing DataFrame to parquet and S3/table: "
763
+ f"dataset_root={dataset_root}, database={database}, table={table}, partition_cols={partition_cols}"
764
+ )
765
+ write_parquet_to_db(
766
+ df=flattened_results_df,
767
+ dataset_root=dataset_root,
768
+ database=database,
769
+ table=table,
770
+ partition_cols=partition_cols
771
+ )
772
+ logger.info("Metadata submission results upload complete.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.6.5
3
+ Version: 0.6.7
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -17,6 +17,7 @@ Requires-Dist: dbt-core (==1.9.4)
17
17
  Requires-Dist: gen3 (>=4.27.4,<5.0.0)
18
18
  Requires-Dist: gen3_validator (>=2.0.0,<3.0.0)
19
19
  Requires-Dist: numpy (<2.0.0)
20
+ Requires-Dist: pyjwt (>=2.10.1,<3.0.0)
20
21
  Requires-Dist: pytest
21
22
  Requires-Dist: python-dotenv
22
23
  Requires-Dist: pytz (>=2025.2,<2026.0)
@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6
3
3
  acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
4
4
  acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
5
5
  acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
6
- acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=6brNnwh5rpyzIf13ZGC_srcP_U0GRtne_sWoiM5CMnw,21059
6
+ acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=k5q5hRkj-dWo25z9nVZI2eNh0xnmQU8TPDffSSnQlUY,29906
7
7
  acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
8
8
  acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
9
9
  acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
10
10
  acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
11
11
  acdc_aws_etl_pipeline/validate/validate.py,sha256=zLqK9i92FsRAaBOGdY-G7-vb0e6tmkoUXhY6zCfbjN8,24895
12
- acdc_aws_etl_pipeline-0.6.5.dist-info/METADATA,sha256=DALzNwPj8iXFMTkLFrk14N4kEVkwayKBwr-FLDgBVeM,2887
13
- acdc_aws_etl_pipeline-0.6.5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
- acdc_aws_etl_pipeline-0.6.5.dist-info/RECORD,,
12
+ acdc_aws_etl_pipeline-0.6.7.dist-info/METADATA,sha256=m-PsBTula6gGZyZo5_6DEkhi36YZMVWvulvDkOQwM2Y,2926
13
+ acdc_aws_etl_pipeline-0.6.7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
+ acdc_aws_etl_pipeline-0.6.7.dist-info/RECORD,,