PyPI - acdc_aws_etl_pipeline - Versions diffs - 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

acdc_aws_etl_pipeline 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

acdc_aws_etl_pipeline/upload/metadata_submitter.py CHANGED Viewed

@@ -1,34 +1,41 @@
 import os
-# redefine to use local cache in /tmp
-os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
+import sys
+import time
 import json
 import boto3
+from botocore.exceptions import BotoCoreError, ClientError
 from gen3.auth import Gen3Auth
-from gen3.index import Gen3Index
 from gen3.submission import Gen3Submission
 import logging
 from datetime import datetime
 import jwt
-from typing import Dict, List
+import requests
+from typing import Any, Dict, List, Optional
 import re
 import pandas as pd
 import uuid
-from acdc_aws_etl_pipeline.validate.validate import write_parquet_to_db
+from acdc_aws_etl_pipeline.validate.validate import (
+    write_parquet_to_db,
+)
+from tenacity import retry, stop_after_attempt, wait_exponential
+# redefine to use local cache in /tmp
+os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
 logger = logging.getLogger(__name__)
-def create_boto3_session(aws_profile: str = None):
+def create_boto3_session(aws_profile: Optional[str] = None):
     """
     Create and return a boto3 Session object using an optional AWS profile.
     Args:
-        aws_profile (str, optional): The AWS CLI named profile to use for credentials. If None, uses default credentials.
+        aws_profile (str, optional): The AWS CLI named profile to use.
+            If None, uses default credentials.
     Returns:
         boto3.Session: The created session instance.
     """
-    logger.debug(f"Creating boto3 session with aws_profile={aws_profile}")
+    logger.debug("Creating boto3 session with aws_profile=%s", aws_profile)
     return boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
 def is_s3_uri(s3_uri: str) -> bool:
@@ -41,7 +48,7 @@ def is_s3_uri(s3_uri: str) -> bool:
     Returns:
         bool: True if the string starts with 's3://', False otherwise.
     """
-    logger.debug(f"Checking if {s3_uri} is an S3 URI.")
+    logger.debug("Checking if %s is an S3 URI.", s3_uri)
     return s3_uri.startswith("s3://")
 def get_filename(file_path: str) -> str:
@@ -55,7 +62,11 @@ def get_filename(file_path: str) -> str:
         str: The filename (with extension).
     """
     filename = file_path.split("/")[-1]
-    logger.debug(f"Extracted filename '{filename}' from file_path '{file_path}'.")
+    logger.debug(
+        "Extracted filename '%s' from file_path '%s'.",
+        filename,
+        file_path,
+    )
     return filename
 def get_node_from_file_path(file_path: str) -> str:
@@ -70,7 +81,7 @@ def get_node_from_file_path(file_path: str) -> str:
     """
     filename = get_filename(file_path)
     node = filename.split(".")[0]
-    logger.debug(f"Extracted node '{node}' from filename '{filename}'.")
+    logger.debug("Extracted node '%s' from filename '%s'.", node, filename)
     return node
 def list_metadata_jsons(metadata_dir: str) -> list:
@@ -87,11 +98,18 @@ def list_metadata_jsons(metadata_dir: str) -> list:
         Exception: If there is an error reading the directory.
     """
     try:
-        logger.info(f"Listing .json files in metadata directory: {metadata_dir}")
+        logger.info(
+            "Listing .json files in metadata directory: %s",
+            metadata_dir,
+        )
         files = os.listdir(metadata_dir)
-        return [os.path.abspath(os.path.join(metadata_dir, f)) for f in files if f.endswith(".json")]
-    except Exception as e:
-        logger.error(f"Error listing metadata JSONs in {metadata_dir}: {e}")
+        return [
+            os.path.abspath(os.path.join(metadata_dir, file_name))
+            for file_name in files
+            if file_name.endswith(".json")
+        ]
+    except OSError as e:
+        logger.error("Error listing metadata JSONs in %s: %s", metadata_dir, e)
         raise
 def find_data_import_order_file(metadata_dir: str) -> str:
@@ -108,16 +126,22 @@ def find_data_import_order_file(metadata_dir: str) -> str:
         FileNotFoundError: If no such file is found.
     """
     try:
-        logger.info(f"Searching for DataImportOrder.txt in {metadata_dir}")
+        logger.info("Searching for DataImportOrder.txt in %s", metadata_dir)
         files = [os.path.join(metadata_dir, f) for f in os.listdir(metadata_dir)]
         order_files = [f for f in files if "DataImportOrder.txt" in f]
         if not order_files:
             logger.error("No DataImportOrder.txt file found in the given directory.")
-            raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
-        logger.debug(f"Found DataImportOrder.txt file: {order_files[0]}")
+            raise FileNotFoundError(
+                "No DataImportOrder.txt file found in the given directory."
+            )
+        logger.debug("Found DataImportOrder.txt file: %s", order_files[0])
         return order_files[0]
-    except Exception as e:
-        logger.error(f"Error finding DataImportOrder.txt in {metadata_dir}: {e}")
+    except OSError as e:
+        logger.error(
+            "Error finding DataImportOrder.txt in %s: %s",
+            metadata_dir,
+            e,
+        )
         raise
 def list_metadata_jsons_s3(s3_uri: str, session) -> list:
@@ -125,13 +149,14 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
     List all .json files in an S3 "directory" (prefix).
     Args:
-        s3_uri (str): S3 URI to the metadata directory (e.g. "s3://my-bucket/path/to/dir").
+        s3_uri (str): S3 URI to the metadata directory
+            (e.g. "s3://my-bucket/path/to/dir").
         session (boto3.Session): An active boto3 Session.
     Returns:
         list: List of S3 URIs for all .json files found under the prefix.
     """
-    logger.info(f"Listing .json files in S3 metadata directory: {s3_uri}")
+    logger.info("Listing .json files in S3 metadata directory: %s", s3_uri)
     s3 = session.client('s3')
     bucket = s3_uri.split("/")[2]
     prefix = "/".join(s3_uri.split("/")[3:])
@@ -144,7 +169,7 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
         for obj in objects.get('Contents', [])
         if obj['Key'].endswith(".json")
     ]
-    logger.debug(f"Found {len(result)} .json files in S3 at {s3_uri}")
+    logger.debug("Found %s .json files in S3 at %s", len(result), s3_uri)
     return result
 def find_data_import_order_file_s3(s3_uri: str, session) -> str:
@@ -161,16 +186,29 @@ def find_data_import_order_file_s3(s3_uri: str, session) -> str:
     Raises:
         FileNotFoundError: If the file does not exist in the specified prefix.
     """
-    logger.info(f"Searching for DataImportOrder.txt in S3 metadata directory: {s3_uri}")
+    logger.info(
+        "Searching for DataImportOrder.txt in S3 metadata directory: %s",
+        s3_uri,
+    )
     s3 = session.client('s3')
     bucket = s3_uri.split("/")[2]
     prefix = "/".join(s3_uri.split("/")[3:])
     objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
-    order_files = [obj['Key'] for obj in objects.get('Contents', []) if obj['Key'].endswith("DataImportOrder.txt")]
+    order_files = [
+        obj['Key']
+        for obj in objects.get('Contents', [])
+        if obj['Key'].endswith("DataImportOrder.txt")
+    ]
     if not order_files:
         logger.error("No DataImportOrder.txt file found in the given S3 directory.")
-        raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
-    logger.debug(f"Found DataImportOrder.txt file in S3: s3://{bucket}/{order_files[0]}")
+        raise FileNotFoundError(
+            "No DataImportOrder.txt file found in the given directory."
+        )
+    logger.debug(
+        "Found DataImportOrder.txt file in S3: s3://%s/%s",
+        bucket,
+        order_files[0],
+    )
     return f"s3://{bucket}/{order_files[0]}"
 def read_metadata_json(file_path: str) -> dict:
@@ -183,10 +221,14 @@ def read_metadata_json(file_path: str) -> dict:
     Returns:
         dict or list: Parsed contents of the JSON file.
     """
-    logger.info(f"Reading metadata json from local file: {file_path}")
-    with open(file_path, "r") as f:
+    logger.info("Reading metadata json from local file: %s", file_path)
+    with open(file_path, "r", encoding="utf-8") as f:
         data = json.load(f)
-    logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {file_path}")
+    logger.debug(
+        "Read %s objects from %s",
+        len(data) if isinstance(data, list) else 'object',
+        file_path,
+    )
     return data
 def read_metadata_json_s3(s3_uri: str, session) -> dict:
@@ -200,11 +242,18 @@ def read_metadata_json_s3(s3_uri: str, session) -> dict:
     Returns:
         dict or list: Parsed JSON object from S3 file.
     """
-    logger.info(f"Reading metadata json from S3 file: {s3_uri}")
+    logger.info("Reading metadata json from S3 file: %s", s3_uri)
     s3 = session.client('s3')
-    obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
+    obj = s3.get_object(
+        Bucket=s3_uri.split("/")[2],
+        Key="/".join(s3_uri.split("/")[3:]),
+    )
     data = json.loads(obj['Body'].read().decode('utf-8'))
-    logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
+    logger.debug(
+        "Read %s objects from %s",
+        len(data) if isinstance(data, list) else 'object',
+        s3_uri,
+    )
     return data
 def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = None) -> list:
@@ -224,20 +273,41 @@ def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = No
     """
     filename = s3_uri.split("/")[-1]
     if 'DataImportOrder.txt' not in filename:
-        logger.error(f"File {filename} is not a DataImportOrder.txt file")
-        raise ValueError(f"File {filename} is not a DataImportOrder.txt file")
-    logger.info(f"Reading DataImportOrder.txt from S3 file: {s3_uri}")
+        logger.error("File %s is not a DataImportOrder.txt file", filename)
+        raise ValueError(
+            f"File {filename} is not a DataImportOrder.txt file"
+        )
+    logger.info(
+        "Reading DataImportOrder.txt from S3 file: %s",
+        s3_uri,
+    )
     s3 = session.client('s3')
-    obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
+    obj = s3.get_object(
+        Bucket=s3_uri.split("/")[2],
+        Key="/".join(s3_uri.split("/")[3:]),
+    )
     content = obj['Body'].read().decode('utf-8')
-    import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
-    logger.debug(f"Raw import order from S3 file: {import_order}")
+    import_order = [
+        line.rstrip()
+        for line in content.splitlines()
+        if line.strip()
+    ]
+    logger.debug("Raw import order from S3 file: %s", import_order)
     if exclude_nodes is not None:
         import_order = [node for node in import_order if node not in exclude_nodes]
-        logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
-    logger.debug(f"Final import order from S3 file {s3_uri}: {import_order}")
+        logger.debug(
+            "Import order after excluding nodes %s: %s",
+            exclude_nodes,
+            import_order,
+        )
+    logger.debug(
+        "Final import order from S3 file %s: %s",
+        s3_uri,
+        import_order,
+    )
     return import_order
 def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
     """
     Read DataImportOrder.txt from local file, optionally excluding some nodes.
@@ -253,17 +323,26 @@ def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
         FileNotFoundError: If the file is not found.
     """
     try:
-        logger.info(f"Reading DataImportOrder.txt from local file: {file_path}")
-        with open(file_path, "r") as f:
+        logger.info(
+            "Reading DataImportOrder.txt from local file: %s",
+            file_path,
+        )
+        with open(file_path, "r", encoding="utf-8") as f:
             import_order = [line.rstrip() for line in f if line.strip()]
-            logger.debug(f"Raw import order from file: {import_order}")
+            logger.debug("Raw import order from file: %s", import_order)
             if exclude_nodes is not None:
-                import_order = [node for node in import_order if node not in exclude_nodes]
-                logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
-        logger.debug(f"Final import order from {file_path}: {import_order}")
+                import_order = [
+                    node for node in import_order if node not in exclude_nodes
+                ]
+                logger.debug(
+                    "Import order after excluding nodes %s: %s",
+                    exclude_nodes,
+                    import_order,
+                )
+        logger.debug("Final import order from %s: %s", file_path, import_order)
         return import_order
     except FileNotFoundError:
-        logger.error(f"Error: DataImportOrder.txt not found in {file_path}")
+        logger.error("Error: DataImportOrder.txt not found in %s", file_path)
         return []
 def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
@@ -280,7 +359,12 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
     Returns:
         list: List of lists. Each sublist size (JSON-serialized) <= max_size_kb.
     """
-    logger.info(f"Splitting JSON objects into max {max_size_kb} KB chunks. Total items: {len(json_list)}")
+    logger.info(
+        "Splitting JSON objects into max %s KB chunks. Total items: %s",
+        max_size_kb,
+        len(json_list),
+    )
     def get_size_in_kb(obj):
         """
         Get the size in kilobytes of the JSON-serialized object.
@@ -291,12 +375,11 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
         Returns:
             float: Size of the object in kilobytes.
         """
-        import sys
         size_kb = sys.getsizeof(json.dumps(obj)) / 1024
-        logger.debug(f"Calculated size: {size_kb:.2f} KB")
+        logger.debug("Calculated size: %.2f KB", size_kb)
         return size_kb
-    def split_list(json_list):
+    def split_list(items):
         """
         Recursively split the list so each chunk fits within max_size_kb.
@@ -306,20 +389,34 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
         Returns:
             list: List of sublists.
         """
-        if get_size_in_kb(json_list) <= max_size_kb:
-            logger.debug(f"Split length {len(json_list)} is within max size {max_size_kb} KB.")
-            return [json_list]
-        mid = len(json_list) // 2
-        left_list = json_list[:mid]
-        right_list = json_list[mid:]
-        logger.debug(f"Splitting list at index {mid}: left {len(left_list)}, right {len(right_list)}")
+        if get_size_in_kb(items) <= max_size_kb:
+            logger.debug(
+                "Split length %s is within max size %s KB.",
+                len(items),
+                max_size_kb,
+            )
+            return [items]
+        mid = len(items) // 2
+        left_list = items[:mid]
+        right_list = items[mid:]
+        logger.debug(
+            "Splitting list at index %s: left %s, right %s",
+            mid,
+            len(left_list),
+            len(right_list),
+        )
         return split_list(left_list) + split_list(right_list)
     split_lists = split_list(json_list)
     if print_results:
         for i, lst in enumerate(split_lists):
-            logger.info(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
-    logger.debug(f"Total splits: {len(split_lists)}")
+            logger.info(
+                "List %s size: %.2f KB, contains %s objects",
+                i + 1,
+                get_size_in_kb(lst),
+                len(lst),
+            )
+    logger.debug("Total splits: %s", len(split_lists))
     return split_lists
 def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) -> dict:
@@ -337,29 +434,40 @@ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) ->
     Raises:
         Exception: On failure to retrieve or parse the secret.
     """
-    logger.info(f"Retrieving Gen3 API key from AWS Secrets Manager: secret_name={secret_name}, region={region_name}")
-    client = session.client(service_name='secretsmanager', region_name=region_name)
+    logger.info(
+        "Retrieving Gen3 API key from AWS Secrets Manager: "
+        "secret_name=%s, region=%s",
+        secret_name,
+        region_name,
+    )
+    client = session.client(
+        service_name='secretsmanager',
+        region_name=region_name,
+    )
     try:
         get_secret_value_response = client.get_secret_value(
-            SecretId=secret_name
+            SecretId=secret_name,
         )
-    except Exception as e:
-        logger.error(f"Error getting secret value from AWS Secrets Manager: {e}")
-        raise e
+    except (BotoCoreError, ClientError) as e:
+        logger.error("Error getting secret value from AWS Secrets Manager: %s", e)
+        raise
     secret = get_secret_value_response['SecretString']
     try:
         secret = json.loads(secret)
         api_key = secret
-        logger.debug(f"Retrieved Gen3 API key from secret {secret_name}")
+        logger.debug("Retrieved Gen3 API key from secret %s", secret_name)
         return api_key
-    except Exception as e:
-        logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
-        raise e
+    except (json.JSONDecodeError, TypeError) as e:
+        logger.error("Error parsing Gen3 API key from AWS Secrets Manager: %s", e)
+        raise
-def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
+def infer_api_endpoint_from_jwt(
+    jwt_token: str,
+    api_version: str = 'v0',
+) -> str:
     """
     Extracts the URL from a JSON Web Token (JWT) credential.
@@ -370,11 +478,14 @@ def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
         str: The extracted URL.
     """
     logger.info("Decoding JWT to extract API URL.")
-    url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '')
+    url = jwt.decode(
+        jwt_token,
+        options={"verify_signature": False},
+    ).get('iss', '')
     if '/user' in url:
         url = url.split('/user')[0]
     url = f"{url}/api/{api_version}"
-    logger.info(f"Extracted API URL from JWT: {url}")
+    logger.info("Extracted API URL from JWT: %s", url)
     return url
@@ -393,190 +504,16 @@ def create_gen3_submission_class(api_key: dict):
     jwt_token = api_key['api_key']
     logger.info("Inferring API endpoint from JWT token.")
     api_endpoint = infer_api_endpoint_from_jwt(jwt_token)
-    logger.debug(f"Inferred API endpoint: {api_endpoint}")
-    logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
+    logger.debug("Inferred API endpoint: %s", api_endpoint)
+    logger.info(
+        "Creating Gen3Submission class for endpoint: %s",
+        api_endpoint,
+    )
     auth = Gen3Auth(refresh_token=api_key)
     submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
     return submit
-def submit_data_chunks(
-    split_json_list: list,
-    node: str,
-    gen3_submitter,
-    project_id: str,
-    max_retries: int,
-    file_path: str,
-    program_id: str = "program1"
-) -> List[Dict]:
-    """
-    Submit each chunk of data (in split_json_list) for a given node to Gen3, using retry logic and logging on failures.
-    Args:
-        split_json_list (list): List of JSON-serializable chunked data to submit.
-        node (str): Name of the data node being submitted.
-        gen3_submitter: A Gen3Submission instance for making submissions.
-        project_id (str): The project identifier within Gen3.
-        max_retries (int): Maximum number of retry attempts per chunk on failure.
-        file_path (str): Path of the file that was submitted. Used only for data capture.
-        program_id (str, optional): The Gen3 program id (default: "program1").
-    Returns:
-        List[Dict]: List of response dictionaries for each submitted chunk.
-    Raises:
-        Exception: If submission fails after all retry attempts for any chunk.
-    """
-    n_json_data = len(split_json_list)
-    response_results = []
-    for index, jsn in enumerate(split_json_list):
-        progress_str = f"{index + 1}/{n_json_data}"
-        submission_success = False
-        last_exception = None
-        attempt = 0
-        while attempt <= max_retries:
-            try:
-                if attempt == 0:
-                    log_msg = (
-                        f"[SUBMIT]  | Project: {project_id:<10} | Node: {node:<12} | "
-                        f"Split: {progress_str:<5}"
-                    )
-                    logger.info(log_msg)
-                else:
-                    log_msg = (
-                        f"[RETRY]   | Project: {project_id:<10} | Node: {node:<12} | "
-                        f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
-                    )
-                    logger.warning(log_msg)
-                res = gen3_submitter.submit_record(program_id, project_id, jsn)
-                res.update({"file_path": file_path})
-                response_results.append(res)
-                submission_success = True
-                logger.info(
-                    f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
-                    f"Node: {node:<12} | Split: {progress_str:<5}"
-                )
-                break  # Success
-            except Exception as e:
-                last_exception = e
-                logger.error(
-                    f"Error submitting chunk {progress_str} for node '{node}': {e}"
-                )
-                if attempt < max_retries:
-                    import time
-                    time.sleep(0.2)
-                else:
-                    logger.critical(
-                        f"\033[91m[FAILED]\033[0m  | Project: {project_id:<10} | "
-                        f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
-                    )
-            attempt += 1
-        if not submission_success:
-            # After retries, still failed
-            raise Exception(
-                f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
-                f"Last error: {last_exception}"
-            )
-    logger.info(f"Finished submitting node '{node}'.")
-    return response_results
-def flatten_submission_results(submission_results: List[Dict]) -> List[Dict]:
-    """
-    Flattens a list of Gen3 submission result dictionaries into a single list of entity dictionaries.
-    For each submission result, this function processes its entities (if any),
-    extracting the 'project_id' and 'submitter_id' from the 'unique_keys' field (if present)
-    into the top-level entity dictionary for easy access.
-    Any submission result that does not have a code of 200 or lacks entities is skipped, and a warning is logged.
-    Args:
-        submission_results (List[Dict]):
-            A list of Gen3 submission result dictionaries, each containing at least a "code" and "entities" entry.
-    Returns:
-        List[Dict]:
-            A flat list, where each element is an entity dictionary (with keys 'project_id' and 'submitter_id' added if available).
-    """
-    flat_list_dict = []
-    total = len(submission_results)
-    logger.info(f"Flattening {total} submission result(s)...")
-    for idx, obj in enumerate(submission_results, 1):
-        transaction_id = obj.get("transaction_id")
-        code = obj.get("code")
-        if code != 200:
-            logger.warning(f"Skipping submission result at index {idx-1} (code={code})")
-            continue
-        entities = obj.get("entities")
-        if entities is None:
-            logger.warning(f"No entities found in submission result at index {idx-1}")
-            continue
-        logger.info(f"Processing submission result {idx} of {total}, {len(entities)} entities")
-        for entity in entities:
-            unique_keys = entity.get("unique_keys", [{}])
-            if unique_keys and isinstance(unique_keys, list):
-                keys = unique_keys[0]
-                entity["project_id"] = keys.get("project_id")
-                entity["submitter_id"] = keys.get("submitter_id")
-                entity["transaction_id"] = transaction_id
-                entity["file_path"] = obj.get("file_path", '')
-            flat_list_dict.append(entity)
-    # renaming cols
-    for entity in flat_list_dict:
-        entity["gen3_guid"] = entity.pop("id", None)
-        entity["node"] = entity.pop("type", None)
-    logger.info(f"Finished flattening. Total entities: {len(flat_list_dict)}")
-    return flat_list_dict
-def find_version_from_path(path):
-    version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
-    found_versions = []
-    for segment in path.split('/'):
-        match = version_pattern.match(segment)
-        if match:
-            found_versions.append(match.group(1))
-    if not found_versions:
-        return None
-    if len(found_versions) > 1:
-        logger.warning("more than one match found in path for version string")
-    return found_versions[-1]
-def collect_versions_from_metadata_file_list(metadata_file_list):
-    versions = []
-    for file_path in metadata_file_list:
-        version = find_version_from_path(file_path)
-        if version:
-            versions.append(version)
-    versions = list(set(versions))
-    if len(versions) > 1:
-        logger.error(f"more than one version found in metadata file list: {metadata_file_list}")
-        raise
-    return versions[0]
 class MetadataSubmitter:
     def __init__(
         self,
@@ -584,49 +521,440 @@ class MetadataSubmitter:
         api_key: dict,
         project_id: str,
         data_import_order_path: str,
+        dataset_root: str,
+        database: str,
+        table: str,
         program_id: str = "program1",
         max_size_kb: int = 100,
-        exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
+        exclude_nodes: Optional[List[str]] = None,
         max_retries: int = 3,
-        aws_profile: str = None
+        aws_profile: str = None,
+        partition_cols: Optional[List[str]] = None,
+        upload_to_database: bool = True
     ):
         """
-        Initialises a MetadataSubmitter for submitting a set of metadata JSON files to a Gen3 data commons endpoint, in order.
+        Initialises a MetadataSubmitter for submitting a set of metadata JSON
+        files to a Gen3 data commons endpoint, in order.
+        **Workflow Overview:**
+        1.  **Node Traversal:** The submitter iterates through each node defined in the
+            `data_import_order` list.
+        2.  **File Resolution:** For each node name, it locates the corresponding JSON file
+            (e.g., `node.json`) from the provided file list.
+        3.  **Chunking:** The JSON file is read and split into manageable chunks based on size.
+        4.  **Submission:** Each chunk is submitted to the Gen3 Sheepdog API via `gen3.submission`.
+        5.  **Response Handling:** The API response, which includes the `submission_id` for
+            the records, is captured.
+        6.  **Persistence:** The response data is flattened, converted into a DataFrame, and
+            written to Parquet files in S3. These records are also registered in a specific
+            upload table within the configured database for audit and tracking.
         Args:
-            metadata_file_list (list): List of local file paths or S3 URIs to metadata .json files, one per node type.
+            metadata_file_list (list): List of local file paths or S3 URIs to
+                metadata .json files, one per node type.
             api_key (dict): Gen3 API key as a parsed dictionary.
-            project_id (str): Gen3 project ID to submit data to.
-            data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying node submission order.
+            project_id (str): Gen3 project ID to submit data to (e.g., "internal-project").
+            data_import_order_path (str): Path or S3 URI to DataImportOrder.txt
+                specifying node submission order.
+            dataset_root (str): S3 path where the parquet files will be stored.
+                Example: "s3://acdc-dataops-metadata/metadata_upload/"
+            database (str): Database name for storing the metadata upload.
+                Example: "acdc_dataops_metadata_db"
+            table (str): Table name for storing the metadata upload.
+                Example: "metadata_upload"
             program_id (str, optional): Gen3 program ID (default: "program1").
-            max_size_kb (int, optional): Maximum size per submission chunk, in KB (default: 100).
-            exclude_nodes (list, optional): List of node names to skip during submission (default: ["project", "program", "acknowledgement", "publication"]).
-            max_retries (int, optional): Maximum number of retry attempts per node chunk (default: 3).
-            aws_profile (str, optional): AWS CLI named profile to use for boto3 session (default: None).
+            max_size_kb (int, optional): Maximum size per submission chunk,
+                in KB (default: 100).
+            exclude_nodes (list, optional): List of node names to skip during
+                submission. Defaults to ["project", "program", "acknowledgement", "publication"].
+            max_retries (int, optional): Maximum number of retry attempts per
+                node chunk (default: 3).
+            aws_profile (str, optional): AWS CLI named profile to use for boto3
+                session (default: None).
+            partition_cols (list, optional): List of column names to partition the parquet table by.
+                Defaults to ["upload_datetime"].
+            upload_to_database (bool, optional): Whether to upload the metadata to a database.
+                Defaults to True. The database is defined by dataset_root, database, and table.
         """
         self.metadata_file_list = metadata_file_list
         self.api_key = api_key
         self.project_id = project_id
         self.data_import_order_path = data_import_order_path
+        self.dataset_root = dataset_root
+        self.database = database
+        self.table = table
         self.program_id = program_id
         self.max_size_kb = max_size_kb
-        self.exclude_nodes = exclude_nodes
+        self.exclude_nodes = exclude_nodes or [
+            "project",
+            "program",
+            "acknowledgement",
+            "publication",
+        ]
         self.max_retries = max_retries
         self.submission_results = []
         self.aws_profile = aws_profile
+        self.partition_cols = partition_cols or ["upload_datetime"]
+        self.upload_to_database = upload_to_database
         self.boto3_session = self._create_boto3_session()
         logger.info("MetadataSubmitter initialised.")
     def _create_gen3_submission_class(self):
+        """Helper to instantiate the Gen3Submission class using the provided API key."""
         return create_gen3_submission_class(self.api_key)
     def _create_boto3_session(self):
+        """Helper to create a boto3 session using the provided AWS profile."""
         return create_boto3_session(self.aws_profile)
-    def _read_data_import_order(self, data_import_order_path: str, exclude_nodes: list[str], boto3_session = None):
+    def _flatten_submission_results(self, submission_results: List[Dict]) -> List[Dict]:
+        """
+        Flattens a list of Gen3 submission result dictionaries into a single
+        list of entity dictionaries.
+        For each submission result, this function processes its entities (if any),
+        extracting the 'project_id' and 'submitter_id' from the 'unique_keys'
+        field (if present) into the top-level entity dictionary for easy access.
+        Any submission result that does not have a code of 200 or lacks entities
+        is skipped, and a warning is logged.
+        Args:
+            submission_results (List[Dict]):
+                A list of Gen3 submission result dictionaries, each containing at
+                least a "code" and "entities" entry.
+        Returns:
+            List[Dict]:
+                A flat list, where each element is an entity dictionary
+                (with keys 'project_id' and 'submitter_id' added if available).
+        """
+        flat_list_dict = []
+        total = len(submission_results)
+        logger.info("Flattening %s submission result(s)...", total)
+        for idx, obj in enumerate(submission_results, 1):
+            transaction_id = obj.get("transaction_id")
+            code = obj.get("code")
+            if code != 200:
+                logger.warning(
+                    "Skipping submission result at index %s (code=%s)",
+                    idx - 1,
+                    code,
+                )
+                continue
+            entities = obj.get("entities")
+            if entities is None:
+                logger.warning("No entities found in submission result at index %s", idx - 1)
+                continue
+            logger.info(
+                "Processing submission result %s of %s, %s entities",
+                idx,
+                total,
+                len(entities),
+            )
+            for entity in entities:
+                unique_keys = entity.get("unique_keys", [{}])
+                if unique_keys and isinstance(unique_keys, list):
+                    keys = unique_keys[0]
+                    entity["project_id"] = keys.get("project_id")
+                    entity["submitter_id"] = keys.get("submitter_id")
+                    entity["transaction_id"] = transaction_id
+                    entity["file_path"] = obj.get("file_path", '')
+                flat_list_dict.append(entity)
+        # renaming cols
+        for entity in flat_list_dict:
+            entity["gen3_guid"] = entity.pop("id", None)
+            entity["node"] = entity.pop("type", None)
+        logger.info("Finished flattening. Total entities: %s", len(flat_list_dict))
+        return flat_list_dict
+    def _find_version_from_path(self, path: str) -> Optional[str]:
+        """
+        Extracts a semantic version string (e.g., '1.0.0' or 'v1.0.0') from a file path.
+        Args:
+            path (str): The file path to inspect.
+        Returns:
+            Optional[str]: The extracted version string if found, otherwise None.
+        """
+        version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
+        found_versions = []
+        for segment in path.split('/'):
+            match = version_pattern.match(segment)
+            if match:
+                found_versions.append(match.group(1))
+        if not found_versions:
+            return None
+        if len(found_versions) > 1:
+            logger.warning("more than one match found in path for version string")
+        return found_versions[-1]
+    def _collect_versions_from_metadata_file_list(self) -> str:
+        """
+        Extract and validate version information from the internal list of metadata
+        file paths (self.metadata_file_list).
+        Returns:
+            str: The single version found in the file list.
+        Raises:
+            ValueError: If more than one version is found across the files,
+                        or if no version is found at all.
+        """
+        versions = []
+        for file_path in self.metadata_file_list:
+            version = self._find_version_from_path(file_path)
+            if version:
+                versions.append(version)
+        versions = list(set(versions))
+        if len(versions) > 1:
+            logger.error(
+                "more than one version found in metadata file list: %s",
+                self.metadata_file_list,
+            )
+            raise ValueError(
+                "More than one version found in metadata file list: %s"
+                % self.metadata_file_list
+            )
+        if not versions:
+            raise ValueError(
+                "No version found in metadata file list: %s" % self.metadata_file_list
+            )
+        return versions[0]
+    def _upload_submission_results(self, submission_results: list):
+        """
+        Uploads the submission results to S3 and a Parquet table.
+        This function performs the final step of the pipeline:
+        1.  Flattens the submission response structure.
+        2.  Prepares a DataFrame with metadata (upload_id, datetime, version).
+        3.  Writes the DataFrame to Parquet files in S3 and registers them in the
+            database configured via `self.database` and `self.table`.
+        **Retry Mechanism:**
+        Uses the `tenacity` library to retry the upload if it fails.
+        - Stop: After `self.max_retries` attempts.
+        - Wait: Exponential backoff starting at 1s, doubling up to 10s.
+        Args:
+            submission_results (list): List of submission results to upload.
+        Configuration used (from __init__):
+            dataset_root (str): e.g. "s3://acdc-dataops-metadata/metadata_upload/"
+            database (str): e.g. "acdc_dataops_metadata_db"
+            table (str): e.g. "metadata_upload"
+            partition_cols (list): e.g. ["upload_datetime"]
+        """
+        @retry(
+            stop=stop_after_attempt(self.max_retries),
+            wait=wait_exponential(multiplier=1, max=10)
+        )
+        def inner_upload():
+            logger.debug("Collecting version from metadata file list.")
+            version = self._collect_versions_from_metadata_file_list()
+            logger.debug("Extracted version: %s", version)
+            logger.debug("Inferring API endpoint from JWT.")
+            api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
+            logger.debug("Using API endpoint: %s", api_endpoint)
+            upload_datetime = datetime.now().isoformat()
+            upload_id = str(uuid.uuid4())
+            logger.debug("Upload datetime: %s", upload_datetime)
+            logger.debug("Generated upload ID: %s", upload_id)
+            logger.debug("Flattening submission results for upload.")
+            flattened_results = self._flatten_submission_results(submission_results)
+            logger.debug(
+                "Flattened %s submission result entries.",
+                len(flattened_results),
+            )
+            logger.debug("Converting flattened results to DataFrame.")
+            flattened_results_df = pd.DataFrame(flattened_results)
+            flattened_results_df['upload_datetime'] = upload_datetime
+            flattened_results_df['upload_id'] = upload_id
+            flattened_results_df['api_endpoint'] = api_endpoint
+            flattened_results_df['version'] = version
+            logger.info(
+                "Writing DataFrame to parquet and S3/table: "
+                "dataset_root=%s, database=%s, table=%s, partition_cols=%s",
+                self.dataset_root,
+                self.database,
+                self.table,
+                self.partition_cols,
+            )
+            write_parquet_to_db(
+                df=flattened_results_df,
+                dataset_root=self.dataset_root,
+                database=self.database,
+                table=self.table,
+                partition_cols=self.partition_cols,
+            )
+            logger.info(
+                "\033[94m[SUCCESS]\033[0m Metadata submission results upload complete. "
+                "Uploaded to dataset_root=%s, database=%s, table=%s.",
+                self.dataset_root,
+                self.database,
+                self.table,
+            )
+        # Execute the decorated inner function
+        try:
+            inner_upload()
+        except Exception as e:
+            logger.critical("Failed to upload submission results after %s attempts.", self.max_retries)
+            raise e
+    def _submit_data_chunks(
+        self,
+        split_json_list: list,
+        node: str,
+        gen3_submitter,
+        file_path: str,
+        upload_to_database: bool = True
+    ) -> List[Dict]:
+        """
+        Submit each chunk of data (in split_json_list) for a given node to Gen3,
+        using retry logic and logging on failures.
+        Upon completion of each chunk (success or failure), the response is uploaded
+        to the configured S3 Parquet table using `_upload_submission_results`.
+        Args:
+            split_json_list (list): List of JSON-serializable chunked data to
+                submit.
+            node (str): Name of the data node being submitted (e.g., "program").
+            gen3_submitter: A Gen3Submission instance for making submissions.
+            file_path (str): Path of the file that was submitted.
+                Used only for data capture in the result logs.
+        Returns:
+            List[Dict]: List of response dictionaries for each submitted chunk.
+        Raises:
+            RuntimeError: If submission fails after all retry attempts for any chunk.
+        """
+        n_json_data = len(split_json_list)
+        for index, jsn in enumerate(split_json_list):
+            # Holds results for the current chunk
+            current_chunk_response: List[Dict[str, Any]] = []
+            progress_str = f"{index + 1}/{n_json_data}"
+            submission_success = False
+            last_exception: Optional[Exception] = None
+            attempt = 0
+            while attempt <= self.max_retries:
+                try:
+                    if attempt == 0:
+                        logger.info(
+                            "[SUBMIT]  | Project: %-10s | Node: %-12s | "
+                            "Split: %-5s",
+                            self.project_id,
+                            node,
+                            progress_str,
+                        )
+                    else:
+                        logger.warning(
+                            "[RETRY]   | Project: %-10s | Node: %-12s | "
+                            "Split: %-5s | "
+                            "Attempt: %s/%s",
+                            self.project_id,
+                            node,
+                            progress_str,
+                            attempt,
+                            self.max_retries,
+                        )
+                    res = gen3_submitter.submit_record(self.program_id, self.project_id, jsn)
+                    res.update({"file_path": file_path})
+                    current_chunk_response.append(res)
+                    submission_success = True
+                    logger.info(
+                        "\033[92m[SUCCESS]\033[0m | Project: %-10s | "
+                        "Node: %-12s | Split: %-5s",
+                        self.project_id,
+                        node,
+                        progress_str,
+                    )
+                    break  # Success
+                except (
+                    requests.exceptions.RequestException,
+                    ValueError,
+                    TypeError,
+                ) as e:
+                    last_exception = e
+                    logger.error(
+                        "Error submitting chunk %s for node '%s': %s",
+                        progress_str,
+                        node,
+                        e,
+                    )
+                    if attempt < self.max_retries:
+                        time.sleep(0.2)
+                    else:
+                        logger.critical(
+                            "\033[91m[FAILED]\033[0m  | Project: %-10s | "
+                            "Node: %-12s | Split: %-5s | Error: %s",
+                            self.project_id,
+                            node,
+                            progress_str,
+                            e,
+                        )
+                attempt += 1
+            if upload_to_database:
+                # Also submitting data chunk response info to s3 and parquet table
+                logger.info("Submitting data chunk response info to S3 and Parquet table.")
+                self._upload_submission_results(submission_results=current_chunk_response)
+            if not submission_success:
+                # After retries, still failed
+                raise RuntimeError(
+                    (
+                        "Failed to submit chunk %s for node '%s' after %s attempts. "
+                        "Last error: %s"
+                    )
+                    % (progress_str, node, self.max_retries + 1, last_exception)
+                ) from last_exception
+        logger.info("Finished submitting node '%s'.", node)
+    def _read_data_import_order(
+        self,
+        data_import_order_path: str,
+        exclude_nodes: List[str],
+        boto3_session=None,
+    ):
+        """Helper to read the data import order from local disk or S3."""
         if is_s3_uri(data_import_order_path):
             session = boto3_session or self.boto3_session
-            return read_data_import_order_txt_s3(data_import_order_path, session, exclude_nodes)
+            return read_data_import_order_txt_s3(
+                data_import_order_path,
+                session,
+                exclude_nodes,
+            )
         else:
             return read_data_import_order_txt(data_import_order_path, exclude_nodes)
@@ -643,7 +971,7 @@ class MetadataSubmitter:
             list: A list of chunks, where each chunk is a list of dictionaries
                 containing JSON data.
         """
-        logger.info(f"Reading metadata json from {metadata_file_path}")
+        logger.info("Reading metadata json from %s", metadata_file_path)
         if is_s3_uri(metadata_file_path):
             session = self.boto3_session
             data = read_metadata_json_s3(metadata_file_path, session)
@@ -660,113 +988,62 @@ class MetadataSubmitter:
         are the corresponding file paths.
         Returns:
-            dict: Dictionary mapping node names (str) to their associated metadata file paths (str).
+            dict: Dictionary mapping node names (str) to their associated metadata file paths.
         """
         file_map = {
-            get_node_from_file_path(file): file
-            for file in self.metadata_file_list
+            get_node_from_file_path(file_path): file_path
+            for file_path in self.metadata_file_list
         }
         return file_map
-    def submit_metadata(self) -> List[Dict]:
+    def submit_metadata(self) -> List[Dict[str, Any]]:
         """
         Submits metadata for each node defined in the data import order, except those in the exclude list.
-        For each node, this method retrieves the corresponding metadata file, splits the JSON data
-        into size-constrained chunks, and submits each chunk to the Gen3 submission API. Responses
-        from all submissions are gathered and returned as a list.
+        **Detailed Process:**
+        1.  **Order Resolution:** The function reads the import order to determine the sequence of nodes.
+        2.  **File Mapping:** It finds the matching `node.json` file for each node in the order.
+        3.  **Chunk & Submit:** For every file, the JSON content is split into chunks and submitted
+            to the Sheepdog API via `gen3.submission`.
+        4.  **Audit Logging:** The API response (containing `submission_id`) is flattened and
+            converted to a DataFrame. This is then written to Parquet files in S3 and registered
+            in the configured upload table.
         Returns:
-            List[Dict]: A list of response dictionaries returned from the Gen3 metadata submissions.
+            List[Dict[str, Any]]: A list of response dictionaries returned from the Gen3 metadata submissions.
+                Each dictionary contains the response from submitting a chunk of metadata for a given node.
+                The keys in the dictionary are "node_name", "response", and "status_code".
         """
         gen3_submitter = self._create_gen3_submission_class()
-        data_import_order = self._read_data_import_order(self.data_import_order_path, self.exclude_nodes, self.boto3_session)
+        data_import_order = self._read_data_import_order(
+            self.data_import_order_path,
+            self.exclude_nodes,
+            self.boto3_session,
+        )
         file_map = self._create_file_map()
-        output_response_list_dict = []
         logger.info("Starting metadata submission.")
-        for node in data_import_order:
+        for node in data_import_order:
             if node in self.exclude_nodes:
-                logger.info(f"Skipping node '{node}' (in exclude list).")
+                logger.info("Skipping node '%s' (in exclude list).", node)
                 continue
             file_path = file_map.get(node)
             if not file_path:
-                logger.info(f"Skipping node '{node}' (not present in file list).")
+                logger.info("Skipping node '%s' (not present in file list).", node)
                 continue
-            logger.info(f"Processing file '{file_path}' for node '{node}'.")
+            logger.info("Processing file '%s' for node '%s'.", file_path, node)
             logger.info("Splitting JSON data into chunks.")
             json_chunks = self._prepare_json_chunks(file_path, self.max_size_kb)
             logger.info("Submitting chunks to Gen3.")
-            response_list = submit_data_chunks(
+            self._submit_data_chunks(
                 split_json_list=json_chunks,
                 node=node,
-                file_path=file_path,
                 gen3_submitter=gen3_submitter,
-                project_id=self.project_id,
-                max_retries=self.max_retries,
-                program_id=self.program_id
+                file_path=file_path,
+                upload_to_database=self.upload_to_database
             )
-            output_response_list_dict.extend(response_list)
-        self.submission_results = output_response_list_dict
-        return output_response_list_dict
-    def upload_metadata_submission_results(
-        self,
-        dataset_root: str,
-        database: str,
-        table: str,
-        partition_cols: list = ["upload_datetime"],
-    ):
-        """
-        Uploads the submission results to s3 and parquet table.
-        Args:
-            dataset_root (str): S3 path where the parquet files will be stored
-                (e.g., "s3://acdc-dataops-metadata/metadata_upload/").
-            database (str): Database name for storing the metadata upload
-                (e.g., "acdc_dataops_metadata_db").
-            table (str): Table name for storing the metadata upload
-                (e.g., "metadata_upload").
-            partition_cols (list, optional): List of column names to partition the parquet table by.
-                Defaults to ["upload_datetime"].
-        """
-        logger.info("Collecting version from metadata file list.")
-        version = collect_versions_from_metadata_file_list(self.metadata_file_list)
-        logger.info(f"Extracted version: {version}")
-        logger.info("Inferring API endpoint from JWT.")
-        api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
-        logger.info(f"Using API endpoint: {api_endpoint}")
-        upload_datetime = datetime.now().isoformat()
-        upload_id = str(uuid.uuid4())
-        logger.info(f"Upload datetime: {upload_datetime}")
-        logger.info(f"Generated upload ID: {upload_id}")
-        logger.info("Flattening submission results for upload.")
-        flattened_results = flatten_submission_results(self.submission_results)
-        logger.info(f"Flattened {len(flattened_results)} submission result entries.")
-        logger.info("Converting flattened results to DataFrame.")
-        flattened_results_df = pd.DataFrame(flattened_results)
-        flattened_results_df['upload_datetime'] = upload_datetime
-        flattened_results_df['upload_id'] = upload_id
-        flattened_results_df['api_endpoint'] = api_endpoint
-        flattened_results_df['version'] = version
-        logger.info(
-            f"Writing DataFrame to parquet and S3/table: "
-            f"dataset_root={dataset_root}, database={database}, table={table}, partition_cols={partition_cols}"
-        )
-        write_parquet_to_db(
-            df=flattened_results_df,
-            dataset_root=dataset_root,
-            database=database,
-            table=table,
-            partition_cols=partition_cols
-        )
-        logger.info("Metadata submission results upload complete.")

{acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: acdc_aws_etl_pipeline
-Version: 0.6.9
+Version: 0.7.1
 Summary: Tools for ACDC ETL pipeline
 Author: JoshuaHarris391
 Author-email: harjo391@gmail.com
@@ -23,6 +23,7 @@ Requires-Dist: python-dotenv
 Requires-Dist: pytz (>=2025.2,<2026.0)
 Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
 Requires-Dist: s3fs (==2025.10.0)
+Requires-Dist: tenacity (>=8.2,<10.0)
 Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
 Description-Content-Type: text/markdown

{acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/RECORD RENAMED Viewed

@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6
 acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
 acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
 acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
-acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=k5q5hRkj-dWo25z9nVZI2eNh0xnmQU8TPDffSSnQlUY,29906
+acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=2PVuv-mvjnO-FxVZHiYfTDlbioEo-JsTcvNZY6v2n40,38331
 acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
 acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
 acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
 acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
 acdc_aws_etl_pipeline/validate/validate.py,sha256=zLqK9i92FsRAaBOGdY-G7-vb0e6tmkoUXhY6zCfbjN8,24895
-acdc_aws_etl_pipeline-0.6.9.dist-info/METADATA,sha256=L02r4oi2Xhtoet7a4HCfV8nGDlmmc2gSVsCN8sMNjTc,2926
-acdc_aws_etl_pipeline-0.6.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
-acdc_aws_etl_pipeline-0.6.9.dist-info/RECORD,,
+acdc_aws_etl_pipeline-0.7.1.dist-info/METADATA,sha256=WddwCKf3KV4-JsKtsegk5dxu6dWKvXx8YANvZZKbRGs,2964
+acdc_aws_etl_pipeline-0.7.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
+acdc_aws_etl_pipeline-0.7.1.dist-info/RECORD,,

{acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

acdc_aws_etl_pipeline 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl

acdc_aws_etl_pipeline 0.6.9py3-none-any.whl → 0.7.1py3-none-any.whl