PyPI - acdc_aws_etl_pipeline - Versions diffs - 0.5.8__tar.gz → 0.6.0__tar.gz - Mend

acdc_aws_etl_pipeline 0.5.8tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: acdc_aws_etl_pipeline
-Version: 0.5.8
+Version: 0.6.0
 Summary: Tools for ACDC ETL pipeline
 Author: JoshuaHarris391
 Author-email: harjo391@gmail.com
@@ -21,6 +21,7 @@ Requires-Dist: pytest
 Requires-Dist: python-dotenv
 Requires-Dist: pytz (>=2025.2,<2026.0)
 Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
+Requires-Dist: s3fs (==2025.10.0)
 Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
 Description-Content-Type: text/markdown

{acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "acdc_aws_etl_pipeline"
-version = "0.5.8"
+version = "0.6.0"
 description = "Tools for ACDC ETL pipeline"
 authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
 readme = "README.md"
@@ -18,6 +18,8 @@ gen3 = ">=4.27.4, <5.0.0"
 gen3_validator = ">=1.1.2,<2.0.0"
 pytest = "*"
 pytz = ">=2025.2,<2026.0"
+s3fs = "2025.10.0"
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.30.1"

{acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/ingest/ingest.py RENAMED Viewed

@@ -9,6 +9,9 @@ from datetime import datetime
 from botocore.exceptions import ClientError
 import logging
 import pytz  # Replaced tzlocal with pytz
+import s3fs
+from typing import Dict
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -125,15 +128,84 @@ def read_json_robust(uri: str) -> pd.DataFrame:
         raise RuntimeError(f"Failed to read JSON from {uri}: {last_err}")
     return df
-def read_xlsx_robust(uri: str) -> pd.DataFrame:
+def read_xlsx_robust(s3_uri: str) -> dict[str, pd.DataFrame]:
+    """
+    Reads an XLSX file from the given S3 URI using pandas with the openpyxl engine.
+    Reads all values as strings and disables default NA values.
+    Returns a dictionary of dataframes, with the sheet name as the key.
+    If there is only one sheet, the key is set to the file name (minus extension) from s3_uri.
+    Args:
+        s3_uri (str): The S3 URI of the XLSX file.
+    Returns:
+        dict[str, pd.DataFrame]: Dictionary mapping sheet name (or file name) to DataFrame.
+    Raises:
+        RuntimeError: If reading the XLSX file fails,
+        ValueError: If the S3 URI is invalid.
+    """
+    # Check that s3_uri is a valid S3 URI
+    if not isinstance(s3_uri, str) or not s3_uri.startswith("s3://"):
+        logger.error(f"Invalid S3 URI: {s3_uri}")
+        raise ValueError(f"Invalid S3 URI: {s3_uri}")
+    try:
+        logger.debug(f"Attempting to read XLSX from {s3_uri}")
+        # Read all sheets, always returns dict
+        df_dict = pd.read_excel(
+            s3_uri,
+            sheet_name=None,
+            engine="openpyxl",
+            dtype=str,
+            keep_default_na=False
+        )
+        logger.debug(f"Successfully read XLSX from {s3_uri}: sheets={list(df_dict.keys())}")
+        if len(df_dict) == 1:
+            # Only one sheet, rename the key to the file name (no extension)
+            import os
+            file_name = os.path.splitext(os.path.basename(s3_uri))[0]
+            only_df = next(iter(df_dict.values()))
+            logger.debug(f"Only one sheet found, renaming key to file name: {file_name}")
+            return {file_name: only_df}
+        return df_dict
+    except Exception as e:
+        logger.error(f"Failed to read XLSX from {s3_uri}: {e}")
+        raise RuntimeError(f"Failed to read XLSX from {s3_uri}: {e}")
+def flatten_xlsx_dict(df_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
+    """
+    Flattens a dictionary of DataFrames (representing Excel sheets) into a single DataFrame.
+    Adds a "sheet_name" column to each DataFrame, indicating the originating sheet,
+    then concatenates all DataFrames into one.
+    Args:
+        df_dict (dict[str, pd.DataFrame]): Dictionary mapping sheet names to DataFrames.
+    Returns:
+        pd.DataFrame: Concatenated DataFrame with an added "sheet_name" column.
+    Raises:
+        ValueError: If the provided dictionary is empty.
+        RuntimeError: If an error occurs during DataFrame concatenation.
+    """
+    if not df_dict:
+        logger.error("The df_dict provided to flatten_xlsx_dict is empty.")
+        raise ValueError("Input dictionary of DataFrames is empty.")
+    dfs_with_sheet = []
     try:
-        logger.debug(f"Attempting to read XLSX from {uri}")
-        df = pd.read_excel(uri, engine="openpyxl", dtype=str, keep_default_na=False)
-        logger.debug(f"Successfully read XLSX from {uri}")
-        return df
+        for sheet_name, df in df_dict.items():
+            df_with_sheet = df.copy()
+            df_with_sheet["sheet_name"] = sheet_name
+            dfs_with_sheet.append(df_with_sheet)
+        return pd.concat(dfs_with_sheet, ignore_index=True)
     except Exception as e:
-        logger.error(f"Failed to read XLSX from {uri}: {e}")
-        raise RuntimeError(f"Failed to read XLSX from {uri}: {e}")
+        logger.error(f"Failed to flatten XLSX dictionary: {e}")
+        raise RuntimeError(f"Failed to flatten XLSX dictionary: {e}") from e
 def get_format(uri: str) -> str:
@@ -473,7 +545,8 @@ def ingest_table_to_parquet_dataset(
         elif file_format == "csv":
             df = read_csv_robust(uri)
         elif file_format == "xlsx":
-            df = read_xlsx_robust(uri)
+            df_dict = read_xlsx_robust(uri)
+            df = flatten_xlsx_dict(df_dict)
         else:
             logger.error(f"Unsupported file format: {file_format} for file {uri}")
             raise ValueError(f"Unsupported file format: {file_format}")