acdc_aws_etl_pipeline 0.5.8__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/PKG-INFO +2 -1
  2. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/pyproject.toml +3 -1
  3. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/ingest/ingest.py +81 -8
  4. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/README.md +0 -0
  5. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
  6. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
  7. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
  8. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -0
  9. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +0 -0
  10. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
  11. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
  12. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
  13. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
  14. {acdc_aws_etl_pipeline-0.5.8 → acdc_aws_etl_pipeline-0.6.0}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.5.8
3
+ Version: 0.6.0
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -21,6 +21,7 @@ Requires-Dist: pytest
21
21
  Requires-Dist: python-dotenv
22
22
  Requires-Dist: pytz (>=2025.2,<2026.0)
23
23
  Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
24
+ Requires-Dist: s3fs (==2025.10.0)
24
25
  Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
25
26
  Description-Content-Type: text/markdown
26
27
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "acdc_aws_etl_pipeline"
3
- version = "0.5.8"
3
+ version = "0.6.0"
4
4
  description = "Tools for ACDC ETL pipeline"
5
5
  authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
6
6
  readme = "README.md"
@@ -18,6 +18,8 @@ gen3 = ">=4.27.4, <5.0.0"
18
18
  gen3_validator = ">=1.1.2,<2.0.0"
19
19
  pytest = "*"
20
20
  pytz = ">=2025.2,<2026.0"
21
+ s3fs = "2025.10.0"
22
+
21
23
 
22
24
  [tool.poetry.group.dev.dependencies]
23
25
  ipykernel = "^6.30.1"
@@ -9,6 +9,9 @@ from datetime import datetime
9
9
  from botocore.exceptions import ClientError
10
10
  import logging
11
11
  import pytz # Replaced tzlocal with pytz
12
+ import s3fs
13
+ from typing import Dict
14
+
12
15
 
13
16
  logger = logging.getLogger(__name__)
14
17
  logger.setLevel(logging.INFO)
@@ -125,15 +128,84 @@ def read_json_robust(uri: str) -> pd.DataFrame:
125
128
  raise RuntimeError(f"Failed to read JSON from {uri}: {last_err}")
126
129
  return df
127
130
 
128
- def read_xlsx_robust(uri: str) -> pd.DataFrame:
131
+ def read_xlsx_robust(s3_uri: str) -> dict[str, pd.DataFrame]:
132
+ """
133
+ Reads an XLSX file from the given S3 URI using pandas with the openpyxl engine.
134
+ Reads all values as strings and disables default NA values.
135
+ Returns a dictionary of dataframes, with the sheet name as the key.
136
+ If there is only one sheet, the key is set to the file name (minus extension) from s3_uri.
137
+
138
+ Args:
139
+ s3_uri (str): The S3 URI of the XLSX file.
140
+
141
+ Returns:
142
+ dict[str, pd.DataFrame]: Dictionary mapping sheet name (or file name) to DataFrame.
143
+
144
+ Raises:
145
+ RuntimeError: If reading the XLSX file fails,
146
+ ValueError: If the S3 URI is invalid.
147
+ """
148
+ # Check that s3_uri is a valid S3 URI
149
+ if not isinstance(s3_uri, str) or not s3_uri.startswith("s3://"):
150
+ logger.error(f"Invalid S3 URI: {s3_uri}")
151
+ raise ValueError(f"Invalid S3 URI: {s3_uri}")
152
+ try:
153
+ logger.debug(f"Attempting to read XLSX from {s3_uri}")
154
+ # Read all sheets, always returns dict
155
+ df_dict = pd.read_excel(
156
+ s3_uri,
157
+ sheet_name=None,
158
+ engine="openpyxl",
159
+ dtype=str,
160
+ keep_default_na=False
161
+ )
162
+ logger.debug(f"Successfully read XLSX from {s3_uri}: sheets={list(df_dict.keys())}")
163
+
164
+ if len(df_dict) == 1:
165
+ # Only one sheet, rename the key to the file name (no extension)
166
+ import os
167
+ file_name = os.path.splitext(os.path.basename(s3_uri))[0]
168
+ only_df = next(iter(df_dict.values()))
169
+ logger.debug(f"Only one sheet found, renaming key to file name: {file_name}")
170
+ return {file_name: only_df}
171
+
172
+ return df_dict
173
+ except Exception as e:
174
+ logger.error(f"Failed to read XLSX from {s3_uri}: {e}")
175
+ raise RuntimeError(f"Failed to read XLSX from {s3_uri}: {e}")
176
+
177
+
178
+ def flatten_xlsx_dict(df_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
179
+ """
180
+ Flattens a dictionary of DataFrames (representing Excel sheets) into a single DataFrame.
181
+
182
+ Adds a "sheet_name" column to each DataFrame, indicating the originating sheet,
183
+ then concatenates all DataFrames into one.
184
+
185
+ Args:
186
+ df_dict (dict[str, pd.DataFrame]): Dictionary mapping sheet names to DataFrames.
187
+
188
+ Returns:
189
+ pd.DataFrame: Concatenated DataFrame with an added "sheet_name" column.
190
+
191
+ Raises:
192
+ ValueError: If the provided dictionary is empty.
193
+ RuntimeError: If an error occurs during DataFrame concatenation.
194
+ """
195
+ if not df_dict:
196
+ logger.error("The df_dict provided to flatten_xlsx_dict is empty.")
197
+ raise ValueError("Input dictionary of DataFrames is empty.")
198
+
199
+ dfs_with_sheet = []
129
200
  try:
130
- logger.debug(f"Attempting to read XLSX from {uri}")
131
- df = pd.read_excel(uri, engine="openpyxl", dtype=str, keep_default_na=False)
132
- logger.debug(f"Successfully read XLSX from {uri}")
133
- return df
201
+ for sheet_name, df in df_dict.items():
202
+ df_with_sheet = df.copy()
203
+ df_with_sheet["sheet_name"] = sheet_name
204
+ dfs_with_sheet.append(df_with_sheet)
205
+ return pd.concat(dfs_with_sheet, ignore_index=True)
134
206
  except Exception as e:
135
- logger.error(f"Failed to read XLSX from {uri}: {e}")
136
- raise RuntimeError(f"Failed to read XLSX from {uri}: {e}")
207
+ logger.error(f"Failed to flatten XLSX dictionary: {e}")
208
+ raise RuntimeError(f"Failed to flatten XLSX dictionary: {e}") from e
137
209
 
138
210
 
139
211
  def get_format(uri: str) -> str:
@@ -473,7 +545,8 @@ def ingest_table_to_parquet_dataset(
473
545
  elif file_format == "csv":
474
546
  df = read_csv_robust(uri)
475
547
  elif file_format == "xlsx":
476
- df = read_xlsx_robust(uri)
548
+ df_dict = read_xlsx_robust(uri)
549
+ df = flatten_xlsx_dict(df_dict)
477
550
  else:
478
551
  logger.error(f"Unsupported file format: {file_format} for file {uri}")
479
552
  raise ValueError(f"Unsupported file format: {file_format}")