acdc_aws_etl_pipeline 0.5.8__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acdc_aws_etl_pipeline/ingest/ingest.py +81 -8
- {acdc_aws_etl_pipeline-0.5.8.dist-info → acdc_aws_etl_pipeline-0.6.0.dist-info}/METADATA +2 -1
- {acdc_aws_etl_pipeline-0.5.8.dist-info → acdc_aws_etl_pipeline-0.6.0.dist-info}/RECORD +4 -4
- {acdc_aws_etl_pipeline-0.5.8.dist-info → acdc_aws_etl_pipeline-0.6.0.dist-info}/WHEEL +0 -0
|
@@ -9,6 +9,9 @@ from datetime import datetime
|
|
|
9
9
|
from botocore.exceptions import ClientError
|
|
10
10
|
import logging
|
|
11
11
|
import pytz # Replaced tzlocal with pytz
|
|
12
|
+
import s3fs
|
|
13
|
+
from typing import Dict
|
|
14
|
+
|
|
12
15
|
|
|
13
16
|
logger = logging.getLogger(__name__)
|
|
14
17
|
logger.setLevel(logging.INFO)
|
|
@@ -125,15 +128,84 @@ def read_json_robust(uri: str) -> pd.DataFrame:
|
|
|
125
128
|
raise RuntimeError(f"Failed to read JSON from {uri}: {last_err}")
|
|
126
129
|
return df
|
|
127
130
|
|
|
128
|
-
def read_xlsx_robust(
|
|
131
|
+
def read_xlsx_robust(s3_uri: str) -> dict[str, pd.DataFrame]:
|
|
132
|
+
"""
|
|
133
|
+
Reads an XLSX file from the given S3 URI using pandas with the openpyxl engine.
|
|
134
|
+
Reads all values as strings and disables default NA values.
|
|
135
|
+
Returns a dictionary of dataframes, with the sheet name as the key.
|
|
136
|
+
If there is only one sheet, the key is set to the file name (minus extension) from s3_uri.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
s3_uri (str): The S3 URI of the XLSX file.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
dict[str, pd.DataFrame]: Dictionary mapping sheet name (or file name) to DataFrame.
|
|
143
|
+
|
|
144
|
+
Raises:
|
|
145
|
+
RuntimeError: If reading the XLSX file fails,
|
|
146
|
+
ValueError: If the S3 URI is invalid.
|
|
147
|
+
"""
|
|
148
|
+
# Check that s3_uri is a valid S3 URI
|
|
149
|
+
if not isinstance(s3_uri, str) or not s3_uri.startswith("s3://"):
|
|
150
|
+
logger.error(f"Invalid S3 URI: {s3_uri}")
|
|
151
|
+
raise ValueError(f"Invalid S3 URI: {s3_uri}")
|
|
152
|
+
try:
|
|
153
|
+
logger.debug(f"Attempting to read XLSX from {s3_uri}")
|
|
154
|
+
# Read all sheets, always returns dict
|
|
155
|
+
df_dict = pd.read_excel(
|
|
156
|
+
s3_uri,
|
|
157
|
+
sheet_name=None,
|
|
158
|
+
engine="openpyxl",
|
|
159
|
+
dtype=str,
|
|
160
|
+
keep_default_na=False
|
|
161
|
+
)
|
|
162
|
+
logger.debug(f"Successfully read XLSX from {s3_uri}: sheets={list(df_dict.keys())}")
|
|
163
|
+
|
|
164
|
+
if len(df_dict) == 1:
|
|
165
|
+
# Only one sheet, rename the key to the file name (no extension)
|
|
166
|
+
import os
|
|
167
|
+
file_name = os.path.splitext(os.path.basename(s3_uri))[0]
|
|
168
|
+
only_df = next(iter(df_dict.values()))
|
|
169
|
+
logger.debug(f"Only one sheet found, renaming key to file name: {file_name}")
|
|
170
|
+
return {file_name: only_df}
|
|
171
|
+
|
|
172
|
+
return df_dict
|
|
173
|
+
except Exception as e:
|
|
174
|
+
logger.error(f"Failed to read XLSX from {s3_uri}: {e}")
|
|
175
|
+
raise RuntimeError(f"Failed to read XLSX from {s3_uri}: {e}")
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def flatten_xlsx_dict(df_dict: dict[str, pd.DataFrame]) -> pd.DataFrame:
|
|
179
|
+
"""
|
|
180
|
+
Flattens a dictionary of DataFrames (representing Excel sheets) into a single DataFrame.
|
|
181
|
+
|
|
182
|
+
Adds a "sheet_name" column to each DataFrame, indicating the originating sheet,
|
|
183
|
+
then concatenates all DataFrames into one.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
df_dict (dict[str, pd.DataFrame]): Dictionary mapping sheet names to DataFrames.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
pd.DataFrame: Concatenated DataFrame with an added "sheet_name" column.
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If the provided dictionary is empty.
|
|
193
|
+
RuntimeError: If an error occurs during DataFrame concatenation.
|
|
194
|
+
"""
|
|
195
|
+
if not df_dict:
|
|
196
|
+
logger.error("The df_dict provided to flatten_xlsx_dict is empty.")
|
|
197
|
+
raise ValueError("Input dictionary of DataFrames is empty.")
|
|
198
|
+
|
|
199
|
+
dfs_with_sheet = []
|
|
129
200
|
try:
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
201
|
+
for sheet_name, df in df_dict.items():
|
|
202
|
+
df_with_sheet = df.copy()
|
|
203
|
+
df_with_sheet["sheet_name"] = sheet_name
|
|
204
|
+
dfs_with_sheet.append(df_with_sheet)
|
|
205
|
+
return pd.concat(dfs_with_sheet, ignore_index=True)
|
|
134
206
|
except Exception as e:
|
|
135
|
-
logger.error(f"Failed to
|
|
136
|
-
raise RuntimeError(f"Failed to
|
|
207
|
+
logger.error(f"Failed to flatten XLSX dictionary: {e}")
|
|
208
|
+
raise RuntimeError(f"Failed to flatten XLSX dictionary: {e}") from e
|
|
137
209
|
|
|
138
210
|
|
|
139
211
|
def get_format(uri: str) -> str:
|
|
@@ -473,7 +545,8 @@ def ingest_table_to_parquet_dataset(
|
|
|
473
545
|
elif file_format == "csv":
|
|
474
546
|
df = read_csv_robust(uri)
|
|
475
547
|
elif file_format == "xlsx":
|
|
476
|
-
|
|
548
|
+
df_dict = read_xlsx_robust(uri)
|
|
549
|
+
df = flatten_xlsx_dict(df_dict)
|
|
477
550
|
else:
|
|
478
551
|
logger.error(f"Unsupported file format: {file_format} for file {uri}")
|
|
479
552
|
raise ValueError(f"Unsupported file format: {file_format}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -21,6 +21,7 @@ Requires-Dist: pytest
|
|
|
21
21
|
Requires-Dist: python-dotenv
|
|
22
22
|
Requires-Dist: pytz (>=2025.2,<2026.0)
|
|
23
23
|
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
24
|
+
Requires-Dist: s3fs (==2025.10.0)
|
|
24
25
|
Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
|
|
25
26
|
Description-Content-Type: text/markdown
|
|
26
27
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
acdc_aws_etl_pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
acdc_aws_etl_pipeline/ingest/ingest.py,sha256=
|
|
2
|
+
acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6XuFA1RZAU,26846
|
|
3
3
|
acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
|
|
4
4
|
acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
|
|
5
5
|
acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
|
|
@@ -9,6 +9,6 @@ acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSH
|
|
|
9
9
|
acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
|
|
10
10
|
acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
|
|
11
11
|
acdc_aws_etl_pipeline/validate/validate.py,sha256=fTa76YvixCWOGkAIuR7CZ2WryMJcpc2wvSOHLZDEknc,28159
|
|
12
|
-
acdc_aws_etl_pipeline-0.
|
|
13
|
-
acdc_aws_etl_pipeline-0.
|
|
14
|
-
acdc_aws_etl_pipeline-0.
|
|
12
|
+
acdc_aws_etl_pipeline-0.6.0.dist-info/METADATA,sha256=gWzCbMUrx9WA9Zy8b6fma_L-KmryghobYWoXp8EokNQ,2887
|
|
13
|
+
acdc_aws_etl_pipeline-0.6.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
14
|
+
acdc_aws_etl_pipeline-0.6.0.dist-info/RECORD,,
|
|
File without changes
|