acdc_aws_etl_pipeline 0.6.2__tar.gz → 0.6.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/PKG-INFO +2 -2
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/pyproject.toml +2 -2
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/validate/validate.py +76 -140
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/README.md +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.4
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -15,7 +15,7 @@ Requires-Dist: boto3
|
|
|
15
15
|
Requires-Dist: dbt-athena (==1.9.4)
|
|
16
16
|
Requires-Dist: dbt-core (==1.9.4)
|
|
17
17
|
Requires-Dist: gen3 (>=4.27.4,<5.0.0)
|
|
18
|
-
Requires-Dist: gen3_validator (>=
|
|
18
|
+
Requires-Dist: gen3_validator (>=2.0.0,<3.0.0)
|
|
19
19
|
Requires-Dist: numpy (<2.0.0)
|
|
20
20
|
Requires-Dist: pytest
|
|
21
21
|
Requires-Dist: python-dotenv
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "acdc_aws_etl_pipeline"
|
|
3
|
-
version = "0.6.
|
|
3
|
+
version = "0.6.4"
|
|
4
4
|
description = "Tools for ACDC ETL pipeline"
|
|
5
5
|
authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
|
|
6
6
|
readme = "README.md"
|
|
@@ -15,7 +15,7 @@ numpy = "<2.0.0"
|
|
|
15
15
|
pyyaml = ">=6.0.2,<7.0.0"
|
|
16
16
|
tzlocal = ">=5.3.1,<6.0.0"
|
|
17
17
|
gen3 = ">=4.27.4, <5.0.0"
|
|
18
|
-
gen3_validator = ">=
|
|
18
|
+
gen3_validator = ">=2.0.0,<3.0.0"
|
|
19
19
|
pytest = "*"
|
|
20
20
|
pytz = ">=2025.2,<2026.0"
|
|
21
21
|
s3fs = "2025.10.0"
|
|
@@ -534,165 +534,112 @@ def validate_pipeline(
|
|
|
534
534
|
write_back_root: str,
|
|
535
535
|
parquet_root: str,
|
|
536
536
|
glue_database: str,
|
|
537
|
-
root_node: str =
|
|
538
|
-
):
|
|
537
|
+
root_node: str = "project",
|
|
538
|
+
) -> None:
|
|
539
539
|
"""
|
|
540
|
-
Orchestrate
|
|
541
|
-
|
|
542
|
-
to S3 and Athena/Glue in usable format.
|
|
543
|
-
|
|
544
|
-
:param study_id: The identifier of the study whose files/results to process (e.g., 'ausdiab').
|
|
545
|
-
:type study_id: str
|
|
546
|
-
:param schema_s3_uri: Full S3 URI to the JSON schema file.
|
|
547
|
-
:type schema_s3_uri: str
|
|
548
|
-
:param validation_s3_uri: S3 prefix/folder where validation result files (.json) are stored.
|
|
549
|
-
:type validation_s3_uri: str
|
|
550
|
-
:param write_back_root: S3 prefix/folder to write generated artifacts/reports to.
|
|
551
|
-
:type write_back_root: str
|
|
552
|
-
:param parquet_root: S3 prefix/folder for writing Parquet files for Athena.
|
|
553
|
-
:type parquet_root: str
|
|
554
|
-
:param glue_database: The Glue database name to register tables into for Athena.
|
|
555
|
-
:type glue_database: str
|
|
556
|
-
:param root_node: The root node in the schema graph for link validation.
|
|
557
|
-
:type root_node: str
|
|
558
|
-
|
|
559
|
-
:raises Exception: If any step in the process fails.
|
|
560
|
-
|
|
561
|
-
.. note::
|
|
562
|
-
This function is typically used in an ETL or validation pipeline step for data quality assurance.
|
|
540
|
+
Orchestrate the validation workflow for a study: load + resolve schema, find the latest
|
|
541
|
+
validation artefacts, validate, and persist results to S3 and Athena/Glue.
|
|
563
542
|
|
|
564
|
-
|
|
543
|
+
Args:
|
|
544
|
+
study_id: Study identifier (e.g. "ausdiab").
|
|
545
|
+
schema_s3_uri: Full S3 URI to the JSON schema file.
|
|
546
|
+
validation_s3_uri: S3 prefix containing validation result files (.json).
|
|
547
|
+
write_back_root: S3 prefix to write generated artefacts/reports to.
|
|
548
|
+
parquet_root: S3 prefix for Parquet outputs (Athena).
|
|
549
|
+
glue_database: Glue database to register tables into.
|
|
550
|
+
root_node: Root node in the schema graph for link validation.
|
|
565
551
|
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
3. Identifies the most recent validation results for the requested study.
|
|
569
|
-
4. Downloads the relevant data files to a temp directory.
|
|
570
|
-
5. Runs validation and builds both full and summary DataFrames of the results.
|
|
571
|
-
6. Writes CSV summaries to S3 for reporting.
|
|
572
|
-
7. Writes full and summarized validation results as Parquet to S3 and registers in Glue.
|
|
573
|
-
|
|
574
|
-
**Example**
|
|
575
|
-
|
|
576
|
-
>>> validate_pipeline(
|
|
577
|
-
... study_id='ausdiab',
|
|
578
|
-
... schema_s3_uri='s3://bucket/schemas/my_schema.json',
|
|
579
|
-
... validation_s3_uri='s3://bucket/validation/',
|
|
580
|
-
... write_back_root='s3://bucket/validation/',
|
|
581
|
-
... parquet_root='s3://bucket/validation/athena/',
|
|
582
|
-
... glue_database='validation_db',
|
|
583
|
-
... root_node='project' # example value for root_node
|
|
584
|
-
... )
|
|
552
|
+
Raises:
|
|
553
|
+
RuntimeError: When expected inputs are missing or a pipeline step fails.
|
|
585
554
|
"""
|
|
555
|
+
# NOTE: root_node is currently unused in the original implementation.
|
|
556
|
+
# Keeping it in the signature for forward compatibility.
|
|
557
|
+
|
|
586
558
|
try:
|
|
587
559
|
schema = load_schema_from_s3_uri(schema_s3_uri)
|
|
588
560
|
schema_path = write_schema_to_temp_file(schema)
|
|
561
|
+
logger.info("Schema loaded and written to temp file: %s", schema_path)
|
|
589
562
|
except Exception as e:
|
|
590
|
-
logger.
|
|
591
|
-
raise
|
|
563
|
+
logger.exception("Schema loading/writing failed.")
|
|
564
|
+
raise RuntimeError("Schema loading/writing failed.") from e
|
|
592
565
|
|
|
593
566
|
try:
|
|
594
|
-
logger.info(
|
|
567
|
+
logger.info("Instantiating schema resolver with schema path: %s", schema_path)
|
|
595
568
|
resolver = gen3_validator.ResolveSchema(schema_path=schema_path)
|
|
596
569
|
resolver.resolve_schema()
|
|
597
570
|
logger.info("Schema resolved.")
|
|
598
571
|
except Exception as e:
|
|
599
|
-
logger.
|
|
600
|
-
raise
|
|
601
|
-
|
|
602
|
-
try:
|
|
603
|
-
logger.info(f"Getting schema version")
|
|
604
|
-
dd = gen3_validator.DataDictionary(schema_path = schema_path)
|
|
605
|
-
dd.parse_schema()
|
|
606
|
-
schema_version = dd.get_schema_version(dd.schema)
|
|
607
|
-
logger.info(f"Schema version: {schema_version}")
|
|
608
|
-
except Exception as e:
|
|
609
|
-
logger.error(f"Failed to get schema version: {e}")
|
|
610
|
-
raise
|
|
572
|
+
logger.exception("Failed to instantiate or resolve schema.")
|
|
573
|
+
raise RuntimeError("Schema resolution failed.") from e
|
|
611
574
|
|
|
612
575
|
try:
|
|
613
576
|
metadata_table = pd.DataFrame(create_metadata_table(validation_s3_uri))
|
|
614
|
-
logger.info(
|
|
577
|
+
logger.info("Metadata table created from S3 (%s rows).", len(metadata_table))
|
|
615
578
|
except Exception as e:
|
|
616
|
-
logger.
|
|
617
|
-
raise
|
|
579
|
+
logger.exception("Failed to create metadata table.")
|
|
580
|
+
raise RuntimeError("Metadata table creation failed.") from e
|
|
618
581
|
|
|
619
582
|
try:
|
|
620
|
-
|
|
621
|
-
if
|
|
622
|
-
|
|
623
|
-
raise Exception(f"No latest validation files found for study {study_id}.")
|
|
583
|
+
latest_metadata, latest_validation_id = get_latest_validation_for_study(metadata_table, study_id)
|
|
584
|
+
if latest_metadata is None or latest_metadata.empty:
|
|
585
|
+
raise RuntimeError(f"No latest validation files found for study '{study_id}'.")
|
|
624
586
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
logger.info(
|
|
628
|
-
logger.info(f"Latest validation S3 URIs: {latest_validation_s3_uris}")
|
|
587
|
+
latest_validation_s3_uris = latest_metadata["s3_uri"].tolist()
|
|
588
|
+
logger.info("Latest validation id: %s", latest_validation_id)
|
|
589
|
+
logger.info("Latest validation file count: %s", len(latest_validation_s3_uris))
|
|
629
590
|
|
|
630
|
-
downloaded_files_dir,
|
|
631
|
-
except Exception as e:
|
|
632
|
-
logger.error(f"Failed to retrieve and download latest validation files: {e}")
|
|
633
|
-
raise
|
|
634
|
-
|
|
635
|
-
try:
|
|
591
|
+
downloaded_files_dir, _downloaded_files = download_s3_files_to_temp_dir(latest_validation_s3_uris)
|
|
636
592
|
files_in_dir = os.listdir(downloaded_files_dir)
|
|
637
|
-
logger.info(
|
|
638
|
-
for
|
|
639
|
-
logger.info(
|
|
593
|
+
logger.info("Downloaded files dir: %s", downloaded_files_dir)
|
|
594
|
+
for name in files_in_dir:
|
|
595
|
+
logger.info(" - %s", name)
|
|
640
596
|
except Exception as e:
|
|
641
|
-
logger.
|
|
642
|
-
raise
|
|
597
|
+
logger.exception("Failed to retrieve/download latest validation files.")
|
|
598
|
+
raise RuntimeError("Retrieval/download of validation files failed.") from e
|
|
643
599
|
|
|
644
600
|
try:
|
|
645
|
-
logger.info(
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
)
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
logger.info("Validation completed.")
|
|
666
|
-
|
|
667
|
-
summary = gen3_validator.ValidateSummary(validator)
|
|
668
|
-
summary.flatten_validation_results()
|
|
669
|
-
full_validation_results_df = summary.flattened_results_to_pd()
|
|
670
|
-
full_validation_results_df['validation_id'] = latest_validation_id
|
|
671
|
-
full_validation_results_df['study_id'] = study_id
|
|
672
|
-
full_validation_results_df['schema_version'] = schema_version
|
|
673
|
-
full_validation_results_df['linkage_pass'] = str(linkage_pass)
|
|
674
|
-
full_validation_results_df['n_unlinked_foreign_keys'] = str(unlinked_fk)
|
|
675
|
-
full_validation_results_df['linkage_results_dict'] = truncate_linkage_results(linkage_results)
|
|
676
|
-
summarised_validation_results_df = summary.collapse_flatten_results_to_pd()
|
|
677
|
-
summarised_validation_results_df['validation_id'] = latest_validation_id
|
|
678
|
-
summarised_validation_results_df['study_id'] = study_id
|
|
679
|
-
summarised_validation_results_df['schema_version'] = schema_version
|
|
680
|
-
logger.info("Validation results processed to DataFrame.")
|
|
601
|
+
logger.info("Reading JSON validation files from: %s", downloaded_files_dir)
|
|
602
|
+
|
|
603
|
+
data_by_file = {}
|
|
604
|
+
for name in files_in_dir:
|
|
605
|
+
if not name.endswith(".json"):
|
|
606
|
+
continue
|
|
607
|
+
full_path = os.path.join(downloaded_files_dir, name)
|
|
608
|
+
with open(full_path, "r") as fh:
|
|
609
|
+
data_by_file[name] = json.load(fh)
|
|
610
|
+
|
|
611
|
+
logger.info("Validating %s JSON file(s).", len(data_by_file))
|
|
612
|
+
|
|
613
|
+
results: list[dict] = []
|
|
614
|
+
for filename, obj in data_by_file.items():
|
|
615
|
+
logger.info("Validating %s", filename)
|
|
616
|
+
results.extend(gen3_validator.validate.validate_list_dict(obj, resolver.schema_resolved))
|
|
617
|
+
|
|
618
|
+
full_validation_results_df = pd.DataFrame(results)
|
|
619
|
+
full_validation_results_df["validation_id"] = latest_validation_id
|
|
620
|
+
full_validation_results_df["study_id"] = study_id
|
|
621
|
+
logger.info("Validation completed (%s rows).", len(full_validation_results_df))
|
|
681
622
|
except Exception as e:
|
|
682
|
-
logger.
|
|
683
|
-
raise
|
|
623
|
+
logger.exception("Validation failed.")
|
|
624
|
+
raise RuntimeError("Validation failed.") from e
|
|
625
|
+
|
|
626
|
+
write_back_s3_uri = (
|
|
627
|
+
f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
|
|
628
|
+
)
|
|
684
629
|
|
|
685
|
-
write_back_s3_uri = f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
|
|
686
630
|
try:
|
|
687
|
-
logger.info(
|
|
631
|
+
logger.info("Writing CSV results to S3: %s", write_back_s3_uri)
|
|
688
632
|
write_df_to_s3(full_validation_results_df, write_back_s3_uri, "full_validation_results.csv")
|
|
689
|
-
write_df_to_s3(summarised_validation_results_df, write_back_s3_uri, "summarised_validation_results.csv")
|
|
690
633
|
except Exception as e:
|
|
691
|
-
logger.
|
|
692
|
-
raise
|
|
634
|
+
logger.exception("Failed to write CSV validation results to S3.")
|
|
635
|
+
raise RuntimeError("Writing CSV results failed.") from e
|
|
693
636
|
|
|
694
637
|
try:
|
|
695
|
-
logger.info(
|
|
638
|
+
logger.info(
|
|
639
|
+
"Writing Parquet to Glue DB '%s', table '%s'.",
|
|
640
|
+
glue_database,
|
|
641
|
+
"full_validation_results",
|
|
642
|
+
)
|
|
696
643
|
write_parquet_to_db(
|
|
697
644
|
df=full_validation_results_df,
|
|
698
645
|
dataset_root=parquet_root,
|
|
@@ -701,19 +648,8 @@ def validate_pipeline(
|
|
|
701
648
|
partition_cols=["validation_id"],
|
|
702
649
|
compression="snappy",
|
|
703
650
|
mode="append",
|
|
704
|
-
schema_evolution=True
|
|
705
|
-
)
|
|
706
|
-
logger.info(f"Writing summarised_validation_results to database '{glue_database}', table 'summarised_validation_results'")
|
|
707
|
-
write_parquet_to_db(
|
|
708
|
-
df=summarised_validation_results_df,
|
|
709
|
-
dataset_root=parquet_root,
|
|
710
|
-
database=glue_database,
|
|
711
|
-
table="summarised_validation_results",
|
|
712
|
-
partition_cols=["validation_id"],
|
|
713
|
-
compression="snappy",
|
|
714
|
-
mode="append",
|
|
715
|
-
schema_evolution=True
|
|
651
|
+
schema_evolution=True,
|
|
716
652
|
)
|
|
717
653
|
except Exception as e:
|
|
718
|
-
logger.
|
|
719
|
-
raise
|
|
654
|
+
logger.exception("Failed to write validation results to Parquet/Glue.")
|
|
655
|
+
raise RuntimeError("Writing Parquet/Glue results failed.") from e
|
|
File without changes
|
{acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|