acdc_aws_etl_pipeline 0.6.3__tar.gz → 0.6.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/PKG-INFO +2 -2
  2. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/pyproject.toml +2 -2
  3. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/validate/validate.py +76 -140
  4. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/README.md +0 -0
  5. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
  6. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
  7. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
  8. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
  9. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -0
  10. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +0 -0
  11. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
  12. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
  13. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
  14. {acdc_aws_etl_pipeline-0.6.3 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.6.3
3
+ Version: 0.6.4
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -15,7 +15,7 @@ Requires-Dist: boto3
15
15
  Requires-Dist: dbt-athena (==1.9.4)
16
16
  Requires-Dist: dbt-core (==1.9.4)
17
17
  Requires-Dist: gen3 (>=4.27.4,<5.0.0)
18
- Requires-Dist: gen3_validator (>=1.1.2,<2.0.0)
18
+ Requires-Dist: gen3_validator (>=2.0.0,<3.0.0)
19
19
  Requires-Dist: numpy (<2.0.0)
20
20
  Requires-Dist: pytest
21
21
  Requires-Dist: python-dotenv
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "acdc_aws_etl_pipeline"
3
- version = "0.6.3"
3
+ version = "0.6.4"
4
4
  description = "Tools for ACDC ETL pipeline"
5
5
  authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
6
6
  readme = "README.md"
@@ -15,7 +15,7 @@ numpy = "<2.0.0"
15
15
  pyyaml = ">=6.0.2,<7.0.0"
16
16
  tzlocal = ">=5.3.1,<6.0.0"
17
17
  gen3 = ">=4.27.4, <5.0.0"
18
- gen3_validator = ">=1.1.2,<2.0.0"
18
+ gen3_validator = ">=2.0.0,<3.0.0"
19
19
  pytest = "*"
20
20
  pytz = ">=2025.2,<2026.0"
21
21
  s3fs = "2025.10.0"
@@ -534,165 +534,112 @@ def validate_pipeline(
534
534
  write_back_root: str,
535
535
  parquet_root: str,
536
536
  glue_database: str,
537
- root_node: str = 'project' # <-- added root_node as an argument
538
- ):
537
+ root_node: str = "project",
538
+ ) -> None:
539
539
  """
540
- Orchestrate an entire validation workflow for a study, from loading and resolving a schema,
541
- collecting validation result files, running validation and result summarization, and persisting results
542
- to S3 and Athena/Glue in usable format.
543
-
544
- :param study_id: The identifier of the study whose files/results to process (e.g., 'ausdiab').
545
- :type study_id: str
546
- :param schema_s3_uri: Full S3 URI to the JSON schema file.
547
- :type schema_s3_uri: str
548
- :param validation_s3_uri: S3 prefix/folder where validation result files (.json) are stored.
549
- :type validation_s3_uri: str
550
- :param write_back_root: S3 prefix/folder to write generated artifacts/reports to.
551
- :type write_back_root: str
552
- :param parquet_root: S3 prefix/folder for writing Parquet files for Athena.
553
- :type parquet_root: str
554
- :param glue_database: The Glue database name to register tables into for Athena.
555
- :type glue_database: str
556
- :param root_node: The root node in the schema graph for link validation.
557
- :type root_node: str
558
-
559
- :raises Exception: If any step in the process fails.
560
-
561
- .. note::
562
- This function is typically used in an ETL or validation pipeline step for data quality assurance.
540
+ Orchestrate the validation workflow for a study: load + resolve schema, find the latest
541
+ validation artefacts, validate, and persist results to S3 and Athena/Glue.
563
542
 
564
- **Process Steps**
543
+ Args:
544
+ study_id: Study identifier (e.g. "ausdiab").
545
+ schema_s3_uri: Full S3 URI to the JSON schema file.
546
+ validation_s3_uri: S3 prefix containing validation result files (.json).
547
+ write_back_root: S3 prefix to write generated artefacts/reports to.
548
+ parquet_root: S3 prefix for Parquet outputs (Athena).
549
+ glue_database: Glue database to register tables into.
550
+ root_node: Root node in the schema graph for link validation.
565
551
 
566
- 1. Loads the latest schema from S3 and writes it to a temp file.
567
- 2. Resolves the Gen3 data dictionary schema using the validator library.
568
- 3. Identifies the most recent validation results for the requested study.
569
- 4. Downloads the relevant data files to a temp directory.
570
- 5. Runs validation and builds both full and summary DataFrames of the results.
571
- 6. Writes CSV summaries to S3 for reporting.
572
- 7. Writes full and summarized validation results as Parquet to S3 and registers in Glue.
573
-
574
- **Example**
575
-
576
- >>> validate_pipeline(
577
- ... study_id='ausdiab',
578
- ... schema_s3_uri='s3://bucket/schemas/my_schema.json',
579
- ... validation_s3_uri='s3://bucket/validation/',
580
- ... write_back_root='s3://bucket/validation/',
581
- ... parquet_root='s3://bucket/validation/athena/',
582
- ... glue_database='validation_db',
583
- ... root_node='project' # example value for root_node
584
- ... )
552
+ Raises:
553
+ RuntimeError: When expected inputs are missing or a pipeline step fails.
585
554
  """
555
+ # NOTE: root_node is currently unused in the original implementation.
556
+ # Keeping it in the signature for forward compatibility.
557
+
586
558
  try:
587
559
  schema = load_schema_from_s3_uri(schema_s3_uri)
588
560
  schema_path = write_schema_to_temp_file(schema)
561
+ logger.info("Schema loaded and written to temp file: %s", schema_path)
589
562
  except Exception as e:
590
- logger.error(f"Schema loading or writing failed: {e}")
591
- raise
563
+ logger.exception("Schema loading/writing failed.")
564
+ raise RuntimeError("Schema loading/writing failed.") from e
592
565
 
593
566
  try:
594
- logger.info(f"Instantiating schema resolver with schema path: {schema_path}")
567
+ logger.info("Instantiating schema resolver with schema path: %s", schema_path)
595
568
  resolver = gen3_validator.ResolveSchema(schema_path=schema_path)
596
569
  resolver.resolve_schema()
597
570
  logger.info("Schema resolved.")
598
571
  except Exception as e:
599
- logger.error(f"Failed to instantiate or resolve schema: {e}")
600
- raise
601
-
602
- try:
603
- logger.info(f"Getting schema version")
604
- dd = gen3_validator.DataDictionary(schema_path = schema_path)
605
- dd.parse_schema()
606
- schema_version = dd.get_schema_version(dd.schema)
607
- logger.info(f"Schema version: {schema_version}")
608
- except Exception as e:
609
- logger.error(f"Failed to get schema version: {e}")
610
- raise
572
+ logger.exception("Failed to instantiate or resolve schema.")
573
+ raise RuntimeError("Schema resolution failed.") from e
611
574
 
612
575
  try:
613
576
  metadata_table = pd.DataFrame(create_metadata_table(validation_s3_uri))
614
- logger.info(f"Metadata table created from S3 ({len(metadata_table)} rows).")
577
+ logger.info("Metadata table created from S3 (%s rows).", len(metadata_table))
615
578
  except Exception as e:
616
- logger.error(f"Failed to create metadata table: {e}")
617
- raise
579
+ logger.exception("Failed to create metadata table.")
580
+ raise RuntimeError("Metadata table creation failed.") from e
618
581
 
619
582
  try:
620
- latest_validation_metadata_table, latest_validation_id = get_latest_validation_for_study(metadata_table, study_id)
621
- if latest_validation_metadata_table is None or latest_validation_metadata_table.empty:
622
- logger.error(f"No latest validation files found for study {study_id}.")
623
- raise Exception(f"No latest validation files found for study {study_id}.")
583
+ latest_metadata, latest_validation_id = get_latest_validation_for_study(metadata_table, study_id)
584
+ if latest_metadata is None or latest_metadata.empty:
585
+ raise RuntimeError(f"No latest validation files found for study '{study_id}'.")
624
586
 
625
- latest_validation_file_names = latest_validation_metadata_table['file_name'].tolist()
626
- latest_validation_s3_uris = latest_validation_metadata_table['s3_uri'].tolist()
627
- logger.info(f"Latest validation file names: {latest_validation_file_names}")
628
- logger.info(f"Latest validation S3 URIs: {latest_validation_s3_uris}")
587
+ latest_validation_s3_uris = latest_metadata["s3_uri"].tolist()
588
+ logger.info("Latest validation id: %s", latest_validation_id)
589
+ logger.info("Latest validation file count: %s", len(latest_validation_s3_uris))
629
590
 
630
- downloaded_files_dir, downloaded_files = download_s3_files_to_temp_dir(latest_validation_s3_uris)
631
- except Exception as e:
632
- logger.error(f"Failed to retrieve and download latest validation files: {e}")
633
- raise
634
-
635
- try:
591
+ downloaded_files_dir, _downloaded_files = download_s3_files_to_temp_dir(latest_validation_s3_uris)
636
592
  files_in_dir = os.listdir(downloaded_files_dir)
637
- logger.info(f"Files in downloaded_files_dir: {downloaded_files_dir}")
638
- for f in files_in_dir:
639
- logger.info(f" - {f}")
593
+ logger.info("Downloaded files dir: %s", downloaded_files_dir)
594
+ for name in files_in_dir:
595
+ logger.info(" - %s", name)
640
596
  except Exception as e:
641
- logger.error(f"Failed to list or log downloaded files: {e}")
642
- raise
597
+ logger.exception("Failed to retrieve/download latest validation files.")
598
+ raise RuntimeError("Retrieval/download of validation files failed.") from e
643
599
 
644
600
  try:
645
- logger.info(f"Starting validation for files in: {downloaded_files_dir}")
646
- data = gen3_validator.ParseData(data_folder_path=downloaded_files_dir)
647
-
648
- logger.info("Validating links")
649
- linkage_validator = gen3_validator.Linkage()
650
- linkage_config = linkage_validator.generate_config(data.data_dict)
651
- linkage_results = linkage_validator.validate_links(
652
- data_map=data.data_dict,
653
- config=linkage_config,
654
- root_node=root_node
655
- )
656
- unlinked_fk = count_unlinked_foreign_keys(linkage_results)
657
- linkage_pass = True if unlinked_fk == 0 else False
658
-
659
- logger.info("Links validation complete | Please check in athena later.")
660
-
661
- logger.info("Validating data")
662
-
663
- validator = gen3_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)
664
- validator.validate_schema()
665
- logger.info("Validation completed.")
666
-
667
- summary = gen3_validator.ValidateSummary(validator)
668
- summary.flatten_validation_results()
669
- full_validation_results_df = summary.flattened_results_to_pd()
670
- full_validation_results_df['validation_id'] = latest_validation_id
671
- full_validation_results_df['study_id'] = study_id
672
- full_validation_results_df['schema_version'] = schema_version
673
- full_validation_results_df['linkage_pass'] = str(linkage_pass)
674
- full_validation_results_df['n_unlinked_foreign_keys'] = str(unlinked_fk)
675
- full_validation_results_df['linkage_results_dict'] = truncate_linkage_results(linkage_results)
676
- summarised_validation_results_df = summary.collapse_flatten_results_to_pd()
677
- summarised_validation_results_df['validation_id'] = latest_validation_id
678
- summarised_validation_results_df['study_id'] = study_id
679
- summarised_validation_results_df['schema_version'] = schema_version
680
- logger.info("Validation results processed to DataFrame.")
601
+ logger.info("Reading JSON validation files from: %s", downloaded_files_dir)
602
+
603
+ data_by_file = {}
604
+ for name in files_in_dir:
605
+ if not name.endswith(".json"):
606
+ continue
607
+ full_path = os.path.join(downloaded_files_dir, name)
608
+ with open(full_path, "r") as fh:
609
+ data_by_file[name] = json.load(fh)
610
+
611
+ logger.info("Validating %s JSON file(s).", len(data_by_file))
612
+
613
+ results: list[dict] = []
614
+ for filename, obj in data_by_file.items():
615
+ logger.info("Validating %s", filename)
616
+ results.extend(gen3_validator.validate.validate_list_dict(obj, resolver.schema_resolved))
617
+
618
+ full_validation_results_df = pd.DataFrame(results)
619
+ full_validation_results_df["validation_id"] = latest_validation_id
620
+ full_validation_results_df["study_id"] = study_id
621
+ logger.info("Validation completed (%s rows).", len(full_validation_results_df))
681
622
  except Exception as e:
682
- logger.error(f"Validation failed: {e}")
683
- raise
623
+ logger.exception("Validation failed.")
624
+ raise RuntimeError("Validation failed.") from e
625
+
626
+ write_back_s3_uri = (
627
+ f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
628
+ )
684
629
 
685
- write_back_s3_uri = f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
686
630
  try:
687
- logger.info(f"Writing out validation results to S3 location: {write_back_s3_uri}")
631
+ logger.info("Writing CSV results to S3: %s", write_back_s3_uri)
688
632
  write_df_to_s3(full_validation_results_df, write_back_s3_uri, "full_validation_results.csv")
689
- write_df_to_s3(summarised_validation_results_df, write_back_s3_uri, "summarised_validation_results.csv")
690
633
  except Exception as e:
691
- logger.error(f"Failed to write CSV validation results to S3: {e}")
692
- raise
634
+ logger.exception("Failed to write CSV validation results to S3.")
635
+ raise RuntimeError("Writing CSV results failed.") from e
693
636
 
694
637
  try:
695
- logger.info(f"Writing full_validation_results to database '{glue_database}', table 'full_validation_results'")
638
+ logger.info(
639
+ "Writing Parquet to Glue DB '%s', table '%s'.",
640
+ glue_database,
641
+ "full_validation_results",
642
+ )
696
643
  write_parquet_to_db(
697
644
  df=full_validation_results_df,
698
645
  dataset_root=parquet_root,
@@ -701,19 +648,8 @@ def validate_pipeline(
701
648
  partition_cols=["validation_id"],
702
649
  compression="snappy",
703
650
  mode="append",
704
- schema_evolution=True
705
- )
706
- logger.info(f"Writing summarised_validation_results to database '{glue_database}', table 'summarised_validation_results'")
707
- write_parquet_to_db(
708
- df=summarised_validation_results_df,
709
- dataset_root=parquet_root,
710
- database=glue_database,
711
- table="summarised_validation_results",
712
- partition_cols=["validation_id"],
713
- compression="snappy",
714
- mode="append",
715
- schema_evolution=True
651
+ schema_evolution=True,
716
652
  )
717
653
  except Exception as e:
718
- logger.error(f"Failed to write validation results to Parquet/Glue: {e}")
719
- raise
654
+ logger.exception("Failed to write validation results to Parquet/Glue.")
655
+ raise RuntimeError("Writing Parquet/Glue results failed.") from e