PyPI - acdc_aws_etl_pipeline - Versions diffs - 0.6.2__tar.gz → 0.6.4__tar.gz - Mend

acdc_aws_etl_pipeline 0.6.2tar.gz → 0.6.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: acdc_aws_etl_pipeline
-Version: 0.6.2
+Version: 0.6.4
 Summary: Tools for ACDC ETL pipeline
 Author: JoshuaHarris391
 Author-email: harjo391@gmail.com
@@ -15,7 +15,7 @@ Requires-Dist: boto3
 Requires-Dist: dbt-athena (==1.9.4)
 Requires-Dist: dbt-core (==1.9.4)
 Requires-Dist: gen3 (>=4.27.4,<5.0.0)
-Requires-Dist: gen3_validator (>=1.1.2,<2.0.0)
+Requires-Dist: gen3_validator (>=2.0.0,<3.0.0)
 Requires-Dist: numpy (<2.0.0)
 Requires-Dist: pytest
 Requires-Dist: python-dotenv

{acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "acdc_aws_etl_pipeline"
-version = "0.6.2"
+version = "0.6.4"
 description = "Tools for ACDC ETL pipeline"
 authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
 readme = "README.md"
@@ -15,7 +15,7 @@ numpy = "<2.0.0"
 pyyaml = ">=6.0.2,<7.0.0"
 tzlocal = ">=5.3.1,<6.0.0"
 gen3 = ">=4.27.4, <5.0.0"
-gen3_validator = ">=1.1.2,<2.0.0"
+gen3_validator = ">=2.0.0,<3.0.0"
 pytest = "*"
 pytz = ">=2025.2,<2026.0"
 s3fs = "2025.10.0"

{acdc_aws_etl_pipeline-0.6.2 → acdc_aws_etl_pipeline-0.6.4}/src/acdc_aws_etl_pipeline/validate/validate.py RENAMED Viewed

@@ -534,165 +534,112 @@ def validate_pipeline(
     write_back_root: str,
     parquet_root: str,
     glue_database: str,
-    root_node: str = 'project'  # <-- added root_node as an argument
-):
+    root_node: str = "project",
+) -> None:
     """
-    Orchestrate an entire validation workflow for a study, from loading and resolving a schema,
-    collecting validation result files, running validation and result summarization, and persisting results
-    to S3 and Athena/Glue in usable format.
-    :param study_id: The identifier of the study whose files/results to process (e.g., 'ausdiab').
-    :type study_id: str
-    :param schema_s3_uri: Full S3 URI to the JSON schema file.
-    :type schema_s3_uri: str
-    :param validation_s3_uri: S3 prefix/folder where validation result files (.json) are stored.
-    :type validation_s3_uri: str
-    :param write_back_root: S3 prefix/folder to write generated artifacts/reports to.
-    :type write_back_root: str
-    :param parquet_root: S3 prefix/folder for writing Parquet files for Athena.
-    :type parquet_root: str
-    :param glue_database: The Glue database name to register tables into for Athena.
-    :type glue_database: str
-    :param root_node: The root node in the schema graph for link validation.
-    :type root_node: str
-    :raises Exception: If any step in the process fails.
-    .. note::
-        This function is typically used in an ETL or validation pipeline step for data quality assurance.
+    Orchestrate the validation workflow for a study: load + resolve schema, find the latest
+    validation artefacts, validate, and persist results to S3 and Athena/Glue.
-    **Process Steps**
+    Args:
+        study_id: Study identifier (e.g. "ausdiab").
+        schema_s3_uri: Full S3 URI to the JSON schema file.
+        validation_s3_uri: S3 prefix containing validation result files (.json).
+        write_back_root: S3 prefix to write generated artefacts/reports to.
+        parquet_root: S3 prefix for Parquet outputs (Athena).
+        glue_database: Glue database to register tables into.
+        root_node: Root node in the schema graph for link validation.
-        1. Loads the latest schema from S3 and writes it to a temp file.
-        2. Resolves the Gen3 data dictionary schema using the validator library.
-        3. Identifies the most recent validation results for the requested study.
-        4. Downloads the relevant data files to a temp directory.
-        5. Runs validation and builds both full and summary DataFrames of the results.
-        6. Writes CSV summaries to S3 for reporting.
-        7. Writes full and summarized validation results as Parquet to S3 and registers in Glue.
-    **Example**
-        >>> validate_pipeline(
-        ...     study_id='ausdiab',
-        ...     schema_s3_uri='s3://bucket/schemas/my_schema.json',
-        ...     validation_s3_uri='s3://bucket/validation/',
-        ...     write_back_root='s3://bucket/validation/',
-        ...     parquet_root='s3://bucket/validation/athena/',
-        ...     glue_database='validation_db',
-        ...     root_node='project'   # example value for root_node
-        ... )
+    Raises:
+        RuntimeError: When expected inputs are missing or a pipeline step fails.
     """
+    # NOTE: root_node is currently unused in the original implementation.
+    # Keeping it in the signature for forward compatibility.
     try:
         schema = load_schema_from_s3_uri(schema_s3_uri)
         schema_path = write_schema_to_temp_file(schema)
+        logger.info("Schema loaded and written to temp file: %s", schema_path)
     except Exception as e:
-        logger.error(f"Schema loading or writing failed: {e}")
-        raise
+        logger.exception("Schema loading/writing failed.")
+        raise RuntimeError("Schema loading/writing failed.") from e
     try:
-        logger.info(f"Instantiating schema resolver with schema path: {schema_path}")
+        logger.info("Instantiating schema resolver with schema path: %s", schema_path)
         resolver = gen3_validator.ResolveSchema(schema_path=schema_path)
         resolver.resolve_schema()
         logger.info("Schema resolved.")
     except Exception as e:
-        logger.error(f"Failed to instantiate or resolve schema: {e}")
-        raise
-    try:
-        logger.info(f"Getting schema version")
-        dd = gen3_validator.DataDictionary(schema_path = schema_path)
-        dd.parse_schema()
-        schema_version = dd.get_schema_version(dd.schema)
-        logger.info(f"Schema version: {schema_version}")
-    except Exception as e:
-        logger.error(f"Failed to get schema version: {e}")
-        raise
+        logger.exception("Failed to instantiate or resolve schema.")
+        raise RuntimeError("Schema resolution failed.") from e
     try:
         metadata_table = pd.DataFrame(create_metadata_table(validation_s3_uri))
-        logger.info(f"Metadata table created from S3 ({len(metadata_table)} rows).")
+        logger.info("Metadata table created from S3 (%s rows).", len(metadata_table))
     except Exception as e:
-        logger.error(f"Failed to create metadata table: {e}")
-        raise
+        logger.exception("Failed to create metadata table.")
+        raise RuntimeError("Metadata table creation failed.") from e
     try:
-        latest_validation_metadata_table, latest_validation_id = get_latest_validation_for_study(metadata_table, study_id)
-        if latest_validation_metadata_table is None or latest_validation_metadata_table.empty:
-            logger.error(f"No latest validation files found for study {study_id}.")
-            raise Exception(f"No latest validation files found for study {study_id}.")
+        latest_metadata, latest_validation_id = get_latest_validation_for_study(metadata_table, study_id)
+        if latest_metadata is None or latest_metadata.empty:
+            raise RuntimeError(f"No latest validation files found for study '{study_id}'.")
-        latest_validation_file_names = latest_validation_metadata_table['file_name'].tolist()
-        latest_validation_s3_uris = latest_validation_metadata_table['s3_uri'].tolist()
-        logger.info(f"Latest validation file names: {latest_validation_file_names}")
-        logger.info(f"Latest validation S3 URIs: {latest_validation_s3_uris}")
+        latest_validation_s3_uris = latest_metadata["s3_uri"].tolist()
+        logger.info("Latest validation id: %s", latest_validation_id)
+        logger.info("Latest validation file count: %s", len(latest_validation_s3_uris))
-        downloaded_files_dir, downloaded_files = download_s3_files_to_temp_dir(latest_validation_s3_uris)
-    except Exception as e:
-        logger.error(f"Failed to retrieve and download latest validation files: {e}")
-        raise
-    try:
+        downloaded_files_dir, _downloaded_files = download_s3_files_to_temp_dir(latest_validation_s3_uris)
         files_in_dir = os.listdir(downloaded_files_dir)
-        logger.info(f"Files in downloaded_files_dir: {downloaded_files_dir}")
-        for f in files_in_dir:
-            logger.info(f"  - {f}")
+        logger.info("Downloaded files dir: %s", downloaded_files_dir)
+        for name in files_in_dir:
+            logger.info(" - %s", name)
     except Exception as e:
-        logger.error(f"Failed to list or log downloaded files: {e}")
-        raise
+        logger.exception("Failed to retrieve/download latest validation files.")
+        raise RuntimeError("Retrieval/download of validation files failed.") from e
     try:
-        logger.info(f"Starting validation for files in: {downloaded_files_dir}")
-        data = gen3_validator.ParseData(data_folder_path=downloaded_files_dir)
-        logger.info("Validating links")
-        linkage_validator = gen3_validator.Linkage()
-        linkage_config = linkage_validator.generate_config(data.data_dict)
-        linkage_results = linkage_validator.validate_links(
-            data_map=data.data_dict,
-            config=linkage_config,
-            root_node=root_node
-        )
-        unlinked_fk = count_unlinked_foreign_keys(linkage_results)
-        linkage_pass = True if unlinked_fk == 0 else False
-        logger.info("Links validation complete | Please check in athena later.")
-        logger.info("Validating data")
-        validator = gen3_validator.Validate(data_map=data.data_dict, resolved_schema=resolver.schema_resolved)
-        validator.validate_schema()
-        logger.info("Validation completed.")
-        summary = gen3_validator.ValidateSummary(validator)
-        summary.flatten_validation_results()
-        full_validation_results_df = summary.flattened_results_to_pd()
-        full_validation_results_df['validation_id'] = latest_validation_id
-        full_validation_results_df['study_id'] = study_id
-        full_validation_results_df['schema_version'] = schema_version
-        full_validation_results_df['linkage_pass'] = str(linkage_pass)
-        full_validation_results_df['n_unlinked_foreign_keys'] = str(unlinked_fk)
-        full_validation_results_df['linkage_results_dict'] = truncate_linkage_results(linkage_results)
-        summarised_validation_results_df = summary.collapse_flatten_results_to_pd()
-        summarised_validation_results_df['validation_id'] = latest_validation_id
-        summarised_validation_results_df['study_id'] = study_id
-        summarised_validation_results_df['schema_version'] = schema_version
-        logger.info("Validation results processed to DataFrame.")
+        logger.info("Reading JSON validation files from: %s", downloaded_files_dir)
+        data_by_file = {}
+        for name in files_in_dir:
+            if not name.endswith(".json"):
+                continue
+            full_path = os.path.join(downloaded_files_dir, name)
+            with open(full_path, "r") as fh:
+                data_by_file[name] = json.load(fh)
+        logger.info("Validating %s JSON file(s).", len(data_by_file))
+        results: list[dict] = []
+        for filename, obj in data_by_file.items():
+            logger.info("Validating %s", filename)
+            results.extend(gen3_validator.validate.validate_list_dict(obj, resolver.schema_resolved))
+        full_validation_results_df = pd.DataFrame(results)
+        full_validation_results_df["validation_id"] = latest_validation_id
+        full_validation_results_df["study_id"] = study_id
+        logger.info("Validation completed (%s rows).", len(full_validation_results_df))
     except Exception as e:
-        logger.error(f"Validation failed: {e}")
-        raise
+        logger.exception("Validation failed.")
+        raise RuntimeError("Validation failed.") from e
+    write_back_s3_uri = (
+        f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
+    )
-    write_back_s3_uri = f"{write_back_root.rstrip('/')}/study_id={study_id}/validation_id={latest_validation_id}/"
     try:
-        logger.info(f"Writing out validation results to S3 location: {write_back_s3_uri}")
+        logger.info("Writing CSV results to S3: %s", write_back_s3_uri)
         write_df_to_s3(full_validation_results_df, write_back_s3_uri, "full_validation_results.csv")
-        write_df_to_s3(summarised_validation_results_df, write_back_s3_uri, "summarised_validation_results.csv")
     except Exception as e:
-        logger.error(f"Failed to write CSV validation results to S3: {e}")
-        raise
+        logger.exception("Failed to write CSV validation results to S3.")
+        raise RuntimeError("Writing CSV results failed.") from e
     try:
-        logger.info(f"Writing full_validation_results to database '{glue_database}', table 'full_validation_results'")
+        logger.info(
+            "Writing Parquet to Glue DB '%s', table '%s'.",
+            glue_database,
+            "full_validation_results",
+        )
         write_parquet_to_db(
             df=full_validation_results_df,
             dataset_root=parquet_root,
@@ -701,19 +648,8 @@ def validate_pipeline(
             partition_cols=["validation_id"],
             compression="snappy",
             mode="append",
-            schema_evolution=True
-        )
-        logger.info(f"Writing summarised_validation_results to database '{glue_database}', table 'summarised_validation_results'")
-        write_parquet_to_db(
-            df=summarised_validation_results_df,
-            dataset_root=parquet_root,
-            database=glue_database,
-            table="summarised_validation_results",
-            partition_cols=["validation_id"],
-            compression="snappy",
-            mode="append",
-            schema_evolution=True
+            schema_evolution=True,
         )
     except Exception as e:
-        logger.error(f"Failed to write validation results to Parquet/Glue: {e}")
-        raise
+        logger.exception("Failed to write validation results to Parquet/Glue.")
+        raise RuntimeError("Writing Parquet/Glue results failed.") from e