acdc_aws_etl_pipeline 0.7.6__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (15) hide show
  1. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/PKG-INFO +22 -12
  2. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/README.md +21 -11
  3. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/pyproject.toml +1 -1
  4. acdc_aws_etl_pipeline-0.8.0/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +115 -0
  5. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +12 -2
  6. acdc_aws_etl_pipeline-0.7.6/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -2
  7. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
  8. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
  9. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
  10. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
  11. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
  12. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
  13. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
  14. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
  15. {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.7.6
3
+ Version: 0.8.0
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -32,19 +32,29 @@ Infrastructure and code for the ACDC ETL pipeline and data operations in AWS
32
32
 
33
33
  ## Documentation
34
34
 
35
- - [Dictionary deployment](docs/dictionary_deployment.md)
36
- - [Data ingestion](docs/data_ingestion.md)
37
- - [Data validation](docs/data_validation.md)
38
- - [Data transformation (dbt)](docs/data_transformation_dbt.md)
39
- - [Data releases](docs/write_data_release.md)
40
- - [Synthetic data generation](docs/synthetic_data_generation.md)
41
- - [REST API upload to sheepdog](docs/rest_api_sheepdog_upload.md)
42
- - [Data deletion](docs/data_deletion.md)
43
- - [IndexD file registration](docs/indexd_registration.md)
44
- - [Querying Athena](docs/querying_athena.md)
45
- - [Writing Athena queries to JSON](docs/write_athena_queries_to_json.md)
35
+ ### Core Configuration & Management
36
+ - [Deployment Configuration Guide](docs/config.md)
37
+ - [Dictionary Deployment](docs/dictionary_deployment.md)
38
+ - [Service Management](docs/service_management.md)
39
+ - [Kubernetes Utilities](docs/k8s_utilities.md)
46
40
  - [Troubleshooting](docs/troubleshooting.md)
47
41
 
42
+ ### Data Lifecycle & ETL
43
+ - [Data Ingestion](docs/data_ingestion.md)
44
+ - [Data Validation](docs/data_validation.md)
45
+ - [Data Transformation (dbt)](docs/data_transformation_dbt.md)
46
+ - [Data Releases](docs/write_data_release.md)
47
+ - [Data Deletion](docs/data_deletion.md)
48
+
49
+ ### Metadata & Registry Operations
50
+ - [REST API Upload to Sheepdog](docs/rest_api_sheepdog_upload.md)
51
+ - [IndexD File Registration](docs/indexd_registration.md)
52
+ - [Synthetic Data Generation](docs/synthetic_data_generation.md)
53
+
54
+ ### Analysis & Querying
55
+ - [Querying Athena](docs/querying_athena.md)
56
+ - [Writing Athena Queries to JSON](docs/write_athena_queries_to_json.md)
57
+
48
58
  ## Library and source code (`src/acdc_aws_etl_pipeline`)
49
59
 
50
60
  The Python package in [`src/acdc_aws_etl_pipeline`](src/acdc_aws_etl_pipeline) provides reusable utilities for ingestion, validation, uploads, and Athena/Glue operations used across the pipeline and services.
@@ -3,19 +3,29 @@ Infrastructure and code for the ACDC ETL pipeline and data operations in AWS
3
3
 
4
4
  ## Documentation
5
5
 
6
- - [Dictionary deployment](docs/dictionary_deployment.md)
7
- - [Data ingestion](docs/data_ingestion.md)
8
- - [Data validation](docs/data_validation.md)
9
- - [Data transformation (dbt)](docs/data_transformation_dbt.md)
10
- - [Data releases](docs/write_data_release.md)
11
- - [Synthetic data generation](docs/synthetic_data_generation.md)
12
- - [REST API upload to sheepdog](docs/rest_api_sheepdog_upload.md)
13
- - [Data deletion](docs/data_deletion.md)
14
- - [IndexD file registration](docs/indexd_registration.md)
15
- - [Querying Athena](docs/querying_athena.md)
16
- - [Writing Athena queries to JSON](docs/write_athena_queries_to_json.md)
6
+ ### Core Configuration & Management
7
+ - [Deployment Configuration Guide](docs/config.md)
8
+ - [Dictionary Deployment](docs/dictionary_deployment.md)
9
+ - [Service Management](docs/service_management.md)
10
+ - [Kubernetes Utilities](docs/k8s_utilities.md)
17
11
  - [Troubleshooting](docs/troubleshooting.md)
18
12
 
13
+ ### Data Lifecycle & ETL
14
+ - [Data Ingestion](docs/data_ingestion.md)
15
+ - [Data Validation](docs/data_validation.md)
16
+ - [Data Transformation (dbt)](docs/data_transformation_dbt.md)
17
+ - [Data Releases](docs/write_data_release.md)
18
+ - [Data Deletion](docs/data_deletion.md)
19
+
20
+ ### Metadata & Registry Operations
21
+ - [REST API Upload to Sheepdog](docs/rest_api_sheepdog_upload.md)
22
+ - [IndexD File Registration](docs/indexd_registration.md)
23
+ - [Synthetic Data Generation](docs/synthetic_data_generation.md)
24
+
25
+ ### Analysis & Querying
26
+ - [Querying Athena](docs/querying_athena.md)
27
+ - [Writing Athena Queries to JSON](docs/write_athena_queries_to_json.md)
28
+
19
29
  ## Library and source code (`src/acdc_aws_etl_pipeline`)
20
30
 
21
31
  The Python package in [`src/acdc_aws_etl_pipeline`](src/acdc_aws_etl_pipeline) provides reusable utilities for ingestion, validation, uploads, and Athena/Glue operations used across the pipeline and services.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "acdc_aws_etl_pipeline"
3
- version = "0.7.6"
3
+ version = "0.8.0"
4
4
  description = "Tools for ACDC ETL pipeline"
5
5
  authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,115 @@
1
+ import os
2
+ import logging
3
+ from gen3.auth import Gen3Auth
4
+ from gen3.submission import Gen3Submission
5
+ from typing import Optional, List
6
+ from acdc_aws_etl_pipeline.upload.metadata_submitter import (
7
+ create_boto3_session,
8
+ get_gen3_api_key_aws_secret,
9
+ infer_api_endpoint_from_jwt,
10
+ create_gen3_submission_class
11
+ )
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ def delete_project_metadata(
16
+ import_order_file: str,
17
+ project_id: str,
18
+ aws_secret_name: str,
19
+ aws_profile: Optional[str] = None,
20
+ aws_region: str = "us-east-1",
21
+ exclude_nodes: Optional[List[str]] = None,
22
+ prompt_for_confirmation: bool = True,
23
+ ):
24
+ """
25
+ Deletes metadata json files from the gen3 api endpoint. Deletion depends on
26
+ a DataImportOrder.txt file, which defines the order of the nodes to be
27
+ deleted.
28
+
29
+ Args:
30
+ import_order_file (str): The path to the import order file
31
+ project_id (str): The ID of the project.
32
+ aws_secret_name (str): The name of the AWS secret containing the Gen3
33
+ API key.
34
+ aws_profile (str, optional): AWS profile name.
35
+ aws_region (str, optional): AWS region. Default is "ap-southeast-2".
36
+ exclude_nodes (list): A list of node names to exclude from the
37
+ deletion. Default is ["project", "program", "acknowledgement",
38
+ "publication"].
39
+ prompt_for_confirmation (bool): Whether to prompt for confirmation
40
+ before deletion.
41
+
42
+ Returns:
43
+ None
44
+ """
45
+
46
+ if aws_region is None:
47
+ aws_region = "ap-southeast-2"
48
+ if aws_profile is None:
49
+ aws_profile = "default"
50
+
51
+ if exclude_nodes is None:
52
+ exclude_nodes = [
53
+ "project",
54
+ "program",
55
+ "acknowledgement",
56
+ "publication",
57
+ ]
58
+
59
+ def get_import_order(import_order_file):
60
+ try:
61
+ with open(import_order_file, "r", encoding="utf-8") as f:
62
+ import_order = [line.rstrip() for line in f]
63
+ import_order = [
64
+ node for node in import_order if node not in exclude_nodes
65
+ ]
66
+ return import_order
67
+ except FileNotFoundError:
68
+ logger.error(
69
+ "DataImportOrder.txt not found in %s", import_order_file
70
+ )
71
+ return []
72
+
73
+ ordered_import_nodes = get_import_order(import_order_file)
74
+
75
+ # AWS and Gen3 Authentication
76
+ session = create_boto3_session(aws_profile)
77
+ api_key_dict = get_gen3_api_key_aws_secret(
78
+ aws_secret_name, aws_region, session
79
+ )
80
+ if api_key_dict is None:
81
+ logger.error("API key not found in AWS secret %s", aws_secret_name)
82
+ return
83
+
84
+ sub = create_gen3_submission_class(api_key_dict)
85
+
86
+ final_ordered_import_nodes = [
87
+ node for node in ordered_import_nodes if node not in exclude_nodes
88
+ ]
89
+ final_ordered_import_nodes.reverse() # Reverse the order for deletion
90
+
91
+ if prompt_for_confirmation:
92
+ confirm = input(
93
+ "Do you want to delete the metadata? (yes/no): "
94
+ ).strip().lower()
95
+ if confirm != "yes":
96
+ logger.info("Deletion cancelled by user.")
97
+ return
98
+
99
+ for node in final_ordered_import_nodes:
100
+ logger.info("[DELETE] | Project: %-10s | Node: %-12s", project_id, node)
101
+ try:
102
+ sub.delete_nodes("program1", project_id, [node])
103
+ logger.info(
104
+ "\033[92m[SUCCESS]\033[0m | Project: %-10s | Node: %-12s",
105
+ project_id,
106
+ node,
107
+ )
108
+ except Exception as e:
109
+ logger.error(
110
+ "\033[91m[FAILED]\033[0m | Project: %-10s | Node: %-12s | "
111
+ "Error: %s",
112
+ project_id,
113
+ node,
114
+ e,
115
+ )
@@ -596,7 +596,9 @@ class MetadataSubmitter:
596
596
  self.aws_profile = aws_profile
597
597
  self.partition_cols = partition_cols or ["upload_datetime"]
598
598
  self.upload_to_database = upload_to_database
599
- self.boto3_session = self._create_boto3_session()
599
+ self.boto3_session = None
600
+ if self.upload_to_database:
601
+ self.boto3_session = self._create_boto3_session()
600
602
  logger.info("MetadataSubmitter initialised.")
601
603
 
602
604
  def _create_gen3_submission_class(self):
@@ -996,10 +998,13 @@ class MetadataSubmitter:
996
998
  }
997
999
  return file_map
998
1000
 
999
- def submit_metadata(self) -> List[Dict[str, Any]]:
1001
+ def submit_metadata(self, specific_node: Optional[str] = None) -> List[Dict[str, Any]]:
1000
1002
  """
1001
1003
  Submits metadata for each node defined in the data import order, except those in the exclude list.
1002
1004
 
1005
+ Args:
1006
+ specific_node (Optional[str]): If provided, only submits metadata for the specified node.
1007
+
1003
1008
  **Detailed Process:**
1004
1009
  1. **Order Resolution:** The function reads the import order to determine the sequence of nodes.
1005
1010
  2. **File Mapping:** It finds the matching `node.json` file for each node in the order.
@@ -1023,6 +1028,11 @@ class MetadataSubmitter:
1023
1028
  file_map = self._create_file_map()
1024
1029
 
1025
1030
  logger.info("Starting metadata submission.")
1031
+
1032
+ if specific_node:
1033
+ if specific_node not in data_import_order:
1034
+ raise ValueError(f"Node '{specific_node}' not found in data import order.")
1035
+ data_import_order = [specific_node]
1026
1036
 
1027
1037
  for node in data_import_order:
1028
1038
  if node in self.exclude_nodes:
@@ -1,2 +0,0 @@
1
- from acdc_aws_etl_pipeline.upload.metadata_submitter import *
2
-