acdc_aws_etl_pipeline 0.7.6__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/PKG-INFO +22 -12
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/README.md +21 -11
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/pyproject.toml +1 -1
- acdc_aws_etl_pipeline-0.8.0/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +115 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +12 -2
- acdc_aws_etl_pipeline-0.7.6/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +0 -2
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
- {acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -32,19 +32,29 @@ Infrastructure and code for the ACDC ETL pipeline and data operations in AWS
|
|
|
32
32
|
|
|
33
33
|
## Documentation
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
- [
|
|
37
|
-
- [
|
|
38
|
-
- [
|
|
39
|
-
- [
|
|
40
|
-
- [Synthetic data generation](docs/synthetic_data_generation.md)
|
|
41
|
-
- [REST API upload to sheepdog](docs/rest_api_sheepdog_upload.md)
|
|
42
|
-
- [Data deletion](docs/data_deletion.md)
|
|
43
|
-
- [IndexD file registration](docs/indexd_registration.md)
|
|
44
|
-
- [Querying Athena](docs/querying_athena.md)
|
|
45
|
-
- [Writing Athena queries to JSON](docs/write_athena_queries_to_json.md)
|
|
35
|
+
### Core Configuration & Management
|
|
36
|
+
- [Deployment Configuration Guide](docs/config.md)
|
|
37
|
+
- [Dictionary Deployment](docs/dictionary_deployment.md)
|
|
38
|
+
- [Service Management](docs/service_management.md)
|
|
39
|
+
- [Kubernetes Utilities](docs/k8s_utilities.md)
|
|
46
40
|
- [Troubleshooting](docs/troubleshooting.md)
|
|
47
41
|
|
|
42
|
+
### Data Lifecycle & ETL
|
|
43
|
+
- [Data Ingestion](docs/data_ingestion.md)
|
|
44
|
+
- [Data Validation](docs/data_validation.md)
|
|
45
|
+
- [Data Transformation (dbt)](docs/data_transformation_dbt.md)
|
|
46
|
+
- [Data Releases](docs/write_data_release.md)
|
|
47
|
+
- [Data Deletion](docs/data_deletion.md)
|
|
48
|
+
|
|
49
|
+
### Metadata & Registry Operations
|
|
50
|
+
- [REST API Upload to Sheepdog](docs/rest_api_sheepdog_upload.md)
|
|
51
|
+
- [IndexD File Registration](docs/indexd_registration.md)
|
|
52
|
+
- [Synthetic Data Generation](docs/synthetic_data_generation.md)
|
|
53
|
+
|
|
54
|
+
### Analysis & Querying
|
|
55
|
+
- [Querying Athena](docs/querying_athena.md)
|
|
56
|
+
- [Writing Athena Queries to JSON](docs/write_athena_queries_to_json.md)
|
|
57
|
+
|
|
48
58
|
## Library and source code (`src/acdc_aws_etl_pipeline`)
|
|
49
59
|
|
|
50
60
|
The Python package in [`src/acdc_aws_etl_pipeline`](src/acdc_aws_etl_pipeline) provides reusable utilities for ingestion, validation, uploads, and Athena/Glue operations used across the pipeline and services.
|
|
@@ -3,19 +3,29 @@ Infrastructure and code for the ACDC ETL pipeline and data operations in AWS
|
|
|
3
3
|
|
|
4
4
|
## Documentation
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
- [
|
|
8
|
-
- [
|
|
9
|
-
- [
|
|
10
|
-
- [
|
|
11
|
-
- [Synthetic data generation](docs/synthetic_data_generation.md)
|
|
12
|
-
- [REST API upload to sheepdog](docs/rest_api_sheepdog_upload.md)
|
|
13
|
-
- [Data deletion](docs/data_deletion.md)
|
|
14
|
-
- [IndexD file registration](docs/indexd_registration.md)
|
|
15
|
-
- [Querying Athena](docs/querying_athena.md)
|
|
16
|
-
- [Writing Athena queries to JSON](docs/write_athena_queries_to_json.md)
|
|
6
|
+
### Core Configuration & Management
|
|
7
|
+
- [Deployment Configuration Guide](docs/config.md)
|
|
8
|
+
- [Dictionary Deployment](docs/dictionary_deployment.md)
|
|
9
|
+
- [Service Management](docs/service_management.md)
|
|
10
|
+
- [Kubernetes Utilities](docs/k8s_utilities.md)
|
|
17
11
|
- [Troubleshooting](docs/troubleshooting.md)
|
|
18
12
|
|
|
13
|
+
### Data Lifecycle & ETL
|
|
14
|
+
- [Data Ingestion](docs/data_ingestion.md)
|
|
15
|
+
- [Data Validation](docs/data_validation.md)
|
|
16
|
+
- [Data Transformation (dbt)](docs/data_transformation_dbt.md)
|
|
17
|
+
- [Data Releases](docs/write_data_release.md)
|
|
18
|
+
- [Data Deletion](docs/data_deletion.md)
|
|
19
|
+
|
|
20
|
+
### Metadata & Registry Operations
|
|
21
|
+
- [REST API Upload to Sheepdog](docs/rest_api_sheepdog_upload.md)
|
|
22
|
+
- [IndexD File Registration](docs/indexd_registration.md)
|
|
23
|
+
- [Synthetic Data Generation](docs/synthetic_data_generation.md)
|
|
24
|
+
|
|
25
|
+
### Analysis & Querying
|
|
26
|
+
- [Querying Athena](docs/querying_athena.md)
|
|
27
|
+
- [Writing Athena Queries to JSON](docs/write_athena_queries_to_json.md)
|
|
28
|
+
|
|
19
29
|
## Library and source code (`src/acdc_aws_etl_pipeline`)
|
|
20
30
|
|
|
21
31
|
The Python package in [`src/acdc_aws_etl_pipeline`](src/acdc_aws_etl_pipeline) provides reusable utilities for ingestion, validation, uploads, and Athena/Glue operations used across the pipeline and services.
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
from gen3.auth import Gen3Auth
|
|
4
|
+
from gen3.submission import Gen3Submission
|
|
5
|
+
from typing import Optional, List
|
|
6
|
+
from acdc_aws_etl_pipeline.upload.metadata_submitter import (
|
|
7
|
+
create_boto3_session,
|
|
8
|
+
get_gen3_api_key_aws_secret,
|
|
9
|
+
infer_api_endpoint_from_jwt,
|
|
10
|
+
create_gen3_submission_class
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
def delete_project_metadata(
|
|
16
|
+
import_order_file: str,
|
|
17
|
+
project_id: str,
|
|
18
|
+
aws_secret_name: str,
|
|
19
|
+
aws_profile: Optional[str] = None,
|
|
20
|
+
aws_region: str = "us-east-1",
|
|
21
|
+
exclude_nodes: Optional[List[str]] = None,
|
|
22
|
+
prompt_for_confirmation: bool = True,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Deletes metadata json files from the gen3 api endpoint. Deletion depends on
|
|
26
|
+
a DataImportOrder.txt file, which defines the order of the nodes to be
|
|
27
|
+
deleted.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
import_order_file (str): The path to the import order file
|
|
31
|
+
project_id (str): The ID of the project.
|
|
32
|
+
aws_secret_name (str): The name of the AWS secret containing the Gen3
|
|
33
|
+
API key.
|
|
34
|
+
aws_profile (str, optional): AWS profile name.
|
|
35
|
+
aws_region (str, optional): AWS region. Default is "ap-southeast-2".
|
|
36
|
+
exclude_nodes (list): A list of node names to exclude from the
|
|
37
|
+
deletion. Default is ["project", "program", "acknowledgement",
|
|
38
|
+
"publication"].
|
|
39
|
+
prompt_for_confirmation (bool): Whether to prompt for confirmation
|
|
40
|
+
before deletion.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
None
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
if aws_region is None:
|
|
47
|
+
aws_region = "ap-southeast-2"
|
|
48
|
+
if aws_profile is None:
|
|
49
|
+
aws_profile = "default"
|
|
50
|
+
|
|
51
|
+
if exclude_nodes is None:
|
|
52
|
+
exclude_nodes = [
|
|
53
|
+
"project",
|
|
54
|
+
"program",
|
|
55
|
+
"acknowledgement",
|
|
56
|
+
"publication",
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
def get_import_order(import_order_file):
|
|
60
|
+
try:
|
|
61
|
+
with open(import_order_file, "r", encoding="utf-8") as f:
|
|
62
|
+
import_order = [line.rstrip() for line in f]
|
|
63
|
+
import_order = [
|
|
64
|
+
node for node in import_order if node not in exclude_nodes
|
|
65
|
+
]
|
|
66
|
+
return import_order
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
logger.error(
|
|
69
|
+
"DataImportOrder.txt not found in %s", import_order_file
|
|
70
|
+
)
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
ordered_import_nodes = get_import_order(import_order_file)
|
|
74
|
+
|
|
75
|
+
# AWS and Gen3 Authentication
|
|
76
|
+
session = create_boto3_session(aws_profile)
|
|
77
|
+
api_key_dict = get_gen3_api_key_aws_secret(
|
|
78
|
+
aws_secret_name, aws_region, session
|
|
79
|
+
)
|
|
80
|
+
if api_key_dict is None:
|
|
81
|
+
logger.error("API key not found in AWS secret %s", aws_secret_name)
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
sub = create_gen3_submission_class(api_key_dict)
|
|
85
|
+
|
|
86
|
+
final_ordered_import_nodes = [
|
|
87
|
+
node for node in ordered_import_nodes if node not in exclude_nodes
|
|
88
|
+
]
|
|
89
|
+
final_ordered_import_nodes.reverse() # Reverse the order for deletion
|
|
90
|
+
|
|
91
|
+
if prompt_for_confirmation:
|
|
92
|
+
confirm = input(
|
|
93
|
+
"Do you want to delete the metadata? (yes/no): "
|
|
94
|
+
).strip().lower()
|
|
95
|
+
if confirm != "yes":
|
|
96
|
+
logger.info("Deletion cancelled by user.")
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
for node in final_ordered_import_nodes:
|
|
100
|
+
logger.info("[DELETE] | Project: %-10s | Node: %-12s", project_id, node)
|
|
101
|
+
try:
|
|
102
|
+
sub.delete_nodes("program1", project_id, [node])
|
|
103
|
+
logger.info(
|
|
104
|
+
"\033[92m[SUCCESS]\033[0m | Project: %-10s | Node: %-12s",
|
|
105
|
+
project_id,
|
|
106
|
+
node,
|
|
107
|
+
)
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(
|
|
110
|
+
"\033[91m[FAILED]\033[0m | Project: %-10s | Node: %-12s | "
|
|
111
|
+
"Error: %s",
|
|
112
|
+
project_id,
|
|
113
|
+
node,
|
|
114
|
+
e,
|
|
115
|
+
)
|
|
@@ -596,7 +596,9 @@ class MetadataSubmitter:
|
|
|
596
596
|
self.aws_profile = aws_profile
|
|
597
597
|
self.partition_cols = partition_cols or ["upload_datetime"]
|
|
598
598
|
self.upload_to_database = upload_to_database
|
|
599
|
-
self.boto3_session =
|
|
599
|
+
self.boto3_session = None
|
|
600
|
+
if self.upload_to_database:
|
|
601
|
+
self.boto3_session = self._create_boto3_session()
|
|
600
602
|
logger.info("MetadataSubmitter initialised.")
|
|
601
603
|
|
|
602
604
|
def _create_gen3_submission_class(self):
|
|
@@ -996,10 +998,13 @@ class MetadataSubmitter:
|
|
|
996
998
|
}
|
|
997
999
|
return file_map
|
|
998
1000
|
|
|
999
|
-
def submit_metadata(self) -> List[Dict[str, Any]]:
|
|
1001
|
+
def submit_metadata(self, specific_node: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
1000
1002
|
"""
|
|
1001
1003
|
Submits metadata for each node defined in the data import order, except those in the exclude list.
|
|
1002
1004
|
|
|
1005
|
+
Args:
|
|
1006
|
+
specific_node (Optional[str]): If provided, only submits metadata for the specified node.
|
|
1007
|
+
|
|
1003
1008
|
**Detailed Process:**
|
|
1004
1009
|
1. **Order Resolution:** The function reads the import order to determine the sequence of nodes.
|
|
1005
1010
|
2. **File Mapping:** It finds the matching `node.json` file for each node in the order.
|
|
@@ -1023,6 +1028,11 @@ class MetadataSubmitter:
|
|
|
1023
1028
|
file_map = self._create_file_map()
|
|
1024
1029
|
|
|
1025
1030
|
logger.info("Starting metadata submission.")
|
|
1031
|
+
|
|
1032
|
+
if specific_node:
|
|
1033
|
+
if specific_node not in data_import_order:
|
|
1034
|
+
raise ValueError(f"Node '{specific_node}' not found in data import order.")
|
|
1035
|
+
data_import_order = [specific_node]
|
|
1026
1036
|
|
|
1027
1037
|
for node in data_import_order:
|
|
1028
1038
|
if node in self.exclude_nodes:
|
{acdc_aws_etl_pipeline-0.7.6 → acdc_aws_etl_pipeline-0.8.0}/src/acdc_aws_etl_pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|