acdc_aws_etl_pipeline 0.3.9__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/PKG-INFO +2 -2
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/README.md +1 -1
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/pyproject.toml +1 -1
- acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +217 -0
- acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +2 -0
- acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +526 -0
- acdc_aws_etl_pipeline-0.3.9/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -184
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
- {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -54,7 +54,7 @@ python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/ac
|
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
# Deploying to staging
|
|
57
|
-
VERSION=
|
|
57
|
+
VERSION=v1.0.0
|
|
58
58
|
bash services/dictionary/pull_dict.sh "https://raw.githubusercontent.com/AustralianBioCommons/acdc-schema-json/refs/tags/${VERSION}/dictionary/prod_dict/acdc_schema.json"
|
|
59
59
|
python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/acdc_schema_${VERSION}.json" s3://gen3schema-cad-staging-biocommons.org.au/cad.json
|
|
60
60
|
```
|
|
@@ -28,7 +28,7 @@ python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/ac
|
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
# Deploying to staging
|
|
31
|
-
VERSION=
|
|
31
|
+
VERSION=v1.0.0
|
|
32
32
|
bash services/dictionary/pull_dict.sh "https://raw.githubusercontent.com/AustralianBioCommons/acdc-schema-json/refs/tags/${VERSION}/dictionary/prod_dict/acdc_schema.json"
|
|
33
33
|
python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/acdc_schema_${VERSION}.json" s3://gen3schema-cad-staging-biocommons.org.au/cad.json
|
|
34
34
|
```
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from gen3.auth import Gen3Auth
|
|
4
|
+
from gen3.index import Gen3Index
|
|
5
|
+
from gen3.submission import Gen3Submission
|
|
6
|
+
import json
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
import uuid
|
|
9
|
+
import shutil
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_import_order(project_name, folder_path, import_order_file=None, exclude_nodes=None):
|
|
13
|
+
path = import_order_file or os.path.join(folder_path, project_name, "DataImportOrder.txt")
|
|
14
|
+
try:
|
|
15
|
+
with open(path, "r") as f:
|
|
16
|
+
import_order = [line.rstrip() for line in f]
|
|
17
|
+
if exclude_nodes is not None:
|
|
18
|
+
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
19
|
+
return import_order
|
|
20
|
+
except FileNotFoundError:
|
|
21
|
+
print(f"Error: DataImportOrder.txt not found in {path}")
|
|
22
|
+
return []
|
|
23
|
+
|
|
24
|
+
def read_json(json_fn, base_dir, project_id, ab_path=False):
|
|
25
|
+
try:
|
|
26
|
+
if ab_path:
|
|
27
|
+
json_path = os.path.join(base_dir, json_fn)
|
|
28
|
+
else:
|
|
29
|
+
json_path = os.path.join(base_dir, project_id, json_fn)
|
|
30
|
+
with open(json_path, 'r') as f:
|
|
31
|
+
schema = json.load(f)
|
|
32
|
+
print(f'{json_path} successfully loaded')
|
|
33
|
+
return schema
|
|
34
|
+
except FileNotFoundError:
|
|
35
|
+
print(f"Error: JSON file {json_path} not found.")
|
|
36
|
+
return None
|
|
37
|
+
except json.JSONDecodeError:
|
|
38
|
+
print(f"Error: JSON file {json_path} is not valid.")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def split_json_objects(json_list, max_size_kb=400, print_results=False):
|
|
42
|
+
def get_size_in_kb(obj):
|
|
43
|
+
return sys.getsizeof(json.dumps(obj)) / 1024
|
|
44
|
+
|
|
45
|
+
def split_list(json_list):
|
|
46
|
+
if get_size_in_kb(json_list) <= max_size_kb:
|
|
47
|
+
return [json_list]
|
|
48
|
+
mid = len(json_list) // 2
|
|
49
|
+
left_list = json_list[:mid]
|
|
50
|
+
right_list = json_list[mid:]
|
|
51
|
+
return split_list(left_list) + split_list(right_list)
|
|
52
|
+
|
|
53
|
+
split_lists = split_list(json_list)
|
|
54
|
+
if print_results:
|
|
55
|
+
for i, lst in enumerate(split_lists):
|
|
56
|
+
print(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
|
|
57
|
+
return split_lists
|
|
58
|
+
|
|
59
|
+
def process_node(node, sub, project_id, dry_run, read_json_args, split_json_objects_args, ab_path, max_retries=3):
|
|
60
|
+
if dry_run:
|
|
61
|
+
print(f"DRY RUN\t| {project_id}\t| {node} would be submitted")
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
print(f"\n\nIMPORTING\t| {project_id}\t| {node}")
|
|
65
|
+
json_data = read_json(f"{node}.json", *read_json_args, ab_path=ab_path)
|
|
66
|
+
|
|
67
|
+
if json_data is None:
|
|
68
|
+
print(f"SKIPPING\t| {project_id}\t| {node} due to errors in reading JSON")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
json_split = split_json_objects(json_data, **split_json_objects_args)
|
|
72
|
+
n_json_data = len(json_split)
|
|
73
|
+
|
|
74
|
+
for index, jsn in enumerate(json_split):
|
|
75
|
+
retries = 0
|
|
76
|
+
while retries < max_retries:
|
|
77
|
+
try:
|
|
78
|
+
print(f"SUBMITTING\t| {project_id}\t| {node}\t| {index + 1}/{n_json_data} data splits")
|
|
79
|
+
sub.submit_record("program1", project_id, jsn)
|
|
80
|
+
print(f"SUCCESS\t| Imported: {project_id}\t| {node}")
|
|
81
|
+
break
|
|
82
|
+
except Exception as e:
|
|
83
|
+
retries += 1
|
|
84
|
+
print(f"ERROR\t| {project_id}\t| {node}: {e} | Retry {retries}/{max_retries}")
|
|
85
|
+
if retries == max_retries:
|
|
86
|
+
print(f"FAILED\t| {project_id}\t| {node} after {max_retries} retries")
|
|
87
|
+
|
|
88
|
+
def submit_metadata(
|
|
89
|
+
base_dir: str,
|
|
90
|
+
project_id: str,
|
|
91
|
+
api_endpoint: str,
|
|
92
|
+
credentials: str,
|
|
93
|
+
exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
|
|
94
|
+
dry_run: bool = False,
|
|
95
|
+
max_submission_size_kb: int = 400,
|
|
96
|
+
retries=5,
|
|
97
|
+
disable_input: bool = False,
|
|
98
|
+
specific_node: str = None,
|
|
99
|
+
ab_path: bool = False,
|
|
100
|
+
import_order_file: str = None,
|
|
101
|
+
):
|
|
102
|
+
"""
|
|
103
|
+
Submits metadata json files to the gen3 api endpoint. Submission depends on a DataImportOrder.txt file, which defines the order of the nodes to be imported.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
base_dir (str): The path to the folder containing the metadata .json files. Should not contain project_id folder
|
|
107
|
+
project_id (str): The ID of the project.
|
|
108
|
+
api_endpoint (str): Gen3 API endpoint.
|
|
109
|
+
credentials (str): The path to the file containing the API credentials.
|
|
110
|
+
exclude_nodes (list): A list of node names to exclude from the import. Default is ["project", "program", "acknowledgement", "publication"].
|
|
111
|
+
dry_run (bool): If True, perform a dry run without actual submission. Default is False.
|
|
112
|
+
max_submission_size_kb (int): The maximum size of each submission in kilobytes. Default is 400 KB.
|
|
113
|
+
disable_input (bool): If True, disable user input confirmation. Default is False.
|
|
114
|
+
specific_node (str): If not None, only submit the specified node.
|
|
115
|
+
ab_path (bool): If True, use the absolute path to the base_dir.
|
|
116
|
+
import_order_file (str): The absolute path to the import order file, if not defined the program will look for os.path.join(folder_path, project_name, "DataImportOrder.txt")
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
None
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
if specific_node is None:
|
|
123
|
+
ordered_import_nodes = get_import_order(
|
|
124
|
+
project_id, base_dir, import_order_file=import_order_file, exclude_nodes=exclude_nodes
|
|
125
|
+
)
|
|
126
|
+
final_ordered_import_nodes = [
|
|
127
|
+
node for node in ordered_import_nodes if node not in exclude_nodes
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# creating auth and submission objects
|
|
131
|
+
auth = Gen3Auth(refresh_file=credentials)
|
|
132
|
+
sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
133
|
+
|
|
134
|
+
if not dry_run and not disable_input:
|
|
135
|
+
confirm = input("Do you want to submit the metadata? (yes/no): ").strip().lower()
|
|
136
|
+
if confirm != "yes":
|
|
137
|
+
print("Submission cancelled by user.")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
read_json_args = (base_dir, project_id)
|
|
141
|
+
split_json_objects_args = {
|
|
142
|
+
"max_size_kb": max_submission_size_kb,
|
|
143
|
+
"print_results": True,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if specific_node:
|
|
147
|
+
process_node(
|
|
148
|
+
specific_node,
|
|
149
|
+
sub,
|
|
150
|
+
project_id,
|
|
151
|
+
dry_run,
|
|
152
|
+
read_json_args,
|
|
153
|
+
split_json_objects_args,
|
|
154
|
+
ab_path,
|
|
155
|
+
retries,
|
|
156
|
+
)
|
|
157
|
+
print(f"Done. {project_id} | {specific_node} metadata submitted")
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
for node in final_ordered_import_nodes:
|
|
161
|
+
process_node(
|
|
162
|
+
node,
|
|
163
|
+
sub,
|
|
164
|
+
project_id,
|
|
165
|
+
dry_run,
|
|
166
|
+
read_json_args,
|
|
167
|
+
split_json_objects_args,
|
|
168
|
+
ab_path,
|
|
169
|
+
retries,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def delete_metadata(import_order_file: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"], prompt_for_confirmation: bool = True):
|
|
174
|
+
"""
|
|
175
|
+
Deletes metadata json files from the gen3 api endpoint. Deletion depends on a DataImportOrder.txt file, which defines the order of the nodes to be deleted.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
import_order_file (str): The path to the import order file
|
|
179
|
+
project_id (str): The ID of the project.
|
|
180
|
+
api_endpoint (str): Gen3 API endpoint.
|
|
181
|
+
credentials (str): The path to the file containing the API credentials.
|
|
182
|
+
exclude_nodes (list): A list of node names to exclude from the deletion. Default is ["project", "program", "acknowledgement", "publication"].
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
None
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
def get_import_order(import_order_file):
|
|
189
|
+
try:
|
|
190
|
+
with open(import_order_file, "r") as f:
|
|
191
|
+
import_order = [line.rstrip() for line in f]
|
|
192
|
+
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
193
|
+
return import_order
|
|
194
|
+
except FileNotFoundError:
|
|
195
|
+
print(f"Error: DataImportOrder.txt not found in {import_order_file}")
|
|
196
|
+
return []
|
|
197
|
+
|
|
198
|
+
ordered_import_nodes = get_import_order(import_order_file)
|
|
199
|
+
auth = Gen3Auth(refresh_file=credentials)
|
|
200
|
+
sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
201
|
+
|
|
202
|
+
final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
|
|
203
|
+
final_ordered_import_nodes.reverse() # Reverse the order for deletion
|
|
204
|
+
|
|
205
|
+
if prompt_for_confirmation:
|
|
206
|
+
confirm = input("Do you want to delete the metadata? (yes/no): ").strip().lower()
|
|
207
|
+
if confirm != 'yes':
|
|
208
|
+
print("Deletion cancelled by user.")
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
for node in final_ordered_import_nodes:
|
|
212
|
+
print(f"\n\n=== Deleting: {project_id} | {node} ===")
|
|
213
|
+
try:
|
|
214
|
+
sub.delete_nodes("program1", project_id, [node])
|
|
215
|
+
print(f"=== Successfully Deleted: {node} ===")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
print(f"=== Error deleting {node}: {e} ===")
|
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import boto3
|
|
3
|
+
from gen3.auth import Gen3Auth
|
|
4
|
+
from gen3.index import Gen3Index
|
|
5
|
+
from gen3.submission import Gen3Submission
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
def create_boto3_session(aws_profile: str = None):
|
|
13
|
+
"""
|
|
14
|
+
Create and return a boto3 Session object using an optional AWS profile.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
aws_profile (str, optional): The AWS CLI named profile to use for credentials. If None, uses default credentials.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
boto3.Session: The created session instance.
|
|
21
|
+
"""
|
|
22
|
+
logger.debug(f"Creating boto3 session with aws_profile={aws_profile}")
|
|
23
|
+
return boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
|
|
24
|
+
|
|
25
|
+
def is_s3_uri(s3_uri: str) -> bool:
|
|
26
|
+
"""
|
|
27
|
+
Check if the provided URI is a valid S3 URI.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
s3_uri (str): The string to check.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
bool: True if the string starts with 's3://', False otherwise.
|
|
34
|
+
"""
|
|
35
|
+
logger.debug(f"Checking if {s3_uri} is an S3 URI.")
|
|
36
|
+
return s3_uri.startswith("s3://")
|
|
37
|
+
|
|
38
|
+
def get_filename(file_path: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Extract the filename from a file path.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
file_path (str): The full path to a file.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: The filename (with extension).
|
|
47
|
+
"""
|
|
48
|
+
filename = file_path.split("/")[-1]
|
|
49
|
+
logger.debug(f"Extracted filename '{filename}' from file_path '{file_path}'.")
|
|
50
|
+
return filename
|
|
51
|
+
|
|
52
|
+
def get_node_from_file_path(file_path: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
Extract the node name from a file path, assuming file is named as 'node.json'.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
file_path (str): The file path.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: The base node name before the extension.
|
|
61
|
+
"""
|
|
62
|
+
filename = get_filename(file_path)
|
|
63
|
+
node = filename.split(".")[0]
|
|
64
|
+
logger.debug(f"Extracted node '{node}' from filename '{filename}'.")
|
|
65
|
+
return node
|
|
66
|
+
|
|
67
|
+
def list_metadata_jsons(metadata_dir: str) -> list:
|
|
68
|
+
"""
|
|
69
|
+
List all .json files in a given directory.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
metadata_dir (str): Directory containing metadata JSON files.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
list: List of absolute paths to all .json files in the directory.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
Exception: If there is an error reading the directory.
|
|
79
|
+
"""
|
|
80
|
+
try:
|
|
81
|
+
logger.info(f"Listing .json files in metadata directory: {metadata_dir}")
|
|
82
|
+
files = os.listdir(metadata_dir)
|
|
83
|
+
return [os.path.abspath(os.path.join(metadata_dir, f)) for f in files if f.endswith(".json")]
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"Error listing metadata JSONs in {metadata_dir}: {e}")
|
|
86
|
+
raise
|
|
87
|
+
|
|
88
|
+
def find_data_import_order_file(metadata_dir: str) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Find the DataImportOrder.txt file within a directory.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
metadata_dir (str): Directory to search in.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
str: Full path to the DataImportOrder.txt file.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
FileNotFoundError: If no such file is found.
|
|
100
|
+
"""
|
|
101
|
+
try:
|
|
102
|
+
logger.info(f"Searching for DataImportOrder.txt in {metadata_dir}")
|
|
103
|
+
files = [os.path.join(metadata_dir, f) for f in os.listdir(metadata_dir)]
|
|
104
|
+
order_files = [f for f in files if "DataImportOrder.txt" in f]
|
|
105
|
+
if not order_files:
|
|
106
|
+
logger.error("No DataImportOrder.txt file found in the given directory.")
|
|
107
|
+
raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
|
|
108
|
+
logger.debug(f"Found DataImportOrder.txt file: {order_files[0]}")
|
|
109
|
+
return order_files[0]
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.error(f"Error finding DataImportOrder.txt in {metadata_dir}: {e}")
|
|
112
|
+
raise
|
|
113
|
+
|
|
114
|
+
def list_metadata_jsons_s3(s3_uri: str, session) -> list:
|
|
115
|
+
"""
|
|
116
|
+
List all .json files in an S3 "directory" (prefix).
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
s3_uri (str): S3 URI to the metadata directory (e.g. "s3://my-bucket/path/to/dir").
|
|
120
|
+
session (boto3.Session): An active boto3 Session.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
list: List of S3 URIs for all .json files found under the prefix.
|
|
124
|
+
"""
|
|
125
|
+
logger.info(f"Listing .json files in S3 metadata directory: {s3_uri}")
|
|
126
|
+
s3 = session.client('s3')
|
|
127
|
+
bucket = s3_uri.split("/")[2]
|
|
128
|
+
prefix = "/".join(s3_uri.split("/")[3:])
|
|
129
|
+
if prefix and not prefix.endswith("/"):
|
|
130
|
+
prefix += "/" # Ensure prefix ends with a slash for directories
|
|
131
|
+
|
|
132
|
+
objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
|
|
133
|
+
result = [
|
|
134
|
+
f"s3://{bucket}/{obj['Key']}"
|
|
135
|
+
for obj in objects.get('Contents', [])
|
|
136
|
+
if obj['Key'].endswith(".json")
|
|
137
|
+
]
|
|
138
|
+
logger.debug(f"Found {len(result)} .json files in S3 at {s3_uri}")
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
def find_data_import_order_file_s3(s3_uri: str, session) -> str:
|
|
142
|
+
"""
|
|
143
|
+
Search for the DataImportOrder.txt file in an S3 directory.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
s3_uri (str): S3 URI specifying the directory/prefix to search.
|
|
147
|
+
session (boto3.Session): An active boto3 Session.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
str: Full S3 URI of the found DataImportOrder.txt file.
|
|
151
|
+
|
|
152
|
+
Raises:
|
|
153
|
+
FileNotFoundError: If the file does not exist in the specified prefix.
|
|
154
|
+
"""
|
|
155
|
+
logger.info(f"Searching for DataImportOrder.txt in S3 metadata directory: {s3_uri}")
|
|
156
|
+
s3 = session.client('s3')
|
|
157
|
+
bucket = s3_uri.split("/")[2]
|
|
158
|
+
prefix = "/".join(s3_uri.split("/")[3:])
|
|
159
|
+
objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
|
|
160
|
+
order_files = [obj['Key'] for obj in objects.get('Contents', []) if obj['Key'].endswith("DataImportOrder.txt")]
|
|
161
|
+
if not order_files:
|
|
162
|
+
logger.error("No DataImportOrder.txt file found in the given S3 directory.")
|
|
163
|
+
raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
|
|
164
|
+
logger.debug(f"Found DataImportOrder.txt file in S3: s3://{bucket}/{order_files[0]}")
|
|
165
|
+
return f"s3://{bucket}/{order_files[0]}"
|
|
166
|
+
|
|
167
|
+
def read_metadata_json(file_path: str) -> dict:
|
|
168
|
+
"""
|
|
169
|
+
Read and return a JSON file from the local file system.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
file_path (str): Path to the .json file.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
dict or list: Parsed contents of the JSON file.
|
|
176
|
+
"""
|
|
177
|
+
logger.info(f"Reading metadata json from local file: {file_path}")
|
|
178
|
+
with open(file_path, "r") as f:
|
|
179
|
+
data = json.load(f)
|
|
180
|
+
logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {file_path}")
|
|
181
|
+
return data
|
|
182
|
+
|
|
183
|
+
def read_metadata_json_s3(s3_uri: str, session) -> dict:
|
|
184
|
+
"""
|
|
185
|
+
Read and return JSON data from an S3 file.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
s3_uri (str): Full S3 URI to the .json file.
|
|
189
|
+
session (boto3.Session): Boto3 session.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
dict or list: Parsed JSON object from S3 file.
|
|
193
|
+
"""
|
|
194
|
+
logger.info(f"Reading metadata json from S3 file: {s3_uri}")
|
|
195
|
+
s3 = session.client('s3')
|
|
196
|
+
obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
|
|
197
|
+
data = json.loads(obj['Body'].read().decode('utf-8'))
|
|
198
|
+
logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
|
|
199
|
+
return data
|
|
200
|
+
|
|
201
|
+
def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
|
|
202
|
+
"""
|
|
203
|
+
Read a DataImportOrder.txt file from S3 and return node order as a list.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
s3_uri (str): S3 URI to the DataImportOrder.txt file.
|
|
207
|
+
session (boto3.Session): Boto3 session.
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
list: Node names (order as listed in file).
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
ValueError: If the provided S3 URI does not point to DataImportOrder.txt.
|
|
214
|
+
"""
|
|
215
|
+
filename = s3_uri.split("/")[-1]
|
|
216
|
+
if 'DataImportOrder.txt' not in filename:
|
|
217
|
+
logger.error(f"File {filename} is not a DataImportOrder.txt file")
|
|
218
|
+
raise ValueError(f"File {filename} is not a DataImportOrder.txt file")
|
|
219
|
+
logger.info(f"Reading DataImportOrder.txt from S3 file: {s3_uri}")
|
|
220
|
+
s3 = session.client('s3')
|
|
221
|
+
obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
|
|
222
|
+
content = obj['Body'].read().decode('utf-8')
|
|
223
|
+
import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
|
|
224
|
+
logger.debug(f"Read import order from S3 file: {import_order}")
|
|
225
|
+
return import_order
|
|
226
|
+
|
|
227
|
+
def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
|
|
228
|
+
"""
|
|
229
|
+
Read DataImportOrder.txt from local file, optionally excluding some nodes.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
file_path (str): Path to DataImportOrder.txt.
|
|
233
|
+
exclude_nodes (list): Node names to exclude from result.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
list: Node names, excludes specified nodes, keeps listed order.
|
|
237
|
+
|
|
238
|
+
Raises:
|
|
239
|
+
FileNotFoundError: If the file is not found.
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
logger.info(f"Reading DataImportOrder.txt from local file: {file_path}")
|
|
243
|
+
with open(file_path, "r") as f:
|
|
244
|
+
import_order = [line.rstrip() for line in f if line.strip()]
|
|
245
|
+
logger.debug(f"Raw import order from file: {import_order}")
|
|
246
|
+
if exclude_nodes is not None:
|
|
247
|
+
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
248
|
+
logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
|
|
249
|
+
logger.debug(f"Final import order from {file_path}: {import_order}")
|
|
250
|
+
return import_order
|
|
251
|
+
except FileNotFoundError:
|
|
252
|
+
logger.error(f"Error: DataImportOrder.txt not found in {file_path}")
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
|
|
256
|
+
"""
|
|
257
|
+
Split a list of JSON-serializable objects into size-limited chunks.
|
|
258
|
+
|
|
259
|
+
Each chunk/list, when JSON-serialized, will not exceed max_size_kb kilobytes.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
json_list (list): List of JSON serializable objects.
|
|
263
|
+
max_size_kb (int, optional): Max chunk size in KB. Default: 50.
|
|
264
|
+
print_results (bool, optional): If True, info log the size/count per chunk. Default: False.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
list: List of lists. Each sublist size (JSON-serialized) <= max_size_kb.
|
|
268
|
+
"""
|
|
269
|
+
logger.info(f"Splitting JSON objects into max {max_size_kb} KB chunks. Total items: {len(json_list)}")
|
|
270
|
+
def get_size_in_kb(obj):
|
|
271
|
+
"""
|
|
272
|
+
Get the size in kilobytes of the JSON-serialized object.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
obj: JSON-serializable object.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
float: Size of the object in kilobytes.
|
|
279
|
+
"""
|
|
280
|
+
import sys
|
|
281
|
+
size_kb = sys.getsizeof(json.dumps(obj)) / 1024
|
|
282
|
+
logger.debug(f"Calculated size: {size_kb:.2f} KB")
|
|
283
|
+
return size_kb
|
|
284
|
+
|
|
285
|
+
def split_list(json_list):
|
|
286
|
+
"""
|
|
287
|
+
Recursively split the list so each chunk fits within max_size_kb.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
json_list (list): List to split.
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
list: List of sublists.
|
|
294
|
+
"""
|
|
295
|
+
if get_size_in_kb(json_list) <= max_size_kb:
|
|
296
|
+
logger.debug(f"Split length {len(json_list)} is within max size {max_size_kb} KB.")
|
|
297
|
+
return [json_list]
|
|
298
|
+
mid = len(json_list) // 2
|
|
299
|
+
left_list = json_list[:mid]
|
|
300
|
+
right_list = json_list[mid:]
|
|
301
|
+
logger.debug(f"Splitting list at index {mid}: left {len(left_list)}, right {len(right_list)}")
|
|
302
|
+
return split_list(left_list) + split_list(right_list)
|
|
303
|
+
|
|
304
|
+
split_lists = split_list(json_list)
|
|
305
|
+
if print_results:
|
|
306
|
+
for i, lst in enumerate(split_lists):
|
|
307
|
+
logger.info(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
|
|
308
|
+
logger.debug(f"Total splits: {len(split_lists)}")
|
|
309
|
+
return split_lists
|
|
310
|
+
|
|
311
|
+
def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) -> dict:
|
|
312
|
+
"""
|
|
313
|
+
Retrieve a Gen3 API key stored as a secret in AWS Secrets Manager and parse it as a dict.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
secret_name (str): Name of the AWS secret.
|
|
317
|
+
region_name (str): AWS region where the secret is located.
|
|
318
|
+
session (boto3.Session): Boto3 session.
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
dict: Parsed Gen3 API key.
|
|
322
|
+
|
|
323
|
+
Raises:
|
|
324
|
+
Exception: On failure to retrieve or parse the secret.
|
|
325
|
+
"""
|
|
326
|
+
logger.info(f"Retrieving Gen3 API key from AWS Secrets Manager: secret_name={secret_name}, region={region_name}")
|
|
327
|
+
client = session.client(service_name='secretsmanager', region_name=region_name)
|
|
328
|
+
try:
|
|
329
|
+
get_secret_value_response = client.get_secret_value(
|
|
330
|
+
SecretId=secret_name
|
|
331
|
+
)
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error(f"Error getting secret value from AWS Secrets Manager: {e}")
|
|
334
|
+
raise e
|
|
335
|
+
|
|
336
|
+
secret = get_secret_value_response['SecretString']
|
|
337
|
+
|
|
338
|
+
try:
|
|
339
|
+
secret = json.loads(secret)
|
|
340
|
+
api_key = secret
|
|
341
|
+
logger.debug(f"Retrieved Gen3 API key from secret {secret_name}")
|
|
342
|
+
return api_key
|
|
343
|
+
except Exception as e:
|
|
344
|
+
logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
|
|
345
|
+
raise e
|
|
346
|
+
|
|
347
|
+
def create_gen3_submission_class(api_key: dict, api_endpoint: str):
|
|
348
|
+
"""
|
|
349
|
+
Create and authenticate a Gen3Submission client using a temporary file for API key.
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
api_key (dict): The Gen3 API key as Python dict.
|
|
353
|
+
api_endpoint (str): Gen3 endpoint (hostname/base API URL).
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
Gen3Submission: An authenticated Gen3Submission object.
|
|
357
|
+
|
|
358
|
+
Notes:
|
|
359
|
+
The temporary file storing the API key is deleted after use.
|
|
360
|
+
"""
|
|
361
|
+
import tempfile
|
|
362
|
+
|
|
363
|
+
logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
|
|
364
|
+
tmp_api_key_path = None
|
|
365
|
+
submit = None
|
|
366
|
+
|
|
367
|
+
try:
|
|
368
|
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", dir="/tmp") as tmp_file:
|
|
369
|
+
json.dump(api_key, tmp_file)
|
|
370
|
+
tmp_api_key_path = tmp_file.name
|
|
371
|
+
auth = Gen3Auth(refresh_file=tmp_api_key_path)
|
|
372
|
+
submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
373
|
+
return submit
|
|
374
|
+
finally:
|
|
375
|
+
if tmp_api_key_path and os.path.exists(tmp_api_key_path):
|
|
376
|
+
try:
|
|
377
|
+
os.remove(tmp_api_key_path)
|
|
378
|
+
logger.debug(f"Temporary API key file {tmp_api_key_path} deleted.")
|
|
379
|
+
except Exception as e:
|
|
380
|
+
logger.warning(f"Failed to delete temporary API key file {tmp_api_key_path}: {e}")
|
|
381
|
+
|
|
382
|
+
def write_submission_results(results, output_path, mode='w'):
|
|
383
|
+
with open(output_path, mode) as f:
|
|
384
|
+
json.dump(results, f, indent=4)
|
|
385
|
+
|
|
386
|
+
def submit_metadata(
|
|
387
|
+
file_list: list,
|
|
388
|
+
api_key: str,
|
|
389
|
+
api_endpoint: str,
|
|
390
|
+
project_id: str,
|
|
391
|
+
data_import_order_path: str,
|
|
392
|
+
boto3_session,
|
|
393
|
+
max_size_kb: int = 50,
|
|
394
|
+
exclude_nodes: list = None,
|
|
395
|
+
max_retries: int = 5,
|
|
396
|
+
):
|
|
397
|
+
"""
|
|
398
|
+
Submit a set of metadata JSON files to a Gen3 data commons endpoint, in order.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
file_list (list): List of paths (local or S3 URIs) to metadata .json files, one per node type.
|
|
402
|
+
api_key (str): Gen3 API key (parsed dict or JSON string).
|
|
403
|
+
api_endpoint (str): Gen3 data commons endpoint URL.
|
|
404
|
+
project_id (str): Gen3 project ID to submit data to.
|
|
405
|
+
data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying submission order.
|
|
406
|
+
boto3_session (boto3.Session): Existing AWS/boto3 session for S3 & secret usage.
|
|
407
|
+
max_size_kb (int, optional): Maximum size per submission chunk, in KB. Default: 50.
|
|
408
|
+
exclude_nodes (list, optional): List of node names to skip (default: ["project", "program", "acknowledgement", "publication"]).
|
|
409
|
+
max_retries (int, optional): Maximum number of retry attempts per node chunk. Default: 5.
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
None
|
|
413
|
+
|
|
414
|
+
Raises:
|
|
415
|
+
Exception: On critical submission failure for any chunk.
|
|
416
|
+
|
|
417
|
+
Notes:
|
|
418
|
+
Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
|
|
422
|
+
log_dir = f"submission_logs/{timestamp}"
|
|
423
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
424
|
+
|
|
425
|
+
if exclude_nodes is None:
|
|
426
|
+
exclude_nodes = ["project", "program", "acknowledgement", "publication"]
|
|
427
|
+
|
|
428
|
+
logger.info("Starting metadata submission process.")
|
|
429
|
+
logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
|
|
430
|
+
submit = create_gen3_submission_class(api_key, api_endpoint)
|
|
431
|
+
|
|
432
|
+
if is_s3_uri(data_import_order_path):
|
|
433
|
+
logger.info(f"Reading import order from S3: {data_import_order_path}")
|
|
434
|
+
import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
|
|
435
|
+
logger.debug(f"Import order from S3: {import_order}")
|
|
436
|
+
else:
|
|
437
|
+
logger.info(f"Reading import order from file: {data_import_order_path}")
|
|
438
|
+
import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
439
|
+
logger.debug(f"Import order from file: {import_order}")
|
|
440
|
+
|
|
441
|
+
# Map node name to file for fast access and avoid repeatedly scanning file_list
|
|
442
|
+
file_map = {get_node_from_file_path(file): file for file in file_list}
|
|
443
|
+
|
|
444
|
+
for node in import_order:
|
|
445
|
+
if node in exclude_nodes:
|
|
446
|
+
logger.info(f"Skipping node '{node}' (in exclude list).")
|
|
447
|
+
continue
|
|
448
|
+
file = file_map.get(node)
|
|
449
|
+
if not file:
|
|
450
|
+
logger.info(f"Skipping node '{node}' (not present in file list).")
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
logger.info(f"Processing file '{file}' for node '{node}'.")
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
if is_s3_uri(file):
|
|
457
|
+
logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
|
|
458
|
+
json_data = read_metadata_json_s3(file, boto3_session)
|
|
459
|
+
else:
|
|
460
|
+
logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
|
|
461
|
+
json_data = read_metadata_json(file)
|
|
462
|
+
except Exception as e:
|
|
463
|
+
logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
if not json_data:
|
|
467
|
+
logger.info(f"Skipping node '{node}' due to errors in reading JSON.")
|
|
468
|
+
continue
|
|
469
|
+
|
|
470
|
+
split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
|
|
471
|
+
n_json_data = len(split_json_list)
|
|
472
|
+
logger.info(
|
|
473
|
+
f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
for index, jsn in enumerate(split_json_list):
|
|
477
|
+
progress_str = f"{index + 1}/{n_json_data}"
|
|
478
|
+
|
|
479
|
+
for attempt in range(max_retries + 1):
|
|
480
|
+
try:
|
|
481
|
+
log_msg = (
|
|
482
|
+
f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
483
|
+
f"Split: {progress_str:<5}"
|
|
484
|
+
if attempt == 0 else
|
|
485
|
+
f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
486
|
+
f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
|
|
487
|
+
)
|
|
488
|
+
logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
|
|
489
|
+
|
|
490
|
+
res = submit.submit_record("program1", project_id, jsn)
|
|
491
|
+
|
|
492
|
+
# writing submission results as log json
|
|
493
|
+
log_filename = os.path.join(
|
|
494
|
+
log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
|
|
495
|
+
)
|
|
496
|
+
abs_log_filename = os.path.abspath(log_filename)
|
|
497
|
+
with open(abs_log_filename, "a") as f:
|
|
498
|
+
json.dump(res, f)
|
|
499
|
+
f.write("\n")
|
|
500
|
+
logger.info(
|
|
501
|
+
f"Wrote submission response to log file: {abs_log_filename}"
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
logger.info(
|
|
505
|
+
f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
|
|
506
|
+
f"Node: {node:<12} | Split: {progress_str:<5}"
|
|
507
|
+
)
|
|
508
|
+
break # Successful, move to next chunk
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.error(
|
|
512
|
+
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
513
|
+
)
|
|
514
|
+
if attempt < max_retries:
|
|
515
|
+
import time
|
|
516
|
+
time.sleep(0.2)
|
|
517
|
+
else:
|
|
518
|
+
logger.critical(
|
|
519
|
+
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
520
|
+
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
521
|
+
)
|
|
522
|
+
raise
|
|
523
|
+
|
|
524
|
+
logger.info(f"Finished submitting node '{node}'.")
|
|
525
|
+
|
|
526
|
+
logger.info("--- Submission process complete ---")
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sys
|
|
3
|
-
from gen3.auth import Gen3Auth
|
|
4
|
-
from gen3.index import Gen3Index
|
|
5
|
-
from gen3.submission import Gen3Submission
|
|
6
|
-
import json
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
import uuid
|
|
9
|
-
import shutil
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def submit_metadata(base_dir: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
|
|
13
|
-
dry_run: bool = False, max_submission_size_kb: int = 400, retries = 5, disable_input: bool = False,
|
|
14
|
-
specific_node: str = None, ab_path: bool = False, import_order_file: str = None):
|
|
15
|
-
"""
|
|
16
|
-
Submits metadata json files to the gen3 api endpoint. Submission depends on a DataImportOrder.txt file, which defines the order of the nodes to be imported.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
base_dir (str): The path to the folder containing the metadata .json files. Should not contain project_id folder
|
|
20
|
-
project_id (str): The ID of the project.
|
|
21
|
-
api_endpoint (str): Gen3 API endpoint.
|
|
22
|
-
credentials (str): The path to the file containing the API credentials.
|
|
23
|
-
exclude_nodes (list): A list of node names to exclude from the import. Default is ["project", "program", "acknowledgement", "publication"].
|
|
24
|
-
dry_run (bool): If True, perform a dry run without actual submission. Default is False.
|
|
25
|
-
max_submission_size_kb (int): The maximum size of each submission in kilobytes. Default is 400 KB.
|
|
26
|
-
disable_input (bool): If True, disable user input confirmation. Default is False.
|
|
27
|
-
specific_node (str): If not None, only submit the specified node.
|
|
28
|
-
ab_path (bool): If True, use the absolute path to the base_dir.
|
|
29
|
-
import_order_file (str): The absolute path to the import order file, if not defined the program will look for os.path.join(folder_path, project_name, "DataImportOrder.txt")
|
|
30
|
-
|
|
31
|
-
Returns:
|
|
32
|
-
None
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def get_import_order(project_name, folder_path):
|
|
36
|
-
path = import_order_file or os.path.join(folder_path, project_name, "DataImportOrder.txt")
|
|
37
|
-
try:
|
|
38
|
-
with open(path, "r") as f:
|
|
39
|
-
import_order = [line.rstrip() for line in f]
|
|
40
|
-
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
41
|
-
return import_order
|
|
42
|
-
except FileNotFoundError:
|
|
43
|
-
print(f"Error: DataImportOrder.txt not found in {path}")
|
|
44
|
-
return []
|
|
45
|
-
|
|
46
|
-
def read_json(json_fn, ab_path: bool = False):
|
|
47
|
-
try:
|
|
48
|
-
if ab_path:
|
|
49
|
-
json_path = os.path.join(base_dir, json_fn)
|
|
50
|
-
else:
|
|
51
|
-
json_path = os.path.join(base_dir, project_id, json_fn)
|
|
52
|
-
with open(json_path, 'r') as f:
|
|
53
|
-
schema = json.load(f)
|
|
54
|
-
print(f'{json_path} successfully loaded')
|
|
55
|
-
return schema
|
|
56
|
-
except FileNotFoundError:
|
|
57
|
-
print(f"Error: JSON file {json_path} not found.")
|
|
58
|
-
return None
|
|
59
|
-
except json.JSONDecodeError:
|
|
60
|
-
print(f"Error: JSON file {json_path} is not valid.")
|
|
61
|
-
return None
|
|
62
|
-
|
|
63
|
-
if specific_node is None:
|
|
64
|
-
ordered_import_nodes = get_import_order(project_id, base_dir)
|
|
65
|
-
final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
|
|
66
|
-
|
|
67
|
-
# creating auth and submission objects
|
|
68
|
-
auth = Gen3Auth(refresh_file=credentials)
|
|
69
|
-
sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
if not dry_run and not disable_input:
|
|
73
|
-
confirm = input("Do you want to submit the metadata? (yes/no): ").strip().lower()
|
|
74
|
-
if confirm != 'yes':
|
|
75
|
-
print("Submission cancelled by user.")
|
|
76
|
-
return
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def split_json_objects(json_list, max_size_kb=max_submission_size_kb, print_results=False):
|
|
80
|
-
def get_size_in_kb(obj):
|
|
81
|
-
return sys.getsizeof(json.dumps(obj)) / 1024
|
|
82
|
-
|
|
83
|
-
def split_list(json_list):
|
|
84
|
-
if get_size_in_kb(json_list) <= max_size_kb:
|
|
85
|
-
return [json_list]
|
|
86
|
-
|
|
87
|
-
mid = len(json_list) // 2
|
|
88
|
-
left_list = json_list[:mid]
|
|
89
|
-
right_list = json_list[mid:]
|
|
90
|
-
|
|
91
|
-
return split_list(left_list) + split_list(right_list)
|
|
92
|
-
|
|
93
|
-
split_lists = split_list(json_list)
|
|
94
|
-
|
|
95
|
-
if print_results:
|
|
96
|
-
for i, lst in enumerate(split_lists):
|
|
97
|
-
print(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
|
|
98
|
-
|
|
99
|
-
return split_lists
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def process_node(node, sub, project_id, dry_run, max_retries=3):
|
|
103
|
-
if dry_run:
|
|
104
|
-
print(f"DRY RUN\t| {project_id}\t| {node} would be submitted")
|
|
105
|
-
return
|
|
106
|
-
|
|
107
|
-
print(f"\n\nIMPORTING\t| {project_id}\t| {node}")
|
|
108
|
-
json_data = read_json(f"{node}.json", ab_path=ab_path)
|
|
109
|
-
|
|
110
|
-
if json_data is None:
|
|
111
|
-
print(f"SKIPPING\t| {project_id}\t| {node} due to errors in reading JSON")
|
|
112
|
-
return
|
|
113
|
-
|
|
114
|
-
json_split = split_json_objects(json_data, max_size_kb=max_submission_size_kb, print_results=True)
|
|
115
|
-
n_json_data = len(json_split)
|
|
116
|
-
|
|
117
|
-
for index, jsn in enumerate(json_split):
|
|
118
|
-
retries = 0
|
|
119
|
-
while retries < max_retries:
|
|
120
|
-
try:
|
|
121
|
-
print(f"SUBMITTING\t| {project_id}\t| {node}\t| {index + 1}/{n_json_data} data splits")
|
|
122
|
-
sub.submit_record("program1", project_id, jsn)
|
|
123
|
-
print(f"SUCCESS\t| Imported: {project_id}\t| {node}")
|
|
124
|
-
break
|
|
125
|
-
except Exception as e:
|
|
126
|
-
retries += 1
|
|
127
|
-
print(f"ERROR\t| {project_id}\t| {node}: {e} | Retry {retries}/{max_retries}")
|
|
128
|
-
if retries == max_retries:
|
|
129
|
-
print(f"FAILED\t| {project_id}\t| {node} after {max_retries} retries")
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
if specific_node:
|
|
133
|
-
process_node(specific_node, sub, project_id, dry_run, retries)
|
|
134
|
-
return print(f"Done. {project_id} | {specific_node} metadata submitted")
|
|
135
|
-
|
|
136
|
-
for node in final_ordered_import_nodes:
|
|
137
|
-
process_node(node, sub, project_id, dry_run, retries)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def delete_metadata(import_order_file: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"], prompt_for_confirmation: bool = True):
|
|
141
|
-
"""
|
|
142
|
-
Deletes metadata json files from the gen3 api endpoint. Deletion depends on a DataImportOrder.txt file, which defines the order of the nodes to be deleted.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
import_order_file (str): The path to the import order file
|
|
146
|
-
project_id (str): The ID of the project.
|
|
147
|
-
api_endpoint (str): Gen3 API endpoint.
|
|
148
|
-
credentials (str): The path to the file containing the API credentials.
|
|
149
|
-
exclude_nodes (list): A list of node names to exclude from the deletion. Default is ["project", "program", "acknowledgement", "publication"].
|
|
150
|
-
|
|
151
|
-
Returns:
|
|
152
|
-
None
|
|
153
|
-
"""
|
|
154
|
-
|
|
155
|
-
def get_import_order(import_order_file):
|
|
156
|
-
try:
|
|
157
|
-
with open(import_order_file, "r") as f:
|
|
158
|
-
import_order = [line.rstrip() for line in f]
|
|
159
|
-
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
160
|
-
return import_order
|
|
161
|
-
except FileNotFoundError:
|
|
162
|
-
print(f"Error: DataImportOrder.txt not found in {import_order_file}")
|
|
163
|
-
return []
|
|
164
|
-
|
|
165
|
-
ordered_import_nodes = get_import_order(import_order_file)
|
|
166
|
-
auth = Gen3Auth(refresh_file=credentials)
|
|
167
|
-
sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
168
|
-
|
|
169
|
-
final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
|
|
170
|
-
final_ordered_import_nodes.reverse() # Reverse the order for deletion
|
|
171
|
-
|
|
172
|
-
if prompt_for_confirmation:
|
|
173
|
-
confirm = input("Do you want to delete the metadata? (yes/no): ").strip().lower()
|
|
174
|
-
if confirm != 'yes':
|
|
175
|
-
print("Deletion cancelled by user.")
|
|
176
|
-
return
|
|
177
|
-
|
|
178
|
-
for node in final_ordered_import_nodes:
|
|
179
|
-
print(f"\n\n=== Deleting: {project_id} | {node} ===")
|
|
180
|
-
try:
|
|
181
|
-
sub.delete_nodes("program1", project_id, [node])
|
|
182
|
-
print(f"=== Successfully Deleted: {node} ===")
|
|
183
|
-
except Exception as e:
|
|
184
|
-
print(f"=== Error deleting {node}: {e} ===")
|
{acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|