acdc_aws_etl_pipeline 0.3.9__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (15) hide show
  1. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/PKG-INFO +2 -2
  2. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/README.md +1 -1
  3. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/pyproject.toml +1 -1
  4. acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +217 -0
  5. acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/metadata_deleter.py +2 -0
  6. acdc_aws_etl_pipeline-0.4.1/src/acdc_aws_etl_pipeline/upload/metadata_submitter.py +526 -0
  7. acdc_aws_etl_pipeline-0.3.9/src/acdc_aws_etl_pipeline/upload/gen3datasubmitter.py +0 -184
  8. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/__init__.py +0 -0
  9. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/ingest/ingest.py +0 -0
  10. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/upload/__init__.py +0 -0
  11. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py +0 -0
  12. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/athena_utils.py +0 -0
  13. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/dbt_utils.py +0 -0
  14. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/utils/release_writer.py +0 -0
  15. {acdc_aws_etl_pipeline-0.3.9 → acdc_aws_etl_pipeline-0.4.1}/src/acdc_aws_etl_pipeline/validate/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.3.9
3
+ Version: 0.4.1
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -54,7 +54,7 @@ python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/ac
54
54
 
55
55
 
56
56
  # Deploying to staging
57
- VERSION=v0.7.7
57
+ VERSION=v1.0.0
58
58
  bash services/dictionary/pull_dict.sh "https://raw.githubusercontent.com/AustralianBioCommons/acdc-schema-json/refs/tags/${VERSION}/dictionary/prod_dict/acdc_schema.json"
59
59
  python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/acdc_schema_${VERSION}.json" s3://gen3schema-cad-staging-biocommons.org.au/cad.json
60
60
  ```
@@ -28,7 +28,7 @@ python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/ac
28
28
 
29
29
 
30
30
  # Deploying to staging
31
- VERSION=v0.7.7
31
+ VERSION=v1.0.0
32
32
  bash services/dictionary/pull_dict.sh "https://raw.githubusercontent.com/AustralianBioCommons/acdc-schema-json/refs/tags/${VERSION}/dictionary/prod_dict/acdc_schema.json"
33
33
  python3 services/dictionary/upload_dictionary.py "services/dictionary/schemas/acdc_schema_${VERSION}.json" s3://gen3schema-cad-staging-biocommons.org.au/cad.json
34
34
  ```
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "acdc_aws_etl_pipeline"
3
- version = "0.3.9"
3
+ version = "0.4.1"
4
4
  description = "Tools for ACDC ETL pipeline"
5
5
  authors = ["JoshuaHarris391 <harjo391@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,217 @@
1
+ import os
2
+ import sys
3
+ from gen3.auth import Gen3Auth
4
+ from gen3.index import Gen3Index
5
+ from gen3.submission import Gen3Submission
6
+ import json
7
+ from datetime import datetime
8
+ import uuid
9
+ import shutil
10
+
11
+
12
+ def get_import_order(project_name, folder_path, import_order_file=None, exclude_nodes=None):
13
+ path = import_order_file or os.path.join(folder_path, project_name, "DataImportOrder.txt")
14
+ try:
15
+ with open(path, "r") as f:
16
+ import_order = [line.rstrip() for line in f]
17
+ if exclude_nodes is not None:
18
+ import_order = [node for node in import_order if node not in exclude_nodes]
19
+ return import_order
20
+ except FileNotFoundError:
21
+ print(f"Error: DataImportOrder.txt not found in {path}")
22
+ return []
23
+
24
+ def read_json(json_fn, base_dir, project_id, ab_path=False):
25
+ try:
26
+ if ab_path:
27
+ json_path = os.path.join(base_dir, json_fn)
28
+ else:
29
+ json_path = os.path.join(base_dir, project_id, json_fn)
30
+ with open(json_path, 'r') as f:
31
+ schema = json.load(f)
32
+ print(f'{json_path} successfully loaded')
33
+ return schema
34
+ except FileNotFoundError:
35
+ print(f"Error: JSON file {json_path} not found.")
36
+ return None
37
+ except json.JSONDecodeError:
38
+ print(f"Error: JSON file {json_path} is not valid.")
39
+ return None
40
+
41
+ def split_json_objects(json_list, max_size_kb=400, print_results=False):
42
+ def get_size_in_kb(obj):
43
+ return sys.getsizeof(json.dumps(obj)) / 1024
44
+
45
+ def split_list(json_list):
46
+ if get_size_in_kb(json_list) <= max_size_kb:
47
+ return [json_list]
48
+ mid = len(json_list) // 2
49
+ left_list = json_list[:mid]
50
+ right_list = json_list[mid:]
51
+ return split_list(left_list) + split_list(right_list)
52
+
53
+ split_lists = split_list(json_list)
54
+ if print_results:
55
+ for i, lst in enumerate(split_lists):
56
+ print(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
57
+ return split_lists
58
+
59
+ def process_node(node, sub, project_id, dry_run, read_json_args, split_json_objects_args, ab_path, max_retries=3):
60
+ if dry_run:
61
+ print(f"DRY RUN\t| {project_id}\t| {node} would be submitted")
62
+ return
63
+
64
+ print(f"\n\nIMPORTING\t| {project_id}\t| {node}")
65
+ json_data = read_json(f"{node}.json", *read_json_args, ab_path=ab_path)
66
+
67
+ if json_data is None:
68
+ print(f"SKIPPING\t| {project_id}\t| {node} due to errors in reading JSON")
69
+ return
70
+
71
+ json_split = split_json_objects(json_data, **split_json_objects_args)
72
+ n_json_data = len(json_split)
73
+
74
+ for index, jsn in enumerate(json_split):
75
+ retries = 0
76
+ while retries < max_retries:
77
+ try:
78
+ print(f"SUBMITTING\t| {project_id}\t| {node}\t| {index + 1}/{n_json_data} data splits")
79
+ sub.submit_record("program1", project_id, jsn)
80
+ print(f"SUCCESS\t| Imported: {project_id}\t| {node}")
81
+ break
82
+ except Exception as e:
83
+ retries += 1
84
+ print(f"ERROR\t| {project_id}\t| {node}: {e} | Retry {retries}/{max_retries}")
85
+ if retries == max_retries:
86
+ print(f"FAILED\t| {project_id}\t| {node} after {max_retries} retries")
87
+
88
+ def submit_metadata(
89
+ base_dir: str,
90
+ project_id: str,
91
+ api_endpoint: str,
92
+ credentials: str,
93
+ exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
94
+ dry_run: bool = False,
95
+ max_submission_size_kb: int = 400,
96
+ retries=5,
97
+ disable_input: bool = False,
98
+ specific_node: str = None,
99
+ ab_path: bool = False,
100
+ import_order_file: str = None,
101
+ ):
102
+ """
103
+ Submits metadata json files to the gen3 api endpoint. Submission depends on a DataImportOrder.txt file, which defines the order of the nodes to be imported.
104
+
105
+ Args:
106
+ base_dir (str): The path to the folder containing the metadata .json files. Should not contain project_id folder
107
+ project_id (str): The ID of the project.
108
+ api_endpoint (str): Gen3 API endpoint.
109
+ credentials (str): The path to the file containing the API credentials.
110
+ exclude_nodes (list): A list of node names to exclude from the import. Default is ["project", "program", "acknowledgement", "publication"].
111
+ dry_run (bool): If True, perform a dry run without actual submission. Default is False.
112
+ max_submission_size_kb (int): The maximum size of each submission in kilobytes. Default is 400 KB.
113
+ disable_input (bool): If True, disable user input confirmation. Default is False.
114
+ specific_node (str): If not None, only submit the specified node.
115
+ ab_path (bool): If True, use the absolute path to the base_dir.
116
+ import_order_file (str): The absolute path to the import order file, if not defined the program will look for os.path.join(folder_path, project_name, "DataImportOrder.txt")
117
+
118
+ Returns:
119
+ None
120
+ """
121
+
122
+ if specific_node is None:
123
+ ordered_import_nodes = get_import_order(
124
+ project_id, base_dir, import_order_file=import_order_file, exclude_nodes=exclude_nodes
125
+ )
126
+ final_ordered_import_nodes = [
127
+ node for node in ordered_import_nodes if node not in exclude_nodes
128
+ ]
129
+
130
+ # creating auth and submission objects
131
+ auth = Gen3Auth(refresh_file=credentials)
132
+ sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
133
+
134
+ if not dry_run and not disable_input:
135
+ confirm = input("Do you want to submit the metadata? (yes/no): ").strip().lower()
136
+ if confirm != "yes":
137
+ print("Submission cancelled by user.")
138
+ return
139
+
140
+ read_json_args = (base_dir, project_id)
141
+ split_json_objects_args = {
142
+ "max_size_kb": max_submission_size_kb,
143
+ "print_results": True,
144
+ }
145
+
146
+ if specific_node:
147
+ process_node(
148
+ specific_node,
149
+ sub,
150
+ project_id,
151
+ dry_run,
152
+ read_json_args,
153
+ split_json_objects_args,
154
+ ab_path,
155
+ retries,
156
+ )
157
+ print(f"Done. {project_id} | {specific_node} metadata submitted")
158
+ return
159
+
160
+ for node in final_ordered_import_nodes:
161
+ process_node(
162
+ node,
163
+ sub,
164
+ project_id,
165
+ dry_run,
166
+ read_json_args,
167
+ split_json_objects_args,
168
+ ab_path,
169
+ retries,
170
+ )
171
+
172
+
173
+ def delete_metadata(import_order_file: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"], prompt_for_confirmation: bool = True):
174
+ """
175
+ Deletes metadata json files from the gen3 api endpoint. Deletion depends on a DataImportOrder.txt file, which defines the order of the nodes to be deleted.
176
+
177
+ Args:
178
+ import_order_file (str): The path to the import order file
179
+ project_id (str): The ID of the project.
180
+ api_endpoint (str): Gen3 API endpoint.
181
+ credentials (str): The path to the file containing the API credentials.
182
+ exclude_nodes (list): A list of node names to exclude from the deletion. Default is ["project", "program", "acknowledgement", "publication"].
183
+
184
+ Returns:
185
+ None
186
+ """
187
+
188
+ def get_import_order(import_order_file):
189
+ try:
190
+ with open(import_order_file, "r") as f:
191
+ import_order = [line.rstrip() for line in f]
192
+ import_order = [node for node in import_order if node not in exclude_nodes]
193
+ return import_order
194
+ except FileNotFoundError:
195
+ print(f"Error: DataImportOrder.txt not found in {import_order_file}")
196
+ return []
197
+
198
+ ordered_import_nodes = get_import_order(import_order_file)
199
+ auth = Gen3Auth(refresh_file=credentials)
200
+ sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
201
+
202
+ final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
203
+ final_ordered_import_nodes.reverse() # Reverse the order for deletion
204
+
205
+ if prompt_for_confirmation:
206
+ confirm = input("Do you want to delete the metadata? (yes/no): ").strip().lower()
207
+ if confirm != 'yes':
208
+ print("Deletion cancelled by user.")
209
+ return
210
+
211
+ for node in final_ordered_import_nodes:
212
+ print(f"\n\n=== Deleting: {project_id} | {node} ===")
213
+ try:
214
+ sub.delete_nodes("program1", project_id, [node])
215
+ print(f"=== Successfully Deleted: {node} ===")
216
+ except Exception as e:
217
+ print(f"=== Error deleting {node}: {e} ===")
@@ -0,0 +1,2 @@
1
+ from acdc_aws_etl_pipeline.upload.metadata_submitter import *
2
+
@@ -0,0 +1,526 @@
1
+ import json
2
+ import boto3
3
+ from gen3.auth import Gen3Auth
4
+ from gen3.index import Gen3Index
5
+ from gen3.submission import Gen3Submission
6
+ import logging
7
+ import os
8
+ from datetime import datetime
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def create_boto3_session(aws_profile: str = None):
13
+ """
14
+ Create and return a boto3 Session object using an optional AWS profile.
15
+
16
+ Args:
17
+ aws_profile (str, optional): The AWS CLI named profile to use for credentials. If None, uses default credentials.
18
+
19
+ Returns:
20
+ boto3.Session: The created session instance.
21
+ """
22
+ logger.debug(f"Creating boto3 session with aws_profile={aws_profile}")
23
+ return boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
24
+
25
+ def is_s3_uri(s3_uri: str) -> bool:
26
+ """
27
+ Check if the provided URI is a valid S3 URI.
28
+
29
+ Args:
30
+ s3_uri (str): The string to check.
31
+
32
+ Returns:
33
+ bool: True if the string starts with 's3://', False otherwise.
34
+ """
35
+ logger.debug(f"Checking if {s3_uri} is an S3 URI.")
36
+ return s3_uri.startswith("s3://")
37
+
38
+ def get_filename(file_path: str) -> str:
39
+ """
40
+ Extract the filename from a file path.
41
+
42
+ Args:
43
+ file_path (str): The full path to a file.
44
+
45
+ Returns:
46
+ str: The filename (with extension).
47
+ """
48
+ filename = file_path.split("/")[-1]
49
+ logger.debug(f"Extracted filename '{filename}' from file_path '{file_path}'.")
50
+ return filename
51
+
52
+ def get_node_from_file_path(file_path: str) -> str:
53
+ """
54
+ Extract the node name from a file path, assuming file is named as 'node.json'.
55
+
56
+ Args:
57
+ file_path (str): The file path.
58
+
59
+ Returns:
60
+ str: The base node name before the extension.
61
+ """
62
+ filename = get_filename(file_path)
63
+ node = filename.split(".")[0]
64
+ logger.debug(f"Extracted node '{node}' from filename '{filename}'.")
65
+ return node
66
+
67
+ def list_metadata_jsons(metadata_dir: str) -> list:
68
+ """
69
+ List all .json files in a given directory.
70
+
71
+ Args:
72
+ metadata_dir (str): Directory containing metadata JSON files.
73
+
74
+ Returns:
75
+ list: List of absolute paths to all .json files in the directory.
76
+
77
+ Raises:
78
+ Exception: If there is an error reading the directory.
79
+ """
80
+ try:
81
+ logger.info(f"Listing .json files in metadata directory: {metadata_dir}")
82
+ files = os.listdir(metadata_dir)
83
+ return [os.path.abspath(os.path.join(metadata_dir, f)) for f in files if f.endswith(".json")]
84
+ except Exception as e:
85
+ logger.error(f"Error listing metadata JSONs in {metadata_dir}: {e}")
86
+ raise
87
+
88
+ def find_data_import_order_file(metadata_dir: str) -> str:
89
+ """
90
+ Find the DataImportOrder.txt file within a directory.
91
+
92
+ Args:
93
+ metadata_dir (str): Directory to search in.
94
+
95
+ Returns:
96
+ str: Full path to the DataImportOrder.txt file.
97
+
98
+ Raises:
99
+ FileNotFoundError: If no such file is found.
100
+ """
101
+ try:
102
+ logger.info(f"Searching for DataImportOrder.txt in {metadata_dir}")
103
+ files = [os.path.join(metadata_dir, f) for f in os.listdir(metadata_dir)]
104
+ order_files = [f for f in files if "DataImportOrder.txt" in f]
105
+ if not order_files:
106
+ logger.error("No DataImportOrder.txt file found in the given directory.")
107
+ raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
108
+ logger.debug(f"Found DataImportOrder.txt file: {order_files[0]}")
109
+ return order_files[0]
110
+ except Exception as e:
111
+ logger.error(f"Error finding DataImportOrder.txt in {metadata_dir}: {e}")
112
+ raise
113
+
114
+ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
115
+ """
116
+ List all .json files in an S3 "directory" (prefix).
117
+
118
+ Args:
119
+ s3_uri (str): S3 URI to the metadata directory (e.g. "s3://my-bucket/path/to/dir").
120
+ session (boto3.Session): An active boto3 Session.
121
+
122
+ Returns:
123
+ list: List of S3 URIs for all .json files found under the prefix.
124
+ """
125
+ logger.info(f"Listing .json files in S3 metadata directory: {s3_uri}")
126
+ s3 = session.client('s3')
127
+ bucket = s3_uri.split("/")[2]
128
+ prefix = "/".join(s3_uri.split("/")[3:])
129
+ if prefix and not prefix.endswith("/"):
130
+ prefix += "/" # Ensure prefix ends with a slash for directories
131
+
132
+ objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
133
+ result = [
134
+ f"s3://{bucket}/{obj['Key']}"
135
+ for obj in objects.get('Contents', [])
136
+ if obj['Key'].endswith(".json")
137
+ ]
138
+ logger.debug(f"Found {len(result)} .json files in S3 at {s3_uri}")
139
+ return result
140
+
141
+ def find_data_import_order_file_s3(s3_uri: str, session) -> str:
142
+ """
143
+ Search for the DataImportOrder.txt file in an S3 directory.
144
+
145
+ Args:
146
+ s3_uri (str): S3 URI specifying the directory/prefix to search.
147
+ session (boto3.Session): An active boto3 Session.
148
+
149
+ Returns:
150
+ str: Full S3 URI of the found DataImportOrder.txt file.
151
+
152
+ Raises:
153
+ FileNotFoundError: If the file does not exist in the specified prefix.
154
+ """
155
+ logger.info(f"Searching for DataImportOrder.txt in S3 metadata directory: {s3_uri}")
156
+ s3 = session.client('s3')
157
+ bucket = s3_uri.split("/")[2]
158
+ prefix = "/".join(s3_uri.split("/")[3:])
159
+ objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
160
+ order_files = [obj['Key'] for obj in objects.get('Contents', []) if obj['Key'].endswith("DataImportOrder.txt")]
161
+ if not order_files:
162
+ logger.error("No DataImportOrder.txt file found in the given S3 directory.")
163
+ raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
164
+ logger.debug(f"Found DataImportOrder.txt file in S3: s3://{bucket}/{order_files[0]}")
165
+ return f"s3://{bucket}/{order_files[0]}"
166
+
167
+ def read_metadata_json(file_path: str) -> dict:
168
+ """
169
+ Read and return a JSON file from the local file system.
170
+
171
+ Args:
172
+ file_path (str): Path to the .json file.
173
+
174
+ Returns:
175
+ dict or list: Parsed contents of the JSON file.
176
+ """
177
+ logger.info(f"Reading metadata json from local file: {file_path}")
178
+ with open(file_path, "r") as f:
179
+ data = json.load(f)
180
+ logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {file_path}")
181
+ return data
182
+
183
+ def read_metadata_json_s3(s3_uri: str, session) -> dict:
184
+ """
185
+ Read and return JSON data from an S3 file.
186
+
187
+ Args:
188
+ s3_uri (str): Full S3 URI to the .json file.
189
+ session (boto3.Session): Boto3 session.
190
+
191
+ Returns:
192
+ dict or list: Parsed JSON object from S3 file.
193
+ """
194
+ logger.info(f"Reading metadata json from S3 file: {s3_uri}")
195
+ s3 = session.client('s3')
196
+ obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
197
+ data = json.loads(obj['Body'].read().decode('utf-8'))
198
+ logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
199
+ return data
200
+
201
+ def read_data_import_order_txt_s3(s3_uri: str, session) -> list:
202
+ """
203
+ Read a DataImportOrder.txt file from S3 and return node order as a list.
204
+
205
+ Args:
206
+ s3_uri (str): S3 URI to the DataImportOrder.txt file.
207
+ session (boto3.Session): Boto3 session.
208
+
209
+ Returns:
210
+ list: Node names (order as listed in file).
211
+
212
+ Raises:
213
+ ValueError: If the provided S3 URI does not point to DataImportOrder.txt.
214
+ """
215
+ filename = s3_uri.split("/")[-1]
216
+ if 'DataImportOrder.txt' not in filename:
217
+ logger.error(f"File {filename} is not a DataImportOrder.txt file")
218
+ raise ValueError(f"File {filename} is not a DataImportOrder.txt file")
219
+ logger.info(f"Reading DataImportOrder.txt from S3 file: {s3_uri}")
220
+ s3 = session.client('s3')
221
+ obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
222
+ content = obj['Body'].read().decode('utf-8')
223
+ import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
224
+ logger.debug(f"Read import order from S3 file: {import_order}")
225
+ return import_order
226
+
227
+ def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
228
+ """
229
+ Read DataImportOrder.txt from local file, optionally excluding some nodes.
230
+
231
+ Args:
232
+ file_path (str): Path to DataImportOrder.txt.
233
+ exclude_nodes (list): Node names to exclude from result.
234
+
235
+ Returns:
236
+ list: Node names, excludes specified nodes, keeps listed order.
237
+
238
+ Raises:
239
+ FileNotFoundError: If the file is not found.
240
+ """
241
+ try:
242
+ logger.info(f"Reading DataImportOrder.txt from local file: {file_path}")
243
+ with open(file_path, "r") as f:
244
+ import_order = [line.rstrip() for line in f if line.strip()]
245
+ logger.debug(f"Raw import order from file: {import_order}")
246
+ if exclude_nodes is not None:
247
+ import_order = [node for node in import_order if node not in exclude_nodes]
248
+ logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
249
+ logger.debug(f"Final import order from {file_path}: {import_order}")
250
+ return import_order
251
+ except FileNotFoundError:
252
+ logger.error(f"Error: DataImportOrder.txt not found in {file_path}")
253
+ return []
254
+
255
+ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
256
+ """
257
+ Split a list of JSON-serializable objects into size-limited chunks.
258
+
259
+ Each chunk/list, when JSON-serialized, will not exceed max_size_kb kilobytes.
260
+
261
+ Args:
262
+ json_list (list): List of JSON serializable objects.
263
+ max_size_kb (int, optional): Max chunk size in KB. Default: 50.
264
+ print_results (bool, optional): If True, info log the size/count per chunk. Default: False.
265
+
266
+ Returns:
267
+ list: List of lists. Each sublist size (JSON-serialized) <= max_size_kb.
268
+ """
269
+ logger.info(f"Splitting JSON objects into max {max_size_kb} KB chunks. Total items: {len(json_list)}")
270
+ def get_size_in_kb(obj):
271
+ """
272
+ Get the size in kilobytes of the JSON-serialized object.
273
+
274
+ Args:
275
+ obj: JSON-serializable object.
276
+
277
+ Returns:
278
+ float: Size of the object in kilobytes.
279
+ """
280
+ import sys
281
+ size_kb = sys.getsizeof(json.dumps(obj)) / 1024
282
+ logger.debug(f"Calculated size: {size_kb:.2f} KB")
283
+ return size_kb
284
+
285
+ def split_list(json_list):
286
+ """
287
+ Recursively split the list so each chunk fits within max_size_kb.
288
+
289
+ Args:
290
+ json_list (list): List to split.
291
+
292
+ Returns:
293
+ list: List of sublists.
294
+ """
295
+ if get_size_in_kb(json_list) <= max_size_kb:
296
+ logger.debug(f"Split length {len(json_list)} is within max size {max_size_kb} KB.")
297
+ return [json_list]
298
+ mid = len(json_list) // 2
299
+ left_list = json_list[:mid]
300
+ right_list = json_list[mid:]
301
+ logger.debug(f"Splitting list at index {mid}: left {len(left_list)}, right {len(right_list)}")
302
+ return split_list(left_list) + split_list(right_list)
303
+
304
+ split_lists = split_list(json_list)
305
+ if print_results:
306
+ for i, lst in enumerate(split_lists):
307
+ logger.info(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
308
+ logger.debug(f"Total splits: {len(split_lists)}")
309
+ return split_lists
310
+
311
+ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) -> dict:
312
+ """
313
+ Retrieve a Gen3 API key stored as a secret in AWS Secrets Manager and parse it as a dict.
314
+
315
+ Args:
316
+ secret_name (str): Name of the AWS secret.
317
+ region_name (str): AWS region where the secret is located.
318
+ session (boto3.Session): Boto3 session.
319
+
320
+ Returns:
321
+ dict: Parsed Gen3 API key.
322
+
323
+ Raises:
324
+ Exception: On failure to retrieve or parse the secret.
325
+ """
326
+ logger.info(f"Retrieving Gen3 API key from AWS Secrets Manager: secret_name={secret_name}, region={region_name}")
327
+ client = session.client(service_name='secretsmanager', region_name=region_name)
328
+ try:
329
+ get_secret_value_response = client.get_secret_value(
330
+ SecretId=secret_name
331
+ )
332
+ except Exception as e:
333
+ logger.error(f"Error getting secret value from AWS Secrets Manager: {e}")
334
+ raise e
335
+
336
+ secret = get_secret_value_response['SecretString']
337
+
338
+ try:
339
+ secret = json.loads(secret)
340
+ api_key = secret
341
+ logger.debug(f"Retrieved Gen3 API key from secret {secret_name}")
342
+ return api_key
343
+ except Exception as e:
344
+ logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
345
+ raise e
346
+
347
+ def create_gen3_submission_class(api_key: dict, api_endpoint: str):
348
+ """
349
+ Create and authenticate a Gen3Submission client using a temporary file for API key.
350
+
351
+ Args:
352
+ api_key (dict): The Gen3 API key as Python dict.
353
+ api_endpoint (str): Gen3 endpoint (hostname/base API URL).
354
+
355
+ Returns:
356
+ Gen3Submission: An authenticated Gen3Submission object.
357
+
358
+ Notes:
359
+ The temporary file storing the API key is deleted after use.
360
+ """
361
+ import tempfile
362
+
363
+ logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
364
+ tmp_api_key_path = None
365
+ submit = None
366
+
367
+ try:
368
+ with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".json", dir="/tmp") as tmp_file:
369
+ json.dump(api_key, tmp_file)
370
+ tmp_api_key_path = tmp_file.name
371
+ auth = Gen3Auth(refresh_file=tmp_api_key_path)
372
+ submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
373
+ return submit
374
+ finally:
375
+ if tmp_api_key_path and os.path.exists(tmp_api_key_path):
376
+ try:
377
+ os.remove(tmp_api_key_path)
378
+ logger.debug(f"Temporary API key file {tmp_api_key_path} deleted.")
379
+ except Exception as e:
380
+ logger.warning(f"Failed to delete temporary API key file {tmp_api_key_path}: {e}")
381
+
382
+ def write_submission_results(results, output_path, mode='w'):
383
+ with open(output_path, mode) as f:
384
+ json.dump(results, f, indent=4)
385
+
386
+ def submit_metadata(
387
+ file_list: list,
388
+ api_key: str,
389
+ api_endpoint: str,
390
+ project_id: str,
391
+ data_import_order_path: str,
392
+ boto3_session,
393
+ max_size_kb: int = 50,
394
+ exclude_nodes: list = None,
395
+ max_retries: int = 5,
396
+ ):
397
+ """
398
+ Submit a set of metadata JSON files to a Gen3 data commons endpoint, in order.
399
+
400
+ Args:
401
+ file_list (list): List of paths (local or S3 URIs) to metadata .json files, one per node type.
402
+ api_key (str): Gen3 API key (parsed dict or JSON string).
403
+ api_endpoint (str): Gen3 data commons endpoint URL.
404
+ project_id (str): Gen3 project ID to submit data to.
405
+ data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying submission order.
406
+ boto3_session (boto3.Session): Existing AWS/boto3 session for S3 & secret usage.
407
+ max_size_kb (int, optional): Maximum size per submission chunk, in KB. Default: 50.
408
+ exclude_nodes (list, optional): List of node names to skip (default: ["project", "program", "acknowledgement", "publication"]).
409
+ max_retries (int, optional): Maximum number of retry attempts per node chunk. Default: 5.
410
+
411
+ Returns:
412
+ None
413
+
414
+ Raises:
415
+ Exception: On critical submission failure for any chunk.
416
+
417
+ Notes:
418
+ Each file is split into size-friendly chunks before submit. Local and S3 files are supported.
419
+ """
420
+
421
+ timestamp = datetime.now().strftime("%Y%d%m-%H%M%S")
422
+ log_dir = f"submission_logs/{timestamp}"
423
+ os.makedirs(log_dir, exist_ok=True)
424
+
425
+ if exclude_nodes is None:
426
+ exclude_nodes = ["project", "program", "acknowledgement", "publication"]
427
+
428
+ logger.info("Starting metadata submission process.")
429
+ logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
430
+ submit = create_gen3_submission_class(api_key, api_endpoint)
431
+
432
+ if is_s3_uri(data_import_order_path):
433
+ logger.info(f"Reading import order from S3: {data_import_order_path}")
434
+ import_order = read_data_import_order_txt_s3(data_import_order_path, boto3_session)
435
+ logger.debug(f"Import order from S3: {import_order}")
436
+ else:
437
+ logger.info(f"Reading import order from file: {data_import_order_path}")
438
+ import_order = read_data_import_order_txt(data_import_order_path, exclude_nodes)
439
+ logger.debug(f"Import order from file: {import_order}")
440
+
441
+ # Map node name to file for fast access and avoid repeatedly scanning file_list
442
+ file_map = {get_node_from_file_path(file): file for file in file_list}
443
+
444
+ for node in import_order:
445
+ if node in exclude_nodes:
446
+ logger.info(f"Skipping node '{node}' (in exclude list).")
447
+ continue
448
+ file = file_map.get(node)
449
+ if not file:
450
+ logger.info(f"Skipping node '{node}' (not present in file list).")
451
+ continue
452
+
453
+ logger.info(f"Processing file '{file}' for node '{node}'.")
454
+
455
+ try:
456
+ if is_s3_uri(file):
457
+ logger.info(f"Reading JSON data for node '{node}' from S3 file: {file}")
458
+ json_data = read_metadata_json_s3(file, boto3_session)
459
+ else:
460
+ logger.info(f"Reading JSON data for node '{node}' from local file: {file}")
461
+ json_data = read_metadata_json(file)
462
+ except Exception as e:
463
+ logger.error(f"Error reading JSON for node '{node}' from {file}: {e}")
464
+ continue
465
+
466
+ if not json_data:
467
+ logger.info(f"Skipping node '{node}' due to errors in reading JSON.")
468
+ continue
469
+
470
+ split_json_list = split_json_objects(json_data, max_size_kb=max_size_kb)
471
+ n_json_data = len(split_json_list)
472
+ logger.info(
473
+ f"--- Starting submission process for node '{node}' ({n_json_data} chunks) ---"
474
+ )
475
+
476
+ for index, jsn in enumerate(split_json_list):
477
+ progress_str = f"{index + 1}/{n_json_data}"
478
+
479
+ for attempt in range(max_retries + 1):
480
+ try:
481
+ log_msg = (
482
+ f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
483
+ f"Split: {progress_str:<5}"
484
+ if attempt == 0 else
485
+ f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
486
+ f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
487
+ )
488
+ logger.info(log_msg) if attempt == 0 else logger.warning(log_msg)
489
+
490
+ res = submit.submit_record("program1", project_id, jsn)
491
+
492
+ # writing submission results as log json
493
+ log_filename = os.path.join(
494
+ log_dir, f"{project_id}_{node}_split{index + 1}_of_{n_json_data}.json"
495
+ )
496
+ abs_log_filename = os.path.abspath(log_filename)
497
+ with open(abs_log_filename, "a") as f:
498
+ json.dump(res, f)
499
+ f.write("\n")
500
+ logger.info(
501
+ f"Wrote submission response to log file: {abs_log_filename}"
502
+ )
503
+
504
+ logger.info(
505
+ f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
506
+ f"Node: {node:<12} | Split: {progress_str:<5}"
507
+ )
508
+ break # Successful, move to next chunk
509
+
510
+ except Exception as e:
511
+ logger.error(
512
+ f"Error submitting chunk {progress_str} for node '{node}': {e}"
513
+ )
514
+ if attempt < max_retries:
515
+ import time
516
+ time.sleep(0.2)
517
+ else:
518
+ logger.critical(
519
+ f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
520
+ f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
521
+ )
522
+ raise
523
+
524
+ logger.info(f"Finished submitting node '{node}'.")
525
+
526
+ logger.info("--- Submission process complete ---")
@@ -1,184 +0,0 @@
1
- import os
2
- import sys
3
- from gen3.auth import Gen3Auth
4
- from gen3.index import Gen3Index
5
- from gen3.submission import Gen3Submission
6
- import json
7
- from datetime import datetime
8
- import uuid
9
- import shutil
10
-
11
-
12
- def submit_metadata(base_dir: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
13
- dry_run: bool = False, max_submission_size_kb: int = 400, retries = 5, disable_input: bool = False,
14
- specific_node: str = None, ab_path: bool = False, import_order_file: str = None):
15
- """
16
- Submits metadata json files to the gen3 api endpoint. Submission depends on a DataImportOrder.txt file, which defines the order of the nodes to be imported.
17
-
18
- Args:
19
- base_dir (str): The path to the folder containing the metadata .json files. Should not contain project_id folder
20
- project_id (str): The ID of the project.
21
- api_endpoint (str): Gen3 API endpoint.
22
- credentials (str): The path to the file containing the API credentials.
23
- exclude_nodes (list): A list of node names to exclude from the import. Default is ["project", "program", "acknowledgement", "publication"].
24
- dry_run (bool): If True, perform a dry run without actual submission. Default is False.
25
- max_submission_size_kb (int): The maximum size of each submission in kilobytes. Default is 400 KB.
26
- disable_input (bool): If True, disable user input confirmation. Default is False.
27
- specific_node (str): If not None, only submit the specified node.
28
- ab_path (bool): If True, use the absolute path to the base_dir.
29
- import_order_file (str): The absolute path to the import order file, if not defined the program will look for os.path.join(folder_path, project_name, "DataImportOrder.txt")
30
-
31
- Returns:
32
- None
33
- """
34
-
35
- def get_import_order(project_name, folder_path):
36
- path = import_order_file or os.path.join(folder_path, project_name, "DataImportOrder.txt")
37
- try:
38
- with open(path, "r") as f:
39
- import_order = [line.rstrip() for line in f]
40
- import_order = [node for node in import_order if node not in exclude_nodes]
41
- return import_order
42
- except FileNotFoundError:
43
- print(f"Error: DataImportOrder.txt not found in {path}")
44
- return []
45
-
46
- def read_json(json_fn, ab_path: bool = False):
47
- try:
48
- if ab_path:
49
- json_path = os.path.join(base_dir, json_fn)
50
- else:
51
- json_path = os.path.join(base_dir, project_id, json_fn)
52
- with open(json_path, 'r') as f:
53
- schema = json.load(f)
54
- print(f'{json_path} successfully loaded')
55
- return schema
56
- except FileNotFoundError:
57
- print(f"Error: JSON file {json_path} not found.")
58
- return None
59
- except json.JSONDecodeError:
60
- print(f"Error: JSON file {json_path} is not valid.")
61
- return None
62
-
63
- if specific_node is None:
64
- ordered_import_nodes = get_import_order(project_id, base_dir)
65
- final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
66
-
67
- # creating auth and submission objects
68
- auth = Gen3Auth(refresh_file=credentials)
69
- sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
70
-
71
-
72
- if not dry_run and not disable_input:
73
- confirm = input("Do you want to submit the metadata? (yes/no): ").strip().lower()
74
- if confirm != 'yes':
75
- print("Submission cancelled by user.")
76
- return
77
-
78
-
79
- def split_json_objects(json_list, max_size_kb=max_submission_size_kb, print_results=False):
80
- def get_size_in_kb(obj):
81
- return sys.getsizeof(json.dumps(obj)) / 1024
82
-
83
- def split_list(json_list):
84
- if get_size_in_kb(json_list) <= max_size_kb:
85
- return [json_list]
86
-
87
- mid = len(json_list) // 2
88
- left_list = json_list[:mid]
89
- right_list = json_list[mid:]
90
-
91
- return split_list(left_list) + split_list(right_list)
92
-
93
- split_lists = split_list(json_list)
94
-
95
- if print_results:
96
- for i, lst in enumerate(split_lists):
97
- print(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
98
-
99
- return split_lists
100
-
101
-
102
- def process_node(node, sub, project_id, dry_run, max_retries=3):
103
- if dry_run:
104
- print(f"DRY RUN\t| {project_id}\t| {node} would be submitted")
105
- return
106
-
107
- print(f"\n\nIMPORTING\t| {project_id}\t| {node}")
108
- json_data = read_json(f"{node}.json", ab_path=ab_path)
109
-
110
- if json_data is None:
111
- print(f"SKIPPING\t| {project_id}\t| {node} due to errors in reading JSON")
112
- return
113
-
114
- json_split = split_json_objects(json_data, max_size_kb=max_submission_size_kb, print_results=True)
115
- n_json_data = len(json_split)
116
-
117
- for index, jsn in enumerate(json_split):
118
- retries = 0
119
- while retries < max_retries:
120
- try:
121
- print(f"SUBMITTING\t| {project_id}\t| {node}\t| {index + 1}/{n_json_data} data splits")
122
- sub.submit_record("program1", project_id, jsn)
123
- print(f"SUCCESS\t| Imported: {project_id}\t| {node}")
124
- break
125
- except Exception as e:
126
- retries += 1
127
- print(f"ERROR\t| {project_id}\t| {node}: {e} | Retry {retries}/{max_retries}")
128
- if retries == max_retries:
129
- print(f"FAILED\t| {project_id}\t| {node} after {max_retries} retries")
130
-
131
-
132
- if specific_node:
133
- process_node(specific_node, sub, project_id, dry_run, retries)
134
- return print(f"Done. {project_id} | {specific_node} metadata submitted")
135
-
136
- for node in final_ordered_import_nodes:
137
- process_node(node, sub, project_id, dry_run, retries)
138
-
139
-
140
- def delete_metadata(import_order_file: str, project_id: str, api_endpoint: str, credentials: str, exclude_nodes: list = ["project", "program", "acknowledgement", "publication"], prompt_for_confirmation: bool = True):
141
- """
142
- Deletes metadata json files from the gen3 api endpoint. Deletion depends on a DataImportOrder.txt file, which defines the order of the nodes to be deleted.
143
-
144
- Args:
145
- import_order_file (str): The path to the import order file
146
- project_id (str): The ID of the project.
147
- api_endpoint (str): Gen3 API endpoint.
148
- credentials (str): The path to the file containing the API credentials.
149
- exclude_nodes (list): A list of node names to exclude from the deletion. Default is ["project", "program", "acknowledgement", "publication"].
150
-
151
- Returns:
152
- None
153
- """
154
-
155
- def get_import_order(import_order_file):
156
- try:
157
- with open(import_order_file, "r") as f:
158
- import_order = [line.rstrip() for line in f]
159
- import_order = [node for node in import_order if node not in exclude_nodes]
160
- return import_order
161
- except FileNotFoundError:
162
- print(f"Error: DataImportOrder.txt not found in {import_order_file}")
163
- return []
164
-
165
- ordered_import_nodes = get_import_order(import_order_file)
166
- auth = Gen3Auth(refresh_file=credentials)
167
- sub = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
168
-
169
- final_ordered_import_nodes = [node for node in ordered_import_nodes if node not in exclude_nodes]
170
- final_ordered_import_nodes.reverse() # Reverse the order for deletion
171
-
172
- if prompt_for_confirmation:
173
- confirm = input("Do you want to delete the metadata? (yes/no): ").strip().lower()
174
- if confirm != 'yes':
175
- print("Deletion cancelled by user.")
176
- return
177
-
178
- for node in final_ordered_import_nodes:
179
- print(f"\n\n=== Deleting: {project_id} | {node} ===")
180
- try:
181
- sub.delete_nodes("program1", project_id, [node])
182
- print(f"=== Successfully Deleted: {node} ===")
183
- except Exception as e:
184
- print(f"=== Error deleting {node}: {e} ===")