acdc_aws_etl_pipeline 0.6.9__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- acdc_aws_etl_pipeline/upload/metadata_submitter.py +626 -349
- {acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/METADATA +2 -1
- {acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/RECORD +4 -4
- {acdc_aws_etl_pipeline-0.6.9.dist-info → acdc_aws_etl_pipeline-0.7.1.dist-info}/WHEEL +0 -0
|
@@ -1,34 +1,41 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
5
4
|
import json
|
|
6
5
|
import boto3
|
|
6
|
+
from botocore.exceptions import BotoCoreError, ClientError
|
|
7
7
|
from gen3.auth import Gen3Auth
|
|
8
|
-
from gen3.index import Gen3Index
|
|
9
8
|
from gen3.submission import Gen3Submission
|
|
10
9
|
import logging
|
|
11
10
|
from datetime import datetime
|
|
12
11
|
import jwt
|
|
13
|
-
|
|
12
|
+
import requests
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
14
|
import re
|
|
15
15
|
import pandas as pd
|
|
16
16
|
import uuid
|
|
17
|
-
from acdc_aws_etl_pipeline.validate.validate import
|
|
17
|
+
from acdc_aws_etl_pipeline.validate.validate import (
|
|
18
|
+
write_parquet_to_db,
|
|
19
|
+
)
|
|
20
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
21
|
+
|
|
22
|
+
# redefine to use local cache in /tmp
|
|
23
|
+
os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
|
|
18
24
|
|
|
19
25
|
logger = logging.getLogger(__name__)
|
|
20
26
|
|
|
21
|
-
def create_boto3_session(aws_profile: str = None):
|
|
27
|
+
def create_boto3_session(aws_profile: Optional[str] = None):
|
|
22
28
|
"""
|
|
23
29
|
Create and return a boto3 Session object using an optional AWS profile.
|
|
24
30
|
|
|
25
31
|
Args:
|
|
26
|
-
aws_profile (str, optional): The AWS CLI named profile to use
|
|
32
|
+
aws_profile (str, optional): The AWS CLI named profile to use.
|
|
33
|
+
If None, uses default credentials.
|
|
27
34
|
|
|
28
35
|
Returns:
|
|
29
36
|
boto3.Session: The created session instance.
|
|
30
37
|
"""
|
|
31
|
-
logger.debug(
|
|
38
|
+
logger.debug("Creating boto3 session with aws_profile=%s", aws_profile)
|
|
32
39
|
return boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
|
|
33
40
|
|
|
34
41
|
def is_s3_uri(s3_uri: str) -> bool:
|
|
@@ -41,7 +48,7 @@ def is_s3_uri(s3_uri: str) -> bool:
|
|
|
41
48
|
Returns:
|
|
42
49
|
bool: True if the string starts with 's3://', False otherwise.
|
|
43
50
|
"""
|
|
44
|
-
logger.debug(
|
|
51
|
+
logger.debug("Checking if %s is an S3 URI.", s3_uri)
|
|
45
52
|
return s3_uri.startswith("s3://")
|
|
46
53
|
|
|
47
54
|
def get_filename(file_path: str) -> str:
|
|
@@ -55,7 +62,11 @@ def get_filename(file_path: str) -> str:
|
|
|
55
62
|
str: The filename (with extension).
|
|
56
63
|
"""
|
|
57
64
|
filename = file_path.split("/")[-1]
|
|
58
|
-
logger.debug(
|
|
65
|
+
logger.debug(
|
|
66
|
+
"Extracted filename '%s' from file_path '%s'.",
|
|
67
|
+
filename,
|
|
68
|
+
file_path,
|
|
69
|
+
)
|
|
59
70
|
return filename
|
|
60
71
|
|
|
61
72
|
def get_node_from_file_path(file_path: str) -> str:
|
|
@@ -70,7 +81,7 @@ def get_node_from_file_path(file_path: str) -> str:
|
|
|
70
81
|
"""
|
|
71
82
|
filename = get_filename(file_path)
|
|
72
83
|
node = filename.split(".")[0]
|
|
73
|
-
logger.debug(
|
|
84
|
+
logger.debug("Extracted node '%s' from filename '%s'.", node, filename)
|
|
74
85
|
return node
|
|
75
86
|
|
|
76
87
|
def list_metadata_jsons(metadata_dir: str) -> list:
|
|
@@ -87,11 +98,18 @@ def list_metadata_jsons(metadata_dir: str) -> list:
|
|
|
87
98
|
Exception: If there is an error reading the directory.
|
|
88
99
|
"""
|
|
89
100
|
try:
|
|
90
|
-
logger.info(
|
|
101
|
+
logger.info(
|
|
102
|
+
"Listing .json files in metadata directory: %s",
|
|
103
|
+
metadata_dir,
|
|
104
|
+
)
|
|
91
105
|
files = os.listdir(metadata_dir)
|
|
92
|
-
return [
|
|
93
|
-
|
|
94
|
-
|
|
106
|
+
return [
|
|
107
|
+
os.path.abspath(os.path.join(metadata_dir, file_name))
|
|
108
|
+
for file_name in files
|
|
109
|
+
if file_name.endswith(".json")
|
|
110
|
+
]
|
|
111
|
+
except OSError as e:
|
|
112
|
+
logger.error("Error listing metadata JSONs in %s: %s", metadata_dir, e)
|
|
95
113
|
raise
|
|
96
114
|
|
|
97
115
|
def find_data_import_order_file(metadata_dir: str) -> str:
|
|
@@ -108,16 +126,22 @@ def find_data_import_order_file(metadata_dir: str) -> str:
|
|
|
108
126
|
FileNotFoundError: If no such file is found.
|
|
109
127
|
"""
|
|
110
128
|
try:
|
|
111
|
-
logger.info(
|
|
129
|
+
logger.info("Searching for DataImportOrder.txt in %s", metadata_dir)
|
|
112
130
|
files = [os.path.join(metadata_dir, f) for f in os.listdir(metadata_dir)]
|
|
113
131
|
order_files = [f for f in files if "DataImportOrder.txt" in f]
|
|
114
132
|
if not order_files:
|
|
115
133
|
logger.error("No DataImportOrder.txt file found in the given directory.")
|
|
116
|
-
raise FileNotFoundError(
|
|
117
|
-
|
|
134
|
+
raise FileNotFoundError(
|
|
135
|
+
"No DataImportOrder.txt file found in the given directory."
|
|
136
|
+
)
|
|
137
|
+
logger.debug("Found DataImportOrder.txt file: %s", order_files[0])
|
|
118
138
|
return order_files[0]
|
|
119
|
-
except
|
|
120
|
-
logger.error(
|
|
139
|
+
except OSError as e:
|
|
140
|
+
logger.error(
|
|
141
|
+
"Error finding DataImportOrder.txt in %s: %s",
|
|
142
|
+
metadata_dir,
|
|
143
|
+
e,
|
|
144
|
+
)
|
|
121
145
|
raise
|
|
122
146
|
|
|
123
147
|
def list_metadata_jsons_s3(s3_uri: str, session) -> list:
|
|
@@ -125,13 +149,14 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
|
|
|
125
149
|
List all .json files in an S3 "directory" (prefix).
|
|
126
150
|
|
|
127
151
|
Args:
|
|
128
|
-
s3_uri (str): S3 URI to the metadata directory
|
|
152
|
+
s3_uri (str): S3 URI to the metadata directory
|
|
153
|
+
(e.g. "s3://my-bucket/path/to/dir").
|
|
129
154
|
session (boto3.Session): An active boto3 Session.
|
|
130
155
|
|
|
131
156
|
Returns:
|
|
132
157
|
list: List of S3 URIs for all .json files found under the prefix.
|
|
133
158
|
"""
|
|
134
|
-
logger.info(
|
|
159
|
+
logger.info("Listing .json files in S3 metadata directory: %s", s3_uri)
|
|
135
160
|
s3 = session.client('s3')
|
|
136
161
|
bucket = s3_uri.split("/")[2]
|
|
137
162
|
prefix = "/".join(s3_uri.split("/")[3:])
|
|
@@ -144,7 +169,7 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
|
|
|
144
169
|
for obj in objects.get('Contents', [])
|
|
145
170
|
if obj['Key'].endswith(".json")
|
|
146
171
|
]
|
|
147
|
-
logger.debug(
|
|
172
|
+
logger.debug("Found %s .json files in S3 at %s", len(result), s3_uri)
|
|
148
173
|
return result
|
|
149
174
|
|
|
150
175
|
def find_data_import_order_file_s3(s3_uri: str, session) -> str:
|
|
@@ -161,16 +186,29 @@ def find_data_import_order_file_s3(s3_uri: str, session) -> str:
|
|
|
161
186
|
Raises:
|
|
162
187
|
FileNotFoundError: If the file does not exist in the specified prefix.
|
|
163
188
|
"""
|
|
164
|
-
logger.info(
|
|
189
|
+
logger.info(
|
|
190
|
+
"Searching for DataImportOrder.txt in S3 metadata directory: %s",
|
|
191
|
+
s3_uri,
|
|
192
|
+
)
|
|
165
193
|
s3 = session.client('s3')
|
|
166
194
|
bucket = s3_uri.split("/")[2]
|
|
167
195
|
prefix = "/".join(s3_uri.split("/")[3:])
|
|
168
196
|
objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
|
|
169
|
-
order_files = [
|
|
197
|
+
order_files = [
|
|
198
|
+
obj['Key']
|
|
199
|
+
for obj in objects.get('Contents', [])
|
|
200
|
+
if obj['Key'].endswith("DataImportOrder.txt")
|
|
201
|
+
]
|
|
170
202
|
if not order_files:
|
|
171
203
|
logger.error("No DataImportOrder.txt file found in the given S3 directory.")
|
|
172
|
-
raise FileNotFoundError(
|
|
173
|
-
|
|
204
|
+
raise FileNotFoundError(
|
|
205
|
+
"No DataImportOrder.txt file found in the given directory."
|
|
206
|
+
)
|
|
207
|
+
logger.debug(
|
|
208
|
+
"Found DataImportOrder.txt file in S3: s3://%s/%s",
|
|
209
|
+
bucket,
|
|
210
|
+
order_files[0],
|
|
211
|
+
)
|
|
174
212
|
return f"s3://{bucket}/{order_files[0]}"
|
|
175
213
|
|
|
176
214
|
def read_metadata_json(file_path: str) -> dict:
|
|
@@ -183,10 +221,14 @@ def read_metadata_json(file_path: str) -> dict:
|
|
|
183
221
|
Returns:
|
|
184
222
|
dict or list: Parsed contents of the JSON file.
|
|
185
223
|
"""
|
|
186
|
-
logger.info(
|
|
187
|
-
with open(file_path, "r") as f:
|
|
224
|
+
logger.info("Reading metadata json from local file: %s", file_path)
|
|
225
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
188
226
|
data = json.load(f)
|
|
189
|
-
logger.debug(
|
|
227
|
+
logger.debug(
|
|
228
|
+
"Read %s objects from %s",
|
|
229
|
+
len(data) if isinstance(data, list) else 'object',
|
|
230
|
+
file_path,
|
|
231
|
+
)
|
|
190
232
|
return data
|
|
191
233
|
|
|
192
234
|
def read_metadata_json_s3(s3_uri: str, session) -> dict:
|
|
@@ -200,11 +242,18 @@ def read_metadata_json_s3(s3_uri: str, session) -> dict:
|
|
|
200
242
|
Returns:
|
|
201
243
|
dict or list: Parsed JSON object from S3 file.
|
|
202
244
|
"""
|
|
203
|
-
logger.info(
|
|
245
|
+
logger.info("Reading metadata json from S3 file: %s", s3_uri)
|
|
204
246
|
s3 = session.client('s3')
|
|
205
|
-
obj = s3.get_object(
|
|
247
|
+
obj = s3.get_object(
|
|
248
|
+
Bucket=s3_uri.split("/")[2],
|
|
249
|
+
Key="/".join(s3_uri.split("/")[3:]),
|
|
250
|
+
)
|
|
206
251
|
data = json.loads(obj['Body'].read().decode('utf-8'))
|
|
207
|
-
logger.debug(
|
|
252
|
+
logger.debug(
|
|
253
|
+
"Read %s objects from %s",
|
|
254
|
+
len(data) if isinstance(data, list) else 'object',
|
|
255
|
+
s3_uri,
|
|
256
|
+
)
|
|
208
257
|
return data
|
|
209
258
|
|
|
210
259
|
def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = None) -> list:
|
|
@@ -224,20 +273,41 @@ def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = No
|
|
|
224
273
|
"""
|
|
225
274
|
filename = s3_uri.split("/")[-1]
|
|
226
275
|
if 'DataImportOrder.txt' not in filename:
|
|
227
|
-
logger.error(
|
|
228
|
-
raise ValueError(
|
|
229
|
-
|
|
276
|
+
logger.error("File %s is not a DataImportOrder.txt file", filename)
|
|
277
|
+
raise ValueError(
|
|
278
|
+
f"File {filename} is not a DataImportOrder.txt file"
|
|
279
|
+
)
|
|
280
|
+
logger.info(
|
|
281
|
+
"Reading DataImportOrder.txt from S3 file: %s",
|
|
282
|
+
s3_uri,
|
|
283
|
+
)
|
|
230
284
|
s3 = session.client('s3')
|
|
231
|
-
obj = s3.get_object(
|
|
285
|
+
obj = s3.get_object(
|
|
286
|
+
Bucket=s3_uri.split("/")[2],
|
|
287
|
+
Key="/".join(s3_uri.split("/")[3:]),
|
|
288
|
+
)
|
|
232
289
|
content = obj['Body'].read().decode('utf-8')
|
|
233
|
-
import_order = [
|
|
234
|
-
|
|
290
|
+
import_order = [
|
|
291
|
+
line.rstrip()
|
|
292
|
+
for line in content.splitlines()
|
|
293
|
+
if line.strip()
|
|
294
|
+
]
|
|
295
|
+
logger.debug("Raw import order from S3 file: %s", import_order)
|
|
235
296
|
if exclude_nodes is not None:
|
|
236
297
|
import_order = [node for node in import_order if node not in exclude_nodes]
|
|
237
|
-
logger.debug(
|
|
238
|
-
|
|
298
|
+
logger.debug(
|
|
299
|
+
"Import order after excluding nodes %s: %s",
|
|
300
|
+
exclude_nodes,
|
|
301
|
+
import_order,
|
|
302
|
+
)
|
|
303
|
+
logger.debug(
|
|
304
|
+
"Final import order from S3 file %s: %s",
|
|
305
|
+
s3_uri,
|
|
306
|
+
import_order,
|
|
307
|
+
)
|
|
239
308
|
return import_order
|
|
240
309
|
|
|
310
|
+
|
|
241
311
|
def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
|
|
242
312
|
"""
|
|
243
313
|
Read DataImportOrder.txt from local file, optionally excluding some nodes.
|
|
@@ -253,17 +323,26 @@ def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
|
|
|
253
323
|
FileNotFoundError: If the file is not found.
|
|
254
324
|
"""
|
|
255
325
|
try:
|
|
256
|
-
logger.info(
|
|
257
|
-
|
|
326
|
+
logger.info(
|
|
327
|
+
"Reading DataImportOrder.txt from local file: %s",
|
|
328
|
+
file_path,
|
|
329
|
+
)
|
|
330
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
258
331
|
import_order = [line.rstrip() for line in f if line.strip()]
|
|
259
|
-
logger.debug(
|
|
332
|
+
logger.debug("Raw import order from file: %s", import_order)
|
|
260
333
|
if exclude_nodes is not None:
|
|
261
|
-
import_order = [
|
|
262
|
-
|
|
263
|
-
|
|
334
|
+
import_order = [
|
|
335
|
+
node for node in import_order if node not in exclude_nodes
|
|
336
|
+
]
|
|
337
|
+
logger.debug(
|
|
338
|
+
"Import order after excluding nodes %s: %s",
|
|
339
|
+
exclude_nodes,
|
|
340
|
+
import_order,
|
|
341
|
+
)
|
|
342
|
+
logger.debug("Final import order from %s: %s", file_path, import_order)
|
|
264
343
|
return import_order
|
|
265
344
|
except FileNotFoundError:
|
|
266
|
-
logger.error(
|
|
345
|
+
logger.error("Error: DataImportOrder.txt not found in %s", file_path)
|
|
267
346
|
return []
|
|
268
347
|
|
|
269
348
|
def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
|
|
@@ -280,7 +359,12 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
|
|
|
280
359
|
Returns:
|
|
281
360
|
list: List of lists. Each sublist size (JSON-serialized) <= max_size_kb.
|
|
282
361
|
"""
|
|
283
|
-
logger.info(
|
|
362
|
+
logger.info(
|
|
363
|
+
"Splitting JSON objects into max %s KB chunks. Total items: %s",
|
|
364
|
+
max_size_kb,
|
|
365
|
+
len(json_list),
|
|
366
|
+
)
|
|
367
|
+
|
|
284
368
|
def get_size_in_kb(obj):
|
|
285
369
|
"""
|
|
286
370
|
Get the size in kilobytes of the JSON-serialized object.
|
|
@@ -291,12 +375,11 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
|
|
|
291
375
|
Returns:
|
|
292
376
|
float: Size of the object in kilobytes.
|
|
293
377
|
"""
|
|
294
|
-
import sys
|
|
295
378
|
size_kb = sys.getsizeof(json.dumps(obj)) / 1024
|
|
296
|
-
logger.debug(
|
|
379
|
+
logger.debug("Calculated size: %.2f KB", size_kb)
|
|
297
380
|
return size_kb
|
|
298
381
|
|
|
299
|
-
def split_list(
|
|
382
|
+
def split_list(items):
|
|
300
383
|
"""
|
|
301
384
|
Recursively split the list so each chunk fits within max_size_kb.
|
|
302
385
|
|
|
@@ -306,20 +389,34 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
|
|
|
306
389
|
Returns:
|
|
307
390
|
list: List of sublists.
|
|
308
391
|
"""
|
|
309
|
-
if get_size_in_kb(
|
|
310
|
-
logger.debug(
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
392
|
+
if get_size_in_kb(items) <= max_size_kb:
|
|
393
|
+
logger.debug(
|
|
394
|
+
"Split length %s is within max size %s KB.",
|
|
395
|
+
len(items),
|
|
396
|
+
max_size_kb,
|
|
397
|
+
)
|
|
398
|
+
return [items]
|
|
399
|
+
mid = len(items) // 2
|
|
400
|
+
left_list = items[:mid]
|
|
401
|
+
right_list = items[mid:]
|
|
402
|
+
logger.debug(
|
|
403
|
+
"Splitting list at index %s: left %s, right %s",
|
|
404
|
+
mid,
|
|
405
|
+
len(left_list),
|
|
406
|
+
len(right_list),
|
|
407
|
+
)
|
|
316
408
|
return split_list(left_list) + split_list(right_list)
|
|
317
409
|
|
|
318
410
|
split_lists = split_list(json_list)
|
|
319
411
|
if print_results:
|
|
320
412
|
for i, lst in enumerate(split_lists):
|
|
321
|
-
logger.info(
|
|
322
|
-
|
|
413
|
+
logger.info(
|
|
414
|
+
"List %s size: %.2f KB, contains %s objects",
|
|
415
|
+
i + 1,
|
|
416
|
+
get_size_in_kb(lst),
|
|
417
|
+
len(lst),
|
|
418
|
+
)
|
|
419
|
+
logger.debug("Total splits: %s", len(split_lists))
|
|
323
420
|
return split_lists
|
|
324
421
|
|
|
325
422
|
def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) -> dict:
|
|
@@ -337,29 +434,40 @@ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) ->
|
|
|
337
434
|
Raises:
|
|
338
435
|
Exception: On failure to retrieve or parse the secret.
|
|
339
436
|
"""
|
|
340
|
-
logger.info(
|
|
341
|
-
|
|
437
|
+
logger.info(
|
|
438
|
+
"Retrieving Gen3 API key from AWS Secrets Manager: "
|
|
439
|
+
"secret_name=%s, region=%s",
|
|
440
|
+
secret_name,
|
|
441
|
+
region_name,
|
|
442
|
+
)
|
|
443
|
+
client = session.client(
|
|
444
|
+
service_name='secretsmanager',
|
|
445
|
+
region_name=region_name,
|
|
446
|
+
)
|
|
342
447
|
try:
|
|
343
448
|
get_secret_value_response = client.get_secret_value(
|
|
344
|
-
SecretId=secret_name
|
|
449
|
+
SecretId=secret_name,
|
|
345
450
|
)
|
|
346
|
-
except
|
|
347
|
-
logger.error(
|
|
348
|
-
raise
|
|
451
|
+
except (BotoCoreError, ClientError) as e:
|
|
452
|
+
logger.error("Error getting secret value from AWS Secrets Manager: %s", e)
|
|
453
|
+
raise
|
|
349
454
|
|
|
350
455
|
secret = get_secret_value_response['SecretString']
|
|
351
456
|
|
|
352
457
|
try:
|
|
353
458
|
secret = json.loads(secret)
|
|
354
459
|
api_key = secret
|
|
355
|
-
logger.debug(
|
|
460
|
+
logger.debug("Retrieved Gen3 API key from secret %s", secret_name)
|
|
356
461
|
return api_key
|
|
357
|
-
except
|
|
358
|
-
logger.error(
|
|
359
|
-
raise
|
|
462
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
463
|
+
logger.error("Error parsing Gen3 API key from AWS Secrets Manager: %s", e)
|
|
464
|
+
raise
|
|
360
465
|
|
|
361
466
|
|
|
362
|
-
def infer_api_endpoint_from_jwt(
|
|
467
|
+
def infer_api_endpoint_from_jwt(
|
|
468
|
+
jwt_token: str,
|
|
469
|
+
api_version: str = 'v0',
|
|
470
|
+
) -> str:
|
|
363
471
|
"""
|
|
364
472
|
Extracts the URL from a JSON Web Token (JWT) credential.
|
|
365
473
|
|
|
@@ -370,11 +478,14 @@ def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
|
|
|
370
478
|
str: The extracted URL.
|
|
371
479
|
"""
|
|
372
480
|
logger.info("Decoding JWT to extract API URL.")
|
|
373
|
-
url = jwt.decode(
|
|
481
|
+
url = jwt.decode(
|
|
482
|
+
jwt_token,
|
|
483
|
+
options={"verify_signature": False},
|
|
484
|
+
).get('iss', '')
|
|
374
485
|
if '/user' in url:
|
|
375
486
|
url = url.split('/user')[0]
|
|
376
487
|
url = f"{url}/api/{api_version}"
|
|
377
|
-
logger.info(
|
|
488
|
+
logger.info("Extracted API URL from JWT: %s", url)
|
|
378
489
|
return url
|
|
379
490
|
|
|
380
491
|
|
|
@@ -393,190 +504,16 @@ def create_gen3_submission_class(api_key: dict):
|
|
|
393
504
|
jwt_token = api_key['api_key']
|
|
394
505
|
logger.info("Inferring API endpoint from JWT token.")
|
|
395
506
|
api_endpoint = infer_api_endpoint_from_jwt(jwt_token)
|
|
396
|
-
logger.debug(
|
|
397
|
-
logger.info(
|
|
507
|
+
logger.debug("Inferred API endpoint: %s", api_endpoint)
|
|
508
|
+
logger.info(
|
|
509
|
+
"Creating Gen3Submission class for endpoint: %s",
|
|
510
|
+
api_endpoint,
|
|
511
|
+
)
|
|
398
512
|
auth = Gen3Auth(refresh_token=api_key)
|
|
399
513
|
submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
|
|
400
514
|
return submit
|
|
401
515
|
|
|
402
516
|
|
|
403
|
-
def submit_data_chunks(
|
|
404
|
-
split_json_list: list,
|
|
405
|
-
node: str,
|
|
406
|
-
gen3_submitter,
|
|
407
|
-
project_id: str,
|
|
408
|
-
max_retries: int,
|
|
409
|
-
file_path: str,
|
|
410
|
-
program_id: str = "program1"
|
|
411
|
-
) -> List[Dict]:
|
|
412
|
-
"""
|
|
413
|
-
Submit each chunk of data (in split_json_list) for a given node to Gen3, using retry logic and logging on failures.
|
|
414
|
-
|
|
415
|
-
Args:
|
|
416
|
-
split_json_list (list): List of JSON-serializable chunked data to submit.
|
|
417
|
-
node (str): Name of the data node being submitted.
|
|
418
|
-
gen3_submitter: A Gen3Submission instance for making submissions.
|
|
419
|
-
project_id (str): The project identifier within Gen3.
|
|
420
|
-
max_retries (int): Maximum number of retry attempts per chunk on failure.
|
|
421
|
-
file_path (str): Path of the file that was submitted. Used only for data capture.
|
|
422
|
-
program_id (str, optional): The Gen3 program id (default: "program1").
|
|
423
|
-
|
|
424
|
-
Returns:
|
|
425
|
-
List[Dict]: List of response dictionaries for each submitted chunk.
|
|
426
|
-
|
|
427
|
-
Raises:
|
|
428
|
-
Exception: If submission fails after all retry attempts for any chunk.
|
|
429
|
-
"""
|
|
430
|
-
|
|
431
|
-
n_json_data = len(split_json_list)
|
|
432
|
-
response_results = []
|
|
433
|
-
|
|
434
|
-
for index, jsn in enumerate(split_json_list):
|
|
435
|
-
progress_str = f"{index + 1}/{n_json_data}"
|
|
436
|
-
|
|
437
|
-
submission_success = False
|
|
438
|
-
last_exception = None
|
|
439
|
-
|
|
440
|
-
attempt = 0
|
|
441
|
-
while attempt <= max_retries:
|
|
442
|
-
try:
|
|
443
|
-
if attempt == 0:
|
|
444
|
-
log_msg = (
|
|
445
|
-
f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
446
|
-
f"Split: {progress_str:<5}"
|
|
447
|
-
)
|
|
448
|
-
logger.info(log_msg)
|
|
449
|
-
else:
|
|
450
|
-
log_msg = (
|
|
451
|
-
f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
|
|
452
|
-
f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
|
|
453
|
-
)
|
|
454
|
-
logger.warning(log_msg)
|
|
455
|
-
|
|
456
|
-
res = gen3_submitter.submit_record(program_id, project_id, jsn)
|
|
457
|
-
res.update({"file_path": file_path})
|
|
458
|
-
response_results.append(res)
|
|
459
|
-
submission_success = True
|
|
460
|
-
logger.info(
|
|
461
|
-
f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
|
|
462
|
-
f"Node: {node:<12} | Split: {progress_str:<5}"
|
|
463
|
-
)
|
|
464
|
-
break # Success
|
|
465
|
-
|
|
466
|
-
except Exception as e:
|
|
467
|
-
last_exception = e
|
|
468
|
-
logger.error(
|
|
469
|
-
f"Error submitting chunk {progress_str} for node '{node}': {e}"
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
if attempt < max_retries:
|
|
473
|
-
import time
|
|
474
|
-
time.sleep(0.2)
|
|
475
|
-
else:
|
|
476
|
-
logger.critical(
|
|
477
|
-
f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
|
|
478
|
-
f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
|
|
479
|
-
)
|
|
480
|
-
attempt += 1
|
|
481
|
-
|
|
482
|
-
if not submission_success:
|
|
483
|
-
# After retries, still failed
|
|
484
|
-
raise Exception(
|
|
485
|
-
f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
|
|
486
|
-
f"Last error: {last_exception}"
|
|
487
|
-
)
|
|
488
|
-
|
|
489
|
-
logger.info(f"Finished submitting node '{node}'.")
|
|
490
|
-
return response_results
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
def flatten_submission_results(submission_results: List[Dict]) -> List[Dict]:
|
|
494
|
-
"""
|
|
495
|
-
Flattens a list of Gen3 submission result dictionaries into a single list of entity dictionaries.
|
|
496
|
-
|
|
497
|
-
For each submission result, this function processes its entities (if any),
|
|
498
|
-
extracting the 'project_id' and 'submitter_id' from the 'unique_keys' field (if present)
|
|
499
|
-
into the top-level entity dictionary for easy access.
|
|
500
|
-
|
|
501
|
-
Any submission result that does not have a code of 200 or lacks entities is skipped, and a warning is logged.
|
|
502
|
-
|
|
503
|
-
Args:
|
|
504
|
-
submission_results (List[Dict]):
|
|
505
|
-
A list of Gen3 submission result dictionaries, each containing at least a "code" and "entities" entry.
|
|
506
|
-
|
|
507
|
-
Returns:
|
|
508
|
-
List[Dict]:
|
|
509
|
-
A flat list, where each element is an entity dictionary (with keys 'project_id' and 'submitter_id' added if available).
|
|
510
|
-
"""
|
|
511
|
-
flat_list_dict = []
|
|
512
|
-
total = len(submission_results)
|
|
513
|
-
logger.info(f"Flattening {total} submission result(s)...")
|
|
514
|
-
|
|
515
|
-
for idx, obj in enumerate(submission_results, 1):
|
|
516
|
-
transaction_id = obj.get("transaction_id")
|
|
517
|
-
code = obj.get("code")
|
|
518
|
-
if code != 200:
|
|
519
|
-
logger.warning(f"Skipping submission result at index {idx-1} (code={code})")
|
|
520
|
-
continue
|
|
521
|
-
|
|
522
|
-
entities = obj.get("entities")
|
|
523
|
-
|
|
524
|
-
if entities is None:
|
|
525
|
-
logger.warning(f"No entities found in submission result at index {idx-1}")
|
|
526
|
-
continue
|
|
527
|
-
|
|
528
|
-
logger.info(f"Processing submission result {idx} of {total}, {len(entities)} entities")
|
|
529
|
-
|
|
530
|
-
for entity in entities:
|
|
531
|
-
unique_keys = entity.get("unique_keys", [{}])
|
|
532
|
-
if unique_keys and isinstance(unique_keys, list):
|
|
533
|
-
keys = unique_keys[0]
|
|
534
|
-
entity["project_id"] = keys.get("project_id")
|
|
535
|
-
entity["submitter_id"] = keys.get("submitter_id")
|
|
536
|
-
entity["transaction_id"] = transaction_id
|
|
537
|
-
entity["file_path"] = obj.get("file_path", '')
|
|
538
|
-
flat_list_dict.append(entity)
|
|
539
|
-
|
|
540
|
-
# renaming cols
|
|
541
|
-
for entity in flat_list_dict:
|
|
542
|
-
entity["gen3_guid"] = entity.pop("id", None)
|
|
543
|
-
entity["node"] = entity.pop("type", None)
|
|
544
|
-
|
|
545
|
-
logger.info(f"Finished flattening. Total entities: {len(flat_list_dict)}")
|
|
546
|
-
return flat_list_dict
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
def find_version_from_path(path):
|
|
550
|
-
version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
|
|
551
|
-
found_versions = []
|
|
552
|
-
|
|
553
|
-
for segment in path.split('/'):
|
|
554
|
-
match = version_pattern.match(segment)
|
|
555
|
-
if match:
|
|
556
|
-
found_versions.append(match.group(1))
|
|
557
|
-
|
|
558
|
-
if not found_versions:
|
|
559
|
-
return None
|
|
560
|
-
|
|
561
|
-
if len(found_versions) > 1:
|
|
562
|
-
logger.warning("more than one match found in path for version string")
|
|
563
|
-
|
|
564
|
-
return found_versions[-1]
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
def collect_versions_from_metadata_file_list(metadata_file_list):
|
|
568
|
-
versions = []
|
|
569
|
-
for file_path in metadata_file_list:
|
|
570
|
-
version = find_version_from_path(file_path)
|
|
571
|
-
if version:
|
|
572
|
-
versions.append(version)
|
|
573
|
-
versions = list(set(versions))
|
|
574
|
-
if len(versions) > 1:
|
|
575
|
-
logger.error(f"more than one version found in metadata file list: {metadata_file_list}")
|
|
576
|
-
raise
|
|
577
|
-
return versions[0]
|
|
578
|
-
|
|
579
|
-
|
|
580
517
|
class MetadataSubmitter:
|
|
581
518
|
def __init__(
|
|
582
519
|
self,
|
|
@@ -584,49 +521,440 @@ class MetadataSubmitter:
|
|
|
584
521
|
api_key: dict,
|
|
585
522
|
project_id: str,
|
|
586
523
|
data_import_order_path: str,
|
|
524
|
+
dataset_root: str,
|
|
525
|
+
database: str,
|
|
526
|
+
table: str,
|
|
587
527
|
program_id: str = "program1",
|
|
588
528
|
max_size_kb: int = 100,
|
|
589
|
-
exclude_nodes:
|
|
529
|
+
exclude_nodes: Optional[List[str]] = None,
|
|
590
530
|
max_retries: int = 3,
|
|
591
|
-
aws_profile: str = None
|
|
531
|
+
aws_profile: str = None,
|
|
532
|
+
partition_cols: Optional[List[str]] = None,
|
|
533
|
+
upload_to_database: bool = True
|
|
592
534
|
):
|
|
593
535
|
"""
|
|
594
|
-
Initialises a MetadataSubmitter for submitting a set of metadata JSON
|
|
536
|
+
Initialises a MetadataSubmitter for submitting a set of metadata JSON
|
|
537
|
+
files to a Gen3 data commons endpoint, in order.
|
|
538
|
+
|
|
539
|
+
**Workflow Overview:**
|
|
540
|
+
1. **Node Traversal:** The submitter iterates through each node defined in the
|
|
541
|
+
`data_import_order` list.
|
|
542
|
+
2. **File Resolution:** For each node name, it locates the corresponding JSON file
|
|
543
|
+
(e.g., `node.json`) from the provided file list.
|
|
544
|
+
3. **Chunking:** The JSON file is read and split into manageable chunks based on size.
|
|
545
|
+
4. **Submission:** Each chunk is submitted to the Gen3 Sheepdog API via `gen3.submission`.
|
|
546
|
+
5. **Response Handling:** The API response, which includes the `submission_id` for
|
|
547
|
+
the records, is captured.
|
|
548
|
+
6. **Persistence:** The response data is flattened, converted into a DataFrame, and
|
|
549
|
+
written to Parquet files in S3. These records are also registered in a specific
|
|
550
|
+
upload table within the configured database for audit and tracking.
|
|
595
551
|
|
|
596
552
|
Args:
|
|
597
|
-
metadata_file_list (list): List of local file paths or S3 URIs to
|
|
553
|
+
metadata_file_list (list): List of local file paths or S3 URIs to
|
|
554
|
+
metadata .json files, one per node type.
|
|
598
555
|
api_key (dict): Gen3 API key as a parsed dictionary.
|
|
599
|
-
project_id (str): Gen3 project ID to submit data to.
|
|
600
|
-
data_import_order_path (str): Path or S3 URI to DataImportOrder.txt
|
|
556
|
+
project_id (str): Gen3 project ID to submit data to (e.g., "internal-project").
|
|
557
|
+
data_import_order_path (str): Path or S3 URI to DataImportOrder.txt
|
|
558
|
+
specifying node submission order.
|
|
559
|
+
dataset_root (str): S3 path where the parquet files will be stored.
|
|
560
|
+
Example: "s3://acdc-dataops-metadata/metadata_upload/"
|
|
561
|
+
database (str): Database name for storing the metadata upload.
|
|
562
|
+
Example: "acdc_dataops_metadata_db"
|
|
563
|
+
table (str): Table name for storing the metadata upload.
|
|
564
|
+
Example: "metadata_upload"
|
|
601
565
|
program_id (str, optional): Gen3 program ID (default: "program1").
|
|
602
|
-
max_size_kb (int, optional): Maximum size per submission chunk,
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
566
|
+
max_size_kb (int, optional): Maximum size per submission chunk,
|
|
567
|
+
in KB (default: 100).
|
|
568
|
+
exclude_nodes (list, optional): List of node names to skip during
|
|
569
|
+
submission. Defaults to ["project", "program", "acknowledgement", "publication"].
|
|
570
|
+
max_retries (int, optional): Maximum number of retry attempts per
|
|
571
|
+
node chunk (default: 3).
|
|
572
|
+
aws_profile (str, optional): AWS CLI named profile to use for boto3
|
|
573
|
+
session (default: None).
|
|
574
|
+
partition_cols (list, optional): List of column names to partition the parquet table by.
|
|
575
|
+
Defaults to ["upload_datetime"].
|
|
576
|
+
upload_to_database (bool, optional): Whether to upload the metadata to a database.
|
|
577
|
+
Defaults to True. The database is defined by dataset_root, database, and table.
|
|
606
578
|
"""
|
|
607
579
|
self.metadata_file_list = metadata_file_list
|
|
608
580
|
self.api_key = api_key
|
|
609
581
|
self.project_id = project_id
|
|
610
582
|
self.data_import_order_path = data_import_order_path
|
|
583
|
+
self.dataset_root = dataset_root
|
|
584
|
+
self.database = database
|
|
585
|
+
self.table = table
|
|
611
586
|
self.program_id = program_id
|
|
612
587
|
self.max_size_kb = max_size_kb
|
|
613
|
-
self.exclude_nodes = exclude_nodes
|
|
588
|
+
self.exclude_nodes = exclude_nodes or [
|
|
589
|
+
"project",
|
|
590
|
+
"program",
|
|
591
|
+
"acknowledgement",
|
|
592
|
+
"publication",
|
|
593
|
+
]
|
|
614
594
|
self.max_retries = max_retries
|
|
615
595
|
self.submission_results = []
|
|
616
596
|
self.aws_profile = aws_profile
|
|
597
|
+
self.partition_cols = partition_cols or ["upload_datetime"]
|
|
598
|
+
self.upload_to_database = upload_to_database
|
|
617
599
|
self.boto3_session = self._create_boto3_session()
|
|
618
600
|
logger.info("MetadataSubmitter initialised.")
|
|
619
601
|
|
|
620
602
|
def _create_gen3_submission_class(self):
|
|
603
|
+
"""Helper to instantiate the Gen3Submission class using the provided API key."""
|
|
621
604
|
return create_gen3_submission_class(self.api_key)
|
|
622
|
-
|
|
605
|
+
|
|
623
606
|
def _create_boto3_session(self):
|
|
607
|
+
"""Helper to create a boto3 session using the provided AWS profile."""
|
|
624
608
|
return create_boto3_session(self.aws_profile)
|
|
625
609
|
|
|
626
|
-
def
|
|
610
|
+
def _flatten_submission_results(self, submission_results: List[Dict]) -> List[Dict]:
|
|
611
|
+
"""
|
|
612
|
+
Flattens a list of Gen3 submission result dictionaries into a single
|
|
613
|
+
list of entity dictionaries.
|
|
614
|
+
|
|
615
|
+
For each submission result, this function processes its entities (if any),
|
|
616
|
+
extracting the 'project_id' and 'submitter_id' from the 'unique_keys'
|
|
617
|
+
field (if present) into the top-level entity dictionary for easy access.
|
|
618
|
+
|
|
619
|
+
Any submission result that does not have a code of 200 or lacks entities
|
|
620
|
+
is skipped, and a warning is logged.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
submission_results (List[Dict]):
|
|
624
|
+
A list of Gen3 submission result dictionaries, each containing at
|
|
625
|
+
least a "code" and "entities" entry.
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
List[Dict]:
|
|
629
|
+
A flat list, where each element is an entity dictionary
|
|
630
|
+
(with keys 'project_id' and 'submitter_id' added if available).
|
|
631
|
+
"""
|
|
632
|
+
flat_list_dict = []
|
|
633
|
+
total = len(submission_results)
|
|
634
|
+
logger.info("Flattening %s submission result(s)...", total)
|
|
635
|
+
|
|
636
|
+
for idx, obj in enumerate(submission_results, 1):
|
|
637
|
+
transaction_id = obj.get("transaction_id")
|
|
638
|
+
code = obj.get("code")
|
|
639
|
+
if code != 200:
|
|
640
|
+
logger.warning(
|
|
641
|
+
"Skipping submission result at index %s (code=%s)",
|
|
642
|
+
idx - 1,
|
|
643
|
+
code,
|
|
644
|
+
)
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
entities = obj.get("entities")
|
|
648
|
+
|
|
649
|
+
if entities is None:
|
|
650
|
+
logger.warning("No entities found in submission result at index %s", idx - 1)
|
|
651
|
+
continue
|
|
652
|
+
|
|
653
|
+
logger.info(
|
|
654
|
+
"Processing submission result %s of %s, %s entities",
|
|
655
|
+
idx,
|
|
656
|
+
total,
|
|
657
|
+
len(entities),
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
for entity in entities:
|
|
661
|
+
unique_keys = entity.get("unique_keys", [{}])
|
|
662
|
+
if unique_keys and isinstance(unique_keys, list):
|
|
663
|
+
keys = unique_keys[0]
|
|
664
|
+
entity["project_id"] = keys.get("project_id")
|
|
665
|
+
entity["submitter_id"] = keys.get("submitter_id")
|
|
666
|
+
entity["transaction_id"] = transaction_id
|
|
667
|
+
entity["file_path"] = obj.get("file_path", '')
|
|
668
|
+
flat_list_dict.append(entity)
|
|
669
|
+
|
|
670
|
+
# renaming cols
|
|
671
|
+
for entity in flat_list_dict:
|
|
672
|
+
entity["gen3_guid"] = entity.pop("id", None)
|
|
673
|
+
entity["node"] = entity.pop("type", None)
|
|
674
|
+
|
|
675
|
+
logger.info("Finished flattening. Total entities: %s", len(flat_list_dict))
|
|
676
|
+
return flat_list_dict
|
|
677
|
+
|
|
678
|
+
def _find_version_from_path(self, path: str) -> Optional[str]:
|
|
679
|
+
"""
|
|
680
|
+
Extracts a semantic version string (e.g., '1.0.0' or 'v1.0.0') from a file path.
|
|
681
|
+
|
|
682
|
+
Args:
|
|
683
|
+
path (str): The file path to inspect.
|
|
684
|
+
|
|
685
|
+
Returns:
|
|
686
|
+
Optional[str]: The extracted version string if found, otherwise None.
|
|
687
|
+
"""
|
|
688
|
+
version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
|
|
689
|
+
found_versions = []
|
|
690
|
+
|
|
691
|
+
for segment in path.split('/'):
|
|
692
|
+
match = version_pattern.match(segment)
|
|
693
|
+
if match:
|
|
694
|
+
found_versions.append(match.group(1))
|
|
695
|
+
|
|
696
|
+
if not found_versions:
|
|
697
|
+
return None
|
|
698
|
+
|
|
699
|
+
if len(found_versions) > 1:
|
|
700
|
+
logger.warning("more than one match found in path for version string")
|
|
701
|
+
|
|
702
|
+
return found_versions[-1]
|
|
703
|
+
|
|
704
|
+
def _collect_versions_from_metadata_file_list(self) -> str:
|
|
705
|
+
"""
|
|
706
|
+
Extract and validate version information from the internal list of metadata
|
|
707
|
+
file paths (self.metadata_file_list).
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
str: The single version found in the file list.
|
|
711
|
+
|
|
712
|
+
Raises:
|
|
713
|
+
ValueError: If more than one version is found across the files,
|
|
714
|
+
or if no version is found at all.
|
|
715
|
+
"""
|
|
716
|
+
versions = []
|
|
717
|
+
for file_path in self.metadata_file_list:
|
|
718
|
+
version = self._find_version_from_path(file_path)
|
|
719
|
+
if version:
|
|
720
|
+
versions.append(version)
|
|
721
|
+
versions = list(set(versions))
|
|
722
|
+
if len(versions) > 1:
|
|
723
|
+
logger.error(
|
|
724
|
+
"more than one version found in metadata file list: %s",
|
|
725
|
+
self.metadata_file_list,
|
|
726
|
+
)
|
|
727
|
+
raise ValueError(
|
|
728
|
+
"More than one version found in metadata file list: %s"
|
|
729
|
+
% self.metadata_file_list
|
|
730
|
+
)
|
|
731
|
+
if not versions:
|
|
732
|
+
raise ValueError(
|
|
733
|
+
"No version found in metadata file list: %s" % self.metadata_file_list
|
|
734
|
+
)
|
|
735
|
+
return versions[0]
|
|
736
|
+
|
|
737
|
+
def _upload_submission_results(self, submission_results: list):
|
|
738
|
+
"""
|
|
739
|
+
Uploads the submission results to S3 and a Parquet table.
|
|
740
|
+
|
|
741
|
+
This function performs the final step of the pipeline:
|
|
742
|
+
1. Flattens the submission response structure.
|
|
743
|
+
2. Prepares a DataFrame with metadata (upload_id, datetime, version).
|
|
744
|
+
3. Writes the DataFrame to Parquet files in S3 and registers them in the
|
|
745
|
+
database configured via `self.database` and `self.table`.
|
|
746
|
+
|
|
747
|
+
**Retry Mechanism:**
|
|
748
|
+
Uses the `tenacity` library to retry the upload if it fails.
|
|
749
|
+
- Stop: After `self.max_retries` attempts.
|
|
750
|
+
- Wait: Exponential backoff starting at 1s, doubling up to 10s.
|
|
751
|
+
|
|
752
|
+
Args:
|
|
753
|
+
submission_results (list): List of submission results to upload.
|
|
754
|
+
|
|
755
|
+
Configuration used (from __init__):
|
|
756
|
+
dataset_root (str): e.g. "s3://acdc-dataops-metadata/metadata_upload/"
|
|
757
|
+
database (str): e.g. "acdc_dataops_metadata_db"
|
|
758
|
+
table (str): e.g. "metadata_upload"
|
|
759
|
+
partition_cols (list): e.g. ["upload_datetime"]
|
|
760
|
+
"""
|
|
761
|
+
|
|
762
|
+
@retry(
|
|
763
|
+
stop=stop_after_attempt(self.max_retries),
|
|
764
|
+
wait=wait_exponential(multiplier=1, max=10)
|
|
765
|
+
)
|
|
766
|
+
def inner_upload():
|
|
767
|
+
logger.debug("Collecting version from metadata file list.")
|
|
768
|
+
version = self._collect_versions_from_metadata_file_list()
|
|
769
|
+
logger.debug("Extracted version: %s", version)
|
|
770
|
+
|
|
771
|
+
logger.debug("Inferring API endpoint from JWT.")
|
|
772
|
+
api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
|
|
773
|
+
logger.debug("Using API endpoint: %s", api_endpoint)
|
|
774
|
+
|
|
775
|
+
upload_datetime = datetime.now().isoformat()
|
|
776
|
+
upload_id = str(uuid.uuid4())
|
|
777
|
+
logger.debug("Upload datetime: %s", upload_datetime)
|
|
778
|
+
logger.debug("Generated upload ID: %s", upload_id)
|
|
779
|
+
|
|
780
|
+
logger.debug("Flattening submission results for upload.")
|
|
781
|
+
flattened_results = self._flatten_submission_results(submission_results)
|
|
782
|
+
logger.debug(
|
|
783
|
+
"Flattened %s submission result entries.",
|
|
784
|
+
len(flattened_results),
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
logger.debug("Converting flattened results to DataFrame.")
|
|
788
|
+
flattened_results_df = pd.DataFrame(flattened_results)
|
|
789
|
+
flattened_results_df['upload_datetime'] = upload_datetime
|
|
790
|
+
flattened_results_df['upload_id'] = upload_id
|
|
791
|
+
flattened_results_df['api_endpoint'] = api_endpoint
|
|
792
|
+
flattened_results_df['version'] = version
|
|
793
|
+
|
|
794
|
+
logger.info(
|
|
795
|
+
"Writing DataFrame to parquet and S3/table: "
|
|
796
|
+
"dataset_root=%s, database=%s, table=%s, partition_cols=%s",
|
|
797
|
+
self.dataset_root,
|
|
798
|
+
self.database,
|
|
799
|
+
self.table,
|
|
800
|
+
self.partition_cols,
|
|
801
|
+
)
|
|
802
|
+
write_parquet_to_db(
|
|
803
|
+
df=flattened_results_df,
|
|
804
|
+
dataset_root=self.dataset_root,
|
|
805
|
+
database=self.database,
|
|
806
|
+
table=self.table,
|
|
807
|
+
partition_cols=self.partition_cols,
|
|
808
|
+
)
|
|
809
|
+
logger.info(
|
|
810
|
+
"\033[94m[SUCCESS]\033[0m Metadata submission results upload complete. "
|
|
811
|
+
"Uploaded to dataset_root=%s, database=%s, table=%s.",
|
|
812
|
+
self.dataset_root,
|
|
813
|
+
self.database,
|
|
814
|
+
self.table,
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
# Execute the decorated inner function
|
|
818
|
+
try:
|
|
819
|
+
inner_upload()
|
|
820
|
+
except Exception as e:
|
|
821
|
+
logger.critical("Failed to upload submission results after %s attempts.", self.max_retries)
|
|
822
|
+
raise e
|
|
823
|
+
|
|
824
|
+
def _submit_data_chunks(
|
|
825
|
+
self,
|
|
826
|
+
split_json_list: list,
|
|
827
|
+
node: str,
|
|
828
|
+
gen3_submitter,
|
|
829
|
+
file_path: str,
|
|
830
|
+
upload_to_database: bool = True
|
|
831
|
+
) -> List[Dict]:
|
|
832
|
+
"""
|
|
833
|
+
Submit each chunk of data (in split_json_list) for a given node to Gen3,
|
|
834
|
+
using retry logic and logging on failures.
|
|
835
|
+
|
|
836
|
+
Upon completion of each chunk (success or failure), the response is uploaded
|
|
837
|
+
to the configured S3 Parquet table using `_upload_submission_results`.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
split_json_list (list): List of JSON-serializable chunked data to
|
|
841
|
+
submit.
|
|
842
|
+
node (str): Name of the data node being submitted (e.g., "program").
|
|
843
|
+
gen3_submitter: A Gen3Submission instance for making submissions.
|
|
844
|
+
file_path (str): Path of the file that was submitted.
|
|
845
|
+
Used only for data capture in the result logs.
|
|
846
|
+
|
|
847
|
+
Returns:
|
|
848
|
+
List[Dict]: List of response dictionaries for each submitted chunk.
|
|
849
|
+
|
|
850
|
+
Raises:
|
|
851
|
+
RuntimeError: If submission fails after all retry attempts for any chunk.
|
|
852
|
+
"""
|
|
853
|
+
n_json_data = len(split_json_list)
|
|
854
|
+
|
|
855
|
+
for index, jsn in enumerate(split_json_list):
|
|
856
|
+
# Holds results for the current chunk
|
|
857
|
+
current_chunk_response: List[Dict[str, Any]] = []
|
|
858
|
+
progress_str = f"{index + 1}/{n_json_data}"
|
|
859
|
+
|
|
860
|
+
submission_success = False
|
|
861
|
+
last_exception: Optional[Exception] = None
|
|
862
|
+
|
|
863
|
+
attempt = 0
|
|
864
|
+
while attempt <= self.max_retries:
|
|
865
|
+
try:
|
|
866
|
+
if attempt == 0:
|
|
867
|
+
logger.info(
|
|
868
|
+
"[SUBMIT] | Project: %-10s | Node: %-12s | "
|
|
869
|
+
"Split: %-5s",
|
|
870
|
+
self.project_id,
|
|
871
|
+
node,
|
|
872
|
+
progress_str,
|
|
873
|
+
)
|
|
874
|
+
else:
|
|
875
|
+
logger.warning(
|
|
876
|
+
"[RETRY] | Project: %-10s | Node: %-12s | "
|
|
877
|
+
"Split: %-5s | "
|
|
878
|
+
"Attempt: %s/%s",
|
|
879
|
+
self.project_id,
|
|
880
|
+
node,
|
|
881
|
+
progress_str,
|
|
882
|
+
attempt,
|
|
883
|
+
self.max_retries,
|
|
884
|
+
)
|
|
885
|
+
|
|
886
|
+
res = gen3_submitter.submit_record(self.program_id, self.project_id, jsn)
|
|
887
|
+
res.update({"file_path": file_path})
|
|
888
|
+
current_chunk_response.append(res)
|
|
889
|
+
submission_success = True
|
|
890
|
+
logger.info(
|
|
891
|
+
"\033[92m[SUCCESS]\033[0m | Project: %-10s | "
|
|
892
|
+
"Node: %-12s | Split: %-5s",
|
|
893
|
+
self.project_id,
|
|
894
|
+
node,
|
|
895
|
+
progress_str,
|
|
896
|
+
)
|
|
897
|
+
break # Success
|
|
898
|
+
|
|
899
|
+
except (
|
|
900
|
+
requests.exceptions.RequestException,
|
|
901
|
+
ValueError,
|
|
902
|
+
TypeError,
|
|
903
|
+
) as e:
|
|
904
|
+
last_exception = e
|
|
905
|
+
logger.error(
|
|
906
|
+
"Error submitting chunk %s for node '%s': %s",
|
|
907
|
+
progress_str,
|
|
908
|
+
node,
|
|
909
|
+
e,
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
if attempt < self.max_retries:
|
|
913
|
+
time.sleep(0.2)
|
|
914
|
+
else:
|
|
915
|
+
logger.critical(
|
|
916
|
+
"\033[91m[FAILED]\033[0m | Project: %-10s | "
|
|
917
|
+
"Node: %-12s | Split: %-5s | Error: %s",
|
|
918
|
+
self.project_id,
|
|
919
|
+
node,
|
|
920
|
+
progress_str,
|
|
921
|
+
e,
|
|
922
|
+
)
|
|
923
|
+
attempt += 1
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
if upload_to_database:
|
|
927
|
+
# Also submitting data chunk response info to s3 and parquet table
|
|
928
|
+
logger.info("Submitting data chunk response info to S3 and Parquet table.")
|
|
929
|
+
self._upload_submission_results(submission_results=current_chunk_response)
|
|
930
|
+
|
|
931
|
+
if not submission_success:
|
|
932
|
+
# After retries, still failed
|
|
933
|
+
raise RuntimeError(
|
|
934
|
+
(
|
|
935
|
+
"Failed to submit chunk %s for node '%s' after %s attempts. "
|
|
936
|
+
"Last error: %s"
|
|
937
|
+
)
|
|
938
|
+
% (progress_str, node, self.max_retries + 1, last_exception)
|
|
939
|
+
) from last_exception
|
|
940
|
+
|
|
941
|
+
logger.info("Finished submitting node '%s'.", node)
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def _read_data_import_order(
|
|
945
|
+
self,
|
|
946
|
+
data_import_order_path: str,
|
|
947
|
+
exclude_nodes: List[str],
|
|
948
|
+
boto3_session=None,
|
|
949
|
+
):
|
|
950
|
+
"""Helper to read the data import order from local disk or S3."""
|
|
627
951
|
if is_s3_uri(data_import_order_path):
|
|
628
952
|
session = boto3_session or self.boto3_session
|
|
629
|
-
return read_data_import_order_txt_s3(
|
|
953
|
+
return read_data_import_order_txt_s3(
|
|
954
|
+
data_import_order_path,
|
|
955
|
+
session,
|
|
956
|
+
exclude_nodes,
|
|
957
|
+
)
|
|
630
958
|
else:
|
|
631
959
|
return read_data_import_order_txt(data_import_order_path, exclude_nodes)
|
|
632
960
|
|
|
@@ -643,7 +971,7 @@ class MetadataSubmitter:
|
|
|
643
971
|
list: A list of chunks, where each chunk is a list of dictionaries
|
|
644
972
|
containing JSON data.
|
|
645
973
|
"""
|
|
646
|
-
logger.info(
|
|
974
|
+
logger.info("Reading metadata json from %s", metadata_file_path)
|
|
647
975
|
if is_s3_uri(metadata_file_path):
|
|
648
976
|
session = self.boto3_session
|
|
649
977
|
data = read_metadata_json_s3(metadata_file_path, session)
|
|
@@ -660,113 +988,62 @@ class MetadataSubmitter:
|
|
|
660
988
|
are the corresponding file paths.
|
|
661
989
|
|
|
662
990
|
Returns:
|
|
663
|
-
dict: Dictionary mapping node names (str) to their associated metadata file paths
|
|
991
|
+
dict: Dictionary mapping node names (str) to their associated metadata file paths.
|
|
664
992
|
"""
|
|
665
993
|
file_map = {
|
|
666
|
-
get_node_from_file_path(
|
|
667
|
-
for
|
|
994
|
+
get_node_from_file_path(file_path): file_path
|
|
995
|
+
for file_path in self.metadata_file_list
|
|
668
996
|
}
|
|
669
997
|
return file_map
|
|
670
998
|
|
|
671
|
-
def submit_metadata(self) -> List[Dict]:
|
|
999
|
+
def submit_metadata(self) -> List[Dict[str, Any]]:
|
|
672
1000
|
"""
|
|
673
1001
|
Submits metadata for each node defined in the data import order, except those in the exclude list.
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
1002
|
+
|
|
1003
|
+
**Detailed Process:**
|
|
1004
|
+
1. **Order Resolution:** The function reads the import order to determine the sequence of nodes.
|
|
1005
|
+
2. **File Mapping:** It finds the matching `node.json` file for each node in the order.
|
|
1006
|
+
3. **Chunk & Submit:** For every file, the JSON content is split into chunks and submitted
|
|
1007
|
+
to the Sheepdog API via `gen3.submission`.
|
|
1008
|
+
4. **Audit Logging:** The API response (containing `submission_id`) is flattened and
|
|
1009
|
+
converted to a DataFrame. This is then written to Parquet files in S3 and registered
|
|
1010
|
+
in the configured upload table.
|
|
678
1011
|
|
|
679
1012
|
Returns:
|
|
680
|
-
List[Dict]: A list of response dictionaries returned from the Gen3 metadata submissions.
|
|
1013
|
+
List[Dict[str, Any]]: A list of response dictionaries returned from the Gen3 metadata submissions.
|
|
1014
|
+
Each dictionary contains the response from submitting a chunk of metadata for a given node.
|
|
1015
|
+
The keys in the dictionary are "node_name", "response", and "status_code".
|
|
681
1016
|
"""
|
|
682
1017
|
gen3_submitter = self._create_gen3_submission_class()
|
|
683
|
-
data_import_order = self._read_data_import_order(
|
|
1018
|
+
data_import_order = self._read_data_import_order(
|
|
1019
|
+
self.data_import_order_path,
|
|
1020
|
+
self.exclude_nodes,
|
|
1021
|
+
self.boto3_session,
|
|
1022
|
+
)
|
|
684
1023
|
file_map = self._create_file_map()
|
|
685
|
-
output_response_list_dict = []
|
|
686
1024
|
|
|
687
1025
|
logger.info("Starting metadata submission.")
|
|
688
|
-
for node in data_import_order:
|
|
689
1026
|
|
|
1027
|
+
for node in data_import_order:
|
|
690
1028
|
if node in self.exclude_nodes:
|
|
691
|
-
logger.info(
|
|
1029
|
+
logger.info("Skipping node '%s' (in exclude list).", node)
|
|
692
1030
|
continue
|
|
693
1031
|
file_path = file_map.get(node)
|
|
694
1032
|
if not file_path:
|
|
695
|
-
logger.info(
|
|
1033
|
+
logger.info("Skipping node '%s' (not present in file list).", node)
|
|
696
1034
|
continue
|
|
697
1035
|
|
|
698
|
-
logger.info(
|
|
1036
|
+
logger.info("Processing file '%s' for node '%s'.", file_path, node)
|
|
699
1037
|
logger.info("Splitting JSON data into chunks.")
|
|
700
1038
|
json_chunks = self._prepare_json_chunks(file_path, self.max_size_kb)
|
|
701
1039
|
|
|
702
1040
|
logger.info("Submitting chunks to Gen3.")
|
|
703
|
-
|
|
1041
|
+
self._submit_data_chunks(
|
|
704
1042
|
split_json_list=json_chunks,
|
|
705
1043
|
node=node,
|
|
706
|
-
file_path=file_path,
|
|
707
1044
|
gen3_submitter=gen3_submitter,
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
program_id=self.program_id
|
|
1045
|
+
file_path=file_path,
|
|
1046
|
+
upload_to_database=self.upload_to_database
|
|
711
1047
|
)
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
self.submission_results = output_response_list_dict
|
|
715
|
-
return output_response_list_dict
|
|
1048
|
+
|
|
716
1049
|
|
|
717
|
-
def upload_metadata_submission_results(
|
|
718
|
-
self,
|
|
719
|
-
dataset_root: str,
|
|
720
|
-
database: str,
|
|
721
|
-
table: str,
|
|
722
|
-
partition_cols: list = ["upload_datetime"],
|
|
723
|
-
):
|
|
724
|
-
"""
|
|
725
|
-
Uploads the submission results to s3 and parquet table.
|
|
726
|
-
|
|
727
|
-
Args:
|
|
728
|
-
dataset_root (str): S3 path where the parquet files will be stored
|
|
729
|
-
(e.g., "s3://acdc-dataops-metadata/metadata_upload/").
|
|
730
|
-
database (str): Database name for storing the metadata upload
|
|
731
|
-
(e.g., "acdc_dataops_metadata_db").
|
|
732
|
-
table (str): Table name for storing the metadata upload
|
|
733
|
-
(e.g., "metadata_upload").
|
|
734
|
-
partition_cols (list, optional): List of column names to partition the parquet table by.
|
|
735
|
-
Defaults to ["upload_datetime"].
|
|
736
|
-
"""
|
|
737
|
-
logger.info("Collecting version from metadata file list.")
|
|
738
|
-
version = collect_versions_from_metadata_file_list(self.metadata_file_list)
|
|
739
|
-
logger.info(f"Extracted version: {version}")
|
|
740
|
-
|
|
741
|
-
logger.info("Inferring API endpoint from JWT.")
|
|
742
|
-
api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
|
|
743
|
-
logger.info(f"Using API endpoint: {api_endpoint}")
|
|
744
|
-
|
|
745
|
-
upload_datetime = datetime.now().isoformat()
|
|
746
|
-
upload_id = str(uuid.uuid4())
|
|
747
|
-
logger.info(f"Upload datetime: {upload_datetime}")
|
|
748
|
-
logger.info(f"Generated upload ID: {upload_id}")
|
|
749
|
-
|
|
750
|
-
logger.info("Flattening submission results for upload.")
|
|
751
|
-
flattened_results = flatten_submission_results(self.submission_results)
|
|
752
|
-
logger.info(f"Flattened {len(flattened_results)} submission result entries.")
|
|
753
|
-
|
|
754
|
-
logger.info("Converting flattened results to DataFrame.")
|
|
755
|
-
flattened_results_df = pd.DataFrame(flattened_results)
|
|
756
|
-
flattened_results_df['upload_datetime'] = upload_datetime
|
|
757
|
-
flattened_results_df['upload_id'] = upload_id
|
|
758
|
-
flattened_results_df['api_endpoint'] = api_endpoint
|
|
759
|
-
flattened_results_df['version'] = version
|
|
760
|
-
|
|
761
|
-
logger.info(
|
|
762
|
-
f"Writing DataFrame to parquet and S3/table: "
|
|
763
|
-
f"dataset_root={dataset_root}, database={database}, table={table}, partition_cols={partition_cols}"
|
|
764
|
-
)
|
|
765
|
-
write_parquet_to_db(
|
|
766
|
-
df=flattened_results_df,
|
|
767
|
-
dataset_root=dataset_root,
|
|
768
|
-
database=database,
|
|
769
|
-
table=table,
|
|
770
|
-
partition_cols=partition_cols
|
|
771
|
-
)
|
|
772
|
-
logger.info("Metadata submission results upload complete.")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: acdc_aws_etl_pipeline
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.1
|
|
4
4
|
Summary: Tools for ACDC ETL pipeline
|
|
5
5
|
Author: JoshuaHarris391
|
|
6
6
|
Author-email: harjo391@gmail.com
|
|
@@ -23,6 +23,7 @@ Requires-Dist: python-dotenv
|
|
|
23
23
|
Requires-Dist: pytz (>=2025.2,<2026.0)
|
|
24
24
|
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
25
25
|
Requires-Dist: s3fs (==2025.10.0)
|
|
26
|
+
Requires-Dist: tenacity (>=8.2,<10.0)
|
|
26
27
|
Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
|
|
27
28
|
Description-Content-Type: text/markdown
|
|
28
29
|
|
|
@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6
|
|
|
3
3
|
acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
|
|
4
4
|
acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
|
|
5
5
|
acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
|
|
6
|
-
acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=
|
|
6
|
+
acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=2PVuv-mvjnO-FxVZHiYfTDlbioEo-JsTcvNZY6v2n40,38331
|
|
7
7
|
acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
|
|
8
8
|
acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
|
|
9
9
|
acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
|
|
10
10
|
acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
|
|
11
11
|
acdc_aws_etl_pipeline/validate/validate.py,sha256=zLqK9i92FsRAaBOGdY-G7-vb0e6tmkoUXhY6zCfbjN8,24895
|
|
12
|
-
acdc_aws_etl_pipeline-0.
|
|
13
|
-
acdc_aws_etl_pipeline-0.
|
|
14
|
-
acdc_aws_etl_pipeline-0.
|
|
12
|
+
acdc_aws_etl_pipeline-0.7.1.dist-info/METADATA,sha256=WddwCKf3KV4-JsKtsegk5dxu6dWKvXx8YANvZZKbRGs,2964
|
|
13
|
+
acdc_aws_etl_pipeline-0.7.1.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
14
|
+
acdc_aws_etl_pipeline-0.7.1.dist-info/RECORD,,
|
|
File without changes
|