acdc_aws_etl_pipeline 0.6.9__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,34 +1,41 @@
1
1
  import os
2
- # redefine to use local cache in /tmp
3
- os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
4
-
2
+ import sys
3
+ import time
5
4
  import json
6
5
  import boto3
6
+ from botocore.exceptions import BotoCoreError, ClientError
7
7
  from gen3.auth import Gen3Auth
8
- from gen3.index import Gen3Index
9
8
  from gen3.submission import Gen3Submission
10
9
  import logging
11
10
  from datetime import datetime
12
11
  import jwt
13
- from typing import Dict, List
12
+ import requests
13
+ from typing import Any, Dict, List, Optional
14
14
  import re
15
15
  import pandas as pd
16
16
  import uuid
17
- from acdc_aws_etl_pipeline.validate.validate import write_parquet_to_db
17
+ from acdc_aws_etl_pipeline.validate.validate import (
18
+ write_parquet_to_db,
19
+ )
20
+ from tenacity import retry, stop_after_attempt, wait_exponential
21
+
22
+ # redefine to use local cache in /tmp
23
+ os.environ['XDG_CACHE_HOME'] = '/tmp/.cache'
18
24
 
19
25
  logger = logging.getLogger(__name__)
20
26
 
21
- def create_boto3_session(aws_profile: str = None):
27
+ def create_boto3_session(aws_profile: Optional[str] = None):
22
28
  """
23
29
  Create and return a boto3 Session object using an optional AWS profile.
24
30
 
25
31
  Args:
26
- aws_profile (str, optional): The AWS CLI named profile to use for credentials. If None, uses default credentials.
32
+ aws_profile (str, optional): The AWS CLI named profile to use.
33
+ If None, uses default credentials.
27
34
 
28
35
  Returns:
29
36
  boto3.Session: The created session instance.
30
37
  """
31
- logger.debug(f"Creating boto3 session with aws_profile={aws_profile}")
38
+ logger.debug("Creating boto3 session with aws_profile=%s", aws_profile)
32
39
  return boto3.Session(profile_name=aws_profile) if aws_profile else boto3.Session()
33
40
 
34
41
  def is_s3_uri(s3_uri: str) -> bool:
@@ -41,7 +48,7 @@ def is_s3_uri(s3_uri: str) -> bool:
41
48
  Returns:
42
49
  bool: True if the string starts with 's3://', False otherwise.
43
50
  """
44
- logger.debug(f"Checking if {s3_uri} is an S3 URI.")
51
+ logger.debug("Checking if %s is an S3 URI.", s3_uri)
45
52
  return s3_uri.startswith("s3://")
46
53
 
47
54
  def get_filename(file_path: str) -> str:
@@ -55,7 +62,11 @@ def get_filename(file_path: str) -> str:
55
62
  str: The filename (with extension).
56
63
  """
57
64
  filename = file_path.split("/")[-1]
58
- logger.debug(f"Extracted filename '{filename}' from file_path '{file_path}'.")
65
+ logger.debug(
66
+ "Extracted filename '%s' from file_path '%s'.",
67
+ filename,
68
+ file_path,
69
+ )
59
70
  return filename
60
71
 
61
72
  def get_node_from_file_path(file_path: str) -> str:
@@ -70,7 +81,7 @@ def get_node_from_file_path(file_path: str) -> str:
70
81
  """
71
82
  filename = get_filename(file_path)
72
83
  node = filename.split(".")[0]
73
- logger.debug(f"Extracted node '{node}' from filename '{filename}'.")
84
+ logger.debug("Extracted node '%s' from filename '%s'.", node, filename)
74
85
  return node
75
86
 
76
87
  def list_metadata_jsons(metadata_dir: str) -> list:
@@ -87,11 +98,18 @@ def list_metadata_jsons(metadata_dir: str) -> list:
87
98
  Exception: If there is an error reading the directory.
88
99
  """
89
100
  try:
90
- logger.info(f"Listing .json files in metadata directory: {metadata_dir}")
101
+ logger.info(
102
+ "Listing .json files in metadata directory: %s",
103
+ metadata_dir,
104
+ )
91
105
  files = os.listdir(metadata_dir)
92
- return [os.path.abspath(os.path.join(metadata_dir, f)) for f in files if f.endswith(".json")]
93
- except Exception as e:
94
- logger.error(f"Error listing metadata JSONs in {metadata_dir}: {e}")
106
+ return [
107
+ os.path.abspath(os.path.join(metadata_dir, file_name))
108
+ for file_name in files
109
+ if file_name.endswith(".json")
110
+ ]
111
+ except OSError as e:
112
+ logger.error("Error listing metadata JSONs in %s: %s", metadata_dir, e)
95
113
  raise
96
114
 
97
115
  def find_data_import_order_file(metadata_dir: str) -> str:
@@ -108,16 +126,22 @@ def find_data_import_order_file(metadata_dir: str) -> str:
108
126
  FileNotFoundError: If no such file is found.
109
127
  """
110
128
  try:
111
- logger.info(f"Searching for DataImportOrder.txt in {metadata_dir}")
129
+ logger.info("Searching for DataImportOrder.txt in %s", metadata_dir)
112
130
  files = [os.path.join(metadata_dir, f) for f in os.listdir(metadata_dir)]
113
131
  order_files = [f for f in files if "DataImportOrder.txt" in f]
114
132
  if not order_files:
115
133
  logger.error("No DataImportOrder.txt file found in the given directory.")
116
- raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
117
- logger.debug(f"Found DataImportOrder.txt file: {order_files[0]}")
134
+ raise FileNotFoundError(
135
+ "No DataImportOrder.txt file found in the given directory."
136
+ )
137
+ logger.debug("Found DataImportOrder.txt file: %s", order_files[0])
118
138
  return order_files[0]
119
- except Exception as e:
120
- logger.error(f"Error finding DataImportOrder.txt in {metadata_dir}: {e}")
139
+ except OSError as e:
140
+ logger.error(
141
+ "Error finding DataImportOrder.txt in %s: %s",
142
+ metadata_dir,
143
+ e,
144
+ )
121
145
  raise
122
146
 
123
147
  def list_metadata_jsons_s3(s3_uri: str, session) -> list:
@@ -125,13 +149,14 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
125
149
  List all .json files in an S3 "directory" (prefix).
126
150
 
127
151
  Args:
128
- s3_uri (str): S3 URI to the metadata directory (e.g. "s3://my-bucket/path/to/dir").
152
+ s3_uri (str): S3 URI to the metadata directory
153
+ (e.g. "s3://my-bucket/path/to/dir").
129
154
  session (boto3.Session): An active boto3 Session.
130
155
 
131
156
  Returns:
132
157
  list: List of S3 URIs for all .json files found under the prefix.
133
158
  """
134
- logger.info(f"Listing .json files in S3 metadata directory: {s3_uri}")
159
+ logger.info("Listing .json files in S3 metadata directory: %s", s3_uri)
135
160
  s3 = session.client('s3')
136
161
  bucket = s3_uri.split("/")[2]
137
162
  prefix = "/".join(s3_uri.split("/")[3:])
@@ -144,7 +169,7 @@ def list_metadata_jsons_s3(s3_uri: str, session) -> list:
144
169
  for obj in objects.get('Contents', [])
145
170
  if obj['Key'].endswith(".json")
146
171
  ]
147
- logger.debug(f"Found {len(result)} .json files in S3 at {s3_uri}")
172
+ logger.debug("Found %s .json files in S3 at %s", len(result), s3_uri)
148
173
  return result
149
174
 
150
175
  def find_data_import_order_file_s3(s3_uri: str, session) -> str:
@@ -161,16 +186,29 @@ def find_data_import_order_file_s3(s3_uri: str, session) -> str:
161
186
  Raises:
162
187
  FileNotFoundError: If the file does not exist in the specified prefix.
163
188
  """
164
- logger.info(f"Searching for DataImportOrder.txt in S3 metadata directory: {s3_uri}")
189
+ logger.info(
190
+ "Searching for DataImportOrder.txt in S3 metadata directory: %s",
191
+ s3_uri,
192
+ )
165
193
  s3 = session.client('s3')
166
194
  bucket = s3_uri.split("/")[2]
167
195
  prefix = "/".join(s3_uri.split("/")[3:])
168
196
  objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
169
- order_files = [obj['Key'] for obj in objects.get('Contents', []) if obj['Key'].endswith("DataImportOrder.txt")]
197
+ order_files = [
198
+ obj['Key']
199
+ for obj in objects.get('Contents', [])
200
+ if obj['Key'].endswith("DataImportOrder.txt")
201
+ ]
170
202
  if not order_files:
171
203
  logger.error("No DataImportOrder.txt file found in the given S3 directory.")
172
- raise FileNotFoundError("No DataImportOrder.txt file found in the given directory.")
173
- logger.debug(f"Found DataImportOrder.txt file in S3: s3://{bucket}/{order_files[0]}")
204
+ raise FileNotFoundError(
205
+ "No DataImportOrder.txt file found in the given directory."
206
+ )
207
+ logger.debug(
208
+ "Found DataImportOrder.txt file in S3: s3://%s/%s",
209
+ bucket,
210
+ order_files[0],
211
+ )
174
212
  return f"s3://{bucket}/{order_files[0]}"
175
213
 
176
214
  def read_metadata_json(file_path: str) -> dict:
@@ -183,10 +221,14 @@ def read_metadata_json(file_path: str) -> dict:
183
221
  Returns:
184
222
  dict or list: Parsed contents of the JSON file.
185
223
  """
186
- logger.info(f"Reading metadata json from local file: {file_path}")
187
- with open(file_path, "r") as f:
224
+ logger.info("Reading metadata json from local file: %s", file_path)
225
+ with open(file_path, "r", encoding="utf-8") as f:
188
226
  data = json.load(f)
189
- logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {file_path}")
227
+ logger.debug(
228
+ "Read %s objects from %s",
229
+ len(data) if isinstance(data, list) else 'object',
230
+ file_path,
231
+ )
190
232
  return data
191
233
 
192
234
  def read_metadata_json_s3(s3_uri: str, session) -> dict:
@@ -200,11 +242,18 @@ def read_metadata_json_s3(s3_uri: str, session) -> dict:
200
242
  Returns:
201
243
  dict or list: Parsed JSON object from S3 file.
202
244
  """
203
- logger.info(f"Reading metadata json from S3 file: {s3_uri}")
245
+ logger.info("Reading metadata json from S3 file: %s", s3_uri)
204
246
  s3 = session.client('s3')
205
- obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
247
+ obj = s3.get_object(
248
+ Bucket=s3_uri.split("/")[2],
249
+ Key="/".join(s3_uri.split("/")[3:]),
250
+ )
206
251
  data = json.loads(obj['Body'].read().decode('utf-8'))
207
- logger.debug(f"Read {len(data) if isinstance(data, list) else 'object'} objects from {s3_uri}")
252
+ logger.debug(
253
+ "Read %s objects from %s",
254
+ len(data) if isinstance(data, list) else 'object',
255
+ s3_uri,
256
+ )
208
257
  return data
209
258
 
210
259
  def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = None) -> list:
@@ -224,20 +273,41 @@ def read_data_import_order_txt_s3(s3_uri: str, session, exclude_nodes: list = No
224
273
  """
225
274
  filename = s3_uri.split("/")[-1]
226
275
  if 'DataImportOrder.txt' not in filename:
227
- logger.error(f"File {filename} is not a DataImportOrder.txt file")
228
- raise ValueError(f"File {filename} is not a DataImportOrder.txt file")
229
- logger.info(f"Reading DataImportOrder.txt from S3 file: {s3_uri}")
276
+ logger.error("File %s is not a DataImportOrder.txt file", filename)
277
+ raise ValueError(
278
+ f"File {filename} is not a DataImportOrder.txt file"
279
+ )
280
+ logger.info(
281
+ "Reading DataImportOrder.txt from S3 file: %s",
282
+ s3_uri,
283
+ )
230
284
  s3 = session.client('s3')
231
- obj = s3.get_object(Bucket=s3_uri.split("/")[2], Key="/".join(s3_uri.split("/")[3:]))
285
+ obj = s3.get_object(
286
+ Bucket=s3_uri.split("/")[2],
287
+ Key="/".join(s3_uri.split("/")[3:]),
288
+ )
232
289
  content = obj['Body'].read().decode('utf-8')
233
- import_order = [line.rstrip() for line in content.splitlines() if line.strip()]
234
- logger.debug(f"Raw import order from S3 file: {import_order}")
290
+ import_order = [
291
+ line.rstrip()
292
+ for line in content.splitlines()
293
+ if line.strip()
294
+ ]
295
+ logger.debug("Raw import order from S3 file: %s", import_order)
235
296
  if exclude_nodes is not None:
236
297
  import_order = [node for node in import_order if node not in exclude_nodes]
237
- logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
238
- logger.debug(f"Final import order from S3 file {s3_uri}: {import_order}")
298
+ logger.debug(
299
+ "Import order after excluding nodes %s: %s",
300
+ exclude_nodes,
301
+ import_order,
302
+ )
303
+ logger.debug(
304
+ "Final import order from S3 file %s: %s",
305
+ s3_uri,
306
+ import_order,
307
+ )
239
308
  return import_order
240
309
 
310
+
241
311
  def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
242
312
  """
243
313
  Read DataImportOrder.txt from local file, optionally excluding some nodes.
@@ -253,17 +323,26 @@ def read_data_import_order_txt(file_path: str, exclude_nodes: list) -> list:
253
323
  FileNotFoundError: If the file is not found.
254
324
  """
255
325
  try:
256
- logger.info(f"Reading DataImportOrder.txt from local file: {file_path}")
257
- with open(file_path, "r") as f:
326
+ logger.info(
327
+ "Reading DataImportOrder.txt from local file: %s",
328
+ file_path,
329
+ )
330
+ with open(file_path, "r", encoding="utf-8") as f:
258
331
  import_order = [line.rstrip() for line in f if line.strip()]
259
- logger.debug(f"Raw import order from file: {import_order}")
332
+ logger.debug("Raw import order from file: %s", import_order)
260
333
  if exclude_nodes is not None:
261
- import_order = [node for node in import_order if node not in exclude_nodes]
262
- logger.debug(f"Import order after excluding nodes {exclude_nodes}: {import_order}")
263
- logger.debug(f"Final import order from {file_path}: {import_order}")
334
+ import_order = [
335
+ node for node in import_order if node not in exclude_nodes
336
+ ]
337
+ logger.debug(
338
+ "Import order after excluding nodes %s: %s",
339
+ exclude_nodes,
340
+ import_order,
341
+ )
342
+ logger.debug("Final import order from %s: %s", file_path, import_order)
264
343
  return import_order
265
344
  except FileNotFoundError:
266
- logger.error(f"Error: DataImportOrder.txt not found in {file_path}")
345
+ logger.error("Error: DataImportOrder.txt not found in %s", file_path)
267
346
  return []
268
347
 
269
348
  def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
@@ -280,7 +359,12 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
280
359
  Returns:
281
360
  list: List of lists. Each sublist size (JSON-serialized) <= max_size_kb.
282
361
  """
283
- logger.info(f"Splitting JSON objects into max {max_size_kb} KB chunks. Total items: {len(json_list)}")
362
+ logger.info(
363
+ "Splitting JSON objects into max %s KB chunks. Total items: %s",
364
+ max_size_kb,
365
+ len(json_list),
366
+ )
367
+
284
368
  def get_size_in_kb(obj):
285
369
  """
286
370
  Get the size in kilobytes of the JSON-serialized object.
@@ -291,12 +375,11 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
291
375
  Returns:
292
376
  float: Size of the object in kilobytes.
293
377
  """
294
- import sys
295
378
  size_kb = sys.getsizeof(json.dumps(obj)) / 1024
296
- logger.debug(f"Calculated size: {size_kb:.2f} KB")
379
+ logger.debug("Calculated size: %.2f KB", size_kb)
297
380
  return size_kb
298
381
 
299
- def split_list(json_list):
382
+ def split_list(items):
300
383
  """
301
384
  Recursively split the list so each chunk fits within max_size_kb.
302
385
 
@@ -306,20 +389,34 @@ def split_json_objects(json_list, max_size_kb=50, print_results=False) -> list:
306
389
  Returns:
307
390
  list: List of sublists.
308
391
  """
309
- if get_size_in_kb(json_list) <= max_size_kb:
310
- logger.debug(f"Split length {len(json_list)} is within max size {max_size_kb} KB.")
311
- return [json_list]
312
- mid = len(json_list) // 2
313
- left_list = json_list[:mid]
314
- right_list = json_list[mid:]
315
- logger.debug(f"Splitting list at index {mid}: left {len(left_list)}, right {len(right_list)}")
392
+ if get_size_in_kb(items) <= max_size_kb:
393
+ logger.debug(
394
+ "Split length %s is within max size %s KB.",
395
+ len(items),
396
+ max_size_kb,
397
+ )
398
+ return [items]
399
+ mid = len(items) // 2
400
+ left_list = items[:mid]
401
+ right_list = items[mid:]
402
+ logger.debug(
403
+ "Splitting list at index %s: left %s, right %s",
404
+ mid,
405
+ len(left_list),
406
+ len(right_list),
407
+ )
316
408
  return split_list(left_list) + split_list(right_list)
317
409
 
318
410
  split_lists = split_list(json_list)
319
411
  if print_results:
320
412
  for i, lst in enumerate(split_lists):
321
- logger.info(f"List {i+1} size: {get_size_in_kb(lst):.2f} KB, contains {len(lst)} objects")
322
- logger.debug(f"Total splits: {len(split_lists)}")
413
+ logger.info(
414
+ "List %s size: %.2f KB, contains %s objects",
415
+ i + 1,
416
+ get_size_in_kb(lst),
417
+ len(lst),
418
+ )
419
+ logger.debug("Total splits: %s", len(split_lists))
323
420
  return split_lists
324
421
 
325
422
  def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) -> dict:
@@ -337,29 +434,40 @@ def get_gen3_api_key_aws_secret(secret_name: str, region_name: str, session) ->
337
434
  Raises:
338
435
  Exception: On failure to retrieve or parse the secret.
339
436
  """
340
- logger.info(f"Retrieving Gen3 API key from AWS Secrets Manager: secret_name={secret_name}, region={region_name}")
341
- client = session.client(service_name='secretsmanager', region_name=region_name)
437
+ logger.info(
438
+ "Retrieving Gen3 API key from AWS Secrets Manager: "
439
+ "secret_name=%s, region=%s",
440
+ secret_name,
441
+ region_name,
442
+ )
443
+ client = session.client(
444
+ service_name='secretsmanager',
445
+ region_name=region_name,
446
+ )
342
447
  try:
343
448
  get_secret_value_response = client.get_secret_value(
344
- SecretId=secret_name
449
+ SecretId=secret_name,
345
450
  )
346
- except Exception as e:
347
- logger.error(f"Error getting secret value from AWS Secrets Manager: {e}")
348
- raise e
451
+ except (BotoCoreError, ClientError) as e:
452
+ logger.error("Error getting secret value from AWS Secrets Manager: %s", e)
453
+ raise
349
454
 
350
455
  secret = get_secret_value_response['SecretString']
351
456
 
352
457
  try:
353
458
  secret = json.loads(secret)
354
459
  api_key = secret
355
- logger.debug(f"Retrieved Gen3 API key from secret {secret_name}")
460
+ logger.debug("Retrieved Gen3 API key from secret %s", secret_name)
356
461
  return api_key
357
- except Exception as e:
358
- logger.error(f"Error parsing Gen3 API key from AWS Secrets Manager: {e}")
359
- raise e
462
+ except (json.JSONDecodeError, TypeError) as e:
463
+ logger.error("Error parsing Gen3 API key from AWS Secrets Manager: %s", e)
464
+ raise
360
465
 
361
466
 
362
- def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
467
+ def infer_api_endpoint_from_jwt(
468
+ jwt_token: str,
469
+ api_version: str = 'v0',
470
+ ) -> str:
363
471
  """
364
472
  Extracts the URL from a JSON Web Token (JWT) credential.
365
473
 
@@ -370,11 +478,14 @@ def infer_api_endpoint_from_jwt(jwt_token: str, api_version: str = 'v0') -> str:
370
478
  str: The extracted URL.
371
479
  """
372
480
  logger.info("Decoding JWT to extract API URL.")
373
- url = jwt.decode(jwt_token, options={"verify_signature": False}).get('iss', '')
481
+ url = jwt.decode(
482
+ jwt_token,
483
+ options={"verify_signature": False},
484
+ ).get('iss', '')
374
485
  if '/user' in url:
375
486
  url = url.split('/user')[0]
376
487
  url = f"{url}/api/{api_version}"
377
- logger.info(f"Extracted API URL from JWT: {url}")
488
+ logger.info("Extracted API URL from JWT: %s", url)
378
489
  return url
379
490
 
380
491
 
@@ -393,190 +504,16 @@ def create_gen3_submission_class(api_key: dict):
393
504
  jwt_token = api_key['api_key']
394
505
  logger.info("Inferring API endpoint from JWT token.")
395
506
  api_endpoint = infer_api_endpoint_from_jwt(jwt_token)
396
- logger.debug(f"Inferred API endpoint: {api_endpoint}")
397
- logger.info(f"Creating Gen3Submission class for endpoint: {api_endpoint}")
507
+ logger.debug("Inferred API endpoint: %s", api_endpoint)
508
+ logger.info(
509
+ "Creating Gen3Submission class for endpoint: %s",
510
+ api_endpoint,
511
+ )
398
512
  auth = Gen3Auth(refresh_token=api_key)
399
513
  submit = Gen3Submission(endpoint=api_endpoint, auth_provider=auth)
400
514
  return submit
401
515
 
402
516
 
403
- def submit_data_chunks(
404
- split_json_list: list,
405
- node: str,
406
- gen3_submitter,
407
- project_id: str,
408
- max_retries: int,
409
- file_path: str,
410
- program_id: str = "program1"
411
- ) -> List[Dict]:
412
- """
413
- Submit each chunk of data (in split_json_list) for a given node to Gen3, using retry logic and logging on failures.
414
-
415
- Args:
416
- split_json_list (list): List of JSON-serializable chunked data to submit.
417
- node (str): Name of the data node being submitted.
418
- gen3_submitter: A Gen3Submission instance for making submissions.
419
- project_id (str): The project identifier within Gen3.
420
- max_retries (int): Maximum number of retry attempts per chunk on failure.
421
- file_path (str): Path of the file that was submitted. Used only for data capture.
422
- program_id (str, optional): The Gen3 program id (default: "program1").
423
-
424
- Returns:
425
- List[Dict]: List of response dictionaries for each submitted chunk.
426
-
427
- Raises:
428
- Exception: If submission fails after all retry attempts for any chunk.
429
- """
430
-
431
- n_json_data = len(split_json_list)
432
- response_results = []
433
-
434
- for index, jsn in enumerate(split_json_list):
435
- progress_str = f"{index + 1}/{n_json_data}"
436
-
437
- submission_success = False
438
- last_exception = None
439
-
440
- attempt = 0
441
- while attempt <= max_retries:
442
- try:
443
- if attempt == 0:
444
- log_msg = (
445
- f"[SUBMIT] | Project: {project_id:<10} | Node: {node:<12} | "
446
- f"Split: {progress_str:<5}"
447
- )
448
- logger.info(log_msg)
449
- else:
450
- log_msg = (
451
- f"[RETRY] | Project: {project_id:<10} | Node: {node:<12} | "
452
- f"Split: {progress_str:<5} | Attempt: {attempt}/{max_retries}"
453
- )
454
- logger.warning(log_msg)
455
-
456
- res = gen3_submitter.submit_record(program_id, project_id, jsn)
457
- res.update({"file_path": file_path})
458
- response_results.append(res)
459
- submission_success = True
460
- logger.info(
461
- f"\033[92m[SUCCESS]\033[0m | Project: {project_id:<10} | "
462
- f"Node: {node:<12} | Split: {progress_str:<5}"
463
- )
464
- break # Success
465
-
466
- except Exception as e:
467
- last_exception = e
468
- logger.error(
469
- f"Error submitting chunk {progress_str} for node '{node}': {e}"
470
- )
471
-
472
- if attempt < max_retries:
473
- import time
474
- time.sleep(0.2)
475
- else:
476
- logger.critical(
477
- f"\033[91m[FAILED]\033[0m | Project: {project_id:<10} | "
478
- f"Node: {node:<12} | Split: {progress_str:<5} | Error: {e}"
479
- )
480
- attempt += 1
481
-
482
- if not submission_success:
483
- # After retries, still failed
484
- raise Exception(
485
- f"Failed to submit chunk {progress_str} for node '{node}' after {max_retries + 1} attempts. "
486
- f"Last error: {last_exception}"
487
- )
488
-
489
- logger.info(f"Finished submitting node '{node}'.")
490
- return response_results
491
-
492
-
493
- def flatten_submission_results(submission_results: List[Dict]) -> List[Dict]:
494
- """
495
- Flattens a list of Gen3 submission result dictionaries into a single list of entity dictionaries.
496
-
497
- For each submission result, this function processes its entities (if any),
498
- extracting the 'project_id' and 'submitter_id' from the 'unique_keys' field (if present)
499
- into the top-level entity dictionary for easy access.
500
-
501
- Any submission result that does not have a code of 200 or lacks entities is skipped, and a warning is logged.
502
-
503
- Args:
504
- submission_results (List[Dict]):
505
- A list of Gen3 submission result dictionaries, each containing at least a "code" and "entities" entry.
506
-
507
- Returns:
508
- List[Dict]:
509
- A flat list, where each element is an entity dictionary (with keys 'project_id' and 'submitter_id' added if available).
510
- """
511
- flat_list_dict = []
512
- total = len(submission_results)
513
- logger.info(f"Flattening {total} submission result(s)...")
514
-
515
- for idx, obj in enumerate(submission_results, 1):
516
- transaction_id = obj.get("transaction_id")
517
- code = obj.get("code")
518
- if code != 200:
519
- logger.warning(f"Skipping submission result at index {idx-1} (code={code})")
520
- continue
521
-
522
- entities = obj.get("entities")
523
-
524
- if entities is None:
525
- logger.warning(f"No entities found in submission result at index {idx-1}")
526
- continue
527
-
528
- logger.info(f"Processing submission result {idx} of {total}, {len(entities)} entities")
529
-
530
- for entity in entities:
531
- unique_keys = entity.get("unique_keys", [{}])
532
- if unique_keys and isinstance(unique_keys, list):
533
- keys = unique_keys[0]
534
- entity["project_id"] = keys.get("project_id")
535
- entity["submitter_id"] = keys.get("submitter_id")
536
- entity["transaction_id"] = transaction_id
537
- entity["file_path"] = obj.get("file_path", '')
538
- flat_list_dict.append(entity)
539
-
540
- # renaming cols
541
- for entity in flat_list_dict:
542
- entity["gen3_guid"] = entity.pop("id", None)
543
- entity["node"] = entity.pop("type", None)
544
-
545
- logger.info(f"Finished flattening. Total entities: {len(flat_list_dict)}")
546
- return flat_list_dict
547
-
548
-
549
- def find_version_from_path(path):
550
- version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
551
- found_versions = []
552
-
553
- for segment in path.split('/'):
554
- match = version_pattern.match(segment)
555
- if match:
556
- found_versions.append(match.group(1))
557
-
558
- if not found_versions:
559
- return None
560
-
561
- if len(found_versions) > 1:
562
- logger.warning("more than one match found in path for version string")
563
-
564
- return found_versions[-1]
565
-
566
-
567
- def collect_versions_from_metadata_file_list(metadata_file_list):
568
- versions = []
569
- for file_path in metadata_file_list:
570
- version = find_version_from_path(file_path)
571
- if version:
572
- versions.append(version)
573
- versions = list(set(versions))
574
- if len(versions) > 1:
575
- logger.error(f"more than one version found in metadata file list: {metadata_file_list}")
576
- raise
577
- return versions[0]
578
-
579
-
580
517
  class MetadataSubmitter:
581
518
  def __init__(
582
519
  self,
@@ -584,49 +521,440 @@ class MetadataSubmitter:
584
521
  api_key: dict,
585
522
  project_id: str,
586
523
  data_import_order_path: str,
524
+ dataset_root: str,
525
+ database: str,
526
+ table: str,
587
527
  program_id: str = "program1",
588
528
  max_size_kb: int = 100,
589
- exclude_nodes: list = ["project", "program", "acknowledgement", "publication"],
529
+ exclude_nodes: Optional[List[str]] = None,
590
530
  max_retries: int = 3,
591
- aws_profile: str = None
531
+ aws_profile: str = None,
532
+ partition_cols: Optional[List[str]] = None,
533
+ upload_to_database: bool = True
592
534
  ):
593
535
  """
594
- Initialises a MetadataSubmitter for submitting a set of metadata JSON files to a Gen3 data commons endpoint, in order.
536
+ Initialises a MetadataSubmitter for submitting a set of metadata JSON
537
+ files to a Gen3 data commons endpoint, in order.
538
+
539
+ **Workflow Overview:**
540
+ 1. **Node Traversal:** The submitter iterates through each node defined in the
541
+ `data_import_order` list.
542
+ 2. **File Resolution:** For each node name, it locates the corresponding JSON file
543
+ (e.g., `node.json`) from the provided file list.
544
+ 3. **Chunking:** The JSON file is read and split into manageable chunks based on size.
545
+ 4. **Submission:** Each chunk is submitted to the Gen3 Sheepdog API via `gen3.submission`.
546
+ 5. **Response Handling:** The API response, which includes the `submission_id` for
547
+ the records, is captured.
548
+ 6. **Persistence:** The response data is flattened, converted into a DataFrame, and
549
+ written to Parquet files in S3. These records are also registered in a specific
550
+ upload table within the configured database for audit and tracking.
595
551
 
596
552
  Args:
597
- metadata_file_list (list): List of local file paths or S3 URIs to metadata .json files, one per node type.
553
+ metadata_file_list (list): List of local file paths or S3 URIs to
554
+ metadata .json files, one per node type.
598
555
  api_key (dict): Gen3 API key as a parsed dictionary.
599
- project_id (str): Gen3 project ID to submit data to.
600
- data_import_order_path (str): Path or S3 URI to DataImportOrder.txt specifying node submission order.
556
+ project_id (str): Gen3 project ID to submit data to (e.g., "internal-project").
557
+ data_import_order_path (str): Path or S3 URI to DataImportOrder.txt
558
+ specifying node submission order.
559
+ dataset_root (str): S3 path where the parquet files will be stored.
560
+ Example: "s3://acdc-dataops-metadata/metadata_upload/"
561
+ database (str): Database name for storing the metadata upload.
562
+ Example: "acdc_dataops_metadata_db"
563
+ table (str): Table name for storing the metadata upload.
564
+ Example: "metadata_upload"
601
565
  program_id (str, optional): Gen3 program ID (default: "program1").
602
- max_size_kb (int, optional): Maximum size per submission chunk, in KB (default: 100).
603
- exclude_nodes (list, optional): List of node names to skip during submission (default: ["project", "program", "acknowledgement", "publication"]).
604
- max_retries (int, optional): Maximum number of retry attempts per node chunk (default: 3).
605
- aws_profile (str, optional): AWS CLI named profile to use for boto3 session (default: None).
566
+ max_size_kb (int, optional): Maximum size per submission chunk,
567
+ in KB (default: 100).
568
+ exclude_nodes (list, optional): List of node names to skip during
569
+ submission. Defaults to ["project", "program", "acknowledgement", "publication"].
570
+ max_retries (int, optional): Maximum number of retry attempts per
571
+ node chunk (default: 3).
572
+ aws_profile (str, optional): AWS CLI named profile to use for boto3
573
+ session (default: None).
574
+ partition_cols (list, optional): List of column names to partition the parquet table by.
575
+ Defaults to ["upload_datetime"].
576
+ upload_to_database (bool, optional): Whether to upload the metadata to a database.
577
+ Defaults to True. The database is defined by dataset_root, database, and table.
606
578
  """
607
579
  self.metadata_file_list = metadata_file_list
608
580
  self.api_key = api_key
609
581
  self.project_id = project_id
610
582
  self.data_import_order_path = data_import_order_path
583
+ self.dataset_root = dataset_root
584
+ self.database = database
585
+ self.table = table
611
586
  self.program_id = program_id
612
587
  self.max_size_kb = max_size_kb
613
- self.exclude_nodes = exclude_nodes
588
+ self.exclude_nodes = exclude_nodes or [
589
+ "project",
590
+ "program",
591
+ "acknowledgement",
592
+ "publication",
593
+ ]
614
594
  self.max_retries = max_retries
615
595
  self.submission_results = []
616
596
  self.aws_profile = aws_profile
597
+ self.partition_cols = partition_cols or ["upload_datetime"]
598
+ self.upload_to_database = upload_to_database
617
599
  self.boto3_session = self._create_boto3_session()
618
600
  logger.info("MetadataSubmitter initialised.")
619
601
 
620
602
  def _create_gen3_submission_class(self):
603
+ """Helper to instantiate the Gen3Submission class using the provided API key."""
621
604
  return create_gen3_submission_class(self.api_key)
622
-
605
+
623
606
  def _create_boto3_session(self):
607
+ """Helper to create a boto3 session using the provided AWS profile."""
624
608
  return create_boto3_session(self.aws_profile)
625
609
 
626
- def _read_data_import_order(self, data_import_order_path: str, exclude_nodes: list[str], boto3_session = None):
610
+ def _flatten_submission_results(self, submission_results: List[Dict]) -> List[Dict]:
611
+ """
612
+ Flattens a list of Gen3 submission result dictionaries into a single
613
+ list of entity dictionaries.
614
+
615
+ For each submission result, this function processes its entities (if any),
616
+ extracting the 'project_id' and 'submitter_id' from the 'unique_keys'
617
+ field (if present) into the top-level entity dictionary for easy access.
618
+
619
+ Any submission result that does not have a code of 200 or lacks entities
620
+ is skipped, and a warning is logged.
621
+
622
+ Args:
623
+ submission_results (List[Dict]):
624
+ A list of Gen3 submission result dictionaries, each containing at
625
+ least a "code" and "entities" entry.
626
+
627
+ Returns:
628
+ List[Dict]:
629
+ A flat list, where each element is an entity dictionary
630
+ (with keys 'project_id' and 'submitter_id' added if available).
631
+ """
632
+ flat_list_dict = []
633
+ total = len(submission_results)
634
+ logger.info("Flattening %s submission result(s)...", total)
635
+
636
+ for idx, obj in enumerate(submission_results, 1):
637
+ transaction_id = obj.get("transaction_id")
638
+ code = obj.get("code")
639
+ if code != 200:
640
+ logger.warning(
641
+ "Skipping submission result at index %s (code=%s)",
642
+ idx - 1,
643
+ code,
644
+ )
645
+ continue
646
+
647
+ entities = obj.get("entities")
648
+
649
+ if entities is None:
650
+ logger.warning("No entities found in submission result at index %s", idx - 1)
651
+ continue
652
+
653
+ logger.info(
654
+ "Processing submission result %s of %s, %s entities",
655
+ idx,
656
+ total,
657
+ len(entities),
658
+ )
659
+
660
+ for entity in entities:
661
+ unique_keys = entity.get("unique_keys", [{}])
662
+ if unique_keys and isinstance(unique_keys, list):
663
+ keys = unique_keys[0]
664
+ entity["project_id"] = keys.get("project_id")
665
+ entity["submitter_id"] = keys.get("submitter_id")
666
+ entity["transaction_id"] = transaction_id
667
+ entity["file_path"] = obj.get("file_path", '')
668
+ flat_list_dict.append(entity)
669
+
670
+ # renaming cols
671
+ for entity in flat_list_dict:
672
+ entity["gen3_guid"] = entity.pop("id", None)
673
+ entity["node"] = entity.pop("type", None)
674
+
675
+ logger.info("Finished flattening. Total entities: %s", len(flat_list_dict))
676
+ return flat_list_dict
677
+
678
+ def _find_version_from_path(self, path: str) -> Optional[str]:
679
+ """
680
+ Extracts a semantic version string (e.g., '1.0.0' or 'v1.0.0') from a file path.
681
+
682
+ Args:
683
+ path (str): The file path to inspect.
684
+
685
+ Returns:
686
+ Optional[str]: The extracted version string if found, otherwise None.
687
+ """
688
+ version_pattern = re.compile(r"^v?(\d+\.\d+\.\d+)$")
689
+ found_versions = []
690
+
691
+ for segment in path.split('/'):
692
+ match = version_pattern.match(segment)
693
+ if match:
694
+ found_versions.append(match.group(1))
695
+
696
+ if not found_versions:
697
+ return None
698
+
699
+ if len(found_versions) > 1:
700
+ logger.warning("more than one match found in path for version string")
701
+
702
+ return found_versions[-1]
703
+
704
+ def _collect_versions_from_metadata_file_list(self) -> str:
705
+ """
706
+ Extract and validate version information from the internal list of metadata
707
+ file paths (self.metadata_file_list).
708
+
709
+ Returns:
710
+ str: The single version found in the file list.
711
+
712
+ Raises:
713
+ ValueError: If more than one version is found across the files,
714
+ or if no version is found at all.
715
+ """
716
+ versions = []
717
+ for file_path in self.metadata_file_list:
718
+ version = self._find_version_from_path(file_path)
719
+ if version:
720
+ versions.append(version)
721
+ versions = list(set(versions))
722
+ if len(versions) > 1:
723
+ logger.error(
724
+ "more than one version found in metadata file list: %s",
725
+ self.metadata_file_list,
726
+ )
727
+ raise ValueError(
728
+ "More than one version found in metadata file list: %s"
729
+ % self.metadata_file_list
730
+ )
731
+ if not versions:
732
+ raise ValueError(
733
+ "No version found in metadata file list: %s" % self.metadata_file_list
734
+ )
735
+ return versions[0]
736
+
737
+ def _upload_submission_results(self, submission_results: list):
738
+ """
739
+ Uploads the submission results to S3 and a Parquet table.
740
+
741
+ This function performs the final step of the pipeline:
742
+ 1. Flattens the submission response structure.
743
+ 2. Prepares a DataFrame with metadata (upload_id, datetime, version).
744
+ 3. Writes the DataFrame to Parquet files in S3 and registers them in the
745
+ database configured via `self.database` and `self.table`.
746
+
747
+ **Retry Mechanism:**
748
+ Uses the `tenacity` library to retry the upload if it fails.
749
+ - Stop: After `self.max_retries` attempts.
750
+ - Wait: Exponential backoff starting at 1s, doubling up to 10s.
751
+
752
+ Args:
753
+ submission_results (list): List of submission results to upload.
754
+
755
+ Configuration used (from __init__):
756
+ dataset_root (str): e.g. "s3://acdc-dataops-metadata/metadata_upload/"
757
+ database (str): e.g. "acdc_dataops_metadata_db"
758
+ table (str): e.g. "metadata_upload"
759
+ partition_cols (list): e.g. ["upload_datetime"]
760
+ """
761
+
762
+ @retry(
763
+ stop=stop_after_attempt(self.max_retries),
764
+ wait=wait_exponential(multiplier=1, max=10)
765
+ )
766
+ def inner_upload():
767
+ logger.debug("Collecting version from metadata file list.")
768
+ version = self._collect_versions_from_metadata_file_list()
769
+ logger.debug("Extracted version: %s", version)
770
+
771
+ logger.debug("Inferring API endpoint from JWT.")
772
+ api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
773
+ logger.debug("Using API endpoint: %s", api_endpoint)
774
+
775
+ upload_datetime = datetime.now().isoformat()
776
+ upload_id = str(uuid.uuid4())
777
+ logger.debug("Upload datetime: %s", upload_datetime)
778
+ logger.debug("Generated upload ID: %s", upload_id)
779
+
780
+ logger.debug("Flattening submission results for upload.")
781
+ flattened_results = self._flatten_submission_results(submission_results)
782
+ logger.debug(
783
+ "Flattened %s submission result entries.",
784
+ len(flattened_results),
785
+ )
786
+
787
+ logger.debug("Converting flattened results to DataFrame.")
788
+ flattened_results_df = pd.DataFrame(flattened_results)
789
+ flattened_results_df['upload_datetime'] = upload_datetime
790
+ flattened_results_df['upload_id'] = upload_id
791
+ flattened_results_df['api_endpoint'] = api_endpoint
792
+ flattened_results_df['version'] = version
793
+
794
+ logger.info(
795
+ "Writing DataFrame to parquet and S3/table: "
796
+ "dataset_root=%s, database=%s, table=%s, partition_cols=%s",
797
+ self.dataset_root,
798
+ self.database,
799
+ self.table,
800
+ self.partition_cols,
801
+ )
802
+ write_parquet_to_db(
803
+ df=flattened_results_df,
804
+ dataset_root=self.dataset_root,
805
+ database=self.database,
806
+ table=self.table,
807
+ partition_cols=self.partition_cols,
808
+ )
809
+ logger.info(
810
+ "\033[94m[SUCCESS]\033[0m Metadata submission results upload complete. "
811
+ "Uploaded to dataset_root=%s, database=%s, table=%s.",
812
+ self.dataset_root,
813
+ self.database,
814
+ self.table,
815
+ )
816
+
817
+ # Execute the decorated inner function
818
+ try:
819
+ inner_upload()
820
+ except Exception as e:
821
+ logger.critical("Failed to upload submission results after %s attempts.", self.max_retries)
822
+ raise e
823
+
824
+ def _submit_data_chunks(
825
+ self,
826
+ split_json_list: list,
827
+ node: str,
828
+ gen3_submitter,
829
+ file_path: str,
830
+ upload_to_database: bool = True
831
+ ) -> List[Dict]:
832
+ """
833
+ Submit each chunk of data (in split_json_list) for a given node to Gen3,
834
+ using retry logic and logging on failures.
835
+
836
+ Upon completion of each chunk (success or failure), the response is uploaded
837
+ to the configured S3 Parquet table using `_upload_submission_results`.
838
+
839
+ Args:
840
+ split_json_list (list): List of JSON-serializable chunked data to
841
+ submit.
842
+ node (str): Name of the data node being submitted (e.g., "program").
843
+ gen3_submitter: A Gen3Submission instance for making submissions.
844
+ file_path (str): Path of the file that was submitted.
845
+ Used only for data capture in the result logs.
846
+
847
+ Returns:
848
+ List[Dict]: List of response dictionaries for each submitted chunk.
849
+
850
+ Raises:
851
+ RuntimeError: If submission fails after all retry attempts for any chunk.
852
+ """
853
+ n_json_data = len(split_json_list)
854
+
855
+ for index, jsn in enumerate(split_json_list):
856
+ # Holds results for the current chunk
857
+ current_chunk_response: List[Dict[str, Any]] = []
858
+ progress_str = f"{index + 1}/{n_json_data}"
859
+
860
+ submission_success = False
861
+ last_exception: Optional[Exception] = None
862
+
863
+ attempt = 0
864
+ while attempt <= self.max_retries:
865
+ try:
866
+ if attempt == 0:
867
+ logger.info(
868
+ "[SUBMIT] | Project: %-10s | Node: %-12s | "
869
+ "Split: %-5s",
870
+ self.project_id,
871
+ node,
872
+ progress_str,
873
+ )
874
+ else:
875
+ logger.warning(
876
+ "[RETRY] | Project: %-10s | Node: %-12s | "
877
+ "Split: %-5s | "
878
+ "Attempt: %s/%s",
879
+ self.project_id,
880
+ node,
881
+ progress_str,
882
+ attempt,
883
+ self.max_retries,
884
+ )
885
+
886
+ res = gen3_submitter.submit_record(self.program_id, self.project_id, jsn)
887
+ res.update({"file_path": file_path})
888
+ current_chunk_response.append(res)
889
+ submission_success = True
890
+ logger.info(
891
+ "\033[92m[SUCCESS]\033[0m | Project: %-10s | "
892
+ "Node: %-12s | Split: %-5s",
893
+ self.project_id,
894
+ node,
895
+ progress_str,
896
+ )
897
+ break # Success
898
+
899
+ except (
900
+ requests.exceptions.RequestException,
901
+ ValueError,
902
+ TypeError,
903
+ ) as e:
904
+ last_exception = e
905
+ logger.error(
906
+ "Error submitting chunk %s for node '%s': %s",
907
+ progress_str,
908
+ node,
909
+ e,
910
+ )
911
+
912
+ if attempt < self.max_retries:
913
+ time.sleep(0.2)
914
+ else:
915
+ logger.critical(
916
+ "\033[91m[FAILED]\033[0m | Project: %-10s | "
917
+ "Node: %-12s | Split: %-5s | Error: %s",
918
+ self.project_id,
919
+ node,
920
+ progress_str,
921
+ e,
922
+ )
923
+ attempt += 1
924
+
925
+
926
+ if upload_to_database:
927
+ # Also submitting data chunk response info to s3 and parquet table
928
+ logger.info("Submitting data chunk response info to S3 and Parquet table.")
929
+ self._upload_submission_results(submission_results=current_chunk_response)
930
+
931
+ if not submission_success:
932
+ # After retries, still failed
933
+ raise RuntimeError(
934
+ (
935
+ "Failed to submit chunk %s for node '%s' after %s attempts. "
936
+ "Last error: %s"
937
+ )
938
+ % (progress_str, node, self.max_retries + 1, last_exception)
939
+ ) from last_exception
940
+
941
+ logger.info("Finished submitting node '%s'.", node)
942
+
943
+
944
+ def _read_data_import_order(
945
+ self,
946
+ data_import_order_path: str,
947
+ exclude_nodes: List[str],
948
+ boto3_session=None,
949
+ ):
950
+ """Helper to read the data import order from local disk or S3."""
627
951
  if is_s3_uri(data_import_order_path):
628
952
  session = boto3_session or self.boto3_session
629
- return read_data_import_order_txt_s3(data_import_order_path, session, exclude_nodes)
953
+ return read_data_import_order_txt_s3(
954
+ data_import_order_path,
955
+ session,
956
+ exclude_nodes,
957
+ )
630
958
  else:
631
959
  return read_data_import_order_txt(data_import_order_path, exclude_nodes)
632
960
 
@@ -643,7 +971,7 @@ class MetadataSubmitter:
643
971
  list: A list of chunks, where each chunk is a list of dictionaries
644
972
  containing JSON data.
645
973
  """
646
- logger.info(f"Reading metadata json from {metadata_file_path}")
974
+ logger.info("Reading metadata json from %s", metadata_file_path)
647
975
  if is_s3_uri(metadata_file_path):
648
976
  session = self.boto3_session
649
977
  data = read_metadata_json_s3(metadata_file_path, session)
@@ -660,113 +988,62 @@ class MetadataSubmitter:
660
988
  are the corresponding file paths.
661
989
 
662
990
  Returns:
663
- dict: Dictionary mapping node names (str) to their associated metadata file paths (str).
991
+ dict: Dictionary mapping node names (str) to their associated metadata file paths.
664
992
  """
665
993
  file_map = {
666
- get_node_from_file_path(file): file
667
- for file in self.metadata_file_list
994
+ get_node_from_file_path(file_path): file_path
995
+ for file_path in self.metadata_file_list
668
996
  }
669
997
  return file_map
670
998
 
671
- def submit_metadata(self) -> List[Dict]:
999
+ def submit_metadata(self) -> List[Dict[str, Any]]:
672
1000
  """
673
1001
  Submits metadata for each node defined in the data import order, except those in the exclude list.
674
-
675
- For each node, this method retrieves the corresponding metadata file, splits the JSON data
676
- into size-constrained chunks, and submits each chunk to the Gen3 submission API. Responses
677
- from all submissions are gathered and returned as a list.
1002
+
1003
+ **Detailed Process:**
1004
+ 1. **Order Resolution:** The function reads the import order to determine the sequence of nodes.
1005
+ 2. **File Mapping:** It finds the matching `node.json` file for each node in the order.
1006
+ 3. **Chunk & Submit:** For every file, the JSON content is split into chunks and submitted
1007
+ to the Sheepdog API via `gen3.submission`.
1008
+ 4. **Audit Logging:** The API response (containing `submission_id`) is flattened and
1009
+ converted to a DataFrame. This is then written to Parquet files in S3 and registered
1010
+ in the configured upload table.
678
1011
 
679
1012
  Returns:
680
- List[Dict]: A list of response dictionaries returned from the Gen3 metadata submissions.
1013
+ List[Dict[str, Any]]: A list of response dictionaries returned from the Gen3 metadata submissions.
1014
+ Each dictionary contains the response from submitting a chunk of metadata for a given node.
1015
+ The keys in the dictionary are "node_name", "response", and "status_code".
681
1016
  """
682
1017
  gen3_submitter = self._create_gen3_submission_class()
683
- data_import_order = self._read_data_import_order(self.data_import_order_path, self.exclude_nodes, self.boto3_session)
1018
+ data_import_order = self._read_data_import_order(
1019
+ self.data_import_order_path,
1020
+ self.exclude_nodes,
1021
+ self.boto3_session,
1022
+ )
684
1023
  file_map = self._create_file_map()
685
- output_response_list_dict = []
686
1024
 
687
1025
  logger.info("Starting metadata submission.")
688
- for node in data_import_order:
689
1026
 
1027
+ for node in data_import_order:
690
1028
  if node in self.exclude_nodes:
691
- logger.info(f"Skipping node '{node}' (in exclude list).")
1029
+ logger.info("Skipping node '%s' (in exclude list).", node)
692
1030
  continue
693
1031
  file_path = file_map.get(node)
694
1032
  if not file_path:
695
- logger.info(f"Skipping node '{node}' (not present in file list).")
1033
+ logger.info("Skipping node '%s' (not present in file list).", node)
696
1034
  continue
697
1035
 
698
- logger.info(f"Processing file '{file_path}' for node '{node}'.")
1036
+ logger.info("Processing file '%s' for node '%s'.", file_path, node)
699
1037
  logger.info("Splitting JSON data into chunks.")
700
1038
  json_chunks = self._prepare_json_chunks(file_path, self.max_size_kb)
701
1039
 
702
1040
  logger.info("Submitting chunks to Gen3.")
703
- response_list = submit_data_chunks(
1041
+ self._submit_data_chunks(
704
1042
  split_json_list=json_chunks,
705
1043
  node=node,
706
- file_path=file_path,
707
1044
  gen3_submitter=gen3_submitter,
708
- project_id=self.project_id,
709
- max_retries=self.max_retries,
710
- program_id=self.program_id
1045
+ file_path=file_path,
1046
+ upload_to_database=self.upload_to_database
711
1047
  )
712
- output_response_list_dict.extend(response_list)
713
-
714
- self.submission_results = output_response_list_dict
715
- return output_response_list_dict
1048
+
716
1049
 
717
- def upload_metadata_submission_results(
718
- self,
719
- dataset_root: str,
720
- database: str,
721
- table: str,
722
- partition_cols: list = ["upload_datetime"],
723
- ):
724
- """
725
- Uploads the submission results to s3 and parquet table.
726
-
727
- Args:
728
- dataset_root (str): S3 path where the parquet files will be stored
729
- (e.g., "s3://acdc-dataops-metadata/metadata_upload/").
730
- database (str): Database name for storing the metadata upload
731
- (e.g., "acdc_dataops_metadata_db").
732
- table (str): Table name for storing the metadata upload
733
- (e.g., "metadata_upload").
734
- partition_cols (list, optional): List of column names to partition the parquet table by.
735
- Defaults to ["upload_datetime"].
736
- """
737
- logger.info("Collecting version from metadata file list.")
738
- version = collect_versions_from_metadata_file_list(self.metadata_file_list)
739
- logger.info(f"Extracted version: {version}")
740
-
741
- logger.info("Inferring API endpoint from JWT.")
742
- api_endpoint = infer_api_endpoint_from_jwt(self.api_key['api_key'])
743
- logger.info(f"Using API endpoint: {api_endpoint}")
744
-
745
- upload_datetime = datetime.now().isoformat()
746
- upload_id = str(uuid.uuid4())
747
- logger.info(f"Upload datetime: {upload_datetime}")
748
- logger.info(f"Generated upload ID: {upload_id}")
749
-
750
- logger.info("Flattening submission results for upload.")
751
- flattened_results = flatten_submission_results(self.submission_results)
752
- logger.info(f"Flattened {len(flattened_results)} submission result entries.")
753
-
754
- logger.info("Converting flattened results to DataFrame.")
755
- flattened_results_df = pd.DataFrame(flattened_results)
756
- flattened_results_df['upload_datetime'] = upload_datetime
757
- flattened_results_df['upload_id'] = upload_id
758
- flattened_results_df['api_endpoint'] = api_endpoint
759
- flattened_results_df['version'] = version
760
-
761
- logger.info(
762
- f"Writing DataFrame to parquet and S3/table: "
763
- f"dataset_root={dataset_root}, database={database}, table={table}, partition_cols={partition_cols}"
764
- )
765
- write_parquet_to_db(
766
- df=flattened_results_df,
767
- dataset_root=dataset_root,
768
- database=database,
769
- table=table,
770
- partition_cols=partition_cols
771
- )
772
- logger.info("Metadata submission results upload complete.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: acdc_aws_etl_pipeline
3
- Version: 0.6.9
3
+ Version: 0.7.0
4
4
  Summary: Tools for ACDC ETL pipeline
5
5
  Author: JoshuaHarris391
6
6
  Author-email: harjo391@gmail.com
@@ -23,6 +23,7 @@ Requires-Dist: python-dotenv
23
23
  Requires-Dist: pytz (>=2025.2,<2026.0)
24
24
  Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
25
25
  Requires-Dist: s3fs (==2025.10.0)
26
+ Requires-Dist: tenacity (>=8.2,<10.0)
26
27
  Requires-Dist: tzlocal (>=5.3.1,<6.0.0)
27
28
  Description-Content-Type: text/markdown
28
29
 
@@ -3,12 +3,12 @@ acdc_aws_etl_pipeline/ingest/ingest.py,sha256=5Q63PZfUVB5L1WxwElAxG6N-4GvqBuTNp6
3
3
  acdc_aws_etl_pipeline/upload/__init__.py,sha256=kRI1wozjK-b9YXMAPwzWHzm967ZiUAM6g8rRo4ONWtI,67
4
4
  acdc_aws_etl_pipeline/upload/gen3datasubmitter.py,sha256=bu5d8IOsKFIA1uvvzaxb7YIKwBZKdP-0QvBt-gZMyUc,8625
5
5
  acdc_aws_etl_pipeline/upload/metadata_deleter.py,sha256=T4q9xqSE2Beu3zluvAmKh7wJWcCFGz2AZ9h9ZcASfyA,63
6
- acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=k5q5hRkj-dWo25z9nVZI2eNh0xnmQU8TPDffSSnQlUY,29906
6
+ acdc_aws_etl_pipeline/upload/metadata_submitter.py,sha256=2PVuv-mvjnO-FxVZHiYfTDlbioEo-JsTcvNZY6v2n40,38331
7
7
  acdc_aws_etl_pipeline/upload/upload_synthdata_s3.py,sha256=Ge5TQzZkWnJNp-q0Ak-Yhv1h1eWLxg-PlWVHrd1m0B8,5155
8
8
  acdc_aws_etl_pipeline/utils/athena_utils.py,sha256=QJlBe-07Hkq-BqmcxBu6ZtAmVfZSHuSY4dijcysgPH8,29560
9
9
  acdc_aws_etl_pipeline/utils/dbt_utils.py,sha256=5XRFOwNNIeuW2sQuor3h_OZTuXGg6xv2AUYwj9bMAAM,2054
10
10
  acdc_aws_etl_pipeline/utils/release_writer.py,sha256=vsxHJ6l-UWPpzeyEPHurX5iFgeCEQ-9FbySAbPNfTTM,7555
11
11
  acdc_aws_etl_pipeline/validate/validate.py,sha256=zLqK9i92FsRAaBOGdY-G7-vb0e6tmkoUXhY6zCfbjN8,24895
12
- acdc_aws_etl_pipeline-0.6.9.dist-info/METADATA,sha256=L02r4oi2Xhtoet7a4HCfV8nGDlmmc2gSVsCN8sMNjTc,2926
13
- acdc_aws_etl_pipeline-0.6.9.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
- acdc_aws_etl_pipeline-0.6.9.dist-info/RECORD,,
12
+ acdc_aws_etl_pipeline-0.7.0.dist-info/METADATA,sha256=bBjSnhz3qpcycFzpVmv7SObXH7dCzXxOvIOeLEw0eJc,2964
13
+ acdc_aws_etl_pipeline-0.7.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
14
+ acdc_aws_etl_pipeline-0.7.0.dist-info/RECORD,,