nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/mongo.py +435 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +270 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +796 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +425 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +633 -13
- nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
- nmdc_runtime/site/graphs.py +8 -22
- nmdc_runtime/site/ops.py +147 -181
- nmdc_runtime/site/repository.py +2 -112
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +90 -48
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,8 +4,9 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any, List
|
|
7
|
+
from typing import Any, List
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
|
+
from unidecode import unidecode
|
|
9
10
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
11
|
handle_controlled_identified_term_value,
|
|
11
12
|
handle_controlled_term_value,
|
|
@@ -16,7 +17,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
16
17
|
handle_float_value,
|
|
17
18
|
handle_string_value,
|
|
18
19
|
load_mappings,
|
|
19
|
-
validate_xml,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
@@ -163,16 +163,53 @@ class NCBISubmissionXML:
|
|
|
163
163
|
org,
|
|
164
164
|
bioproject_id,
|
|
165
165
|
nmdc_biosamples,
|
|
166
|
+
pooled_biosamples_data=None,
|
|
166
167
|
):
|
|
167
168
|
attribute_mappings, slot_range_mappings = load_mappings(
|
|
168
169
|
self.nmdc_ncbi_attribute_mapping_file_url
|
|
169
170
|
)
|
|
170
171
|
|
|
172
|
+
# Use provided pooling data or empty dict
|
|
173
|
+
pooling_data = pooled_biosamples_data or {}
|
|
174
|
+
|
|
175
|
+
# Group biosamples by pooling process
|
|
176
|
+
pooling_groups = {}
|
|
177
|
+
individual_biosamples = []
|
|
178
|
+
|
|
171
179
|
for biosample in nmdc_biosamples:
|
|
180
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
181
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
182
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
183
|
+
if pooling_process_id not in pooling_groups:
|
|
184
|
+
pooling_groups[pooling_process_id] = {
|
|
185
|
+
"biosamples": [],
|
|
186
|
+
"pooling_info": pooling_info,
|
|
187
|
+
}
|
|
188
|
+
pooling_groups[pooling_process_id]["biosamples"].append(biosample)
|
|
189
|
+
else:
|
|
190
|
+
individual_biosamples.append(biosample)
|
|
191
|
+
|
|
192
|
+
# Process pooled sample groups - create one <Action> block per pooling process
|
|
193
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
194
|
+
self._create_pooled_biosample_action(
|
|
195
|
+
group_data["biosamples"],
|
|
196
|
+
group_data["pooling_info"],
|
|
197
|
+
organism_name,
|
|
198
|
+
org,
|
|
199
|
+
bioproject_id,
|
|
200
|
+
attribute_mappings,
|
|
201
|
+
slot_range_mappings,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Process individual biosamples
|
|
205
|
+
for biosample in individual_biosamples:
|
|
172
206
|
attributes = {}
|
|
173
207
|
sample_id_value = None
|
|
174
208
|
env_package = None
|
|
175
209
|
|
|
210
|
+
# Get pooling info for this specific biosample
|
|
211
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
212
|
+
|
|
176
213
|
for json_key, value in biosample.items():
|
|
177
214
|
if isinstance(value, list):
|
|
178
215
|
for item in value:
|
|
@@ -191,15 +228,6 @@ class NCBISubmissionXML:
|
|
|
191
228
|
attributes[xml_key] = value
|
|
192
229
|
continue # Skip applying the handler to this key
|
|
193
230
|
|
|
194
|
-
# Special handling for "host_taxid"
|
|
195
|
-
if json_key == "host_taxid" and isinstance(value, dict):
|
|
196
|
-
if "term" in value and "id" in value["term"]:
|
|
197
|
-
value = re.findall(
|
|
198
|
-
r"\d+", value["term"]["id"].split(":")[1]
|
|
199
|
-
)[0]
|
|
200
|
-
attributes[xml_key] = value
|
|
201
|
-
continue # Skip applying the handler to this key
|
|
202
|
-
|
|
203
231
|
formatted_value = handler(item)
|
|
204
232
|
|
|
205
233
|
# Combine multiple values with a separator for list elements
|
|
@@ -214,7 +242,11 @@ class NCBISubmissionXML:
|
|
|
214
242
|
|
|
215
243
|
# Special handling for NMDC Biosample "id"
|
|
216
244
|
if json_key == "id":
|
|
217
|
-
|
|
245
|
+
# Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
|
|
246
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
247
|
+
sample_id_value = pooling_info["processed_sample_id"]
|
|
248
|
+
else:
|
|
249
|
+
sample_id_value = value
|
|
218
250
|
continue
|
|
219
251
|
|
|
220
252
|
if json_key not in attribute_mappings:
|
|
@@ -237,10 +269,39 @@ class NCBISubmissionXML:
|
|
|
237
269
|
attributes[xml_key] = value
|
|
238
270
|
continue # Skip applying the handler to this key
|
|
239
271
|
|
|
272
|
+
# Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
|
|
273
|
+
if json_key == "geo_loc_name":
|
|
274
|
+
formatted_value = handler(value)
|
|
275
|
+
formatted_value_ascii = unidecode(formatted_value)
|
|
276
|
+
attributes[xml_key] = formatted_value_ascii
|
|
277
|
+
continue # Skip applying the handler to this key
|
|
278
|
+
|
|
240
279
|
# Default processing for other keys
|
|
241
280
|
formatted_value = handler(value)
|
|
242
281
|
attributes[xml_key] = formatted_value
|
|
243
282
|
|
|
283
|
+
# Override with aggregated values for pooled samples
|
|
284
|
+
if pooling_info:
|
|
285
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
286
|
+
# Find the mapping for collection_date
|
|
287
|
+
collection_date_key = attribute_mappings.get(
|
|
288
|
+
"collection_date", "collection_date"
|
|
289
|
+
)
|
|
290
|
+
attributes[collection_date_key] = pooling_info[
|
|
291
|
+
"aggregated_collection_date"
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
if pooling_info.get("aggregated_depth"):
|
|
295
|
+
# Find the mapping for depth
|
|
296
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
297
|
+
attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
298
|
+
|
|
299
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
300
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
301
|
+
attributes["samp_pooling"] = ";".join(
|
|
302
|
+
pooling_info["pooled_biosample_ids"]
|
|
303
|
+
)
|
|
304
|
+
|
|
244
305
|
biosample_elements = [
|
|
245
306
|
self.set_element(
|
|
246
307
|
"SampleId",
|
|
@@ -261,7 +322,48 @@ class NCBISubmissionXML:
|
|
|
261
322
|
f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
|
|
262
323
|
),
|
|
263
324
|
),
|
|
264
|
-
]
|
|
325
|
+
]
|
|
326
|
+
+ (
|
|
327
|
+
# Add external links for pooled samples
|
|
328
|
+
[
|
|
329
|
+
self.set_element(
|
|
330
|
+
"ExternalLink",
|
|
331
|
+
attrib={"label": "NMDC Processed Sample"},
|
|
332
|
+
children=[
|
|
333
|
+
self.set_element(
|
|
334
|
+
"URL",
|
|
335
|
+
f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
|
|
336
|
+
)
|
|
337
|
+
],
|
|
338
|
+
),
|
|
339
|
+
self.set_element(
|
|
340
|
+
"ExternalLink",
|
|
341
|
+
attrib={"label": "NMDC Pooling Process"},
|
|
342
|
+
children=[
|
|
343
|
+
self.set_element(
|
|
344
|
+
"URL",
|
|
345
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
346
|
+
)
|
|
347
|
+
],
|
|
348
|
+
),
|
|
349
|
+
]
|
|
350
|
+
if pooling_info
|
|
351
|
+
and pooling_info.get("processed_sample_id")
|
|
352
|
+
and pooling_info.get("pooling_process_id")
|
|
353
|
+
else [
|
|
354
|
+
# Add external link for individual biosamples
|
|
355
|
+
self.set_element(
|
|
356
|
+
"ExternalLink",
|
|
357
|
+
attrib={"label": sample_id_value},
|
|
358
|
+
children=[
|
|
359
|
+
self.set_element(
|
|
360
|
+
"URL",
|
|
361
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
362
|
+
)
|
|
363
|
+
],
|
|
364
|
+
),
|
|
365
|
+
]
|
|
366
|
+
),
|
|
265
367
|
),
|
|
266
368
|
self.set_element(
|
|
267
369
|
"Organism",
|
|
@@ -333,6 +435,248 @@ class NCBISubmissionXML:
|
|
|
333
435
|
)
|
|
334
436
|
self.root.append(action)
|
|
335
437
|
|
|
438
|
+
def _create_pooled_biosample_action(
|
|
439
|
+
self,
|
|
440
|
+
biosamples,
|
|
441
|
+
pooling_info,
|
|
442
|
+
organism_name,
|
|
443
|
+
org,
|
|
444
|
+
bioproject_id,
|
|
445
|
+
attribute_mappings,
|
|
446
|
+
slot_range_mappings,
|
|
447
|
+
):
|
|
448
|
+
# Use the processed sample ID as the primary identifier
|
|
449
|
+
sample_id_value = pooling_info.get("processed_sample_id")
|
|
450
|
+
if not sample_id_value:
|
|
451
|
+
return
|
|
452
|
+
|
|
453
|
+
# Aggregate attributes from all biosamples in the pool
|
|
454
|
+
aggregated_attributes = {}
|
|
455
|
+
env_package = None
|
|
456
|
+
|
|
457
|
+
# Get title from the first biosample or use processed sample name
|
|
458
|
+
title = pooling_info.get(
|
|
459
|
+
"processed_sample_name", f"Pooled sample {sample_id_value}"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Process each biosample to collect and aggregate attributes
|
|
463
|
+
for biosample in biosamples:
|
|
464
|
+
for json_key, value in biosample.items():
|
|
465
|
+
if json_key == "id":
|
|
466
|
+
continue
|
|
467
|
+
|
|
468
|
+
if json_key == "env_package":
|
|
469
|
+
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
if isinstance(value, list):
|
|
473
|
+
for item in value:
|
|
474
|
+
if json_key not in attribute_mappings:
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
xml_key = attribute_mappings[json_key]
|
|
478
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
479
|
+
handler = self.type_handlers.get(
|
|
480
|
+
value_type, handle_string_value
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Special handling for "elev" key
|
|
484
|
+
if json_key == "elev":
|
|
485
|
+
value = f"{float(value)} m"
|
|
486
|
+
aggregated_attributes[xml_key] = value
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
# Special handling for "host_taxid"
|
|
490
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
491
|
+
if "term" in value and "id" in value["term"]:
|
|
492
|
+
value = re.findall(
|
|
493
|
+
r"\d+", value["term"]["id"].split(":")[1]
|
|
494
|
+
)[0]
|
|
495
|
+
aggregated_attributes[xml_key] = value
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
formatted_value = handler(item)
|
|
499
|
+
|
|
500
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
501
|
+
if xml_key not in aggregated_attributes:
|
|
502
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
if json_key not in attribute_mappings:
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
xml_key = attribute_mappings[json_key]
|
|
509
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
510
|
+
handler = self.type_handlers.get(value_type, handle_string_value)
|
|
511
|
+
|
|
512
|
+
# Special handling for "elev" key
|
|
513
|
+
if json_key == "elev":
|
|
514
|
+
value = f"{float(value)} m"
|
|
515
|
+
aggregated_attributes[xml_key] = value
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
# Special handling for "host_taxid"
|
|
519
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
520
|
+
if "term" in value and "id" in value["term"]:
|
|
521
|
+
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
|
|
522
|
+
aggregated_attributes[xml_key] = value
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
formatted_value = handler(value)
|
|
526
|
+
|
|
527
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
528
|
+
if xml_key not in aggregated_attributes:
|
|
529
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
530
|
+
|
|
531
|
+
# Override with aggregated values for pooled samples
|
|
532
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
533
|
+
collection_date_key = attribute_mappings.get(
|
|
534
|
+
"collection_date", "collection_date"
|
|
535
|
+
)
|
|
536
|
+
aggregated_attributes[collection_date_key] = pooling_info[
|
|
537
|
+
"aggregated_collection_date"
|
|
538
|
+
]
|
|
539
|
+
|
|
540
|
+
if pooling_info.get("aggregated_depth"):
|
|
541
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
542
|
+
aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
543
|
+
|
|
544
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
545
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
546
|
+
aggregated_attributes["samp_pooling"] = ";".join(
|
|
547
|
+
pooling_info["pooled_biosample_ids"]
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
|
|
551
|
+
allowed_attributes = {
|
|
552
|
+
"collection_date",
|
|
553
|
+
"depth",
|
|
554
|
+
"elev",
|
|
555
|
+
"geo_loc_name",
|
|
556
|
+
"lat_lon",
|
|
557
|
+
"env_broad_scale",
|
|
558
|
+
"env_local_scale",
|
|
559
|
+
"env_medium",
|
|
560
|
+
"samp_pooling",
|
|
561
|
+
}
|
|
562
|
+
filtered_attributes = {
|
|
563
|
+
k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
biosample_elements = [
|
|
567
|
+
self.set_element(
|
|
568
|
+
"SampleId",
|
|
569
|
+
children=[
|
|
570
|
+
self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
|
|
571
|
+
],
|
|
572
|
+
),
|
|
573
|
+
self.set_element(
|
|
574
|
+
"Descriptor",
|
|
575
|
+
children=[
|
|
576
|
+
self.set_element("Title", title),
|
|
577
|
+
self.set_element(
|
|
578
|
+
"ExternalLink",
|
|
579
|
+
attrib={"label": sample_id_value},
|
|
580
|
+
children=[
|
|
581
|
+
self.set_element(
|
|
582
|
+
"URL",
|
|
583
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
584
|
+
)
|
|
585
|
+
],
|
|
586
|
+
),
|
|
587
|
+
self.set_element(
|
|
588
|
+
"ExternalLink",
|
|
589
|
+
attrib={"label": pooling_info["pooling_process_id"]},
|
|
590
|
+
children=[
|
|
591
|
+
self.set_element(
|
|
592
|
+
"URL",
|
|
593
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
594
|
+
)
|
|
595
|
+
],
|
|
596
|
+
),
|
|
597
|
+
]
|
|
598
|
+
+ [
|
|
599
|
+
self.set_element(
|
|
600
|
+
"ExternalLink",
|
|
601
|
+
attrib={"label": biosample_id},
|
|
602
|
+
children=[
|
|
603
|
+
self.set_element(
|
|
604
|
+
"URL",
|
|
605
|
+
f"https://bioregistry.io/{biosample_id}",
|
|
606
|
+
)
|
|
607
|
+
],
|
|
608
|
+
)
|
|
609
|
+
for biosample_id in pooling_info.get("pooled_biosample_ids", [])
|
|
610
|
+
],
|
|
611
|
+
),
|
|
612
|
+
self.set_element(
|
|
613
|
+
"Organism",
|
|
614
|
+
children=[self.set_element("OrganismName", organism_name)],
|
|
615
|
+
),
|
|
616
|
+
self.set_element(
|
|
617
|
+
"BioProject",
|
|
618
|
+
children=[
|
|
619
|
+
self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
|
|
620
|
+
],
|
|
621
|
+
),
|
|
622
|
+
self.set_element("Package", env_package),
|
|
623
|
+
self.set_element(
|
|
624
|
+
"Attributes",
|
|
625
|
+
children=[
|
|
626
|
+
self.set_element(
|
|
627
|
+
"Attribute", filtered_attributes[key], {"attribute_name": key}
|
|
628
|
+
)
|
|
629
|
+
for key in sorted(filtered_attributes)
|
|
630
|
+
]
|
|
631
|
+
+ [
|
|
632
|
+
self.set_element(
|
|
633
|
+
"Attribute",
|
|
634
|
+
"National Microbiome Data Collaborative",
|
|
635
|
+
{"attribute_name": "broker name"},
|
|
636
|
+
)
|
|
637
|
+
],
|
|
638
|
+
),
|
|
639
|
+
]
|
|
640
|
+
|
|
641
|
+
action = self.set_element(
|
|
642
|
+
"Action",
|
|
643
|
+
children=[
|
|
644
|
+
self.set_element(
|
|
645
|
+
"AddData",
|
|
646
|
+
attrib={"target_db": "BioSample"},
|
|
647
|
+
children=[
|
|
648
|
+
self.set_element(
|
|
649
|
+
"Data",
|
|
650
|
+
attrib={"content_type": "XML"},
|
|
651
|
+
children=[
|
|
652
|
+
self.set_element(
|
|
653
|
+
"XmlContent",
|
|
654
|
+
children=[
|
|
655
|
+
self.set_element(
|
|
656
|
+
"BioSample",
|
|
657
|
+
attrib={"schema_version": "2.0"},
|
|
658
|
+
children=biosample_elements,
|
|
659
|
+
),
|
|
660
|
+
],
|
|
661
|
+
),
|
|
662
|
+
],
|
|
663
|
+
),
|
|
664
|
+
self.set_element(
|
|
665
|
+
"Identifier",
|
|
666
|
+
children=[
|
|
667
|
+
self.set_element(
|
|
668
|
+
"SPUID",
|
|
669
|
+
sample_id_value,
|
|
670
|
+
{"spuid_namespace": org},
|
|
671
|
+
),
|
|
672
|
+
],
|
|
673
|
+
),
|
|
674
|
+
],
|
|
675
|
+
),
|
|
676
|
+
],
|
|
677
|
+
)
|
|
678
|
+
self.root.append(action)
|
|
679
|
+
|
|
336
680
|
def set_fastq(
|
|
337
681
|
self,
|
|
338
682
|
biosample_data_objects: list,
|
|
@@ -342,12 +686,57 @@ class NCBISubmissionXML:
|
|
|
342
686
|
nmdc_biosamples: list,
|
|
343
687
|
nmdc_library_preparation: list,
|
|
344
688
|
all_instruments: dict,
|
|
689
|
+
pooled_biosamples_data=None,
|
|
345
690
|
):
|
|
346
691
|
bsm_id_name_dict = {
|
|
347
692
|
biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
|
|
348
693
|
}
|
|
349
694
|
|
|
695
|
+
# Use provided pooling data or empty dict
|
|
696
|
+
pooling_data = pooled_biosamples_data or {}
|
|
697
|
+
|
|
698
|
+
# Group data objects by pooling process
|
|
699
|
+
pooling_groups = {}
|
|
700
|
+
individual_entries = []
|
|
701
|
+
|
|
350
702
|
for entry in biosample_data_objects:
|
|
703
|
+
pooling_process_id = None
|
|
704
|
+
# Check if any biosample in this entry belongs to a pooling process
|
|
705
|
+
for biosample_id in entry.keys():
|
|
706
|
+
pooling_info = pooling_data.get(biosample_id, {})
|
|
707
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
708
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
709
|
+
break
|
|
710
|
+
|
|
711
|
+
if pooling_process_id:
|
|
712
|
+
if pooling_process_id not in pooling_groups:
|
|
713
|
+
pooling_groups[pooling_process_id] = {
|
|
714
|
+
"entries": [],
|
|
715
|
+
"processed_sample_id": pooling_info.get("processed_sample_id"),
|
|
716
|
+
"processed_sample_name": pooling_info.get(
|
|
717
|
+
"processed_sample_name", ""
|
|
718
|
+
),
|
|
719
|
+
}
|
|
720
|
+
pooling_groups[pooling_process_id]["entries"].append(entry)
|
|
721
|
+
else:
|
|
722
|
+
individual_entries.append(entry)
|
|
723
|
+
|
|
724
|
+
# Process pooled entries - create one SRA <Action> block per pooling process
|
|
725
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
726
|
+
self._create_pooled_sra_action(
|
|
727
|
+
group_data["entries"],
|
|
728
|
+
group_data["processed_sample_id"],
|
|
729
|
+
group_data["processed_sample_name"],
|
|
730
|
+
bioproject_id,
|
|
731
|
+
org,
|
|
732
|
+
nmdc_nucleotide_sequencing,
|
|
733
|
+
nmdc_library_preparation,
|
|
734
|
+
all_instruments,
|
|
735
|
+
bsm_id_name_dict,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Process individual entries
|
|
739
|
+
for entry in individual_entries:
|
|
351
740
|
fastq_files = []
|
|
352
741
|
biosample_ids = []
|
|
353
742
|
nucleotide_sequencing_ids = {}
|
|
@@ -532,6 +921,7 @@ class NCBISubmissionXML:
|
|
|
532
921
|
)
|
|
533
922
|
)
|
|
534
923
|
|
|
924
|
+
# Add library_name attribute
|
|
535
925
|
if library_name:
|
|
536
926
|
sra_attributes.append(
|
|
537
927
|
self.set_element(
|
|
@@ -577,6 +967,233 @@ class NCBISubmissionXML:
|
|
|
577
967
|
|
|
578
968
|
self.root.append(action)
|
|
579
969
|
|
|
970
|
+
def _create_pooled_sra_action(
|
|
971
|
+
self,
|
|
972
|
+
entries,
|
|
973
|
+
processed_sample_id,
|
|
974
|
+
processed_sample_name,
|
|
975
|
+
bioproject_id,
|
|
976
|
+
org,
|
|
977
|
+
nmdc_nucleotide_sequencing,
|
|
978
|
+
nmdc_library_preparation,
|
|
979
|
+
all_instruments,
|
|
980
|
+
bsm_id_name_dict,
|
|
981
|
+
):
|
|
982
|
+
if not processed_sample_id:
|
|
983
|
+
return
|
|
984
|
+
|
|
985
|
+
# Collect all fastq files from all entries
|
|
986
|
+
all_fastq_files = set()
|
|
987
|
+
all_biosample_ids = set()
|
|
988
|
+
nucleotide_sequencing_ids = {}
|
|
989
|
+
lib_prep_protocol_names = {}
|
|
990
|
+
analyte_category = ""
|
|
991
|
+
instrument_vendor = ""
|
|
992
|
+
instrument_model = ""
|
|
993
|
+
|
|
994
|
+
for entry in entries:
|
|
995
|
+
for biosample_id, data_objects in entry.items():
|
|
996
|
+
all_biosample_ids.add(biosample_id)
|
|
997
|
+
for data_object in data_objects:
|
|
998
|
+
if "url" in data_object:
|
|
999
|
+
url = urlparse(data_object["url"])
|
|
1000
|
+
file_path = os.path.basename(url.path)
|
|
1001
|
+
all_fastq_files.add(file_path)
|
|
1002
|
+
|
|
1003
|
+
# Get nucleotide sequencing info
|
|
1004
|
+
for ntseq_dict in nmdc_nucleotide_sequencing:
|
|
1005
|
+
if biosample_id in ntseq_dict:
|
|
1006
|
+
for ntseq in ntseq_dict[biosample_id]:
|
|
1007
|
+
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
|
|
1008
|
+
"id", ""
|
|
1009
|
+
)
|
|
1010
|
+
instrument_used = ntseq.get("instrument_used", [])
|
|
1011
|
+
if instrument_used:
|
|
1012
|
+
instrument_id = instrument_used[0]
|
|
1013
|
+
instrument = all_instruments.get(instrument_id, {})
|
|
1014
|
+
instrument_vendor = instrument.get("vendor", "")
|
|
1015
|
+
instrument_model = instrument.get("model", "")
|
|
1016
|
+
analyte_category = ntseq.get("analyte_category", "")
|
|
1017
|
+
|
|
1018
|
+
# Get library preparation info
|
|
1019
|
+
for lib_prep_dict in nmdc_library_preparation:
|
|
1020
|
+
if biosample_id in lib_prep_dict:
|
|
1021
|
+
lib_prep_protocol_names[biosample_id] = (
|
|
1022
|
+
lib_prep_dict[biosample_id]
|
|
1023
|
+
.get("protocol_link", {})
|
|
1024
|
+
.get("name", "")
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
if all_fastq_files:
|
|
1028
|
+
files_elements = [
|
|
1029
|
+
self.set_element(
|
|
1030
|
+
"File",
|
|
1031
|
+
"",
|
|
1032
|
+
{"file_path": f},
|
|
1033
|
+
[
|
|
1034
|
+
self.set_element(
|
|
1035
|
+
"DataType",
|
|
1036
|
+
"sra-run-fastq" if ".fastq" in f else "generic-data",
|
|
1037
|
+
)
|
|
1038
|
+
],
|
|
1039
|
+
)
|
|
1040
|
+
for f in sorted(all_fastq_files)
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
attribute_elements = [
|
|
1044
|
+
self.set_element(
|
|
1045
|
+
"AttributeRefId",
|
|
1046
|
+
attrib={"name": "BioProject"},
|
|
1047
|
+
children=[
|
|
1048
|
+
self.set_element(
|
|
1049
|
+
"RefId",
|
|
1050
|
+
children=[
|
|
1051
|
+
self.set_element(
|
|
1052
|
+
"PrimaryId",
|
|
1053
|
+
bioproject_id,
|
|
1054
|
+
{"db": "BioProject"},
|
|
1055
|
+
)
|
|
1056
|
+
],
|
|
1057
|
+
)
|
|
1058
|
+
],
|
|
1059
|
+
),
|
|
1060
|
+
# Reference the processed sample, not individual biosamples
|
|
1061
|
+
self.set_element(
|
|
1062
|
+
"AttributeRefId",
|
|
1063
|
+
attrib={"name": "BioSample"},
|
|
1064
|
+
children=[
|
|
1065
|
+
self.set_element(
|
|
1066
|
+
"RefId",
|
|
1067
|
+
children=[
|
|
1068
|
+
self.set_element(
|
|
1069
|
+
"SPUID",
|
|
1070
|
+
processed_sample_id,
|
|
1071
|
+
{"spuid_namespace": org},
|
|
1072
|
+
)
|
|
1073
|
+
],
|
|
1074
|
+
)
|
|
1075
|
+
],
|
|
1076
|
+
),
|
|
1077
|
+
]
|
|
1078
|
+
|
|
1079
|
+
sra_attributes = []
|
|
1080
|
+
if instrument_vendor == "illumina":
|
|
1081
|
+
sra_attributes.append(
|
|
1082
|
+
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
|
|
1083
|
+
)
|
|
1084
|
+
if instrument_model == "nextseq_550":
|
|
1085
|
+
sra_attributes.append(
|
|
1086
|
+
self.set_element(
|
|
1087
|
+
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
1088
|
+
)
|
|
1089
|
+
)
|
|
1090
|
+
elif instrument_model == "novaseq_6000":
|
|
1091
|
+
sra_attributes.append(
|
|
1092
|
+
self.set_element(
|
|
1093
|
+
"Attribute",
|
|
1094
|
+
"NovaSeq 6000",
|
|
1095
|
+
{"name": "instrument_model"},
|
|
1096
|
+
)
|
|
1097
|
+
)
|
|
1098
|
+
elif instrument_model == "hiseq":
|
|
1099
|
+
sra_attributes.append(
|
|
1100
|
+
self.set_element(
|
|
1101
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
1102
|
+
)
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
if analyte_category == "metagenome":
|
|
1106
|
+
sra_attributes.append(
|
|
1107
|
+
self.set_element("Attribute", "WGS", {"name": "library_strategy"})
|
|
1108
|
+
)
|
|
1109
|
+
sra_attributes.append(
|
|
1110
|
+
self.set_element(
|
|
1111
|
+
"Attribute", "METAGENOMIC", {"name": "library_source"}
|
|
1112
|
+
)
|
|
1113
|
+
)
|
|
1114
|
+
sra_attributes.append(
|
|
1115
|
+
self.set_element(
|
|
1116
|
+
"Attribute", "RANDOM", {"name": "library_selection"}
|
|
1117
|
+
)
|
|
1118
|
+
)
|
|
1119
|
+
elif analyte_category == "metatranscriptome":
|
|
1120
|
+
sra_attributes.append(
|
|
1121
|
+
self.set_element(
|
|
1122
|
+
"Attribute",
|
|
1123
|
+
"METATRANSCRIPTOMIC",
|
|
1124
|
+
{"name": "library_source"},
|
|
1125
|
+
)
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# Determine library layout based on file patterns
|
|
1129
|
+
has_paired_reads = any(
|
|
1130
|
+
"_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
|
|
1131
|
+
for f in all_fastq_files
|
|
1132
|
+
if "_R1" in f
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
if has_paired_reads:
|
|
1136
|
+
sra_attributes.append(
|
|
1137
|
+
self.set_element("Attribute", "paired", {"name": "library_layout"})
|
|
1138
|
+
)
|
|
1139
|
+
else:
|
|
1140
|
+
sra_attributes.append(
|
|
1141
|
+
self.set_element("Attribute", "single", {"name": "library_layout"})
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
# Add library_name attribute using ProcessedSample name
|
|
1145
|
+
if processed_sample_name:
|
|
1146
|
+
sra_attributes.append(
|
|
1147
|
+
self.set_element(
|
|
1148
|
+
"Attribute", processed_sample_name, {"name": "library_name"}
|
|
1149
|
+
)
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Add library construction protocol from any of the biosamples
|
|
1153
|
+
for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
|
|
1154
|
+
if lib_prep_name:
|
|
1155
|
+
sra_attributes.append(
|
|
1156
|
+
self.set_element(
|
|
1157
|
+
"Attribute",
|
|
1158
|
+
lib_prep_name,
|
|
1159
|
+
{"name": "library_construction_protocol"},
|
|
1160
|
+
)
|
|
1161
|
+
)
|
|
1162
|
+
break # Only add one protocol name
|
|
1163
|
+
|
|
1164
|
+
# Use the first nucleotide sequencing ID as the identifier
|
|
1165
|
+
omics_processing_id = None
|
|
1166
|
+
for biosample_id, seq_id in nucleotide_sequencing_ids.items():
|
|
1167
|
+
if seq_id:
|
|
1168
|
+
omics_processing_id = seq_id
|
|
1169
|
+
break
|
|
1170
|
+
|
|
1171
|
+
if omics_processing_id:
|
|
1172
|
+
identifier_element = self.set_element(
|
|
1173
|
+
"Identifier",
|
|
1174
|
+
children=[
|
|
1175
|
+
self.set_element(
|
|
1176
|
+
"SPUID", omics_processing_id, {"spuid_namespace": org}
|
|
1177
|
+
)
|
|
1178
|
+
],
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
action = self.set_element(
|
|
1182
|
+
"Action",
|
|
1183
|
+
children=[
|
|
1184
|
+
self.set_element(
|
|
1185
|
+
"AddFiles",
|
|
1186
|
+
attrib={"target_db": "SRA"},
|
|
1187
|
+
children=files_elements
|
|
1188
|
+
+ attribute_elements
|
|
1189
|
+
+ sra_attributes
|
|
1190
|
+
+ [identifier_element],
|
|
1191
|
+
),
|
|
1192
|
+
],
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
self.root.append(action)
|
|
1196
|
+
|
|
580
1197
|
def get_submission_xml(
|
|
581
1198
|
self,
|
|
582
1199
|
biosamples_list: list,
|
|
@@ -584,6 +1201,7 @@ class NCBISubmissionXML:
|
|
|
584
1201
|
biosample_data_objects_list: list,
|
|
585
1202
|
biosample_library_preparation_list: list,
|
|
586
1203
|
instruments_dict: dict,
|
|
1204
|
+
pooled_biosamples_data=None,
|
|
587
1205
|
):
|
|
588
1206
|
# data_type = None
|
|
589
1207
|
|
|
@@ -646,6 +1264,7 @@ class NCBISubmissionXML:
|
|
|
646
1264
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
647
1265
|
bioproject_id=self.ncbi_bioproject_id,
|
|
648
1266
|
nmdc_biosamples=filtered_biosamples_list,
|
|
1267
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
649
1268
|
)
|
|
650
1269
|
|
|
651
1270
|
# Also filter biosample_data_objects_list
|
|
@@ -692,6 +1311,7 @@ class NCBISubmissionXML:
|
|
|
692
1311
|
nmdc_biosamples=filtered_biosamples_list,
|
|
693
1312
|
nmdc_library_preparation=filtered_library_preparation_list,
|
|
694
1313
|
all_instruments=instruments_dict,
|
|
1314
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
695
1315
|
)
|
|
696
1316
|
|
|
697
1317
|
rough_string = ET.tostring(self.root, "unicode")
|