nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,8 +4,9 @@ import datetime
4
4
  import xml.etree.ElementTree as ET
5
5
  import xml.dom.minidom
6
6
 
7
- from typing import Any, List, Union
7
+ from typing import Any, List
8
8
  from urllib.parse import urlparse
9
+ from unidecode import unidecode
9
10
  from nmdc_runtime.site.export.ncbi_xml_utils import (
10
11
  handle_controlled_identified_term_value,
11
12
  handle_controlled_term_value,
@@ -16,7 +17,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
16
17
  handle_float_value,
17
18
  handle_string_value,
18
19
  load_mappings,
19
- validate_xml,
20
20
  )
21
21
 
22
22
 
@@ -163,16 +163,53 @@ class NCBISubmissionXML:
163
163
  org,
164
164
  bioproject_id,
165
165
  nmdc_biosamples,
166
+ pooled_biosamples_data=None,
166
167
  ):
167
168
  attribute_mappings, slot_range_mappings = load_mappings(
168
169
  self.nmdc_ncbi_attribute_mapping_file_url
169
170
  )
170
171
 
172
+ # Use provided pooling data or empty dict
173
+ pooling_data = pooled_biosamples_data or {}
174
+
175
+ # Group biosamples by pooling process
176
+ pooling_groups = {}
177
+ individual_biosamples = []
178
+
171
179
  for biosample in nmdc_biosamples:
180
+ pooling_info = pooling_data.get(biosample["id"], {})
181
+ if pooling_info and pooling_info.get("pooling_process_id"):
182
+ pooling_process_id = pooling_info["pooling_process_id"]
183
+ if pooling_process_id not in pooling_groups:
184
+ pooling_groups[pooling_process_id] = {
185
+ "biosamples": [],
186
+ "pooling_info": pooling_info,
187
+ }
188
+ pooling_groups[pooling_process_id]["biosamples"].append(biosample)
189
+ else:
190
+ individual_biosamples.append(biosample)
191
+
192
+ # Process pooled sample groups - create one <Action> block per pooling process
193
+ for pooling_process_id, group_data in pooling_groups.items():
194
+ self._create_pooled_biosample_action(
195
+ group_data["biosamples"],
196
+ group_data["pooling_info"],
197
+ organism_name,
198
+ org,
199
+ bioproject_id,
200
+ attribute_mappings,
201
+ slot_range_mappings,
202
+ )
203
+
204
+ # Process individual biosamples
205
+ for biosample in individual_biosamples:
172
206
  attributes = {}
173
207
  sample_id_value = None
174
208
  env_package = None
175
209
 
210
+ # Get pooling info for this specific biosample
211
+ pooling_info = pooling_data.get(biosample["id"], {})
212
+
176
213
  for json_key, value in biosample.items():
177
214
  if isinstance(value, list):
178
215
  for item in value:
@@ -191,15 +228,6 @@ class NCBISubmissionXML:
191
228
  attributes[xml_key] = value
192
229
  continue # Skip applying the handler to this key
193
230
 
194
- # Special handling for "host_taxid"
195
- if json_key == "host_taxid" and isinstance(value, dict):
196
- if "term" in value and "id" in value["term"]:
197
- value = re.findall(
198
- r"\d+", value["term"]["id"].split(":")[1]
199
- )[0]
200
- attributes[xml_key] = value
201
- continue # Skip applying the handler to this key
202
-
203
231
  formatted_value = handler(item)
204
232
 
205
233
  # Combine multiple values with a separator for list elements
@@ -214,7 +242,11 @@ class NCBISubmissionXML:
214
242
 
215
243
  # Special handling for NMDC Biosample "id"
216
244
  if json_key == "id":
217
- sample_id_value = value
245
+ # Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
246
+ if pooling_info and pooling_info.get("processed_sample_id"):
247
+ sample_id_value = pooling_info["processed_sample_id"]
248
+ else:
249
+ sample_id_value = value
218
250
  continue
219
251
 
220
252
  if json_key not in attribute_mappings:
@@ -237,10 +269,39 @@ class NCBISubmissionXML:
237
269
  attributes[xml_key] = value
238
270
  continue # Skip applying the handler to this key
239
271
 
272
+ # Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
273
+ if json_key == "geo_loc_name":
274
+ formatted_value = handler(value)
275
+ formatted_value_ascii = unidecode(formatted_value)
276
+ attributes[xml_key] = formatted_value_ascii
277
+ continue # Skip applying the handler to this key
278
+
240
279
  # Default processing for other keys
241
280
  formatted_value = handler(value)
242
281
  attributes[xml_key] = formatted_value
243
282
 
283
+ # Override with aggregated values for pooled samples
284
+ if pooling_info:
285
+ if pooling_info.get("aggregated_collection_date"):
286
+ # Find the mapping for collection_date
287
+ collection_date_key = attribute_mappings.get(
288
+ "collection_date", "collection_date"
289
+ )
290
+ attributes[collection_date_key] = pooling_info[
291
+ "aggregated_collection_date"
292
+ ]
293
+
294
+ if pooling_info.get("aggregated_depth"):
295
+ # Find the mapping for depth
296
+ depth_key = attribute_mappings.get("depth", "depth")
297
+ attributes[depth_key] = pooling_info["aggregated_depth"]
298
+
299
+ # Add samp_pooling attribute with semicolon-delimited biosample IDs
300
+ if pooling_info.get("pooled_biosample_ids"):
301
+ attributes["samp_pooling"] = ";".join(
302
+ pooling_info["pooled_biosample_ids"]
303
+ )
304
+
244
305
  biosample_elements = [
245
306
  self.set_element(
246
307
  "SampleId",
@@ -261,7 +322,48 @@ class NCBISubmissionXML:
261
322
  f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
262
323
  ),
263
324
  ),
264
- ],
325
+ ]
326
+ + (
327
+ # Add external links for pooled samples
328
+ [
329
+ self.set_element(
330
+ "ExternalLink",
331
+ attrib={"label": "NMDC Processed Sample"},
332
+ children=[
333
+ self.set_element(
334
+ "URL",
335
+ f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
336
+ )
337
+ ],
338
+ ),
339
+ self.set_element(
340
+ "ExternalLink",
341
+ attrib={"label": "NMDC Pooling Process"},
342
+ children=[
343
+ self.set_element(
344
+ "URL",
345
+ f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
346
+ )
347
+ ],
348
+ ),
349
+ ]
350
+ if pooling_info
351
+ and pooling_info.get("processed_sample_id")
352
+ and pooling_info.get("pooling_process_id")
353
+ else [
354
+ # Add external link for individual biosamples
355
+ self.set_element(
356
+ "ExternalLink",
357
+ attrib={"label": sample_id_value},
358
+ children=[
359
+ self.set_element(
360
+ "URL",
361
+ f"https://bioregistry.io/{sample_id_value}",
362
+ )
363
+ ],
364
+ ),
365
+ ]
366
+ ),
265
367
  ),
266
368
  self.set_element(
267
369
  "Organism",
@@ -333,6 +435,248 @@ class NCBISubmissionXML:
333
435
  )
334
436
  self.root.append(action)
335
437
 
438
+ def _create_pooled_biosample_action(
439
+ self,
440
+ biosamples,
441
+ pooling_info,
442
+ organism_name,
443
+ org,
444
+ bioproject_id,
445
+ attribute_mappings,
446
+ slot_range_mappings,
447
+ ):
448
+ # Use the processed sample ID as the primary identifier
449
+ sample_id_value = pooling_info.get("processed_sample_id")
450
+ if not sample_id_value:
451
+ return
452
+
453
+ # Aggregate attributes from all biosamples in the pool
454
+ aggregated_attributes = {}
455
+ env_package = None
456
+
457
+ # Get title from the first biosample or use processed sample name
458
+ title = pooling_info.get(
459
+ "processed_sample_name", f"Pooled sample {sample_id_value}"
460
+ )
461
+
462
+ # Process each biosample to collect and aggregate attributes
463
+ for biosample in biosamples:
464
+ for json_key, value in biosample.items():
465
+ if json_key == "id":
466
+ continue
467
+
468
+ if json_key == "env_package":
469
+ env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
470
+ continue
471
+
472
+ if isinstance(value, list):
473
+ for item in value:
474
+ if json_key not in attribute_mappings:
475
+ continue
476
+
477
+ xml_key = attribute_mappings[json_key]
478
+ value_type = slot_range_mappings.get(json_key, "string")
479
+ handler = self.type_handlers.get(
480
+ value_type, handle_string_value
481
+ )
482
+
483
+ # Special handling for "elev" key
484
+ if json_key == "elev":
485
+ value = f"{float(value)} m"
486
+ aggregated_attributes[xml_key] = value
487
+ continue
488
+
489
+ # Special handling for "host_taxid"
490
+ if json_key == "host_taxid" and isinstance(value, dict):
491
+ if "term" in value and "id" in value["term"]:
492
+ value = re.findall(
493
+ r"\d+", value["term"]["id"].split(":")[1]
494
+ )[0]
495
+ aggregated_attributes[xml_key] = value
496
+ continue
497
+
498
+ formatted_value = handler(item)
499
+
500
+ # For pooled samples, we typically want the first value or aggregate appropriately
501
+ if xml_key not in aggregated_attributes:
502
+ aggregated_attributes[xml_key] = formatted_value
503
+ continue
504
+
505
+ if json_key not in attribute_mappings:
506
+ continue
507
+
508
+ xml_key = attribute_mappings[json_key]
509
+ value_type = slot_range_mappings.get(json_key, "string")
510
+ handler = self.type_handlers.get(value_type, handle_string_value)
511
+
512
+ # Special handling for "elev" key
513
+ if json_key == "elev":
514
+ value = f"{float(value)} m"
515
+ aggregated_attributes[xml_key] = value
516
+ continue
517
+
518
+ # Special handling for "host_taxid"
519
+ if json_key == "host_taxid" and isinstance(value, dict):
520
+ if "term" in value and "id" in value["term"]:
521
+ value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
522
+ aggregated_attributes[xml_key] = value
523
+ continue
524
+
525
+ formatted_value = handler(value)
526
+
527
+ # For pooled samples, we typically want the first value or aggregate appropriately
528
+ if xml_key not in aggregated_attributes:
529
+ aggregated_attributes[xml_key] = formatted_value
530
+
531
+ # Override with aggregated values for pooled samples
532
+ if pooling_info.get("aggregated_collection_date"):
533
+ collection_date_key = attribute_mappings.get(
534
+ "collection_date", "collection_date"
535
+ )
536
+ aggregated_attributes[collection_date_key] = pooling_info[
537
+ "aggregated_collection_date"
538
+ ]
539
+
540
+ if pooling_info.get("aggregated_depth"):
541
+ depth_key = attribute_mappings.get("depth", "depth")
542
+ aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
543
+
544
+ # Add samp_pooling attribute with semicolon-delimited biosample IDs
545
+ if pooling_info.get("pooled_biosample_ids"):
546
+ aggregated_attributes["samp_pooling"] = ";".join(
547
+ pooling_info["pooled_biosample_ids"]
548
+ )
549
+
550
+ # Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
551
+ allowed_attributes = {
552
+ "collection_date",
553
+ "depth",
554
+ "elev",
555
+ "geo_loc_name",
556
+ "lat_lon",
557
+ "env_broad_scale",
558
+ "env_local_scale",
559
+ "env_medium",
560
+ "samp_pooling",
561
+ }
562
+ filtered_attributes = {
563
+ k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
564
+ }
565
+
566
+ biosample_elements = [
567
+ self.set_element(
568
+ "SampleId",
569
+ children=[
570
+ self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
571
+ ],
572
+ ),
573
+ self.set_element(
574
+ "Descriptor",
575
+ children=[
576
+ self.set_element("Title", title),
577
+ self.set_element(
578
+ "ExternalLink",
579
+ attrib={"label": sample_id_value},
580
+ children=[
581
+ self.set_element(
582
+ "URL",
583
+ f"https://bioregistry.io/{sample_id_value}",
584
+ )
585
+ ],
586
+ ),
587
+ self.set_element(
588
+ "ExternalLink",
589
+ attrib={"label": pooling_info["pooling_process_id"]},
590
+ children=[
591
+ self.set_element(
592
+ "URL",
593
+ f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
594
+ )
595
+ ],
596
+ ),
597
+ ]
598
+ + [
599
+ self.set_element(
600
+ "ExternalLink",
601
+ attrib={"label": biosample_id},
602
+ children=[
603
+ self.set_element(
604
+ "URL",
605
+ f"https://bioregistry.io/{biosample_id}",
606
+ )
607
+ ],
608
+ )
609
+ for biosample_id in pooling_info.get("pooled_biosample_ids", [])
610
+ ],
611
+ ),
612
+ self.set_element(
613
+ "Organism",
614
+ children=[self.set_element("OrganismName", organism_name)],
615
+ ),
616
+ self.set_element(
617
+ "BioProject",
618
+ children=[
619
+ self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
620
+ ],
621
+ ),
622
+ self.set_element("Package", env_package),
623
+ self.set_element(
624
+ "Attributes",
625
+ children=[
626
+ self.set_element(
627
+ "Attribute", filtered_attributes[key], {"attribute_name": key}
628
+ )
629
+ for key in sorted(filtered_attributes)
630
+ ]
631
+ + [
632
+ self.set_element(
633
+ "Attribute",
634
+ "National Microbiome Data Collaborative",
635
+ {"attribute_name": "broker name"},
636
+ )
637
+ ],
638
+ ),
639
+ ]
640
+
641
+ action = self.set_element(
642
+ "Action",
643
+ children=[
644
+ self.set_element(
645
+ "AddData",
646
+ attrib={"target_db": "BioSample"},
647
+ children=[
648
+ self.set_element(
649
+ "Data",
650
+ attrib={"content_type": "XML"},
651
+ children=[
652
+ self.set_element(
653
+ "XmlContent",
654
+ children=[
655
+ self.set_element(
656
+ "BioSample",
657
+ attrib={"schema_version": "2.0"},
658
+ children=biosample_elements,
659
+ ),
660
+ ],
661
+ ),
662
+ ],
663
+ ),
664
+ self.set_element(
665
+ "Identifier",
666
+ children=[
667
+ self.set_element(
668
+ "SPUID",
669
+ sample_id_value,
670
+ {"spuid_namespace": org},
671
+ ),
672
+ ],
673
+ ),
674
+ ],
675
+ ),
676
+ ],
677
+ )
678
+ self.root.append(action)
679
+
336
680
  def set_fastq(
337
681
  self,
338
682
  biosample_data_objects: list,
@@ -342,12 +686,57 @@ class NCBISubmissionXML:
342
686
  nmdc_biosamples: list,
343
687
  nmdc_library_preparation: list,
344
688
  all_instruments: dict,
689
+ pooled_biosamples_data=None,
345
690
  ):
346
691
  bsm_id_name_dict = {
347
692
  biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
348
693
  }
349
694
 
695
+ # Use provided pooling data or empty dict
696
+ pooling_data = pooled_biosamples_data or {}
697
+
698
+ # Group data objects by pooling process
699
+ pooling_groups = {}
700
+ individual_entries = []
701
+
350
702
  for entry in biosample_data_objects:
703
+ pooling_process_id = None
704
+ # Check if any biosample in this entry belongs to a pooling process
705
+ for biosample_id in entry.keys():
706
+ pooling_info = pooling_data.get(biosample_id, {})
707
+ if pooling_info and pooling_info.get("pooling_process_id"):
708
+ pooling_process_id = pooling_info["pooling_process_id"]
709
+ break
710
+
711
+ if pooling_process_id:
712
+ if pooling_process_id not in pooling_groups:
713
+ pooling_groups[pooling_process_id] = {
714
+ "entries": [],
715
+ "processed_sample_id": pooling_info.get("processed_sample_id"),
716
+ "processed_sample_name": pooling_info.get(
717
+ "processed_sample_name", ""
718
+ ),
719
+ }
720
+ pooling_groups[pooling_process_id]["entries"].append(entry)
721
+ else:
722
+ individual_entries.append(entry)
723
+
724
+ # Process pooled entries - create one SRA <Action> block per pooling process
725
+ for pooling_process_id, group_data in pooling_groups.items():
726
+ self._create_pooled_sra_action(
727
+ group_data["entries"],
728
+ group_data["processed_sample_id"],
729
+ group_data["processed_sample_name"],
730
+ bioproject_id,
731
+ org,
732
+ nmdc_nucleotide_sequencing,
733
+ nmdc_library_preparation,
734
+ all_instruments,
735
+ bsm_id_name_dict,
736
+ )
737
+
738
+ # Process individual entries
739
+ for entry in individual_entries:
351
740
  fastq_files = []
352
741
  biosample_ids = []
353
742
  nucleotide_sequencing_ids = {}
@@ -532,6 +921,7 @@ class NCBISubmissionXML:
532
921
  )
533
922
  )
534
923
 
924
+ # Add library_name attribute
535
925
  if library_name:
536
926
  sra_attributes.append(
537
927
  self.set_element(
@@ -577,6 +967,233 @@ class NCBISubmissionXML:
577
967
 
578
968
  self.root.append(action)
579
969
 
970
+ def _create_pooled_sra_action(
971
+ self,
972
+ entries,
973
+ processed_sample_id,
974
+ processed_sample_name,
975
+ bioproject_id,
976
+ org,
977
+ nmdc_nucleotide_sequencing,
978
+ nmdc_library_preparation,
979
+ all_instruments,
980
+ bsm_id_name_dict,
981
+ ):
982
+ if not processed_sample_id:
983
+ return
984
+
985
+ # Collect all fastq files from all entries
986
+ all_fastq_files = set()
987
+ all_biosample_ids = set()
988
+ nucleotide_sequencing_ids = {}
989
+ lib_prep_protocol_names = {}
990
+ analyte_category = ""
991
+ instrument_vendor = ""
992
+ instrument_model = ""
993
+
994
+ for entry in entries:
995
+ for biosample_id, data_objects in entry.items():
996
+ all_biosample_ids.add(biosample_id)
997
+ for data_object in data_objects:
998
+ if "url" in data_object:
999
+ url = urlparse(data_object["url"])
1000
+ file_path = os.path.basename(url.path)
1001
+ all_fastq_files.add(file_path)
1002
+
1003
+ # Get nucleotide sequencing info
1004
+ for ntseq_dict in nmdc_nucleotide_sequencing:
1005
+ if biosample_id in ntseq_dict:
1006
+ for ntseq in ntseq_dict[biosample_id]:
1007
+ nucleotide_sequencing_ids[biosample_id] = ntseq.get(
1008
+ "id", ""
1009
+ )
1010
+ instrument_used = ntseq.get("instrument_used", [])
1011
+ if instrument_used:
1012
+ instrument_id = instrument_used[0]
1013
+ instrument = all_instruments.get(instrument_id, {})
1014
+ instrument_vendor = instrument.get("vendor", "")
1015
+ instrument_model = instrument.get("model", "")
1016
+ analyte_category = ntseq.get("analyte_category", "")
1017
+
1018
+ # Get library preparation info
1019
+ for lib_prep_dict in nmdc_library_preparation:
1020
+ if biosample_id in lib_prep_dict:
1021
+ lib_prep_protocol_names[biosample_id] = (
1022
+ lib_prep_dict[biosample_id]
1023
+ .get("protocol_link", {})
1024
+ .get("name", "")
1025
+ )
1026
+
1027
+ if all_fastq_files:
1028
+ files_elements = [
1029
+ self.set_element(
1030
+ "File",
1031
+ "",
1032
+ {"file_path": f},
1033
+ [
1034
+ self.set_element(
1035
+ "DataType",
1036
+ "sra-run-fastq" if ".fastq" in f else "generic-data",
1037
+ )
1038
+ ],
1039
+ )
1040
+ for f in sorted(all_fastq_files)
1041
+ ]
1042
+
1043
+ attribute_elements = [
1044
+ self.set_element(
1045
+ "AttributeRefId",
1046
+ attrib={"name": "BioProject"},
1047
+ children=[
1048
+ self.set_element(
1049
+ "RefId",
1050
+ children=[
1051
+ self.set_element(
1052
+ "PrimaryId",
1053
+ bioproject_id,
1054
+ {"db": "BioProject"},
1055
+ )
1056
+ ],
1057
+ )
1058
+ ],
1059
+ ),
1060
+ # Reference the processed sample, not individual biosamples
1061
+ self.set_element(
1062
+ "AttributeRefId",
1063
+ attrib={"name": "BioSample"},
1064
+ children=[
1065
+ self.set_element(
1066
+ "RefId",
1067
+ children=[
1068
+ self.set_element(
1069
+ "SPUID",
1070
+ processed_sample_id,
1071
+ {"spuid_namespace": org},
1072
+ )
1073
+ ],
1074
+ )
1075
+ ],
1076
+ ),
1077
+ ]
1078
+
1079
+ sra_attributes = []
1080
+ if instrument_vendor == "illumina":
1081
+ sra_attributes.append(
1082
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
1083
+ )
1084
+ if instrument_model == "nextseq_550":
1085
+ sra_attributes.append(
1086
+ self.set_element(
1087
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
1088
+ )
1089
+ )
1090
+ elif instrument_model == "novaseq_6000":
1091
+ sra_attributes.append(
1092
+ self.set_element(
1093
+ "Attribute",
1094
+ "NovaSeq 6000",
1095
+ {"name": "instrument_model"},
1096
+ )
1097
+ )
1098
+ elif instrument_model == "hiseq":
1099
+ sra_attributes.append(
1100
+ self.set_element(
1101
+ "Attribute", "HiSeq", {"name": "instrument_model"}
1102
+ )
1103
+ )
1104
+
1105
+ if analyte_category == "metagenome":
1106
+ sra_attributes.append(
1107
+ self.set_element("Attribute", "WGS", {"name": "library_strategy"})
1108
+ )
1109
+ sra_attributes.append(
1110
+ self.set_element(
1111
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
1112
+ )
1113
+ )
1114
+ sra_attributes.append(
1115
+ self.set_element(
1116
+ "Attribute", "RANDOM", {"name": "library_selection"}
1117
+ )
1118
+ )
1119
+ elif analyte_category == "metatranscriptome":
1120
+ sra_attributes.append(
1121
+ self.set_element(
1122
+ "Attribute",
1123
+ "METATRANSCRIPTOMIC",
1124
+ {"name": "library_source"},
1125
+ )
1126
+ )
1127
+
1128
+ # Determine library layout based on file patterns
1129
+ has_paired_reads = any(
1130
+ "_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
1131
+ for f in all_fastq_files
1132
+ if "_R1" in f
1133
+ )
1134
+
1135
+ if has_paired_reads:
1136
+ sra_attributes.append(
1137
+ self.set_element("Attribute", "paired", {"name": "library_layout"})
1138
+ )
1139
+ else:
1140
+ sra_attributes.append(
1141
+ self.set_element("Attribute", "single", {"name": "library_layout"})
1142
+ )
1143
+
1144
+ # Add library_name attribute using ProcessedSample name
1145
+ if processed_sample_name:
1146
+ sra_attributes.append(
1147
+ self.set_element(
1148
+ "Attribute", processed_sample_name, {"name": "library_name"}
1149
+ )
1150
+ )
1151
+
1152
+ # Add library construction protocol from any of the biosamples
1153
+ for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
1154
+ if lib_prep_name:
1155
+ sra_attributes.append(
1156
+ self.set_element(
1157
+ "Attribute",
1158
+ lib_prep_name,
1159
+ {"name": "library_construction_protocol"},
1160
+ )
1161
+ )
1162
+ break # Only add one protocol name
1163
+
1164
+ # Use the first nucleotide sequencing ID as the identifier
1165
+ omics_processing_id = None
1166
+ for biosample_id, seq_id in nucleotide_sequencing_ids.items():
1167
+ if seq_id:
1168
+ omics_processing_id = seq_id
1169
+ break
1170
+
1171
+ if omics_processing_id:
1172
+ identifier_element = self.set_element(
1173
+ "Identifier",
1174
+ children=[
1175
+ self.set_element(
1176
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
1177
+ )
1178
+ ],
1179
+ )
1180
+
1181
+ action = self.set_element(
1182
+ "Action",
1183
+ children=[
1184
+ self.set_element(
1185
+ "AddFiles",
1186
+ attrib={"target_db": "SRA"},
1187
+ children=files_elements
1188
+ + attribute_elements
1189
+ + sra_attributes
1190
+ + [identifier_element],
1191
+ ),
1192
+ ],
1193
+ )
1194
+
1195
+ self.root.append(action)
1196
+
580
1197
  def get_submission_xml(
581
1198
  self,
582
1199
  biosamples_list: list,
@@ -584,6 +1201,7 @@ class NCBISubmissionXML:
584
1201
  biosample_data_objects_list: list,
585
1202
  biosample_library_preparation_list: list,
586
1203
  instruments_dict: dict,
1204
+ pooled_biosamples_data=None,
587
1205
  ):
588
1206
  # data_type = None
589
1207
 
@@ -646,6 +1264,7 @@ class NCBISubmissionXML:
646
1264
  org=self.ncbi_submission_metadata.get("organization", ""),
647
1265
  bioproject_id=self.ncbi_bioproject_id,
648
1266
  nmdc_biosamples=filtered_biosamples_list,
1267
+ pooled_biosamples_data=pooled_biosamples_data,
649
1268
  )
650
1269
 
651
1270
  # Also filter biosample_data_objects_list
@@ -692,6 +1311,7 @@ class NCBISubmissionXML:
692
1311
  nmdc_biosamples=filtered_biosamples_list,
693
1312
  nmdc_library_preparation=filtered_library_preparation_list,
694
1313
  all_instruments=instruments_dict,
1314
+ pooled_biosamples_data=pooled_biosamples_data,
695
1315
  )
696
1316
 
697
1317
  rough_string = ET.tostring(self.root, "unicode")