nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,1331 @@
1
+ import os
2
+ import re
3
+ import datetime
4
+ import xml.etree.ElementTree as ET
5
+ import xml.dom.minidom
6
+
7
+ from typing import Any, List
8
+ from urllib.parse import urlparse
9
+ from unidecode import unidecode
10
+ from nmdc_runtime.site.export.ncbi_xml_utils import (
11
+ handle_controlled_identified_term_value,
12
+ handle_controlled_term_value,
13
+ handle_geolocation_value,
14
+ handle_quantity_value,
15
+ handle_text_value,
16
+ handle_timestamp_value,
17
+ handle_float_value,
18
+ handle_string_value,
19
+ load_mappings,
20
+ )
21
+
22
+
23
+ class NCBISubmissionXML:
24
+ def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
25
+ self.root = ET.Element("Submission")
26
+
27
+ self.nmdc_study_id = nmdc_study.get("id")
28
+ self.nmdc_study_title = nmdc_study.get("title")
29
+ self.nmdc_study_description = nmdc_study.get("description")
30
+ # get the first INSDC BioProject ID from the NMDC study
31
+ self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
32
+ # the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
33
+ # everything after the prefix and delimiter (":")
34
+ self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
35
+ self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
36
+ nmdc_study_pi_name = (
37
+ nmdc_study.get("principal_investigator", {}).get("name").split()
38
+ )
39
+ self.first_name = nmdc_study_pi_name[0]
40
+ self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None
41
+
42
+ self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
43
+ "nmdc_ncbi_attribute_mapping_file_url"
44
+ )
45
+ self.ncbi_submission_metadata = ncbi_submission_metadata.get(
46
+ "ncbi_submission_metadata", {}
47
+ )
48
+ self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
49
+ "ncbi_biosample_metadata", {}
50
+ )
51
+
52
+ # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
53
+ # type handlers
54
+ self.type_handlers = {
55
+ "QuantityValue": handle_quantity_value,
56
+ "TextValue": handle_text_value,
57
+ "TimestampValue": handle_timestamp_value,
58
+ "ControlledTermValue": handle_controlled_term_value,
59
+ "ControlledIdentifiedTermValue": handle_controlled_identified_term_value,
60
+ "GeolocationValue": handle_geolocation_value,
61
+ "float": handle_float_value,
62
+ "string": handle_string_value,
63
+ }
64
+
65
+ def set_element(self, tag, text="", attrib=None, children=None):
66
+ attrib = attrib or {}
67
+ children = children or []
68
+ element = ET.Element(tag, attrib=attrib)
69
+ element.text = text
70
+ for child in children:
71
+ element.append(child)
72
+ return element
73
+
74
+ def set_description(self, email, first, last, org, date=None):
75
+ date = date or datetime.datetime.now().strftime("%Y-%m-%d")
76
+ description = self.set_element(
77
+ "Description",
78
+ children=[
79
+ self.set_element(
80
+ "Comment", f"NMDC Submission for {self.nmdc_study_id}"
81
+ ),
82
+ self.set_element(
83
+ "Organization",
84
+ attrib={"role": "owner", "type": "center"},
85
+ children=[
86
+ self.set_element("Name", org),
87
+ self.set_element(
88
+ "Contact",
89
+ attrib={"email": email},
90
+ children=[
91
+ self.set_element(
92
+ "Name",
93
+ children=[
94
+ self.set_element("First", first),
95
+ self.set_element("Last", last),
96
+ ],
97
+ )
98
+ ],
99
+ ),
100
+ ],
101
+ ),
102
+ self.set_element("Hold", attrib={"release_date": date}),
103
+ ],
104
+ )
105
+ self.root.append(description)
106
+
107
+ def set_descriptor(self, title, description):
108
+ descriptor_elements = []
109
+ descriptor_elements.append(self.set_element("Title", title))
110
+ descriptor_elements.append(
111
+ self.set_element(
112
+ "Description", children=[self.set_element("p", description)]
113
+ )
114
+ )
115
+
116
+ return descriptor_elements
117
+
118
+ def set_bioproject(self, title, project_id, description, data_type, org):
119
+ action = self.set_element("Action")
120
+ add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
121
+
122
+ data_element = self.set_element("Data", attrib={"content_type": "XML"})
123
+ xml_content = self.set_element("XmlContent")
124
+ project = self.set_element("Project", attrib={"schema_version": "2.0"})
125
+
126
+ project_id_element = self.set_element("ProjectID")
127
+ spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org})
128
+ project_id_element.append(spuid)
129
+
130
+ descriptor = self.set_descriptor(title, description)
131
+ project_type = self.set_element("ProjectType")
132
+ # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd
133
+ # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.."
134
+ project_type_submission = self.set_element(
135
+ "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
136
+ )
137
+ intended_data_type_set = self.set_element("IntendedDataTypeSet")
138
+ data_type_element = self.set_element("DataType", data_type)
139
+
140
+ intended_data_type_set.append(data_type_element)
141
+ project_type_submission.append(intended_data_type_set)
142
+ project_type.append(project_type_submission)
143
+
144
+ project.extend([project_id_element] + descriptor + [project_type])
145
+
146
+ xml_content.append(project)
147
+ data_element.append(xml_content)
148
+ add_data.append(data_element)
149
+
150
+ identifier = self.set_element("Identifier")
151
+ spuid_identifier = self.set_element(
152
+ "SPUID", project_id, {"spuid_namespace": org}
153
+ )
154
+ identifier.append(spuid_identifier)
155
+ add_data.append(identifier)
156
+
157
+ action.append(add_data)
158
+ self.root.append(action)
159
+
160
+ def set_biosample(
161
+ self,
162
+ organism_name,
163
+ org,
164
+ bioproject_id,
165
+ nmdc_biosamples,
166
+ pooled_biosamples_data=None,
167
+ ):
168
+ attribute_mappings, slot_range_mappings = load_mappings(
169
+ self.nmdc_ncbi_attribute_mapping_file_url
170
+ )
171
+
172
+ # Use provided pooling data or empty dict
173
+ pooling_data = pooled_biosamples_data or {}
174
+
175
+ # Group biosamples by pooling process
176
+ pooling_groups = {}
177
+ individual_biosamples = []
178
+
179
+ for biosample in nmdc_biosamples:
180
+ pooling_info = pooling_data.get(biosample["id"], {})
181
+ if pooling_info and pooling_info.get("pooling_process_id"):
182
+ pooling_process_id = pooling_info["pooling_process_id"]
183
+ if pooling_process_id not in pooling_groups:
184
+ pooling_groups[pooling_process_id] = {
185
+ "biosamples": [],
186
+ "pooling_info": pooling_info,
187
+ }
188
+ pooling_groups[pooling_process_id]["biosamples"].append(biosample)
189
+ else:
190
+ individual_biosamples.append(biosample)
191
+
192
+ # Process pooled sample groups - create one <Action> block per pooling process
193
+ for pooling_process_id, group_data in pooling_groups.items():
194
+ self._create_pooled_biosample_action(
195
+ group_data["biosamples"],
196
+ group_data["pooling_info"],
197
+ organism_name,
198
+ org,
199
+ bioproject_id,
200
+ attribute_mappings,
201
+ slot_range_mappings,
202
+ )
203
+
204
+ # Process individual biosamples
205
+ for biosample in individual_biosamples:
206
+ attributes = {}
207
+ sample_id_value = None
208
+ env_package = None
209
+
210
+ # Get pooling info for this specific biosample
211
+ pooling_info = pooling_data.get(biosample["id"], {})
212
+
213
+ for json_key, value in biosample.items():
214
+ if isinstance(value, list):
215
+ for item in value:
216
+ if json_key not in attribute_mappings:
217
+ continue
218
+
219
+ xml_key = attribute_mappings[json_key]
220
+ value_type = slot_range_mappings.get(json_key, "string")
221
+ handler = self.type_handlers.get(
222
+ value_type, handle_string_value
223
+ )
224
+
225
+ # Special handling for "elev" key
226
+ if json_key == "elev":
227
+ value = f"{float(value)} m" # Convert to float if possible
228
+ attributes[xml_key] = value
229
+ continue # Skip applying the handler to this key
230
+
231
+ formatted_value = handler(item)
232
+
233
+ # Combine multiple values with a separator for list elements
234
+ if xml_key in attributes:
235
+ attributes[xml_key] += f"| {formatted_value}"
236
+ else:
237
+ attributes[xml_key] = formatted_value
238
+ continue
239
+
240
+ if json_key == "env_package":
241
+ env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
242
+
243
+ # Special handling for NMDC Biosample "id"
244
+ if json_key == "id":
245
+ # Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
246
+ if pooling_info and pooling_info.get("processed_sample_id"):
247
+ sample_id_value = pooling_info["processed_sample_id"]
248
+ else:
249
+ sample_id_value = value
250
+ continue
251
+
252
+ if json_key not in attribute_mappings:
253
+ continue
254
+
255
+ xml_key = attribute_mappings[json_key]
256
+ value_type = slot_range_mappings.get(json_key, "string")
257
+ handler = self.type_handlers.get(value_type, handle_string_value)
258
+
259
+ # Special handling for "elev" key
260
+ if json_key == "elev":
261
+ value = f"{float(value)} m" # Convert to float if possible
262
+ attributes[xml_key] = value
263
+ continue # Skip applying the handler to this key
264
+
265
+ # Special handling for "host_taxid"
266
+ if json_key == "host_taxid" and isinstance(value, dict):
267
+ if "term" in value and "id" in value["term"]:
268
+ value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
269
+ attributes[xml_key] = value
270
+ continue # Skip applying the handler to this key
271
+
272
+ # Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
273
+ if json_key == "geo_loc_name":
274
+ formatted_value = handler(value)
275
+ formatted_value_ascii = unidecode(formatted_value)
276
+ attributes[xml_key] = formatted_value_ascii
277
+ continue # Skip applying the handler to this key
278
+
279
+ # Default processing for other keys
280
+ formatted_value = handler(value)
281
+ attributes[xml_key] = formatted_value
282
+
283
+ # Override with aggregated values for pooled samples
284
+ if pooling_info:
285
+ if pooling_info.get("aggregated_collection_date"):
286
+ # Find the mapping for collection_date
287
+ collection_date_key = attribute_mappings.get(
288
+ "collection_date", "collection_date"
289
+ )
290
+ attributes[collection_date_key] = pooling_info[
291
+ "aggregated_collection_date"
292
+ ]
293
+
294
+ if pooling_info.get("aggregated_depth"):
295
+ # Find the mapping for depth
296
+ depth_key = attribute_mappings.get("depth", "depth")
297
+ attributes[depth_key] = pooling_info["aggregated_depth"]
298
+
299
+ # Add samp_pooling attribute with semicolon-delimited biosample IDs
300
+ if pooling_info.get("pooled_biosample_ids"):
301
+ attributes["samp_pooling"] = ";".join(
302
+ pooling_info["pooled_biosample_ids"]
303
+ )
304
+
305
+ biosample_elements = [
306
+ self.set_element(
307
+ "SampleId",
308
+ children=[
309
+ self.set_element(
310
+ "SPUID", sample_id_value, {"spuid_namespace": org}
311
+ )
312
+ ],
313
+ ),
314
+ self.set_element(
315
+ "Descriptor",
316
+ children=[
317
+ self.set_element(
318
+ "Title",
319
+ attributes.get(
320
+ "name",
321
+ # fallback title if "name" is not present
322
+ f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
323
+ ),
324
+ ),
325
+ ]
326
+ + (
327
+ # Add external links for pooled samples
328
+ [
329
+ self.set_element(
330
+ "ExternalLink",
331
+ attrib={"label": "NMDC Processed Sample"},
332
+ children=[
333
+ self.set_element(
334
+ "URL",
335
+ f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
336
+ )
337
+ ],
338
+ ),
339
+ self.set_element(
340
+ "ExternalLink",
341
+ attrib={"label": "NMDC Pooling Process"},
342
+ children=[
343
+ self.set_element(
344
+ "URL",
345
+ f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
346
+ )
347
+ ],
348
+ ),
349
+ ]
350
+ if pooling_info
351
+ and pooling_info.get("processed_sample_id")
352
+ and pooling_info.get("pooling_process_id")
353
+ else [
354
+ # Add external link for individual biosamples
355
+ self.set_element(
356
+ "ExternalLink",
357
+ attrib={"label": sample_id_value},
358
+ children=[
359
+ self.set_element(
360
+ "URL",
361
+ f"https://bioregistry.io/{sample_id_value}",
362
+ )
363
+ ],
364
+ ),
365
+ ]
366
+ ),
367
+ ),
368
+ self.set_element(
369
+ "Organism",
370
+ children=[self.set_element("OrganismName", organism_name)],
371
+ ),
372
+ self.set_element(
373
+ "BioProject",
374
+ children=[
375
+ self.set_element(
376
+ "PrimaryId", bioproject_id, {"db": "BioProject"}
377
+ )
378
+ ],
379
+ ),
380
+ self.set_element("Package", env_package),
381
+ self.set_element(
382
+ "Attributes",
383
+ children=[
384
+ self.set_element(
385
+ "Attribute", attributes[key], {"attribute_name": key}
386
+ )
387
+ for key in sorted(attributes)
388
+ ]
389
+ + [
390
+ self.set_element(
391
+ "Attribute",
392
+ "National Microbiome Data Collaborative",
393
+ {"attribute_name": "broker name"},
394
+ )
395
+ ],
396
+ ),
397
+ ]
398
+
399
+ action = self.set_element(
400
+ "Action",
401
+ children=[
402
+ self.set_element(
403
+ "AddData",
404
+ attrib={"target_db": "BioSample"},
405
+ children=[
406
+ self.set_element(
407
+ "Data",
408
+ attrib={"content_type": "XML"},
409
+ children=[
410
+ self.set_element(
411
+ "XmlContent",
412
+ children=[
413
+ self.set_element(
414
+ "BioSample",
415
+ attrib={"schema_version": "2.0"},
416
+ children=biosample_elements,
417
+ ),
418
+ ],
419
+ ),
420
+ ],
421
+ ),
422
+ self.set_element(
423
+ "Identifier",
424
+ children=[
425
+ self.set_element(
426
+ "SPUID",
427
+ sample_id_value,
428
+ {"spuid_namespace": org},
429
+ ),
430
+ ],
431
+ ),
432
+ ],
433
+ ),
434
+ ],
435
+ )
436
+ self.root.append(action)
437
+
438
+ def _create_pooled_biosample_action(
439
+ self,
440
+ biosamples,
441
+ pooling_info,
442
+ organism_name,
443
+ org,
444
+ bioproject_id,
445
+ attribute_mappings,
446
+ slot_range_mappings,
447
+ ):
448
+ # Use the processed sample ID as the primary identifier
449
+ sample_id_value = pooling_info.get("processed_sample_id")
450
+ if not sample_id_value:
451
+ return
452
+
453
+ # Aggregate attributes from all biosamples in the pool
454
+ aggregated_attributes = {}
455
+ env_package = None
456
+
457
+ # Get title from the first biosample or use processed sample name
458
+ title = pooling_info.get(
459
+ "processed_sample_name", f"Pooled sample {sample_id_value}"
460
+ )
461
+
462
+ # Process each biosample to collect and aggregate attributes
463
+ for biosample in biosamples:
464
+ for json_key, value in biosample.items():
465
+ if json_key == "id":
466
+ continue
467
+
468
+ if json_key == "env_package":
469
+ env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
470
+ continue
471
+
472
+ if isinstance(value, list):
473
+ for item in value:
474
+ if json_key not in attribute_mappings:
475
+ continue
476
+
477
+ xml_key = attribute_mappings[json_key]
478
+ value_type = slot_range_mappings.get(json_key, "string")
479
+ handler = self.type_handlers.get(
480
+ value_type, handle_string_value
481
+ )
482
+
483
+ # Special handling for "elev" key
484
+ if json_key == "elev":
485
+ value = f"{float(value)} m"
486
+ aggregated_attributes[xml_key] = value
487
+ continue
488
+
489
+ # Special handling for "host_taxid"
490
+ if json_key == "host_taxid" and isinstance(value, dict):
491
+ if "term" in value and "id" in value["term"]:
492
+ value = re.findall(
493
+ r"\d+", value["term"]["id"].split(":")[1]
494
+ )[0]
495
+ aggregated_attributes[xml_key] = value
496
+ continue
497
+
498
+ formatted_value = handler(item)
499
+
500
+ # For pooled samples, we typically want the first value or aggregate appropriately
501
+ if xml_key not in aggregated_attributes:
502
+ aggregated_attributes[xml_key] = formatted_value
503
+ continue
504
+
505
+ if json_key not in attribute_mappings:
506
+ continue
507
+
508
+ xml_key = attribute_mappings[json_key]
509
+ value_type = slot_range_mappings.get(json_key, "string")
510
+ handler = self.type_handlers.get(value_type, handle_string_value)
511
+
512
+ # Special handling for "elev" key
513
+ if json_key == "elev":
514
+ value = f"{float(value)} m"
515
+ aggregated_attributes[xml_key] = value
516
+ continue
517
+
518
+ # Special handling for "host_taxid"
519
+ if json_key == "host_taxid" and isinstance(value, dict):
520
+ if "term" in value and "id" in value["term"]:
521
+ value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
522
+ aggregated_attributes[xml_key] = value
523
+ continue
524
+
525
+ formatted_value = handler(value)
526
+
527
+ # For pooled samples, we typically want the first value or aggregate appropriately
528
+ if xml_key not in aggregated_attributes:
529
+ aggregated_attributes[xml_key] = formatted_value
530
+
531
+ # Override with aggregated values for pooled samples
532
+ if pooling_info.get("aggregated_collection_date"):
533
+ collection_date_key = attribute_mappings.get(
534
+ "collection_date", "collection_date"
535
+ )
536
+ aggregated_attributes[collection_date_key] = pooling_info[
537
+ "aggregated_collection_date"
538
+ ]
539
+
540
+ if pooling_info.get("aggregated_depth"):
541
+ depth_key = attribute_mappings.get("depth", "depth")
542
+ aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
543
+
544
+ # Add samp_pooling attribute with semicolon-delimited biosample IDs
545
+ if pooling_info.get("pooled_biosample_ids"):
546
+ aggregated_attributes["samp_pooling"] = ";".join(
547
+ pooling_info["pooled_biosample_ids"]
548
+ )
549
+
550
+ # Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
551
+ allowed_attributes = {
552
+ "collection_date",
553
+ "depth",
554
+ "elev",
555
+ "geo_loc_name",
556
+ "lat_lon",
557
+ "env_broad_scale",
558
+ "env_local_scale",
559
+ "env_medium",
560
+ "samp_pooling",
561
+ }
562
+ filtered_attributes = {
563
+ k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
564
+ }
565
+
566
+ biosample_elements = [
567
+ self.set_element(
568
+ "SampleId",
569
+ children=[
570
+ self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
571
+ ],
572
+ ),
573
+ self.set_element(
574
+ "Descriptor",
575
+ children=[
576
+ self.set_element("Title", title),
577
+ self.set_element(
578
+ "ExternalLink",
579
+ attrib={"label": sample_id_value},
580
+ children=[
581
+ self.set_element(
582
+ "URL",
583
+ f"https://bioregistry.io/{sample_id_value}",
584
+ )
585
+ ],
586
+ ),
587
+ self.set_element(
588
+ "ExternalLink",
589
+ attrib={"label": pooling_info["pooling_process_id"]},
590
+ children=[
591
+ self.set_element(
592
+ "URL",
593
+ f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
594
+ )
595
+ ],
596
+ ),
597
+ ]
598
+ + [
599
+ self.set_element(
600
+ "ExternalLink",
601
+ attrib={"label": biosample_id},
602
+ children=[
603
+ self.set_element(
604
+ "URL",
605
+ f"https://bioregistry.io/{biosample_id}",
606
+ )
607
+ ],
608
+ )
609
+ for biosample_id in pooling_info.get("pooled_biosample_ids", [])
610
+ ],
611
+ ),
612
+ self.set_element(
613
+ "Organism",
614
+ children=[self.set_element("OrganismName", organism_name)],
615
+ ),
616
+ self.set_element(
617
+ "BioProject",
618
+ children=[
619
+ self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
620
+ ],
621
+ ),
622
+ self.set_element("Package", env_package),
623
+ self.set_element(
624
+ "Attributes",
625
+ children=[
626
+ self.set_element(
627
+ "Attribute", filtered_attributes[key], {"attribute_name": key}
628
+ )
629
+ for key in sorted(filtered_attributes)
630
+ ]
631
+ + [
632
+ self.set_element(
633
+ "Attribute",
634
+ "National Microbiome Data Collaborative",
635
+ {"attribute_name": "broker name"},
636
+ )
637
+ ],
638
+ ),
639
+ ]
640
+
641
+ action = self.set_element(
642
+ "Action",
643
+ children=[
644
+ self.set_element(
645
+ "AddData",
646
+ attrib={"target_db": "BioSample"},
647
+ children=[
648
+ self.set_element(
649
+ "Data",
650
+ attrib={"content_type": "XML"},
651
+ children=[
652
+ self.set_element(
653
+ "XmlContent",
654
+ children=[
655
+ self.set_element(
656
+ "BioSample",
657
+ attrib={"schema_version": "2.0"},
658
+ children=biosample_elements,
659
+ ),
660
+ ],
661
+ ),
662
+ ],
663
+ ),
664
+ self.set_element(
665
+ "Identifier",
666
+ children=[
667
+ self.set_element(
668
+ "SPUID",
669
+ sample_id_value,
670
+ {"spuid_namespace": org},
671
+ ),
672
+ ],
673
+ ),
674
+ ],
675
+ ),
676
+ ],
677
+ )
678
+ self.root.append(action)
679
+
680
+ def set_fastq(
681
+ self,
682
+ biosample_data_objects: list,
683
+ bioproject_id: str,
684
+ org: str,
685
+ nmdc_nucleotide_sequencing: list,
686
+ nmdc_biosamples: list,
687
+ nmdc_library_preparation: list,
688
+ all_instruments: dict,
689
+ pooled_biosamples_data=None,
690
+ ):
691
+ bsm_id_name_dict = {
692
+ biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
693
+ }
694
+
695
+ # Use provided pooling data or empty dict
696
+ pooling_data = pooled_biosamples_data or {}
697
+
698
+ # Group data objects by pooling process
699
+ pooling_groups = {}
700
+ individual_entries = []
701
+
702
+ for entry in biosample_data_objects:
703
+ pooling_process_id = None
704
+ # Check if any biosample in this entry belongs to a pooling process
705
+ for biosample_id in entry.keys():
706
+ pooling_info = pooling_data.get(biosample_id, {})
707
+ if pooling_info and pooling_info.get("pooling_process_id"):
708
+ pooling_process_id = pooling_info["pooling_process_id"]
709
+ break
710
+
711
+ if pooling_process_id:
712
+ if pooling_process_id not in pooling_groups:
713
+ pooling_groups[pooling_process_id] = {
714
+ "entries": [],
715
+ "processed_sample_id": pooling_info.get("processed_sample_id"),
716
+ "processed_sample_name": pooling_info.get(
717
+ "processed_sample_name", ""
718
+ ),
719
+ }
720
+ pooling_groups[pooling_process_id]["entries"].append(entry)
721
+ else:
722
+ individual_entries.append(entry)
723
+
724
+ # Process pooled entries - create one SRA <Action> block per pooling process
725
+ for pooling_process_id, group_data in pooling_groups.items():
726
+ self._create_pooled_sra_action(
727
+ group_data["entries"],
728
+ group_data["processed_sample_id"],
729
+ group_data["processed_sample_name"],
730
+ bioproject_id,
731
+ org,
732
+ nmdc_nucleotide_sequencing,
733
+ nmdc_library_preparation,
734
+ all_instruments,
735
+ bsm_id_name_dict,
736
+ )
737
+
738
+ # Process individual entries
739
+ for entry in individual_entries:
740
+ fastq_files = []
741
+ biosample_ids = []
742
+ nucleotide_sequencing_ids = {}
743
+ lib_prep_protocol_names = {}
744
+ analyte_category = ""
745
+ library_name = ""
746
+ instrument_vendor = ""
747
+ instrument_model = ""
748
+
749
+ for biosample_id, data_objects in entry.items():
750
+ biosample_ids.append(biosample_id)
751
+ for data_object in data_objects:
752
+ if "url" in data_object:
753
+ url = urlparse(data_object["url"])
754
+ file_path = os.path.basename(url.path)
755
+ fastq_files.append(file_path)
756
+
757
+ for ntseq_dict in nmdc_nucleotide_sequencing:
758
+ if biosample_id in ntseq_dict:
759
+ for ntseq in ntseq_dict[biosample_id]:
760
+ nucleotide_sequencing_ids[biosample_id] = ntseq.get(
761
+ "id", ""
762
+ )
763
+ # Currently, we are making the assumption that only one instrument
764
+ # is used to sequence a Biosample
765
+ instrument_used: List[str] = ntseq.get(
766
+ "instrument_used", []
767
+ )
768
+ if not instrument_used:
769
+ instrument_id = None
770
+ else:
771
+ instrument_id = instrument_used[0]
772
+
773
+ instrument = all_instruments.get(instrument_id, {})
774
+ instrument_vendor = instrument.get("vendor", "")
775
+ instrument_model = instrument.get("model", "")
776
+
777
+ analyte_category = ntseq.get("analyte_category", "")
778
+ library_name = bsm_id_name_dict.get(biosample_id, "")
779
+
780
+ for lib_prep_dict in nmdc_library_preparation:
781
+ if biosample_id in lib_prep_dict:
782
+ lib_prep_protocol_names[biosample_id] = (
783
+ lib_prep_dict[biosample_id]
784
+ .get("protocol_link", {})
785
+ .get("name", "")
786
+ )
787
+
788
+ if fastq_files:
789
+ files_elements = [
790
+ self.set_element(
791
+ "File",
792
+ "",
793
+ {"file_path": f},
794
+ [
795
+ self.set_element(
796
+ "DataType",
797
+ "sra-run-fastq" if ".fastq" in f else "generic-data",
798
+ )
799
+ ],
800
+ )
801
+ for f in fastq_files
802
+ ]
803
+
804
+ attribute_elements = [
805
+ self.set_element(
806
+ "AttributeRefId",
807
+ attrib={"name": "BioProject"},
808
+ children=[
809
+ self.set_element(
810
+ "RefId",
811
+ children=[
812
+ self.set_element(
813
+ "PrimaryId",
814
+ bioproject_id,
815
+ {"db": "BioProject"},
816
+ )
817
+ ],
818
+ )
819
+ ],
820
+ )
821
+ ]
822
+
823
+ for biosample_id in biosample_ids:
824
+ attribute_elements.append(
825
+ self.set_element(
826
+ "AttributeRefId",
827
+ attrib={"name": "BioSample"},
828
+ children=[
829
+ self.set_element(
830
+ "RefId",
831
+ children=[
832
+ self.set_element(
833
+ "SPUID",
834
+ biosample_id,
835
+ {"spuid_namespace": org},
836
+ )
837
+ ],
838
+ )
839
+ ],
840
+ )
841
+ )
842
+
843
+ sra_attributes = []
844
+ if instrument_vendor == "illumina":
845
+ sra_attributes.append(
846
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
847
+ )
848
+ if instrument_model == "nextseq_550":
849
+ sra_attributes.append(
850
+ self.set_element(
851
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
852
+ )
853
+ )
854
+ elif instrument_model == "novaseq_6000":
855
+ sra_attributes.append(
856
+ self.set_element(
857
+ "Attribute",
858
+ "NovaSeq 6000",
859
+ {"name": "instrument_model"},
860
+ )
861
+ )
862
+ elif instrument_model == "hiseq":
863
+ sra_attributes.append(
864
+ self.set_element(
865
+ "Attribute", "HiSeq", {"name": "instrument_model"}
866
+ )
867
+ )
868
+
869
+ if analyte_category == "metagenome":
870
+ sra_attributes.append(
871
+ self.set_element(
872
+ "Attribute", "WGS", {"name": "library_strategy"}
873
+ )
874
+ )
875
+ sra_attributes.append(
876
+ self.set_element(
877
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
878
+ )
879
+ )
880
+ sra_attributes.append(
881
+ self.set_element(
882
+ "Attribute", "RANDOM", {"name": "library_selection"}
883
+ )
884
+ )
885
+ elif analyte_category == "metatranscriptome":
886
+ sra_attributes.append(
887
+ self.set_element(
888
+ "Attribute",
889
+ "METATRANSCRIPTOMIC",
890
+ {"name": "library_source"},
891
+ )
892
+ )
893
+
894
+ has_paired_reads = any(
895
+ data_object.get("data_object_type", "").lower()
896
+ == "metagenome raw reads"
897
+ for data_object in data_objects
898
+ ) or (
899
+ any(
900
+ data_object.get("data_object_type", "").lower()
901
+ == "metagenome raw read 1"
902
+ for data_object in data_objects
903
+ )
904
+ and any(
905
+ data_object.get("data_object_type", "").lower()
906
+ == "metagenome raw read 2"
907
+ for data_object in data_objects
908
+ )
909
+ )
910
+
911
+ if has_paired_reads:
912
+ sra_attributes.append(
913
+ self.set_element(
914
+ "Attribute", "paired", {"name": "library_layout"}
915
+ )
916
+ )
917
+ else:
918
+ sra_attributes.append(
919
+ self.set_element(
920
+ "Attribute", "single", {"name": "library_layout"}
921
+ )
922
+ )
923
+
924
+ # Add library_name attribute
925
+ if library_name:
926
+ sra_attributes.append(
927
+ self.set_element(
928
+ "Attribute", library_name, {"name": "library_name"}
929
+ )
930
+ )
931
+
932
+ for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
933
+ sra_attributes.append(
934
+ self.set_element(
935
+ "Attribute",
936
+ lib_prep_name,
937
+ {"name": "library_construction_protocol"},
938
+ )
939
+ )
940
+
941
+ for (
942
+ biosample_id,
943
+ omics_processing_id,
944
+ ) in nucleotide_sequencing_ids.items():
945
+ identifier_element = self.set_element(
946
+ "Identifier",
947
+ children=[
948
+ self.set_element(
949
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
950
+ )
951
+ ],
952
+ )
953
+
954
+ action = self.set_element(
955
+ "Action",
956
+ children=[
957
+ self.set_element(
958
+ "AddFiles",
959
+ attrib={"target_db": "SRA"},
960
+ children=files_elements
961
+ + attribute_elements
962
+ + sra_attributes
963
+ + [identifier_element],
964
+ ),
965
+ ],
966
+ )
967
+
968
+ self.root.append(action)
969
+
970
+ def _create_pooled_sra_action(
971
+ self,
972
+ entries,
973
+ processed_sample_id,
974
+ processed_sample_name,
975
+ bioproject_id,
976
+ org,
977
+ nmdc_nucleotide_sequencing,
978
+ nmdc_library_preparation,
979
+ all_instruments,
980
+ bsm_id_name_dict,
981
+ ):
982
+ if not processed_sample_id:
983
+ return
984
+
985
+ # Collect all fastq files from all entries
986
+ all_fastq_files = set()
987
+ all_biosample_ids = set()
988
+ nucleotide_sequencing_ids = {}
989
+ lib_prep_protocol_names = {}
990
+ analyte_category = ""
991
+ instrument_vendor = ""
992
+ instrument_model = ""
993
+
994
+ for entry in entries:
995
+ for biosample_id, data_objects in entry.items():
996
+ all_biosample_ids.add(biosample_id)
997
+ for data_object in data_objects:
998
+ if "url" in data_object:
999
+ url = urlparse(data_object["url"])
1000
+ file_path = os.path.basename(url.path)
1001
+ all_fastq_files.add(file_path)
1002
+
1003
+ # Get nucleotide sequencing info
1004
+ for ntseq_dict in nmdc_nucleotide_sequencing:
1005
+ if biosample_id in ntseq_dict:
1006
+ for ntseq in ntseq_dict[biosample_id]:
1007
+ nucleotide_sequencing_ids[biosample_id] = ntseq.get(
1008
+ "id", ""
1009
+ )
1010
+ instrument_used = ntseq.get("instrument_used", [])
1011
+ if instrument_used:
1012
+ instrument_id = instrument_used[0]
1013
+ instrument = all_instruments.get(instrument_id, {})
1014
+ instrument_vendor = instrument.get("vendor", "")
1015
+ instrument_model = instrument.get("model", "")
1016
+ analyte_category = ntseq.get("analyte_category", "")
1017
+
1018
+ # Get library preparation info
1019
+ for lib_prep_dict in nmdc_library_preparation:
1020
+ if biosample_id in lib_prep_dict:
1021
+ lib_prep_protocol_names[biosample_id] = (
1022
+ lib_prep_dict[biosample_id]
1023
+ .get("protocol_link", {})
1024
+ .get("name", "")
1025
+ )
1026
+
1027
+ if all_fastq_files:
1028
+ files_elements = [
1029
+ self.set_element(
1030
+ "File",
1031
+ "",
1032
+ {"file_path": f},
1033
+ [
1034
+ self.set_element(
1035
+ "DataType",
1036
+ "sra-run-fastq" if ".fastq" in f else "generic-data",
1037
+ )
1038
+ ],
1039
+ )
1040
+ for f in sorted(all_fastq_files)
1041
+ ]
1042
+
1043
+ attribute_elements = [
1044
+ self.set_element(
1045
+ "AttributeRefId",
1046
+ attrib={"name": "BioProject"},
1047
+ children=[
1048
+ self.set_element(
1049
+ "RefId",
1050
+ children=[
1051
+ self.set_element(
1052
+ "PrimaryId",
1053
+ bioproject_id,
1054
+ {"db": "BioProject"},
1055
+ )
1056
+ ],
1057
+ )
1058
+ ],
1059
+ ),
1060
+ # Reference the processed sample, not individual biosamples
1061
+ self.set_element(
1062
+ "AttributeRefId",
1063
+ attrib={"name": "BioSample"},
1064
+ children=[
1065
+ self.set_element(
1066
+ "RefId",
1067
+ children=[
1068
+ self.set_element(
1069
+ "SPUID",
1070
+ processed_sample_id,
1071
+ {"spuid_namespace": org},
1072
+ )
1073
+ ],
1074
+ )
1075
+ ],
1076
+ ),
1077
+ ]
1078
+
1079
+ sra_attributes = []
1080
+ if instrument_vendor == "illumina":
1081
+ sra_attributes.append(
1082
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
1083
+ )
1084
+ if instrument_model == "nextseq_550":
1085
+ sra_attributes.append(
1086
+ self.set_element(
1087
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
1088
+ )
1089
+ )
1090
+ elif instrument_model == "novaseq_6000":
1091
+ sra_attributes.append(
1092
+ self.set_element(
1093
+ "Attribute",
1094
+ "NovaSeq 6000",
1095
+ {"name": "instrument_model"},
1096
+ )
1097
+ )
1098
+ elif instrument_model == "hiseq":
1099
+ sra_attributes.append(
1100
+ self.set_element(
1101
+ "Attribute", "HiSeq", {"name": "instrument_model"}
1102
+ )
1103
+ )
1104
+
1105
+ if analyte_category == "metagenome":
1106
+ sra_attributes.append(
1107
+ self.set_element("Attribute", "WGS", {"name": "library_strategy"})
1108
+ )
1109
+ sra_attributes.append(
1110
+ self.set_element(
1111
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
1112
+ )
1113
+ )
1114
+ sra_attributes.append(
1115
+ self.set_element(
1116
+ "Attribute", "RANDOM", {"name": "library_selection"}
1117
+ )
1118
+ )
1119
+ elif analyte_category == "metatranscriptome":
1120
+ sra_attributes.append(
1121
+ self.set_element(
1122
+ "Attribute",
1123
+ "METATRANSCRIPTOMIC",
1124
+ {"name": "library_source"},
1125
+ )
1126
+ )
1127
+
1128
+ # Determine library layout based on file patterns
1129
+ has_paired_reads = any(
1130
+ "_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
1131
+ for f in all_fastq_files
1132
+ if "_R1" in f
1133
+ )
1134
+
1135
+ if has_paired_reads:
1136
+ sra_attributes.append(
1137
+ self.set_element("Attribute", "paired", {"name": "library_layout"})
1138
+ )
1139
+ else:
1140
+ sra_attributes.append(
1141
+ self.set_element("Attribute", "single", {"name": "library_layout"})
1142
+ )
1143
+
1144
+ # Add library_name attribute using ProcessedSample name
1145
+ if processed_sample_name:
1146
+ sra_attributes.append(
1147
+ self.set_element(
1148
+ "Attribute", processed_sample_name, {"name": "library_name"}
1149
+ )
1150
+ )
1151
+
1152
+ # Add library construction protocol from any of the biosamples
1153
+ for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
1154
+ if lib_prep_name:
1155
+ sra_attributes.append(
1156
+ self.set_element(
1157
+ "Attribute",
1158
+ lib_prep_name,
1159
+ {"name": "library_construction_protocol"},
1160
+ )
1161
+ )
1162
+ break # Only add one protocol name
1163
+
1164
+ # Use the first nucleotide sequencing ID as the identifier
1165
+ omics_processing_id = None
1166
+ for biosample_id, seq_id in nucleotide_sequencing_ids.items():
1167
+ if seq_id:
1168
+ omics_processing_id = seq_id
1169
+ break
1170
+
1171
+ if omics_processing_id:
1172
+ identifier_element = self.set_element(
1173
+ "Identifier",
1174
+ children=[
1175
+ self.set_element(
1176
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
1177
+ )
1178
+ ],
1179
+ )
1180
+
1181
+ action = self.set_element(
1182
+ "Action",
1183
+ children=[
1184
+ self.set_element(
1185
+ "AddFiles",
1186
+ attrib={"target_db": "SRA"},
1187
+ children=files_elements
1188
+ + attribute_elements
1189
+ + sra_attributes
1190
+ + [identifier_element],
1191
+ ),
1192
+ ],
1193
+ )
1194
+
1195
+ self.root.append(action)
1196
+
1197
+ def get_submission_xml(
1198
+ self,
1199
+ biosamples_list: list,
1200
+ biosample_nucleotide_sequencing_list: list,
1201
+ biosample_data_objects_list: list,
1202
+ biosample_library_preparation_list: list,
1203
+ instruments_dict: dict,
1204
+ pooled_biosamples_data=None,
1205
+ ):
1206
+ # data_type = None
1207
+
1208
+ biosamples_to_exclude = set()
1209
+ for bsm_ntseq in biosample_nucleotide_sequencing_list:
1210
+ for bsm_id, ntseq_list in bsm_ntseq.items():
1211
+ # Check if any processing_institution is "JGI"
1212
+ for ntseq in ntseq_list:
1213
+ if (
1214
+ "processing_institution" in ntseq
1215
+ and ntseq["processing_institution"] == "JGI"
1216
+ ):
1217
+ biosamples_to_exclude.add(bsm_id)
1218
+ break
1219
+
1220
+ # Filter biosample_nucleotide_sequencing_list to exclude JGI records
1221
+ filtered_nucleotide_sequencing_list = []
1222
+ for bsm_ntseq in biosample_nucleotide_sequencing_list:
1223
+ filtered_dict = {}
1224
+ for bsm_id, ntseq_list in bsm_ntseq.items():
1225
+ if bsm_id not in biosamples_to_exclude:
1226
+ filtered_dict[bsm_id] = ntseq_list
1227
+ if filtered_dict: # Only add non-empty dictionaries
1228
+ filtered_nucleotide_sequencing_list.append(filtered_dict)
1229
+
1230
+ # Filter biosamples_list to exclude JGI-processed biosamples
1231
+ filtered_biosamples_list = [
1232
+ biosample
1233
+ for biosample in biosamples_list
1234
+ if biosample.get("id") not in biosamples_to_exclude
1235
+ ]
1236
+
1237
+ # Get data_type from filtered list
1238
+ # for bsm_ntseq in filtered_nucleotide_sequencing_list:
1239
+ # for _, ntseq_list in bsm_ntseq.items():
1240
+ # for ntseq in ntseq_list:
1241
+ # if "analyte_category" in ntseq:
1242
+ # data_type = handle_string_value(
1243
+ # ntseq["analyte_category"]
1244
+ # ).capitalize()
1245
+
1246
+ self.set_description(
1247
+ email=self.nmdc_pi_email,
1248
+ first=self.first_name,
1249
+ last=self.last_name,
1250
+ org=self.ncbi_submission_metadata.get("organization", ""),
1251
+ )
1252
+
1253
+ # if not self.ncbi_bioproject_id:
1254
+ # self.set_bioproject(
1255
+ # title=self.nmdc_study_title,
1256
+ # project_id=self.ncbi_bioproject_id,
1257
+ # description=self.nmdc_study_description,
1258
+ # data_type=data_type,
1259
+ # org=self.ncbi_submission_metadata.get("organization", ""),
1260
+ # )
1261
+
1262
+ self.set_biosample(
1263
+ organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
1264
+ org=self.ncbi_submission_metadata.get("organization", ""),
1265
+ bioproject_id=self.ncbi_bioproject_id,
1266
+ nmdc_biosamples=filtered_biosamples_list,
1267
+ pooled_biosamples_data=pooled_biosamples_data,
1268
+ )
1269
+
1270
+ # Also filter biosample_data_objects_list
1271
+ filtered_data_objects_list = []
1272
+ acceptable_extensions = [".fastq.gz", ".fastq"]
1273
+
1274
+ for entry in biosample_data_objects_list:
1275
+ filtered_entry = {}
1276
+ for biosample_id, data_objects in entry.items():
1277
+ if biosample_id not in biosamples_to_exclude:
1278
+ # filter data_objects based on acceptable/allowed extensions
1279
+ # for "url" key in data_object
1280
+ filtered_objects = []
1281
+ for data_object in data_objects:
1282
+ if "url" in data_object:
1283
+ url = urlparse(data_object["url"])
1284
+ file_path = os.path.basename(url.path)
1285
+ if any(
1286
+ file_path.endswith(ext) for ext in acceptable_extensions
1287
+ ):
1288
+ filtered_objects.append(data_object)
1289
+
1290
+ if filtered_objects:
1291
+ filtered_entry[biosample_id] = filtered_objects
1292
+
1293
+ if filtered_entry: # Only add non-empty entries
1294
+ filtered_data_objects_list.append(filtered_entry)
1295
+
1296
+ # Filter library preparation list as well
1297
+ filtered_library_preparation_list = []
1298
+ for lib_prep_dict in biosample_library_preparation_list:
1299
+ filtered_lib_prep = {}
1300
+ for biosample_id, lib_prep in lib_prep_dict.items():
1301
+ if biosample_id not in biosamples_to_exclude:
1302
+ filtered_lib_prep[biosample_id] = lib_prep
1303
+ if filtered_lib_prep: # Only add non-empty entries
1304
+ filtered_library_preparation_list.append(filtered_lib_prep)
1305
+
1306
+ self.set_fastq(
1307
+ biosample_data_objects=filtered_data_objects_list,
1308
+ bioproject_id=self.ncbi_bioproject_id,
1309
+ org=self.ncbi_submission_metadata.get("organization", ""),
1310
+ nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
1311
+ nmdc_biosamples=filtered_biosamples_list,
1312
+ nmdc_library_preparation=filtered_library_preparation_list,
1313
+ all_instruments=instruments_dict,
1314
+ pooled_biosamples_data=pooled_biosamples_data,
1315
+ )
1316
+
1317
+ rough_string = ET.tostring(self.root, "unicode")
1318
+ reparsed = xml.dom.minidom.parseString(rough_string)
1319
+ submission_xml = reparsed.toprettyxml(indent=" ", newl="\n")
1320
+
1321
+ # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
1322
+ # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
1323
+ # validate_xml(submission_xml, submission_xsd_url)
1324
+
1325
+ # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co"
1326
+ # validate_xml(submission_xml, bioproject_xsd_url)
1327
+
1328
+ # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
1329
+ # validate_xml(submission_xml, biosample_xsd_url)
1330
+
1331
+ return submission_xml