nmdc-runtime 1.6.0__py3-none-any.whl → 1.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

@@ -0,0 +1,529 @@
1
+ import os
2
+ import datetime
3
+ import xml.etree.ElementTree as ET
4
+ import xml.dom.minidom
5
+
6
+ from typing import Any
7
+ from urllib.parse import urlparse
8
+ from nmdc_runtime.site.export.ncbi_xml_utils import (
9
+ handle_controlled_identified_term_value,
10
+ handle_controlled_term_value,
11
+ handle_geolocation_value,
12
+ handle_quantity_value,
13
+ handle_text_value,
14
+ handle_timestamp_value,
15
+ handle_float_value,
16
+ handle_string_value,
17
+ load_mappings,
18
+ validate_xml,
19
+ )
20
+
21
+
22
+ class NCBISubmissionXML:
23
+ def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
24
+ self.root = ET.Element("Submission")
25
+
26
+ self.nmdc_study_id = nmdc_study.get("id")
27
+ self.nmdc_study_title = nmdc_study.get("title")
28
+ self.nmdc_study_description = nmdc_study.get("description")
29
+ self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
30
+ self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
31
+ nmdc_study_pi_name = (
32
+ nmdc_study.get("principal_investigator", {}).get("name").split()
33
+ )
34
+ self.first_name = nmdc_study_pi_name[0]
35
+ self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None
36
+
37
+ self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
38
+ "nmdc_ncbi_attribute_mapping_file_url"
39
+ )
40
+ self.ncbi_submission_metadata = ncbi_submission_metadata.get(
41
+ "ncbi_submission_metadata", {}
42
+ )
43
+ self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
44
+ "ncbi_biosample_metadata", {}
45
+ )
46
+
47
+ # dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
48
+ # type handlers
49
+ self.type_handlers = {
50
+ "QuantityValue": handle_quantity_value,
51
+ "TextValue": handle_text_value,
52
+ "TimestampValue": handle_timestamp_value,
53
+ "ControlledTermValue": handle_controlled_term_value,
54
+ "ControlledIdentifiedTermValue": handle_controlled_identified_term_value,
55
+ "GeolocationValue": handle_geolocation_value,
56
+ "float": handle_float_value,
57
+ "string": handle_string_value,
58
+ }
59
+
60
+ def set_element(self, tag, text="", attrib=None, children=None):
61
+ attrib = attrib or {}
62
+ children = children or []
63
+ element = ET.Element(tag, attrib=attrib)
64
+ element.text = text
65
+ for child in children:
66
+ element.append(child)
67
+ return element
68
+
69
+ def set_description(self, email, user, first, last, org, date=None):
70
+ date = date or datetime.datetime.now().strftime("%Y-%m-%d")
71
+ description = self.set_element(
72
+ "Description",
73
+ children=[
74
+ self.set_element(
75
+ "Comment", f"NMDC Submission for {self.nmdc_study_id}"
76
+ ),
77
+ self.set_element("Submitter", attrib={"user_name": user}),
78
+ self.set_element(
79
+ "Organization",
80
+ attrib={"role": "owner", "type": "center"},
81
+ children=[
82
+ self.set_element("Name", org),
83
+ self.set_element(
84
+ "Contact",
85
+ attrib={"email": email},
86
+ children=[
87
+ self.set_element(
88
+ "Name",
89
+ children=[
90
+ self.set_element("First", first),
91
+ self.set_element("Last", last),
92
+ ],
93
+ )
94
+ ],
95
+ ),
96
+ ],
97
+ ),
98
+ self.set_element("Hold", attrib={"release_date": date}),
99
+ ],
100
+ )
101
+ self.root.append(description)
102
+
103
+ def set_descriptor(self, title, description):
104
+ descriptor_elements = []
105
+ descriptor_elements.append(self.set_element("Title", title))
106
+ descriptor_elements.append(
107
+ self.set_element(
108
+ "Description", children=[self.set_element("p", description)]
109
+ )
110
+ )
111
+
112
+ return descriptor_elements
113
+
114
+ def set_bioproject(self, title, project_id, description, data_type, org):
115
+ action = self.set_element("Action")
116
+ add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
117
+
118
+ data_element = self.set_element("Data", attrib={"content_type": "XML"})
119
+ xml_content = self.set_element("XmlContent")
120
+ project = self.set_element("Project", attrib={"schema_version": "2.0"})
121
+
122
+ project_id_element = self.set_element("ProjectID")
123
+ spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org})
124
+ project_id_element.append(spuid)
125
+
126
+ descriptor = self.set_descriptor(title, description)
127
+ project_type = self.set_element("ProjectType")
128
+ # "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd
129
+ # scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.."
130
+ project_type_submission = self.set_element(
131
+ "ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
132
+ )
133
+ intended_data_type_set = self.set_element("IntendedDataTypeSet")
134
+ data_type_element = self.set_element("DataType", data_type)
135
+
136
+ intended_data_type_set.append(data_type_element)
137
+ project_type_submission.append(intended_data_type_set)
138
+ project_type.append(project_type_submission)
139
+
140
+ project.extend([project_id_element] + descriptor + [project_type])
141
+
142
+ xml_content.append(project)
143
+ data_element.append(xml_content)
144
+ add_data.append(data_element)
145
+
146
+ identifier = self.set_element("Identifier")
147
+ spuid_identifier = self.set_element(
148
+ "SPUID", project_id, {"spuid_namespace": org}
149
+ )
150
+ identifier.append(spuid_identifier)
151
+ add_data.append(identifier)
152
+
153
+ action.append(add_data)
154
+ self.root.append(action)
155
+
156
+ def set_biosample(
157
+ self,
158
+ organism_name,
159
+ org,
160
+ bioproject_id,
161
+ nmdc_biosamples,
162
+ ):
163
+ attribute_mappings, slot_range_mappings = load_mappings(
164
+ self.nmdc_ncbi_attribute_mapping_file_url
165
+ )
166
+
167
+ for biosample in nmdc_biosamples:
168
+ attributes = {}
169
+ sample_id_value = None
170
+ env_package = None
171
+
172
+ for json_key, value in biosample.items():
173
+ if isinstance(value, list):
174
+ continue # Skip processing for list values
175
+
176
+ if json_key == "env_package":
177
+ env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
178
+
179
+ # Special handling for NMDC Biosample "id"
180
+ if json_key == "id":
181
+ sample_id_value = value
182
+ continue
183
+
184
+ if json_key not in attribute_mappings:
185
+ continue
186
+
187
+ xml_key = attribute_mappings[json_key]
188
+ value_type = slot_range_mappings.get(json_key, "string")
189
+ handler = self.type_handlers.get(value_type, handle_string_value)
190
+
191
+ formatted_value = handler(value)
192
+ attributes[xml_key] = formatted_value
193
+
194
+ biosample_elements = [
195
+ self.set_element(
196
+ "SampleId",
197
+ children=[
198
+ self.set_element(
199
+ "SPUID", sample_id_value, {"spuid_namespace": org}
200
+ )
201
+ ],
202
+ ),
203
+ self.set_element(
204
+ "Descriptor",
205
+ children=[
206
+ self.set_element(
207
+ "Title",
208
+ f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study",
209
+ ),
210
+ ],
211
+ ),
212
+ self.set_element(
213
+ "Organism",
214
+ children=[self.set_element("OrganismName", organism_name)],
215
+ ),
216
+ self.set_element(
217
+ "BioProject",
218
+ children=[
219
+ self.set_element(
220
+ "PrimaryId", bioproject_id, {"db": "BioProject"}
221
+ )
222
+ ],
223
+ ),
224
+ self.set_element("Package", env_package),
225
+ self.set_element(
226
+ "Attributes",
227
+ children=[
228
+ self.set_element(
229
+ "Attribute", attributes[key], {"attribute_name": key}
230
+ )
231
+ for key in sorted(attributes)
232
+ ],
233
+ ),
234
+ ]
235
+
236
+ action = self.set_element(
237
+ "Action",
238
+ children=[
239
+ self.set_element(
240
+ "AddData",
241
+ attrib={"target_db": "BioSample"},
242
+ children=[
243
+ self.set_element(
244
+ "Data",
245
+ attrib={"content_type": "XML"},
246
+ children=[
247
+ self.set_element(
248
+ "XmlContent",
249
+ children=[
250
+ self.set_element(
251
+ "BioSample",
252
+ attrib={"schema_version": "2.0"},
253
+ children=biosample_elements,
254
+ ),
255
+ ],
256
+ ),
257
+ ],
258
+ ),
259
+ self.set_element(
260
+ "Identifier",
261
+ children=[
262
+ self.set_element(
263
+ "SPUID",
264
+ sample_id_value,
265
+ {"spuid_namespace": org},
266
+ ),
267
+ ],
268
+ ),
269
+ ],
270
+ ),
271
+ ],
272
+ )
273
+ self.root.append(action)
274
+
275
+ def set_fastq(
276
+ self,
277
+ biosample_data_objects: list,
278
+ bioproject_id: str,
279
+ org: str,
280
+ nmdc_omics_processing: list,
281
+ nmdc_biosamples: list,
282
+ ):
283
+ bsm_id_name_dict = {
284
+ biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
285
+ }
286
+
287
+ for entry in biosample_data_objects:
288
+ fastq_files = []
289
+ biosample_ids = []
290
+ omics_processing_ids = {}
291
+ instrument_name = ""
292
+ omics_type = ""
293
+ library_name = ""
294
+
295
+ for biosample_id, data_objects in entry.items():
296
+ biosample_ids.append(biosample_id)
297
+ for data_object in data_objects:
298
+ if "url" in data_object:
299
+ url = urlparse(data_object["url"])
300
+ file_path = os.path.basename(url.path)
301
+ fastq_files.append(file_path)
302
+
303
+ for omprc_dict in nmdc_omics_processing:
304
+ if biosample_id in omprc_dict:
305
+ for omprc in omprc_dict[biosample_id]:
306
+ omics_processing_ids[biosample_id] = omprc.get("id", "")
307
+ instrument_name = omprc.get("instrument_name", "")
308
+ omics_type = (
309
+ omprc.get("omics_type", {})
310
+ .get("has_raw_value", "")
311
+ .lower()
312
+ )
313
+ library_name = bsm_id_name_dict.get(biosample_id, "")
314
+
315
+ if fastq_files:
316
+ files_elements = [
317
+ self.set_element(
318
+ "File",
319
+ "",
320
+ {"file_path": f},
321
+ [self.set_element("DataType", "generic-data")],
322
+ )
323
+ for f in fastq_files
324
+ ]
325
+
326
+ attribute_elements = [
327
+ self.set_element(
328
+ "AttributeRefId",
329
+ attrib={"name": "BioProject"},
330
+ children=[
331
+ self.set_element(
332
+ "RefId",
333
+ children=[
334
+ self.set_element(
335
+ "SPUID",
336
+ bioproject_id,
337
+ {"spuid_namespace": org},
338
+ )
339
+ ],
340
+ )
341
+ ],
342
+ )
343
+ ]
344
+
345
+ for biosample_id in biosample_ids:
346
+ attribute_elements.append(
347
+ self.set_element(
348
+ "AttributeRefId",
349
+ attrib={"name": "BioSample"},
350
+ children=[
351
+ self.set_element(
352
+ "RefId",
353
+ children=[
354
+ self.set_element(
355
+ "SPUID",
356
+ biosample_id,
357
+ {"spuid_namespace": org},
358
+ )
359
+ ],
360
+ )
361
+ ],
362
+ )
363
+ )
364
+
365
+ sra_attributes = []
366
+ if instrument_name.lower().startswith("illumina"):
367
+ sra_attributes.append(
368
+ self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
369
+ )
370
+ if "nextseq550" in instrument_name.lower():
371
+ sra_attributes.append(
372
+ self.set_element(
373
+ "Attribute", "NextSeq 550", {"name": "instrument_model"}
374
+ )
375
+ )
376
+
377
+ if omics_type == "metagenome":
378
+ sra_attributes.append(
379
+ self.set_element(
380
+ "Attribute", "WGS", {"name": "library_strategy"}
381
+ )
382
+ )
383
+ sra_attributes.append(
384
+ self.set_element(
385
+ "Attribute", "METAGENOMIC", {"name": "library_source"}
386
+ )
387
+ )
388
+ sra_attributes.append(
389
+ self.set_element(
390
+ "Attribute", "RANDOM", {"name": "library_selection"}
391
+ )
392
+ )
393
+
394
+ if omics_type == "metatranscriptome":
395
+ sra_attributes.append(
396
+ self.set_element(
397
+ "Attribute",
398
+ "METATRANSCRIPTOMIC",
399
+ {"name": "library_source"},
400
+ )
401
+ )
402
+
403
+ has_paired_reads = any(
404
+ data_object.get("data_object_type", "").lower()
405
+ == "metagenome raw reads"
406
+ for data_object in data_objects
407
+ ) or (
408
+ any(
409
+ data_object.get("data_object_type", "").lower()
410
+ == "metagenome raw read 1"
411
+ for data_object in data_objects
412
+ )
413
+ and any(
414
+ data_object.get("data_object_type", "").lower()
415
+ == "metagenome raw read 2"
416
+ for data_object in data_objects
417
+ )
418
+ )
419
+
420
+ if has_paired_reads:
421
+ sra_attributes.append(
422
+ self.set_element(
423
+ "Attribute", "paired", {"name": "library_layout"}
424
+ )
425
+ )
426
+ else:
427
+ sra_attributes.append(
428
+ self.set_element(
429
+ "Attribute", "single", {"name": "library_layout"}
430
+ )
431
+ )
432
+
433
+ if library_name:
434
+ sra_attributes.append(
435
+ self.set_element(
436
+ "Attribute", library_name, {"name": "library_name"}
437
+ )
438
+ )
439
+
440
+ for biosample_id, omics_processing_id in omics_processing_ids.items():
441
+ identifier_element = self.set_element(
442
+ "Identifier",
443
+ children=[
444
+ self.set_element(
445
+ "SPUID", omics_processing_id, {"spuid_namespace": org}
446
+ )
447
+ ],
448
+ )
449
+
450
+ action = self.set_element(
451
+ "Action",
452
+ children=[
453
+ self.set_element(
454
+ "AddFiles",
455
+ attrib={"target_db": "SRA"},
456
+ children=files_elements
457
+ + attribute_elements
458
+ + sra_attributes
459
+ + [identifier_element],
460
+ ),
461
+ ],
462
+ )
463
+
464
+ self.root.append(action)
465
+
466
+ def get_submission_xml(
467
+ self,
468
+ biosamples_list: list,
469
+ biosample_omics_processing_list: list,
470
+ biosample_data_objects_list: list,
471
+ ):
472
+ data_type = None
473
+ ncbi_project_id = None
474
+ for bsm_omprc in biosample_omics_processing_list:
475
+ for _, omprc_list in bsm_omprc.items():
476
+ for omprc in omprc_list:
477
+ if "omics_type" in omprc:
478
+ data_type = handle_text_value(omprc["omics_type"]).capitalize()
479
+
480
+ if "ncbi_project_name" in omprc:
481
+ ncbi_project_id = omprc["ncbi_project_name"]
482
+
483
+ self.set_description(
484
+ email=self.nmdc_pi_email,
485
+ user="National Microbiome Data Collaborative (NMDC)",
486
+ first=self.first_name,
487
+ last=self.last_name,
488
+ org=self.ncbi_submission_metadata.get("organization", ""),
489
+ )
490
+
491
+ if not ncbi_project_id:
492
+ self.set_bioproject(
493
+ title=self.nmdc_study_title,
494
+ project_id=ncbi_project_id,
495
+ description=self.nmdc_study_description,
496
+ data_type=data_type,
497
+ org=self.ncbi_submission_metadata.get("organization", ""),
498
+ )
499
+
500
+ self.set_biosample(
501
+ organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
502
+ org=self.ncbi_submission_metadata.get("organization", ""),
503
+ bioproject_id=ncbi_project_id,
504
+ nmdc_biosamples=biosamples_list,
505
+ )
506
+
507
+ self.set_fastq(
508
+ biosample_data_objects=biosample_data_objects_list,
509
+ bioproject_id=ncbi_project_id,
510
+ org=self.ncbi_submission_metadata.get("organization", ""),
511
+ nmdc_omics_processing=biosample_omics_processing_list,
512
+ nmdc_biosamples=biosamples_list,
513
+ )
514
+
515
+ rough_string = ET.tostring(self.root, "unicode")
516
+ reparsed = xml.dom.minidom.parseString(rough_string)
517
+ submission_xml = reparsed.toprettyxml(indent=" ", newl="\n")
518
+
519
+ # ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
520
+ # submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
521
+ # validate_xml(submission_xml, submission_xsd_url)
522
+
523
+ # bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co"
524
+ # validate_xml(submission_xml, bioproject_xsd_url)
525
+
526
+ # biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
527
+ # validate_xml(submission_xml, biosample_xsd_url)
528
+
529
+ return submission_xml