nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +59 -3
- nmdc_runtime/site/export/ncbi_xml.py +29 -25
- nmdc_runtime/site/export/ncbi_xml_utils.py +5 -5
- nmdc_runtime/site/export/study_metadata.py +3 -1
- nmdc_runtime/site/graphs.py +71 -15
- nmdc_runtime/site/ops.py +135 -42
- nmdc_runtime/site/repository.py +16 -4
- nmdc_runtime/site/translation/gold_translator.py +112 -43
- nmdc_runtime/site/translation/neon_benthic_translator.py +59 -34
- nmdc_runtime/site/translation/neon_soil_translator.py +72 -48
- nmdc_runtime/site/translation/neon_surface_water_translator.py +61 -32
- nmdc_runtime/site/translation/neon_utils.py +19 -6
- nmdc_runtime/site/translation/submission_portal_translator.py +67 -36
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/METADATA +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/RECORD +19 -19
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
import collections
|
|
2
|
+
import csv
|
|
2
3
|
import re
|
|
3
4
|
from typing import List, Tuple, Union
|
|
4
5
|
from nmdc_schema import nmdc
|
|
6
|
+
import pandas as pd
|
|
5
7
|
|
|
6
8
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
7
9
|
|
|
@@ -10,18 +12,22 @@ class GoldStudyTranslator(Translator):
|
|
|
10
12
|
def __init__(
|
|
11
13
|
self,
|
|
12
14
|
study: JSON_OBJECT = {},
|
|
15
|
+
study_type: str = "research_study",
|
|
13
16
|
biosamples: List[JSON_OBJECT] = [],
|
|
14
17
|
projects: List[JSON_OBJECT] = [],
|
|
15
18
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
19
|
+
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
16
20
|
*args,
|
|
17
21
|
**kwargs,
|
|
18
22
|
) -> None:
|
|
19
23
|
super().__init__(*args, **kwargs)
|
|
20
24
|
|
|
21
25
|
self.study = study
|
|
26
|
+
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
22
27
|
self.biosamples = biosamples
|
|
23
28
|
self.projects = projects
|
|
24
29
|
self.analysis_projects = analysis_projects
|
|
30
|
+
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
25
31
|
|
|
26
32
|
self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
|
|
27
33
|
self._analysis_projects_by_id = self._index_by_id(
|
|
@@ -69,6 +75,7 @@ class GoldStudyTranslator(Translator):
|
|
|
69
75
|
has_raw_value=pi_dict.get("name"),
|
|
70
76
|
name=pi_dict.get("name"),
|
|
71
77
|
email=pi_dict.get("email"),
|
|
78
|
+
type="nmdc:PersonValue",
|
|
72
79
|
)
|
|
73
80
|
|
|
74
81
|
def _get_mod_date(self, gold_entity: JSON_OBJECT) -> Union[str, None]:
|
|
@@ -108,22 +115,58 @@ class GoldStudyTranslator(Translator):
|
|
|
108
115
|
|
|
109
116
|
def _get_samp_taxon_id(
|
|
110
117
|
self, gold_biosample: JSON_OBJECT
|
|
111
|
-
) -> Union[nmdc.
|
|
112
|
-
"""Get a
|
|
118
|
+
) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
|
|
119
|
+
"""Get a ControlledIdentifiedTermValue representing the NCBI taxon
|
|
120
|
+
for a GOLD biosample
|
|
113
121
|
|
|
114
122
|
This method gets the `ncbiTaxName` and `ncbiTaxId` from a GOLD biosample object.
|
|
115
|
-
If both are not `None`, it constructs a
|
|
123
|
+
If both are not `None`, it constructs a ControlledIdentifiedTermValue of the format
|
|
116
124
|
`{ncbiTaxName} [NCBITaxon:{ncbiTaxId}]`. Otherwise, it returns `None`
|
|
117
125
|
|
|
118
126
|
:param gold_biosample: GOLD biosample object
|
|
119
|
-
:return:
|
|
127
|
+
:return: ControlledIdentifiedTermValue object
|
|
120
128
|
"""
|
|
121
129
|
ncbi_tax_name = gold_biosample.get("ncbiTaxName")
|
|
122
130
|
ncbi_tax_id = gold_biosample.get("ncbiTaxId")
|
|
123
131
|
if ncbi_tax_name is None or ncbi_tax_id is None:
|
|
124
132
|
return None
|
|
125
133
|
|
|
126
|
-
|
|
134
|
+
raw_value = f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]"
|
|
135
|
+
|
|
136
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
137
|
+
has_raw_value=raw_value,
|
|
138
|
+
term=nmdc.OntologyClass(
|
|
139
|
+
id=f"NCBITaxon:{ncbi_tax_id}",
|
|
140
|
+
name=ncbi_tax_name,
|
|
141
|
+
type="nmdc:OntologyClass",
|
|
142
|
+
),
|
|
143
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def _get_host_taxid(
|
|
147
|
+
self, gold_biosample: JSON_OBJECT
|
|
148
|
+
) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
|
|
149
|
+
"""Get a ControlledIdentifiedTermValue representing the NCBI host taxon id
|
|
150
|
+
for a GOLD biosample
|
|
151
|
+
|
|
152
|
+
This method gets the `hostNcbiTaxid` from a GOLD biosample object.
|
|
153
|
+
It constructs a ControlledIdentifiedTermValue of the format
|
|
154
|
+
`[NCBITaxon:{hostNcbiTaxid}]`. Otherwise, it returns `None`
|
|
155
|
+
|
|
156
|
+
:param gold_biosample: GOLD biosample object
|
|
157
|
+
:return: ControlledIdentifiedTermValue object
|
|
158
|
+
"""
|
|
159
|
+
host_taxid = gold_biosample.get("hostNcbiTaxid")
|
|
160
|
+
if host_taxid is None:
|
|
161
|
+
return None
|
|
162
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
163
|
+
has_raw_value=f"NCBITaxon:{host_taxid}",
|
|
164
|
+
term=nmdc.OntologyClass(
|
|
165
|
+
id=f"NCBITaxon:{host_taxid}",
|
|
166
|
+
type="nmdc:OntologyClass",
|
|
167
|
+
),
|
|
168
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
169
|
+
)
|
|
127
170
|
|
|
128
171
|
def _get_samp_name(self, gold_biosample: JSON_OBJECT) -> Union[str, None]:
|
|
129
172
|
"""Get a sample name for a GOLD biosample object
|
|
@@ -183,7 +226,9 @@ class GoldStudyTranslator(Translator):
|
|
|
183
226
|
date_collected = gold_biosample.get("dateCollected")
|
|
184
227
|
if date_collected is None:
|
|
185
228
|
return None
|
|
186
|
-
return nmdc.TimestampValue(
|
|
229
|
+
return nmdc.TimestampValue(
|
|
230
|
+
has_raw_value=date_collected, type="nmdc:TimestampValue"
|
|
231
|
+
)
|
|
187
232
|
|
|
188
233
|
def _get_quantity_value(
|
|
189
234
|
self,
|
|
@@ -215,12 +260,14 @@ class GoldStudyTranslator(Translator):
|
|
|
215
260
|
has_raw_value=minimum_numeric_value,
|
|
216
261
|
has_numeric_value=nmdc.Double(minimum_numeric_value),
|
|
217
262
|
has_unit=unit,
|
|
263
|
+
type="nmdc:QuantityValue",
|
|
218
264
|
)
|
|
219
265
|
else:
|
|
220
266
|
return nmdc.QuantityValue(
|
|
221
267
|
has_minimum_numeric_value=nmdc.Double(minimum_numeric_value),
|
|
222
268
|
has_maximum_numeric_value=nmdc.Double(maximum_numeric_value),
|
|
223
269
|
has_unit=unit,
|
|
270
|
+
type="nmdc:QuantityValue",
|
|
224
271
|
)
|
|
225
272
|
|
|
226
273
|
field_value = gold_entity.get(gold_field)
|
|
@@ -231,6 +278,7 @@ class GoldStudyTranslator(Translator):
|
|
|
231
278
|
has_raw_value=field_value,
|
|
232
279
|
has_numeric_value=nmdc.Double(field_value),
|
|
233
280
|
has_unit=unit,
|
|
281
|
+
type="nmdc:QuantityValue",
|
|
234
282
|
)
|
|
235
283
|
|
|
236
284
|
def _get_text_value(
|
|
@@ -249,7 +297,7 @@ class GoldStudyTranslator(Translator):
|
|
|
249
297
|
field_value = gold_entity.get(gold_field)
|
|
250
298
|
if field_value is None:
|
|
251
299
|
return None
|
|
252
|
-
return nmdc.TextValue(has_raw_value=field_value)
|
|
300
|
+
return nmdc.TextValue(has_raw_value=field_value, type="nmdc:TextValue")
|
|
253
301
|
|
|
254
302
|
def _get_controlled_term_value(
|
|
255
303
|
self, gold_entity: JSON_OBJECT, gold_field: str
|
|
@@ -267,7 +315,9 @@ class GoldStudyTranslator(Translator):
|
|
|
267
315
|
field_value = gold_entity.get(gold_field)
|
|
268
316
|
if field_value is None:
|
|
269
317
|
return None
|
|
270
|
-
return nmdc.ControlledTermValue(
|
|
318
|
+
return nmdc.ControlledTermValue(
|
|
319
|
+
has_raw_value=field_value, type="nmdc:ControlledTermValue"
|
|
320
|
+
)
|
|
271
321
|
|
|
272
322
|
def _get_env_term_value(
|
|
273
323
|
self, gold_biosample: JSON_OBJECT, gold_field: str
|
|
@@ -277,8 +327,8 @@ class GoldStudyTranslator(Translator):
|
|
|
277
327
|
In GOLD entities ENVO terms are represented as a nested object with `id` and `label`
|
|
278
328
|
fields. This method extracts this type of nested object by the given field name, and
|
|
279
329
|
returns it as an `nmdc:ControlledIdentifiedTermValue` object. The `id` in the original
|
|
280
|
-
GOLD object be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
|
|
281
|
-
`ENVO:00005801`). If the value of the given field is `None` or if does not contain
|
|
330
|
+
GOLD object should be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
|
|
331
|
+
`ENVO:00005801`). If the value of the given field is `None` or if it does not contain
|
|
282
332
|
a nested object with an `id` field, `None` is returned.
|
|
283
333
|
|
|
284
334
|
:param gold_biosample: GOLD biosample object
|
|
@@ -292,8 +342,10 @@ class GoldStudyTranslator(Translator):
|
|
|
292
342
|
term=nmdc.OntologyClass(
|
|
293
343
|
id=env_field["id"].replace("_", ":"),
|
|
294
344
|
name=env_field.get("label"),
|
|
345
|
+
type="nmdc:OntologyClass",
|
|
295
346
|
),
|
|
296
347
|
has_raw_value=env_field["id"],
|
|
348
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
297
349
|
)
|
|
298
350
|
|
|
299
351
|
def _get_lat_lon(
|
|
@@ -316,22 +368,40 @@ class GoldStudyTranslator(Translator):
|
|
|
316
368
|
has_raw_value=f"{latitude} {longitude}",
|
|
317
369
|
latitude=nmdc.DecimalDegree(latitude),
|
|
318
370
|
longitude=nmdc.DecimalDegree(longitude),
|
|
371
|
+
type="nmdc:GeolocationValue",
|
|
319
372
|
)
|
|
320
373
|
|
|
321
|
-
def
|
|
322
|
-
"""Get instrument
|
|
374
|
+
def _get_instrument(self, gold_project: JSON_OBJECT) -> Union[str, None]:
|
|
375
|
+
"""Get instrument id referenced in instrument_set collection in Mongo.
|
|
376
|
+
Note: The instrument id is not retrieved by making a call to the database,
|
|
377
|
+
but rather parsed out from a TSV file in the nmdc-schema repo stored at
|
|
378
|
+
self.gold_instrument_set_mapping_file_path.
|
|
323
379
|
|
|
324
|
-
This method gets the
|
|
325
|
-
that value is not
|
|
326
|
-
|
|
380
|
+
This method gets the seqMethod field from a GOLD project object. If
|
|
381
|
+
that value is not None and is in the self.gold_instrument_set_mapping_file_path
|
|
382
|
+
file's GOLD SeqMethod column, the corresponding instrument id from
|
|
383
|
+
NMDC instrument_set id column is returned. If the value of the field
|
|
384
|
+
is None, None is returned.
|
|
327
385
|
|
|
328
386
|
:param gold_project: GOLD project object
|
|
329
|
-
:return: Instrument
|
|
387
|
+
:return: id corresponding to an Instrument from instrument_set collection
|
|
330
388
|
"""
|
|
331
389
|
seq_method = gold_project.get("seqMethod")
|
|
332
390
|
if not seq_method:
|
|
333
391
|
return None
|
|
334
|
-
|
|
392
|
+
|
|
393
|
+
seq_method = seq_method[0].strip()
|
|
394
|
+
df = self.gold_nmdc_instrument_map_df
|
|
395
|
+
|
|
396
|
+
matching_row = df[df["GOLD SeqMethod"] == seq_method]
|
|
397
|
+
|
|
398
|
+
if not matching_row.empty:
|
|
399
|
+
instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
400
|
+
return instrument_id
|
|
401
|
+
|
|
402
|
+
raise ValueError(
|
|
403
|
+
f"seqMethod '{seq_method}' could not be found in the GOLD-NMDC instrument mapping TSV file."
|
|
404
|
+
)
|
|
335
405
|
|
|
336
406
|
def _get_processing_institution(
|
|
337
407
|
self, gold_project: JSON_OBJECT
|
|
@@ -407,6 +477,7 @@ class GoldStudyTranslator(Translator):
|
|
|
407
477
|
principal_investigator=self._get_pi(gold_study),
|
|
408
478
|
title=gold_study.get("studyName"),
|
|
409
479
|
type="nmdc:Study",
|
|
480
|
+
study_category=self.study_type,
|
|
410
481
|
)
|
|
411
482
|
|
|
412
483
|
def _translate_biosample(
|
|
@@ -454,7 +525,7 @@ class GoldStudyTranslator(Translator):
|
|
|
454
525
|
gold_biosample_identifiers=self._get_curie("gold", gold_biosample_id),
|
|
455
526
|
habitat=gold_biosample.get("habitat"),
|
|
456
527
|
host_name=gold_biosample.get("hostName"),
|
|
457
|
-
host_taxid=self.
|
|
528
|
+
host_taxid=self._get_host_taxid(gold_biosample),
|
|
458
529
|
id=nmdc_biosample_id,
|
|
459
530
|
img_identifiers=self._get_img_identifiers(gold_biosample_id),
|
|
460
531
|
insdc_biosample_identifiers=self._get_insdc_biosample_identifiers(
|
|
@@ -466,7 +537,6 @@ class GoldStudyTranslator(Translator):
|
|
|
466
537
|
name=gold_biosample.get("biosampleName"),
|
|
467
538
|
ncbi_taxonomy_name=gold_biosample.get("ncbiTaxName"),
|
|
468
539
|
nitrite=self._get_quantity_value(gold_biosample, "nitrateConcentration"),
|
|
469
|
-
part_of=nmdc_study_id,
|
|
470
540
|
ph=gold_biosample.get("ph"),
|
|
471
541
|
pressure=self._get_quantity_value(gold_biosample, "pressure"),
|
|
472
542
|
samp_name=self._get_samp_name(gold_biosample),
|
|
@@ -482,47 +552,46 @@ class GoldStudyTranslator(Translator):
|
|
|
482
552
|
gold_biosample, "sampleCollectionTemperature"
|
|
483
553
|
),
|
|
484
554
|
type="nmdc:Biosample",
|
|
555
|
+
associated_studies=[nmdc_study_id],
|
|
485
556
|
)
|
|
486
557
|
|
|
487
|
-
def
|
|
558
|
+
def _translate_nucleotide_sequencing(
|
|
488
559
|
self,
|
|
489
560
|
gold_project: JSON_OBJECT,
|
|
490
|
-
|
|
561
|
+
nmdc_nucleotide_sequencing_id: str,
|
|
491
562
|
nmdc_biosample_id: str,
|
|
492
563
|
nmdc_study_id: str,
|
|
493
|
-
)
|
|
494
|
-
"""Translate a GOLD project object into an `nmdc:
|
|
564
|
+
):
|
|
565
|
+
"""Translate a GOLD project object into an `nmdc:NucleotideSequencing` object.
|
|
495
566
|
|
|
496
|
-
This method translates a GOLD project object into an equivalent `nmdc:
|
|
567
|
+
This method translates a GOLD project object into an equivalent `nmdc:NucleotideSequencing`
|
|
497
568
|
object. Any minted NMDC IDs must be passed to this method. Internally, each
|
|
498
|
-
slot of the `nmdc:
|
|
569
|
+
slot of the `nmdc:NucleotideSequencing` is either directly pulled from the GOLD object or
|
|
499
570
|
one of the `_get_*` methods is used.
|
|
500
571
|
|
|
501
572
|
:param gold_project: GOLD project object
|
|
502
|
-
:param nmdc_omics_processing_id: Minted nmdc:
|
|
573
|
+
:param nmdc_omics_processing_id: Minted nmdc:NucleotideSequencing identifier for the translated object
|
|
503
574
|
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the related Biosample
|
|
504
575
|
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
|
|
505
|
-
:return: nmdc:
|
|
576
|
+
:return: nmdc:NucleotideSequencing object
|
|
506
577
|
"""
|
|
507
578
|
gold_project_id = gold_project["projectGoldId"]
|
|
508
|
-
return nmdc.
|
|
509
|
-
id=
|
|
579
|
+
return nmdc.NucleotideSequencing(
|
|
580
|
+
id=nmdc_nucleotide_sequencing_id,
|
|
510
581
|
name=gold_project.get("projectName"),
|
|
511
582
|
gold_sequencing_project_identifiers=self._get_curie(
|
|
512
583
|
"gold", gold_project_id
|
|
513
584
|
),
|
|
514
585
|
ncbi_project_name=gold_project.get("projectName"),
|
|
515
|
-
type="nmdc:
|
|
586
|
+
type="nmdc:NucleotideSequencing",
|
|
516
587
|
has_input=nmdc_biosample_id,
|
|
517
|
-
part_of=nmdc_study_id,
|
|
518
588
|
add_date=gold_project.get("addDate"),
|
|
519
589
|
mod_date=self._get_mod_date(gold_project),
|
|
520
590
|
principal_investigator=self._get_pi(gold_project),
|
|
521
|
-
omics_type=self._get_controlled_term_value(
|
|
522
|
-
gold_project, "sequencingStrategy"
|
|
523
|
-
),
|
|
524
|
-
instrument_name=self._get_instrument_name(gold_project),
|
|
525
591
|
processing_institution=self._get_processing_institution(gold_project),
|
|
592
|
+
instrument_used=self._get_instrument(gold_project),
|
|
593
|
+
analyte_category="metagenome",
|
|
594
|
+
associated_studies=[nmdc_study_id],
|
|
526
595
|
)
|
|
527
596
|
|
|
528
597
|
def get_database(self) -> nmdc.Database:
|
|
@@ -563,11 +632,11 @@ class GoldStudyTranslator(Translator):
|
|
|
563
632
|
}
|
|
564
633
|
|
|
565
634
|
gold_project_ids = [project["projectGoldId"] for project in self.projects]
|
|
566
|
-
|
|
567
|
-
"nmdc:
|
|
635
|
+
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
636
|
+
"nmdc:NucleotideSequencing", len(gold_project_ids)
|
|
568
637
|
)
|
|
569
|
-
|
|
570
|
-
zip(gold_project_ids,
|
|
638
|
+
gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
639
|
+
zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
|
|
571
640
|
)
|
|
572
641
|
|
|
573
642
|
database.study_set = [self._translate_study(self.study, nmdc_study_id)]
|
|
@@ -585,13 +654,13 @@ class GoldStudyTranslator(Translator):
|
|
|
585
654
|
for biosample in self.biosamples
|
|
586
655
|
]
|
|
587
656
|
database.field_research_site_set = [
|
|
588
|
-
nmdc.FieldResearchSite(id=id, name=name)
|
|
657
|
+
nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
|
|
589
658
|
for name, id in gold_name_to_nmdc_field_site_ids.items()
|
|
590
659
|
]
|
|
591
|
-
database.
|
|
592
|
-
self.
|
|
660
|
+
database.data_generation_set = [
|
|
661
|
+
self._translate_nucleotide_sequencing(
|
|
593
662
|
project,
|
|
594
|
-
|
|
663
|
+
nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
|
|
595
664
|
project["projectGoldId"]
|
|
596
665
|
],
|
|
597
666
|
nmdc_biosample_id=gold_to_nmdc_biosample_ids[
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
+
from typing import Union
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import requests_cache
|
|
@@ -47,6 +48,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
47
48
|
site_code_mapping: dict,
|
|
48
49
|
neon_envo_mappings_file: pd.DataFrame,
|
|
49
50
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
51
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
50
52
|
*args,
|
|
51
53
|
**kwargs,
|
|
52
54
|
) -> None:
|
|
@@ -92,13 +94,13 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
self.site_code_mapping = site_code_mapping
|
|
97
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
95
98
|
|
|
96
99
|
def _translate_biosample(
|
|
97
100
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
98
101
|
) -> nmdc.Biosample:
|
|
99
102
|
return nmdc.Biosample(
|
|
100
103
|
id=nmdc_id,
|
|
101
|
-
part_of="nmdc:sty-11-pzmd0x14",
|
|
102
104
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
103
105
|
BENTHIC_BROAD_SCALE_MAPPINGS.get(
|
|
104
106
|
biosample_row["aquaticSiteType"].values[0]
|
|
@@ -146,8 +148,10 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
146
148
|
depth=nmdc.QuantityValue(
|
|
147
149
|
has_minimum_numeric_value=nmdc.Float("0"),
|
|
148
150
|
has_maximum_numeric_value=nmdc.Float("1"),
|
|
149
|
-
has_unit="
|
|
151
|
+
has_unit="m",
|
|
152
|
+
type="nmdc:QuantityValue",
|
|
150
153
|
),
|
|
154
|
+
associated_studies=["nmdc:sty-11-pzmd0x14"],
|
|
151
155
|
)
|
|
152
156
|
|
|
153
157
|
def _translate_extraction_process(
|
|
@@ -185,10 +189,9 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
185
189
|
input_mass=_create_quantity_value(
|
|
186
190
|
_get_value_or_none(extraction_row, "sampleMass"), "g"
|
|
187
191
|
),
|
|
188
|
-
|
|
189
|
-
status=_get_value_or_none(extraction_row, "qaqcStatus")
|
|
190
|
-
),
|
|
192
|
+
qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
|
|
191
193
|
processing_institution=processing_institution,
|
|
194
|
+
type="nmdc:Extraction",
|
|
192
195
|
)
|
|
193
196
|
|
|
194
197
|
def _translate_library_preparation(
|
|
@@ -201,13 +204,13 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
201
204
|
"""
|
|
202
205
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
203
206
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
204
|
-
process is fed as input to an
|
|
207
|
+
process is fed as input to an NucleotideSequencing object.
|
|
205
208
|
|
|
206
209
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
207
210
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
208
211
|
Extraction process.
|
|
209
212
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
210
|
-
is also input to
|
|
213
|
+
is also input to NucleotideSequencing.
|
|
211
214
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
212
215
|
:return: Object that using LibraryPreparation process model.
|
|
213
216
|
"""
|
|
@@ -226,31 +229,47 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
226
229
|
start_date=_get_value_or_none(library_preparation_row, "collectDate"),
|
|
227
230
|
end_date=_get_value_or_none(library_preparation_row, "processedDate"),
|
|
228
231
|
processing_institution=processing_institution,
|
|
232
|
+
type="nmdc:LibraryPreparation",
|
|
229
233
|
)
|
|
230
234
|
|
|
231
|
-
def
|
|
235
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
236
|
+
if not instrument_model:
|
|
237
|
+
raise ValueError(
|
|
238
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
df = self.neon_nmdc_instrument_map_df
|
|
242
|
+
matching_row = df[
|
|
243
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
if not matching_row.empty:
|
|
247
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
248
|
+
return nmdc_instrument_id
|
|
249
|
+
|
|
250
|
+
def _translate_nucleotide_sequencing(
|
|
232
251
|
self,
|
|
233
|
-
|
|
252
|
+
nucleotide_sequencing_id: str,
|
|
234
253
|
processed_sample_id: str,
|
|
235
254
|
raw_data_file_data: str,
|
|
236
|
-
|
|
237
|
-
)
|
|
238
|
-
"""Create nmdc
|
|
239
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
240
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
255
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
256
|
+
):
|
|
257
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
258
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
259
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
241
260
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
242
261
|
|
|
243
|
-
:param
|
|
262
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
244
263
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
245
264
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
246
265
|
files embedded in them.
|
|
247
|
-
:param
|
|
266
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
248
267
|
process/run.
|
|
249
|
-
:return:
|
|
268
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
250
269
|
"""
|
|
251
270
|
processing_institution = None
|
|
252
271
|
sequencing_facility = _get_value_or_none(
|
|
253
|
-
|
|
272
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
254
273
|
)
|
|
255
274
|
if sequencing_facility is not None:
|
|
256
275
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -258,19 +277,21 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
258
277
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
259
278
|
processing_institution = "ANL"
|
|
260
279
|
|
|
261
|
-
return nmdc.
|
|
262
|
-
id=
|
|
280
|
+
return nmdc.NucleotideSequencing(
|
|
281
|
+
id=nucleotide_sequencing_id,
|
|
263
282
|
has_input=processed_sample_id,
|
|
264
283
|
has_output=raw_data_file_data,
|
|
265
284
|
processing_institution=processing_institution,
|
|
266
|
-
ncbi_project_name=_get_value_or_none(
|
|
267
|
-
|
|
268
|
-
|
|
285
|
+
ncbi_project_name=_get_value_or_none(
|
|
286
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
287
|
+
),
|
|
288
|
+
instrument_used=self._get_instrument_id(
|
|
289
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
269
290
|
),
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
291
|
+
name=f"Benthic microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
292
|
+
type="nmdc:NucleotideSequencing",
|
|
293
|
+
associated_studies=["nmdc:sty-11-pzmd0x14"],
|
|
294
|
+
analyte_category="metagenome",
|
|
274
295
|
)
|
|
275
296
|
|
|
276
297
|
def _translate_processed_sample(
|
|
@@ -287,12 +308,14 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
287
308
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
288
309
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
289
310
|
"""
|
|
290
|
-
return nmdc.ProcessedSample(
|
|
311
|
+
return nmdc.ProcessedSample(
|
|
312
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
313
|
+
)
|
|
291
314
|
|
|
292
315
|
def _translate_data_object(
|
|
293
316
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
294
317
|
) -> nmdc.DataObject:
|
|
295
|
-
"""Create nmdc DataObject which is the output of
|
|
318
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
296
319
|
object mainly contains information about the sequencing file that was generated as
|
|
297
320
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
298
321
|
is the result of a LibraryPreparation process.
|
|
@@ -419,7 +442,9 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
419
442
|
)
|
|
420
443
|
|
|
421
444
|
neon_omprc_ids = benthic_samples["sampleID"]
|
|
422
|
-
nmdc_omprc_ids = self._id_minter(
|
|
445
|
+
nmdc_omprc_ids = self._id_minter(
|
|
446
|
+
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
447
|
+
)
|
|
423
448
|
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
424
449
|
|
|
425
450
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -445,7 +470,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
445
470
|
processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
446
471
|
|
|
447
472
|
if extraction_input is not None and processed_sample_id is not None:
|
|
448
|
-
database.
|
|
473
|
+
database.material_processing_set.append(
|
|
449
474
|
self._translate_extraction_process(
|
|
450
475
|
nmdc_id,
|
|
451
476
|
extraction_input,
|
|
@@ -489,7 +514,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
489
514
|
processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
|
|
490
515
|
|
|
491
516
|
if lib_prep_input is not None and processed_sample_id is not None:
|
|
492
|
-
database.
|
|
517
|
+
database.material_processing_set.append(
|
|
493
518
|
self._translate_library_preparation(
|
|
494
519
|
nmdc_id,
|
|
495
520
|
lib_prep_input,
|
|
@@ -536,8 +561,8 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
536
561
|
)
|
|
537
562
|
)
|
|
538
563
|
|
|
539
|
-
database.
|
|
540
|
-
self.
|
|
564
|
+
database.data_generation_set.append(
|
|
565
|
+
self._translate_nucleotide_sequencing(
|
|
541
566
|
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
542
567
|
processed_sample_id,
|
|
543
568
|
has_output_do_ids,
|