nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +59 -3
- nmdc_runtime/site/export/ncbi_xml.py +29 -25
- nmdc_runtime/site/export/ncbi_xml_utils.py +5 -5
- nmdc_runtime/site/export/study_metadata.py +3 -1
- nmdc_runtime/site/graphs.py +71 -15
- nmdc_runtime/site/ops.py +135 -42
- nmdc_runtime/site/repository.py +16 -4
- nmdc_runtime/site/translation/gold_translator.py +112 -43
- nmdc_runtime/site/translation/neon_benthic_translator.py +59 -34
- nmdc_runtime/site/translation/neon_soil_translator.py +72 -48
- nmdc_runtime/site/translation/neon_surface_water_translator.py +61 -32
- nmdc_runtime/site/translation/neon_utils.py +19 -6
- nmdc_runtime/site/translation/submission_portal_translator.py +67 -36
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/METADATA +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/RECORD +19 -19
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-1.9.0.dist-info → nmdc_runtime-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -26,6 +26,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
26
26
|
sls_data: dict,
|
|
27
27
|
neon_envo_mappings_file: pd.DataFrame,
|
|
28
28
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
29
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
29
30
|
*args,
|
|
30
31
|
**kwargs,
|
|
31
32
|
) -> None:
|
|
@@ -99,6 +100,23 @@ class NeonSoilDataTranslator(Translator):
|
|
|
99
100
|
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
100
101
|
)
|
|
101
102
|
|
|
103
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
104
|
+
|
|
105
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
106
|
+
if not instrument_model:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
df = self.neon_nmdc_instrument_map_df
|
|
112
|
+
matching_row = df[
|
|
113
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if not matching_row.empty:
|
|
117
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
118
|
+
return nmdc_instrument_id
|
|
119
|
+
|
|
102
120
|
def _translate_biosample(
|
|
103
121
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
104
122
|
) -> nmdc.Biosample:
|
|
@@ -116,7 +134,6 @@ class NeonSoilDataTranslator(Translator):
|
|
|
116
134
|
"""
|
|
117
135
|
return nmdc.Biosample(
|
|
118
136
|
id=nmdc_id,
|
|
119
|
-
part_of="nmdc:sty-11-34xj1150",
|
|
120
137
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
121
138
|
"ENVO:00000446", "terrestrial biome"
|
|
122
139
|
),
|
|
@@ -145,6 +162,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
145
162
|
biosample_row, "sampleBottomDepth"
|
|
146
163
|
),
|
|
147
164
|
has_unit="m",
|
|
165
|
+
type="nmdc:QuantityValue",
|
|
148
166
|
),
|
|
149
167
|
samp_collec_device=_get_value_or_none(biosample_row, "soilSamplingDevice"),
|
|
150
168
|
soil_horizon=_get_value_or_none(biosample_row, "horizon"),
|
|
@@ -172,6 +190,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
172
190
|
biosample_row["kclNitrateNitriteNConc"].values[0], "mg/L"
|
|
173
191
|
),
|
|
174
192
|
type="nmdc:Biosample",
|
|
193
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
175
194
|
)
|
|
176
195
|
|
|
177
196
|
def _translate_pooling_process(
|
|
@@ -198,6 +217,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
198
217
|
has_input=bsm_input_values_list,
|
|
199
218
|
start_date=_get_value_or_none(pooling_row, "startDate"),
|
|
200
219
|
end_date=_get_value_or_none(pooling_row, "collectDate"),
|
|
220
|
+
type="nmdc:Pooling",
|
|
201
221
|
)
|
|
202
222
|
|
|
203
223
|
def _translate_processed_sample(
|
|
@@ -214,12 +234,14 @@ class NeonSoilDataTranslator(Translator):
|
|
|
214
234
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
215
235
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
216
236
|
"""
|
|
217
|
-
return nmdc.ProcessedSample(
|
|
237
|
+
return nmdc.ProcessedSample(
|
|
238
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
239
|
+
)
|
|
218
240
|
|
|
219
241
|
def _translate_data_object(
|
|
220
242
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
221
243
|
) -> nmdc.DataObject:
|
|
222
|
-
"""Create nmdc DataObject which is the output of
|
|
244
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
223
245
|
object mainly contains information about the sequencing file that was generated as
|
|
224
246
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
225
247
|
is the result of a LibraryPreparation process.
|
|
@@ -280,10 +302,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
280
302
|
input_mass=_create_quantity_value(
|
|
281
303
|
_get_value_or_none(extraction_row, "sampleMass"), "g"
|
|
282
304
|
),
|
|
283
|
-
|
|
284
|
-
status=_get_value_or_none(extraction_row, "qaqcStatus")
|
|
285
|
-
),
|
|
305
|
+
qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
|
|
286
306
|
processing_institution=processing_institution,
|
|
307
|
+
type="nmdc:Extraction",
|
|
287
308
|
)
|
|
288
309
|
|
|
289
310
|
def _translate_library_preparation(
|
|
@@ -296,13 +317,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
296
317
|
"""
|
|
297
318
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
298
319
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
299
|
-
process is fed as input to an
|
|
320
|
+
process is fed as input to an NucleotideSequencing object.
|
|
300
321
|
|
|
301
322
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
302
323
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
303
324
|
Extraction process.
|
|
304
325
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
305
|
-
is also input to
|
|
326
|
+
is also input to NucleotideSequencing.
|
|
306
327
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
307
328
|
:return: Object that using LibraryPreparation process model.
|
|
308
329
|
"""
|
|
@@ -321,31 +342,32 @@ class NeonSoilDataTranslator(Translator):
|
|
|
321
342
|
start_date=_get_value_or_none(library_preparation_row, "collectDate"),
|
|
322
343
|
end_date=_get_value_or_none(library_preparation_row, "processedDate"),
|
|
323
344
|
processing_institution=processing_institution,
|
|
345
|
+
type="nmdc:LibraryPreparation",
|
|
324
346
|
)
|
|
325
347
|
|
|
326
|
-
def
|
|
348
|
+
def _translate_nucleotide_sequencing(
|
|
327
349
|
self,
|
|
328
|
-
|
|
350
|
+
nucleotide_sequencing_id: str,
|
|
329
351
|
processed_sample_id: str,
|
|
330
352
|
raw_data_file_data: str,
|
|
331
|
-
|
|
332
|
-
)
|
|
333
|
-
"""Create nmdc
|
|
334
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
335
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
353
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
354
|
+
):
|
|
355
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
356
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
357
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
336
358
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
337
359
|
|
|
338
|
-
:param
|
|
360
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
339
361
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
340
362
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
341
363
|
files embedded in them.
|
|
342
|
-
:param
|
|
364
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
343
365
|
process/run.
|
|
344
|
-
:return:
|
|
366
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
345
367
|
"""
|
|
346
368
|
processing_institution = None
|
|
347
369
|
sequencing_facility = _get_value_or_none(
|
|
348
|
-
|
|
370
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
349
371
|
)
|
|
350
372
|
if sequencing_facility is not None:
|
|
351
373
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -353,19 +375,21 @@ class NeonSoilDataTranslator(Translator):
|
|
|
353
375
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
354
376
|
processing_institution = "ANL"
|
|
355
377
|
|
|
356
|
-
return nmdc.
|
|
357
|
-
id=
|
|
378
|
+
return nmdc.NucleotideSequencing(
|
|
379
|
+
id=nucleotide_sequencing_id,
|
|
358
380
|
has_input=processed_sample_id,
|
|
359
381
|
has_output=raw_data_file_data,
|
|
360
382
|
processing_institution=processing_institution,
|
|
361
|
-
ncbi_project_name=_get_value_or_none(
|
|
362
|
-
|
|
363
|
-
omics_processing_row["investigation_type"].values[0]
|
|
383
|
+
ncbi_project_name=_get_value_or_none(
|
|
384
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
364
385
|
),
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
386
|
+
instrument_used=self._get_instrument_id(
|
|
387
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
388
|
+
),
|
|
389
|
+
name=f"Terrestrial soil microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
390
|
+
type="nmdc:NucleotideSequencing",
|
|
391
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
392
|
+
analyte_category="metagenome",
|
|
369
393
|
)
|
|
370
394
|
|
|
371
395
|
def get_database(self) -> nmdc.Database:
|
|
@@ -373,10 +397,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
373
397
|
nmdc object creation methods as well as the nmdc type (QuantityValue, GeolocationValue, etc.)
|
|
374
398
|
creation methods, to make an nmdc Database object. It populates multiple sets in the Mongo database -
|
|
375
399
|
* `biosample_set`: uses `_translate_biosample()`
|
|
376
|
-
* `
|
|
377
|
-
|
|
378
|
-
* `
|
|
379
|
-
* `omics_processing_set`: uses `_translate_omics_processing()`
|
|
400
|
+
* `material_processing_set`: uses `_translate_pooling_process()`, `_translate_extraction_process()`,
|
|
401
|
+
`_translate_library_preparation()`
|
|
402
|
+
* `data_generation_set`: uses `_translate_nucleotide_sequencing()`
|
|
380
403
|
* `processed_sample_set`: uses `_translate_processed_sample()`
|
|
381
404
|
* `data_object_set`: uses `_translate_data_object()`
|
|
382
405
|
The core Biosample information is in the `sls_soilCoreCollection` table. However, we
|
|
@@ -607,14 +630,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
607
630
|
mms_metagenomeDnaExtraction.processedDate,
|
|
608
631
|
mms_metagenomeSequencing.sequencingFacilityID,
|
|
609
632
|
mms_metagenomeSequencing.ncbiProjectID,
|
|
610
|
-
mms_metagenomeSequencing.investigation_type,
|
|
611
633
|
mms_metagenomeSequencing.sequencingMethod,
|
|
612
634
|
mms_metagenomeSequencing.instrument_model
|
|
613
635
|
FROM mms_metagenomeSequencing
|
|
614
636
|
LEFT JOIN mms_metagenomeDnaExtraction ON mms_metagenomeDnaExtraction.dnaSampleID = mms_metagenomeSequencing.dnaSampleID
|
|
615
637
|
"""
|
|
616
638
|
library_preparation_table = pd.read_sql_query(query, self.conn)
|
|
617
|
-
|
|
639
|
+
nucleotide_sequencing_table = pd.read_sql_query(query, self.conn)
|
|
618
640
|
|
|
619
641
|
nmdc_pooling_ids = self._id_minter("nmdc:Pooling", len(pooling_ids_dict))
|
|
620
642
|
neon_to_nmdc_pooling_ids = dict(
|
|
@@ -653,12 +675,12 @@ class NeonSoilDataTranslator(Translator):
|
|
|
653
675
|
zip(library_prepration_ids, nmdc_library_preparation_processed_sample_ids)
|
|
654
676
|
)
|
|
655
677
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
"nmdc:
|
|
678
|
+
nucleotide_sequencing_ids = nucleotide_sequencing_table["dnaSampleID"]
|
|
679
|
+
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
680
|
+
"nmdc:NucleotideSequencing", len(nucleotide_sequencing_ids)
|
|
659
681
|
)
|
|
660
|
-
|
|
661
|
-
zip(
|
|
682
|
+
neon_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
683
|
+
zip(nucleotide_sequencing_ids, nmdc_nucleotide_sequencing_ids)
|
|
662
684
|
)
|
|
663
685
|
|
|
664
686
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -701,7 +723,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
701
723
|
# if the number of biosamples that are input to a pooling process
|
|
702
724
|
# is one or less, then ignore it and go straight to extraction
|
|
703
725
|
if len(bsm_values_list) > 1:
|
|
704
|
-
database.
|
|
726
|
+
database.material_processing_set.append(
|
|
705
727
|
self._translate_pooling_process(
|
|
706
728
|
pooling_process_id,
|
|
707
729
|
processed_sample_id,
|
|
@@ -734,7 +756,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
734
756
|
# handler for creating extraction process records
|
|
735
757
|
# for both pooled and non-pooled samples
|
|
736
758
|
if "|" in genomics_pooled_id_list:
|
|
737
|
-
database.
|
|
759
|
+
database.material_processing_set.append(
|
|
738
760
|
self._translate_extraction_process(
|
|
739
761
|
extraction_id,
|
|
740
762
|
extraction_input,
|
|
@@ -755,7 +777,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
755
777
|
|
|
756
778
|
extraction_input = neon_to_nmdc_biosample_ids[neon_biosample_id]
|
|
757
779
|
|
|
758
|
-
database.
|
|
780
|
+
database.material_processing_set.append(
|
|
759
781
|
self._translate_extraction_process(
|
|
760
782
|
extraction_id,
|
|
761
783
|
extraction_input,
|
|
@@ -772,7 +794,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
772
794
|
dna_sample_id
|
|
773
795
|
]
|
|
774
796
|
|
|
775
|
-
|
|
797
|
+
nucleotide_sequencing_id = neon_to_nmdc_nucleotide_sequencing_ids[
|
|
798
|
+
dna_sample_id
|
|
799
|
+
]
|
|
776
800
|
|
|
777
801
|
genomics_sample_id = library_preparation_table[
|
|
778
802
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
@@ -787,7 +811,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
787
811
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
788
812
|
]
|
|
789
813
|
|
|
790
|
-
database.
|
|
814
|
+
database.material_processing_set.append(
|
|
791
815
|
self._translate_library_preparation(
|
|
792
816
|
library_preparation_id,
|
|
793
817
|
library_preparation_input,
|
|
@@ -809,9 +833,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
809
833
|
if item in neon_to_nmdc_data_object_ids:
|
|
810
834
|
has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
|
|
811
835
|
|
|
812
|
-
database.
|
|
813
|
-
self.
|
|
814
|
-
|
|
836
|
+
database.data_generation_set.append(
|
|
837
|
+
self._translate_nucleotide_sequencing(
|
|
838
|
+
nucleotide_sequencing_id,
|
|
815
839
|
processed_sample_id,
|
|
816
840
|
has_output_do_ids,
|
|
817
841
|
library_preparation_row,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import Dict, Optional
|
|
3
|
+
from typing import Dict, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import requests
|
|
@@ -36,6 +36,7 @@ SURFACE_WATER_LOCAL_SCALE_MAPPINGS = {
|
|
|
36
36
|
"term_id": "ENVO:01000409",
|
|
37
37
|
"term_name": "freshwater littoral zone",
|
|
38
38
|
},
|
|
39
|
+
"inflow": {"term_id": "ENVO:00000476", "term_name": "lake inlet"},
|
|
39
40
|
},
|
|
40
41
|
"river": {"term_id": "ENVO:01000297", "term_name": "freshwater river"},
|
|
41
42
|
"stream": {"term_id": "ENVO:03605007", "term_name": "freshwater stream"},
|
|
@@ -58,6 +59,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
58
59
|
site_code_mapping: dict,
|
|
59
60
|
neon_envo_mappings_file: pd.DataFrame,
|
|
60
61
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
62
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
61
63
|
*args,
|
|
62
64
|
**kwargs,
|
|
63
65
|
) -> None:
|
|
@@ -108,6 +110,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
108
110
|
|
|
109
111
|
self.site_code_mapping = site_code_mapping
|
|
110
112
|
|
|
113
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
114
|
+
|
|
111
115
|
def _translate_biosample(
|
|
112
116
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
113
117
|
) -> nmdc.Biosample:
|
|
@@ -136,16 +140,17 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
136
140
|
has_minimum_numeric_value=nmdc.Float(minimum_depth),
|
|
137
141
|
has_maximum_numeric_value=nmdc.Float(maximum_depth),
|
|
138
142
|
has_unit="m",
|
|
143
|
+
type="nmdc:QuantityValue",
|
|
139
144
|
)
|
|
140
145
|
else:
|
|
141
146
|
depth = nmdc.QuantityValue(
|
|
142
147
|
has_numeric_value=nmdc.Float(minimum_depth),
|
|
143
148
|
has_unit="m",
|
|
149
|
+
type="nmdc:QuantityValue",
|
|
144
150
|
)
|
|
145
151
|
|
|
146
152
|
return nmdc.Biosample(
|
|
147
153
|
id=nmdc_id,
|
|
148
|
-
part_of="nmdc:sty-11-hht5sb92",
|
|
149
154
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
150
155
|
SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
|
|
151
156
|
biosample_row["aquaticSiteType"].values[0]
|
|
@@ -201,7 +206,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
201
206
|
samp_size=_create_quantity_value(
|
|
202
207
|
biosample_row["geneticFilteredSampleVolume"].values[0], "mL"
|
|
203
208
|
),
|
|
204
|
-
env_package=nmdc.TextValue(has_raw_value="water"),
|
|
209
|
+
env_package=nmdc.TextValue(has_raw_value="water", type="nmdc:TextValue"),
|
|
210
|
+
associated_studies=["nmdc:sty-11-hht5sb92"],
|
|
205
211
|
)
|
|
206
212
|
|
|
207
213
|
def _translate_extraction_process(
|
|
@@ -243,6 +249,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
243
249
|
_get_value_or_none(extraction_row, "extrQaqcStatus")
|
|
244
250
|
),
|
|
245
251
|
processing_institution=processing_institution,
|
|
252
|
+
type="nmdc:Extraction",
|
|
246
253
|
)
|
|
247
254
|
|
|
248
255
|
def _translate_library_preparation(
|
|
@@ -255,13 +262,13 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
255
262
|
"""
|
|
256
263
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
257
264
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
258
|
-
process is fed as input to an
|
|
265
|
+
process is fed as input to an NucleotideSequencing object.
|
|
259
266
|
|
|
260
267
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
261
268
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
262
269
|
Extraction process.
|
|
263
270
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
264
|
-
is also input to
|
|
271
|
+
is also input to NucleotideSequencing.
|
|
265
272
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
266
273
|
:return: Object that using LibraryPreparation process model.
|
|
267
274
|
"""
|
|
@@ -280,31 +287,47 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
280
287
|
start_date=_get_value_or_none(library_preparation_row, "seqCollectDate"),
|
|
281
288
|
end_date=_get_value_or_none(library_preparation_row, "seqProcessedDate"),
|
|
282
289
|
processing_institution=processing_institution,
|
|
290
|
+
type="nmdc:LibraryPreparation",
|
|
283
291
|
)
|
|
284
292
|
|
|
285
|
-
def
|
|
293
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
294
|
+
if not instrument_model:
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
df = self.neon_nmdc_instrument_map_df
|
|
300
|
+
matching_row = df[
|
|
301
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
if not matching_row.empty:
|
|
305
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
306
|
+
return nmdc_instrument_id
|
|
307
|
+
|
|
308
|
+
def _translate_nucleotide_sequencing(
|
|
286
309
|
self,
|
|
287
|
-
|
|
310
|
+
nucleotide_sequencing_id: str,
|
|
288
311
|
processed_sample_id: str,
|
|
289
312
|
raw_data_file_data: str,
|
|
290
|
-
|
|
291
|
-
)
|
|
292
|
-
"""Create nmdc
|
|
293
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
294
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
313
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
314
|
+
):
|
|
315
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
316
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
317
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
295
318
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
296
319
|
|
|
297
|
-
:param
|
|
320
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
298
321
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
299
322
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
300
323
|
files embedded in them.
|
|
301
|
-
:param
|
|
324
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
302
325
|
process/run.
|
|
303
|
-
:return:
|
|
326
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
304
327
|
"""
|
|
305
328
|
processing_institution = None
|
|
306
329
|
sequencing_facility = _get_value_or_none(
|
|
307
|
-
|
|
330
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
308
331
|
)
|
|
309
332
|
if sequencing_facility is not None:
|
|
310
333
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -312,19 +335,21 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
312
335
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
313
336
|
processing_institution = "ANL"
|
|
314
337
|
|
|
315
|
-
return nmdc.
|
|
316
|
-
id=
|
|
338
|
+
return nmdc.NucleotideSequencing(
|
|
339
|
+
id=nucleotide_sequencing_id,
|
|
317
340
|
has_input=processed_sample_id,
|
|
318
341
|
has_output=raw_data_file_data,
|
|
319
342
|
processing_institution=processing_institution,
|
|
320
|
-
ncbi_project_name=_get_value_or_none(
|
|
321
|
-
|
|
322
|
-
|
|
343
|
+
ncbi_project_name=_get_value_or_none(
|
|
344
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
345
|
+
),
|
|
346
|
+
instrument_used=self._get_instrument_id(
|
|
347
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
323
348
|
),
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
349
|
+
name=f"Surface water microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
350
|
+
type="nmdc:NucleotideSequencing",
|
|
351
|
+
associated_studies=["nmdc:sty-11-hht5sb92"],
|
|
352
|
+
analyte_category="metagenome",
|
|
328
353
|
)
|
|
329
354
|
|
|
330
355
|
def _translate_processed_sample(
|
|
@@ -341,12 +366,14 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
341
366
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
342
367
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
343
368
|
"""
|
|
344
|
-
return nmdc.ProcessedSample(
|
|
369
|
+
return nmdc.ProcessedSample(
|
|
370
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
371
|
+
)
|
|
345
372
|
|
|
346
373
|
def _translate_data_object(
|
|
347
374
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
348
375
|
) -> nmdc.DataObject:
|
|
349
|
-
"""Create nmdc DataObject which is the output of
|
|
376
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
350
377
|
object mainly contains information about the sequencing file that was generated as
|
|
351
378
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
352
379
|
is the result of a LibraryPreparation process.
|
|
@@ -485,7 +512,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
485
512
|
)
|
|
486
513
|
|
|
487
514
|
neon_omprc_ids = surface_water_samples["parentSampleID"]
|
|
488
|
-
nmdc_omprc_ids = self._id_minter(
|
|
515
|
+
nmdc_omprc_ids = self._id_minter(
|
|
516
|
+
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
517
|
+
)
|
|
489
518
|
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
490
519
|
|
|
491
520
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -515,7 +544,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
515
544
|
processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
516
545
|
|
|
517
546
|
if extraction_input is not None and processed_sample_id is not None:
|
|
518
|
-
database.
|
|
547
|
+
database.material_processing_set.append(
|
|
519
548
|
self._translate_extraction_process(
|
|
520
549
|
nmdc_id,
|
|
521
550
|
extraction_input,
|
|
@@ -561,7 +590,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
561
590
|
processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
|
|
562
591
|
|
|
563
592
|
if lib_prep_input is not None and processed_sample_id is not None:
|
|
564
|
-
database.
|
|
593
|
+
database.material_processing_set.append(
|
|
565
594
|
self._translate_library_preparation(
|
|
566
595
|
nmdc_id,
|
|
567
596
|
lib_prep_input,
|
|
@@ -608,8 +637,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
608
637
|
)
|
|
609
638
|
)
|
|
610
639
|
|
|
611
|
-
database.
|
|
612
|
-
self.
|
|
640
|
+
database.data_generation_set.append(
|
|
641
|
+
self._translate_nucleotide_sequencing(
|
|
613
642
|
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
614
643
|
processed_sample_id,
|
|
615
644
|
has_output_do_ids,
|
|
@@ -50,7 +50,14 @@ def _create_controlled_identified_term_value(
|
|
|
50
50
|
"""
|
|
51
51
|
if id is None or name is None:
|
|
52
52
|
return None
|
|
53
|
-
return nmdc.ControlledIdentifiedTermValue(
|
|
53
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
54
|
+
term=nmdc.OntologyClass(
|
|
55
|
+
id=id,
|
|
56
|
+
name=name,
|
|
57
|
+
type="nmdc:OntologyClass",
|
|
58
|
+
),
|
|
59
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
60
|
+
)
|
|
54
61
|
|
|
55
62
|
|
|
56
63
|
def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
|
|
@@ -64,7 +71,10 @@ def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
|
|
|
64
71
|
"""
|
|
65
72
|
if name is None:
|
|
66
73
|
return None
|
|
67
|
-
return nmdc.ControlledTermValue(
|
|
74
|
+
return nmdc.ControlledTermValue(
|
|
75
|
+
has_raw_value=name,
|
|
76
|
+
type="nmdc:ControlledTermValue",
|
|
77
|
+
)
|
|
68
78
|
|
|
69
79
|
|
|
70
80
|
def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
|
|
@@ -77,7 +87,7 @@ def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
|
|
|
77
87
|
"""
|
|
78
88
|
if value is None:
|
|
79
89
|
return None
|
|
80
|
-
return nmdc.TimestampValue(has_raw_value=value)
|
|
90
|
+
return nmdc.TimestampValue(has_raw_value=value, type="nmdc:TimestampValue")
|
|
81
91
|
|
|
82
92
|
|
|
83
93
|
def _create_quantity_value(
|
|
@@ -94,7 +104,9 @@ def _create_quantity_value(
|
|
|
94
104
|
"""
|
|
95
105
|
if numeric_value is None or math.isnan(numeric_value):
|
|
96
106
|
return None
|
|
97
|
-
return nmdc.QuantityValue(
|
|
107
|
+
return nmdc.QuantityValue(
|
|
108
|
+
has_numeric_value=float(numeric_value), has_unit=unit, type="nmdc:QuantityValue"
|
|
109
|
+
)
|
|
98
110
|
|
|
99
111
|
|
|
100
112
|
def _create_text_value(value: str = None) -> nmdc.TextValue:
|
|
@@ -106,7 +118,7 @@ def _create_text_value(value: str = None) -> nmdc.TextValue:
|
|
|
106
118
|
"""
|
|
107
119
|
if value is None:
|
|
108
120
|
return None
|
|
109
|
-
return nmdc.TextValue(has_raw_value=value)
|
|
121
|
+
return nmdc.TextValue(has_raw_value=value, type="nmdc:TextValue")
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def _create_double_value(value: str = None) -> nmdc.Double:
|
|
@@ -119,7 +131,7 @@ def _create_double_value(value: str = None) -> nmdc.Double:
|
|
|
119
131
|
"""
|
|
120
132
|
if value is None or math.isnan(value):
|
|
121
133
|
return None
|
|
122
|
-
return nmdc.Double(value)
|
|
134
|
+
return nmdc.Double(value, type="nmdc:Double")
|
|
123
135
|
|
|
124
136
|
|
|
125
137
|
def _create_geolocation_value(
|
|
@@ -147,4 +159,5 @@ def _create_geolocation_value(
|
|
|
147
159
|
return nmdc.GeolocationValue(
|
|
148
160
|
latitude=nmdc.DecimalDegree(latitude),
|
|
149
161
|
longitude=nmdc.DecimalDegree(longitude),
|
|
162
|
+
type="nmdc:GeolocationValue",
|
|
150
163
|
)
|