nmdc-runtime 1.10.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/minter/config.py +59 -3
- nmdc_runtime/site/export/ncbi_xml.py +29 -25
- nmdc_runtime/site/export/ncbi_xml_utils.py +5 -5
- nmdc_runtime/site/export/study_metadata.py +3 -1
- nmdc_runtime/site/graphs.py +71 -15
- nmdc_runtime/site/ops.py +131 -42
- nmdc_runtime/site/repository.py +16 -4
- nmdc_runtime/site/translation/gold_translator.py +112 -43
- nmdc_runtime/site/translation/neon_benthic_translator.py +58 -31
- nmdc_runtime/site/translation/neon_soil_translator.py +71 -45
- nmdc_runtime/site/translation/neon_surface_water_translator.py +61 -32
- nmdc_runtime/site/translation/neon_utils.py +19 -6
- nmdc_runtime/site/translation/submission_portal_translator.py +67 -36
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/METADATA +1 -1
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/RECORD +19 -19
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-1.10.0.dist-info → nmdc_runtime-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import List
|
|
3
|
+
from typing import List, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
@@ -26,6 +26,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
26
26
|
sls_data: dict,
|
|
27
27
|
neon_envo_mappings_file: pd.DataFrame,
|
|
28
28
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
29
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
29
30
|
*args,
|
|
30
31
|
**kwargs,
|
|
31
32
|
) -> None:
|
|
@@ -99,6 +100,23 @@ class NeonSoilDataTranslator(Translator):
|
|
|
99
100
|
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
100
101
|
)
|
|
101
102
|
|
|
103
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
104
|
+
|
|
105
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
106
|
+
if not instrument_model:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
df = self.neon_nmdc_instrument_map_df
|
|
112
|
+
matching_row = df[
|
|
113
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
if not matching_row.empty:
|
|
117
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
118
|
+
return nmdc_instrument_id
|
|
119
|
+
|
|
102
120
|
def _translate_biosample(
|
|
103
121
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
104
122
|
) -> nmdc.Biosample:
|
|
@@ -116,7 +134,6 @@ class NeonSoilDataTranslator(Translator):
|
|
|
116
134
|
"""
|
|
117
135
|
return nmdc.Biosample(
|
|
118
136
|
id=nmdc_id,
|
|
119
|
-
part_of="nmdc:sty-11-34xj1150",
|
|
120
137
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
121
138
|
"ENVO:00000446", "terrestrial biome"
|
|
122
139
|
),
|
|
@@ -145,6 +162,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
145
162
|
biosample_row, "sampleBottomDepth"
|
|
146
163
|
),
|
|
147
164
|
has_unit="m",
|
|
165
|
+
type="nmdc:QuantityValue",
|
|
148
166
|
),
|
|
149
167
|
samp_collec_device=_get_value_or_none(biosample_row, "soilSamplingDevice"),
|
|
150
168
|
soil_horizon=_get_value_or_none(biosample_row, "horizon"),
|
|
@@ -172,6 +190,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
172
190
|
biosample_row["kclNitrateNitriteNConc"].values[0], "mg/L"
|
|
173
191
|
),
|
|
174
192
|
type="nmdc:Biosample",
|
|
193
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
175
194
|
)
|
|
176
195
|
|
|
177
196
|
def _translate_pooling_process(
|
|
@@ -198,6 +217,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
198
217
|
has_input=bsm_input_values_list,
|
|
199
218
|
start_date=_get_value_or_none(pooling_row, "startDate"),
|
|
200
219
|
end_date=_get_value_or_none(pooling_row, "collectDate"),
|
|
220
|
+
type="nmdc:Pooling",
|
|
201
221
|
)
|
|
202
222
|
|
|
203
223
|
def _translate_processed_sample(
|
|
@@ -214,12 +234,14 @@ class NeonSoilDataTranslator(Translator):
|
|
|
214
234
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
215
235
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
216
236
|
"""
|
|
217
|
-
return nmdc.ProcessedSample(
|
|
237
|
+
return nmdc.ProcessedSample(
|
|
238
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
239
|
+
)
|
|
218
240
|
|
|
219
241
|
def _translate_data_object(
|
|
220
242
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
221
243
|
) -> nmdc.DataObject:
|
|
222
|
-
"""Create nmdc DataObject which is the output of
|
|
244
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
223
245
|
object mainly contains information about the sequencing file that was generated as
|
|
224
246
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
225
247
|
is the result of a LibraryPreparation process.
|
|
@@ -282,6 +304,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
282
304
|
),
|
|
283
305
|
qc_status=_get_value_or_none(extraction_row, "qaqcStatus"),
|
|
284
306
|
processing_institution=processing_institution,
|
|
307
|
+
type="nmdc:Extraction",
|
|
285
308
|
)
|
|
286
309
|
|
|
287
310
|
def _translate_library_preparation(
|
|
@@ -294,13 +317,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
294
317
|
"""
|
|
295
318
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
296
319
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
297
|
-
process is fed as input to an
|
|
320
|
+
process is fed as input to an NucleotideSequencing object.
|
|
298
321
|
|
|
299
322
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
300
323
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
301
324
|
Extraction process.
|
|
302
325
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
303
|
-
is also input to
|
|
326
|
+
is also input to NucleotideSequencing.
|
|
304
327
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
305
328
|
:return: Object that using LibraryPreparation process model.
|
|
306
329
|
"""
|
|
@@ -319,31 +342,32 @@ class NeonSoilDataTranslator(Translator):
|
|
|
319
342
|
start_date=_get_value_or_none(library_preparation_row, "collectDate"),
|
|
320
343
|
end_date=_get_value_or_none(library_preparation_row, "processedDate"),
|
|
321
344
|
processing_institution=processing_institution,
|
|
345
|
+
type="nmdc:LibraryPreparation",
|
|
322
346
|
)
|
|
323
347
|
|
|
324
|
-
def
|
|
348
|
+
def _translate_nucleotide_sequencing(
|
|
325
349
|
self,
|
|
326
|
-
|
|
350
|
+
nucleotide_sequencing_id: str,
|
|
327
351
|
processed_sample_id: str,
|
|
328
352
|
raw_data_file_data: str,
|
|
329
|
-
|
|
330
|
-
)
|
|
331
|
-
"""Create nmdc
|
|
332
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
333
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
353
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
354
|
+
):
|
|
355
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
356
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
357
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
334
358
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
335
359
|
|
|
336
|
-
:param
|
|
360
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
337
361
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
338
362
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
339
363
|
files embedded in them.
|
|
340
|
-
:param
|
|
364
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
341
365
|
process/run.
|
|
342
|
-
:return:
|
|
366
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
343
367
|
"""
|
|
344
368
|
processing_institution = None
|
|
345
369
|
sequencing_facility = _get_value_or_none(
|
|
346
|
-
|
|
370
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
347
371
|
)
|
|
348
372
|
if sequencing_facility is not None:
|
|
349
373
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -351,19 +375,21 @@ class NeonSoilDataTranslator(Translator):
|
|
|
351
375
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
352
376
|
processing_institution = "ANL"
|
|
353
377
|
|
|
354
|
-
return nmdc.
|
|
355
|
-
id=
|
|
378
|
+
return nmdc.NucleotideSequencing(
|
|
379
|
+
id=nucleotide_sequencing_id,
|
|
356
380
|
has_input=processed_sample_id,
|
|
357
381
|
has_output=raw_data_file_data,
|
|
358
382
|
processing_institution=processing_institution,
|
|
359
|
-
ncbi_project_name=_get_value_or_none(
|
|
360
|
-
|
|
361
|
-
|
|
383
|
+
ncbi_project_name=_get_value_or_none(
|
|
384
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
385
|
+
),
|
|
386
|
+
instrument_used=self._get_instrument_id(
|
|
387
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
362
388
|
),
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
389
|
+
name=f"Terrestrial soil microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
390
|
+
type="nmdc:NucleotideSequencing",
|
|
391
|
+
associated_studies=["nmdc:sty-11-34xj1150"],
|
|
392
|
+
analyte_category="metagenome",
|
|
367
393
|
)
|
|
368
394
|
|
|
369
395
|
def get_database(self) -> nmdc.Database:
|
|
@@ -371,10 +397,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
371
397
|
nmdc object creation methods as well as the nmdc type (QuantityValue, GeolocationValue, etc.)
|
|
372
398
|
creation methods, to make an nmdc Database object. It populates multiple sets in the Mongo database -
|
|
373
399
|
* `biosample_set`: uses `_translate_biosample()`
|
|
374
|
-
* `
|
|
375
|
-
|
|
376
|
-
* `
|
|
377
|
-
* `omics_processing_set`: uses `_translate_omics_processing()`
|
|
400
|
+
* `material_processing_set`: uses `_translate_pooling_process()`, `_translate_extraction_process()`,
|
|
401
|
+
`_translate_library_preparation()`
|
|
402
|
+
* `data_generation_set`: uses `_translate_nucleotide_sequencing()`
|
|
378
403
|
* `processed_sample_set`: uses `_translate_processed_sample()`
|
|
379
404
|
* `data_object_set`: uses `_translate_data_object()`
|
|
380
405
|
The core Biosample information is in the `sls_soilCoreCollection` table. However, we
|
|
@@ -605,14 +630,13 @@ class NeonSoilDataTranslator(Translator):
|
|
|
605
630
|
mms_metagenomeDnaExtraction.processedDate,
|
|
606
631
|
mms_metagenomeSequencing.sequencingFacilityID,
|
|
607
632
|
mms_metagenomeSequencing.ncbiProjectID,
|
|
608
|
-
mms_metagenomeSequencing.investigation_type,
|
|
609
633
|
mms_metagenomeSequencing.sequencingMethod,
|
|
610
634
|
mms_metagenomeSequencing.instrument_model
|
|
611
635
|
FROM mms_metagenomeSequencing
|
|
612
636
|
LEFT JOIN mms_metagenomeDnaExtraction ON mms_metagenomeDnaExtraction.dnaSampleID = mms_metagenomeSequencing.dnaSampleID
|
|
613
637
|
"""
|
|
614
638
|
library_preparation_table = pd.read_sql_query(query, self.conn)
|
|
615
|
-
|
|
639
|
+
nucleotide_sequencing_table = pd.read_sql_query(query, self.conn)
|
|
616
640
|
|
|
617
641
|
nmdc_pooling_ids = self._id_minter("nmdc:Pooling", len(pooling_ids_dict))
|
|
618
642
|
neon_to_nmdc_pooling_ids = dict(
|
|
@@ -651,12 +675,12 @@ class NeonSoilDataTranslator(Translator):
|
|
|
651
675
|
zip(library_prepration_ids, nmdc_library_preparation_processed_sample_ids)
|
|
652
676
|
)
|
|
653
677
|
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
"nmdc:
|
|
678
|
+
nucleotide_sequencing_ids = nucleotide_sequencing_table["dnaSampleID"]
|
|
679
|
+
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
680
|
+
"nmdc:NucleotideSequencing", len(nucleotide_sequencing_ids)
|
|
657
681
|
)
|
|
658
|
-
|
|
659
|
-
zip(
|
|
682
|
+
neon_to_nmdc_nucleotide_sequencing_ids = dict(
|
|
683
|
+
zip(nucleotide_sequencing_ids, nmdc_nucleotide_sequencing_ids)
|
|
660
684
|
)
|
|
661
685
|
|
|
662
686
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -699,7 +723,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
699
723
|
# if the number of biosamples that are input to a pooling process
|
|
700
724
|
# is one or less, then ignore it and go straight to extraction
|
|
701
725
|
if len(bsm_values_list) > 1:
|
|
702
|
-
database.
|
|
726
|
+
database.material_processing_set.append(
|
|
703
727
|
self._translate_pooling_process(
|
|
704
728
|
pooling_process_id,
|
|
705
729
|
processed_sample_id,
|
|
@@ -732,7 +756,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
732
756
|
# handler for creating extraction process records
|
|
733
757
|
# for both pooled and non-pooled samples
|
|
734
758
|
if "|" in genomics_pooled_id_list:
|
|
735
|
-
database.
|
|
759
|
+
database.material_processing_set.append(
|
|
736
760
|
self._translate_extraction_process(
|
|
737
761
|
extraction_id,
|
|
738
762
|
extraction_input,
|
|
@@ -753,7 +777,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
753
777
|
|
|
754
778
|
extraction_input = neon_to_nmdc_biosample_ids[neon_biosample_id]
|
|
755
779
|
|
|
756
|
-
database.
|
|
780
|
+
database.material_processing_set.append(
|
|
757
781
|
self._translate_extraction_process(
|
|
758
782
|
extraction_id,
|
|
759
783
|
extraction_input,
|
|
@@ -770,7 +794,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
770
794
|
dna_sample_id
|
|
771
795
|
]
|
|
772
796
|
|
|
773
|
-
|
|
797
|
+
nucleotide_sequencing_id = neon_to_nmdc_nucleotide_sequencing_ids[
|
|
798
|
+
dna_sample_id
|
|
799
|
+
]
|
|
774
800
|
|
|
775
801
|
genomics_sample_id = library_preparation_table[
|
|
776
802
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
@@ -785,7 +811,7 @@ class NeonSoilDataTranslator(Translator):
|
|
|
785
811
|
library_preparation_table["dnaSampleID"] == dna_sample_id
|
|
786
812
|
]
|
|
787
813
|
|
|
788
|
-
database.
|
|
814
|
+
database.material_processing_set.append(
|
|
789
815
|
self._translate_library_preparation(
|
|
790
816
|
library_preparation_id,
|
|
791
817
|
library_preparation_input,
|
|
@@ -807,9 +833,9 @@ class NeonSoilDataTranslator(Translator):
|
|
|
807
833
|
if item in neon_to_nmdc_data_object_ids:
|
|
808
834
|
has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
|
|
809
835
|
|
|
810
|
-
database.
|
|
811
|
-
self.
|
|
812
|
-
|
|
836
|
+
database.data_generation_set.append(
|
|
837
|
+
self._translate_nucleotide_sequencing(
|
|
838
|
+
nucleotide_sequencing_id,
|
|
813
839
|
processed_sample_id,
|
|
814
840
|
has_output_do_ids,
|
|
815
841
|
library_preparation_row,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import Dict, Optional
|
|
3
|
+
from typing import Dict, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import requests
|
|
@@ -36,6 +36,7 @@ SURFACE_WATER_LOCAL_SCALE_MAPPINGS = {
|
|
|
36
36
|
"term_id": "ENVO:01000409",
|
|
37
37
|
"term_name": "freshwater littoral zone",
|
|
38
38
|
},
|
|
39
|
+
"inflow": {"term_id": "ENVO:00000476", "term_name": "lake inlet"},
|
|
39
40
|
},
|
|
40
41
|
"river": {"term_id": "ENVO:01000297", "term_name": "freshwater river"},
|
|
41
42
|
"stream": {"term_id": "ENVO:03605007", "term_name": "freshwater stream"},
|
|
@@ -58,6 +59,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
58
59
|
site_code_mapping: dict,
|
|
59
60
|
neon_envo_mappings_file: pd.DataFrame,
|
|
60
61
|
neon_raw_data_file_mappings_file: pd.DataFrame,
|
|
62
|
+
neon_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
61
63
|
*args,
|
|
62
64
|
**kwargs,
|
|
63
65
|
) -> None:
|
|
@@ -108,6 +110,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
108
110
|
|
|
109
111
|
self.site_code_mapping = site_code_mapping
|
|
110
112
|
|
|
113
|
+
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
114
|
+
|
|
111
115
|
def _translate_biosample(
|
|
112
116
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
113
117
|
) -> nmdc.Biosample:
|
|
@@ -136,16 +140,17 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
136
140
|
has_minimum_numeric_value=nmdc.Float(minimum_depth),
|
|
137
141
|
has_maximum_numeric_value=nmdc.Float(maximum_depth),
|
|
138
142
|
has_unit="m",
|
|
143
|
+
type="nmdc:QuantityValue",
|
|
139
144
|
)
|
|
140
145
|
else:
|
|
141
146
|
depth = nmdc.QuantityValue(
|
|
142
147
|
has_numeric_value=nmdc.Float(minimum_depth),
|
|
143
148
|
has_unit="m",
|
|
149
|
+
type="nmdc:QuantityValue",
|
|
144
150
|
)
|
|
145
151
|
|
|
146
152
|
return nmdc.Biosample(
|
|
147
153
|
id=nmdc_id,
|
|
148
|
-
part_of="nmdc:sty-11-hht5sb92",
|
|
149
154
|
env_broad_scale=_create_controlled_identified_term_value(
|
|
150
155
|
SURFACE_WATER_BROAD_SCALE_MAPPINGS.get(
|
|
151
156
|
biosample_row["aquaticSiteType"].values[0]
|
|
@@ -201,7 +206,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
201
206
|
samp_size=_create_quantity_value(
|
|
202
207
|
biosample_row["geneticFilteredSampleVolume"].values[0], "mL"
|
|
203
208
|
),
|
|
204
|
-
env_package=nmdc.TextValue(has_raw_value="water"),
|
|
209
|
+
env_package=nmdc.TextValue(has_raw_value="water", type="nmdc:TextValue"),
|
|
210
|
+
associated_studies=["nmdc:sty-11-hht5sb92"],
|
|
205
211
|
)
|
|
206
212
|
|
|
207
213
|
def _translate_extraction_process(
|
|
@@ -243,6 +249,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
243
249
|
_get_value_or_none(extraction_row, "extrQaqcStatus")
|
|
244
250
|
),
|
|
245
251
|
processing_institution=processing_institution,
|
|
252
|
+
type="nmdc:Extraction",
|
|
246
253
|
)
|
|
247
254
|
|
|
248
255
|
def _translate_library_preparation(
|
|
@@ -255,13 +262,13 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
255
262
|
"""
|
|
256
263
|
Create LibraryPreparation process object. The input to LibraryPreparation process
|
|
257
264
|
is the output ProcessedSample from an Extraction process. The output of LibraryPreparation
|
|
258
|
-
process is fed as input to an
|
|
265
|
+
process is fed as input to an NucleotideSequencing object.
|
|
259
266
|
|
|
260
267
|
:param library_preparation_id: Minted id for LibraryPreparation process.
|
|
261
268
|
:param library_preparation_input: Input to LibraryPreparation process is output from
|
|
262
269
|
Extraction process.
|
|
263
270
|
:param processed_sample_id: Minted ProcessedSample id which is output of LibraryPreparation
|
|
264
|
-
is also input to
|
|
271
|
+
is also input to NucleotideSequencing.
|
|
265
272
|
:param library_preparation_row: Metadata required to populate LibraryPreparation.
|
|
266
273
|
:return: Object that using LibraryPreparation process model.
|
|
267
274
|
"""
|
|
@@ -280,31 +287,47 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
280
287
|
start_date=_get_value_or_none(library_preparation_row, "seqCollectDate"),
|
|
281
288
|
end_date=_get_value_or_none(library_preparation_row, "seqProcessedDate"),
|
|
282
289
|
processing_institution=processing_institution,
|
|
290
|
+
type="nmdc:LibraryPreparation",
|
|
283
291
|
)
|
|
284
292
|
|
|
285
|
-
def
|
|
293
|
+
def _get_instrument_id(self, instrument_model: Union[str | None]) -> str:
|
|
294
|
+
if not instrument_model:
|
|
295
|
+
raise ValueError(
|
|
296
|
+
f"instrument_model '{instrument_model}' could not be found in the NEON-NMDC instrument mapping TSV file."
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
df = self.neon_nmdc_instrument_map_df
|
|
300
|
+
matching_row = df[
|
|
301
|
+
df["NEON sequencingMethod"].str.contains(instrument_model, case=False)
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
if not matching_row.empty:
|
|
305
|
+
nmdc_instrument_id = matching_row["NMDC instrument_set id"].values[0]
|
|
306
|
+
return nmdc_instrument_id
|
|
307
|
+
|
|
308
|
+
def _translate_nucleotide_sequencing(
|
|
286
309
|
self,
|
|
287
|
-
|
|
310
|
+
nucleotide_sequencing_id: str,
|
|
288
311
|
processed_sample_id: str,
|
|
289
312
|
raw_data_file_data: str,
|
|
290
|
-
|
|
291
|
-
)
|
|
292
|
-
"""Create nmdc
|
|
293
|
-
Bioinformatics workflow on sequence data from a biosample. The input to an
|
|
294
|
-
process is the output from a LibraryPreparation process, and the output of
|
|
313
|
+
nucleotide_sequencing_row: pd.DataFrame,
|
|
314
|
+
):
|
|
315
|
+
"""Create nmdc NucleotideSequencing object. This class typically models the run of a
|
|
316
|
+
Bioinformatics workflow on sequence data from a biosample. The input to an NucleotideSequencing
|
|
317
|
+
process is the output from a LibraryPreparation process, and the output of NucleotideSequencing
|
|
295
318
|
is a DataObject which has the FASTQ sequence file URLs embedded in them.
|
|
296
319
|
|
|
297
|
-
:param
|
|
320
|
+
:param nucleotide_sequencing_id: Minted id for an NucleotideSequencing process.
|
|
298
321
|
:param processed_sample_id: ProcessedSample that is the output of LibraryPreparation.
|
|
299
322
|
:param raw_data_file_data: R1/R2 DataObjects which have links to workflow processed output
|
|
300
323
|
files embedded in them.
|
|
301
|
-
:param
|
|
324
|
+
:param nucleotide_sequencing_row: DataFrame with metadata for an NucleotideSequencing workflow
|
|
302
325
|
process/run.
|
|
303
|
-
:return:
|
|
326
|
+
:return: NucleotideSequencing object that models a Bioinformatics workflow process/run.
|
|
304
327
|
"""
|
|
305
328
|
processing_institution = None
|
|
306
329
|
sequencing_facility = _get_value_or_none(
|
|
307
|
-
|
|
330
|
+
nucleotide_sequencing_row, "sequencingFacilityID"
|
|
308
331
|
)
|
|
309
332
|
if sequencing_facility is not None:
|
|
310
333
|
if re.search("Battelle", sequencing_facility, re.IGNORECASE):
|
|
@@ -312,19 +335,21 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
312
335
|
elif re.search("Argonne", sequencing_facility, re.IGNORECASE):
|
|
313
336
|
processing_institution = "ANL"
|
|
314
337
|
|
|
315
|
-
return nmdc.
|
|
316
|
-
id=
|
|
338
|
+
return nmdc.NucleotideSequencing(
|
|
339
|
+
id=nucleotide_sequencing_id,
|
|
317
340
|
has_input=processed_sample_id,
|
|
318
341
|
has_output=raw_data_file_data,
|
|
319
342
|
processing_institution=processing_institution,
|
|
320
|
-
ncbi_project_name=_get_value_or_none(
|
|
321
|
-
|
|
322
|
-
|
|
343
|
+
ncbi_project_name=_get_value_or_none(
|
|
344
|
+
nucleotide_sequencing_row, "ncbiProjectID"
|
|
345
|
+
),
|
|
346
|
+
instrument_used=self._get_instrument_id(
|
|
347
|
+
_get_value_or_none(nucleotide_sequencing_row, "instrument_model")
|
|
323
348
|
),
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
349
|
+
name=f"Surface water microbial communities - {_get_value_or_none(nucleotide_sequencing_row, 'dnaSampleID')}",
|
|
350
|
+
type="nmdc:NucleotideSequencing",
|
|
351
|
+
associated_studies=["nmdc:sty-11-hht5sb92"],
|
|
352
|
+
analyte_category="metagenome",
|
|
328
353
|
)
|
|
329
354
|
|
|
330
355
|
def _translate_processed_sample(
|
|
@@ -341,12 +366,14 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
341
366
|
:param sample_id: Value from `genomicsSampleID` or `dnaSampleID` column.
|
|
342
367
|
:return: ProcessedSample objects to be stored in `processed_sample_set`.
|
|
343
368
|
"""
|
|
344
|
-
return nmdc.ProcessedSample(
|
|
369
|
+
return nmdc.ProcessedSample(
|
|
370
|
+
id=processed_sample_id, name=sample_id, type="nmdc:ProcessedSample"
|
|
371
|
+
)
|
|
345
372
|
|
|
346
373
|
def _translate_data_object(
|
|
347
374
|
self, do_id: str, url: str, do_type: str, checksum: str
|
|
348
375
|
) -> nmdc.DataObject:
|
|
349
|
-
"""Create nmdc DataObject which is the output of
|
|
376
|
+
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
350
377
|
object mainly contains information about the sequencing file that was generated as
|
|
351
378
|
the result of running a Bioinformatics workflow on a certain ProcessedSample, which
|
|
352
379
|
is the result of a LibraryPreparation process.
|
|
@@ -485,7 +512,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
485
512
|
)
|
|
486
513
|
|
|
487
514
|
neon_omprc_ids = surface_water_samples["parentSampleID"]
|
|
488
|
-
nmdc_omprc_ids = self._id_minter(
|
|
515
|
+
nmdc_omprc_ids = self._id_minter(
|
|
516
|
+
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
517
|
+
)
|
|
489
518
|
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
490
519
|
|
|
491
520
|
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
@@ -515,7 +544,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
515
544
|
processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
516
545
|
|
|
517
546
|
if extraction_input is not None and processed_sample_id is not None:
|
|
518
|
-
database.
|
|
547
|
+
database.material_processing_set.append(
|
|
519
548
|
self._translate_extraction_process(
|
|
520
549
|
nmdc_id,
|
|
521
550
|
extraction_input,
|
|
@@ -561,7 +590,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
561
590
|
processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
|
|
562
591
|
|
|
563
592
|
if lib_prep_input is not None and processed_sample_id is not None:
|
|
564
|
-
database.
|
|
593
|
+
database.material_processing_set.append(
|
|
565
594
|
self._translate_library_preparation(
|
|
566
595
|
nmdc_id,
|
|
567
596
|
lib_prep_input,
|
|
@@ -608,8 +637,8 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
608
637
|
)
|
|
609
638
|
)
|
|
610
639
|
|
|
611
|
-
database.
|
|
612
|
-
self.
|
|
640
|
+
database.data_generation_set.append(
|
|
641
|
+
self._translate_nucleotide_sequencing(
|
|
613
642
|
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
614
643
|
processed_sample_id,
|
|
615
644
|
has_output_do_ids,
|
|
@@ -50,7 +50,14 @@ def _create_controlled_identified_term_value(
|
|
|
50
50
|
"""
|
|
51
51
|
if id is None or name is None:
|
|
52
52
|
return None
|
|
53
|
-
return nmdc.ControlledIdentifiedTermValue(
|
|
53
|
+
return nmdc.ControlledIdentifiedTermValue(
|
|
54
|
+
term=nmdc.OntologyClass(
|
|
55
|
+
id=id,
|
|
56
|
+
name=name,
|
|
57
|
+
type="nmdc:OntologyClass",
|
|
58
|
+
),
|
|
59
|
+
type="nmdc:ControlledIdentifiedTermValue",
|
|
60
|
+
)
|
|
54
61
|
|
|
55
62
|
|
|
56
63
|
def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
|
|
@@ -64,7 +71,10 @@ def _create_controlled_term_value(name: str = None) -> nmdc.ControlledTermValue:
|
|
|
64
71
|
"""
|
|
65
72
|
if name is None:
|
|
66
73
|
return None
|
|
67
|
-
return nmdc.ControlledTermValue(
|
|
74
|
+
return nmdc.ControlledTermValue(
|
|
75
|
+
has_raw_value=name,
|
|
76
|
+
type="nmdc:ControlledTermValue",
|
|
77
|
+
)
|
|
68
78
|
|
|
69
79
|
|
|
70
80
|
def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
|
|
@@ -77,7 +87,7 @@ def _create_timestamp_value(value: str = None) -> nmdc.TimestampValue:
|
|
|
77
87
|
"""
|
|
78
88
|
if value is None:
|
|
79
89
|
return None
|
|
80
|
-
return nmdc.TimestampValue(has_raw_value=value)
|
|
90
|
+
return nmdc.TimestampValue(has_raw_value=value, type="nmdc:TimestampValue")
|
|
81
91
|
|
|
82
92
|
|
|
83
93
|
def _create_quantity_value(
|
|
@@ -94,7 +104,9 @@ def _create_quantity_value(
|
|
|
94
104
|
"""
|
|
95
105
|
if numeric_value is None or math.isnan(numeric_value):
|
|
96
106
|
return None
|
|
97
|
-
return nmdc.QuantityValue(
|
|
107
|
+
return nmdc.QuantityValue(
|
|
108
|
+
has_numeric_value=float(numeric_value), has_unit=unit, type="nmdc:QuantityValue"
|
|
109
|
+
)
|
|
98
110
|
|
|
99
111
|
|
|
100
112
|
def _create_text_value(value: str = None) -> nmdc.TextValue:
|
|
@@ -106,7 +118,7 @@ def _create_text_value(value: str = None) -> nmdc.TextValue:
|
|
|
106
118
|
"""
|
|
107
119
|
if value is None:
|
|
108
120
|
return None
|
|
109
|
-
return nmdc.TextValue(has_raw_value=value)
|
|
121
|
+
return nmdc.TextValue(has_raw_value=value, type="nmdc:TextValue")
|
|
110
122
|
|
|
111
123
|
|
|
112
124
|
def _create_double_value(value: str = None) -> nmdc.Double:
|
|
@@ -119,7 +131,7 @@ def _create_double_value(value: str = None) -> nmdc.Double:
|
|
|
119
131
|
"""
|
|
120
132
|
if value is None or math.isnan(value):
|
|
121
133
|
return None
|
|
122
|
-
return nmdc.Double(value)
|
|
134
|
+
return nmdc.Double(value, type="nmdc:Double")
|
|
123
135
|
|
|
124
136
|
|
|
125
137
|
def _create_geolocation_value(
|
|
@@ -147,4 +159,5 @@ def _create_geolocation_value(
|
|
|
147
159
|
return nmdc.GeolocationValue(
|
|
148
160
|
latitude=nmdc.DecimalDegree(latitude),
|
|
149
161
|
longitude=nmdc.DecimalDegree(longitude),
|
|
162
|
+
type="nmdc:GeolocationValue",
|
|
150
163
|
)
|