nmdc-runtime 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/site/export/ncbi_xml.py +23 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
- nmdc_runtime/site/ops.py +6 -3
- nmdc_runtime/site/repository.py +8 -8
- nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA +2 -2
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD +11 -11
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, List, Union
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
10
|
get_instruments,
|
|
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
|
|
|
366
366
|
)
|
|
367
367
|
# Currently, we are making the assumption that only one instrument
|
|
368
368
|
# is used to sequence a Biosample
|
|
369
|
-
|
|
369
|
+
instrument_used: List[str] = ntseq.get(
|
|
370
|
+
"instrument_used", []
|
|
371
|
+
)
|
|
372
|
+
if not instrument_used:
|
|
373
|
+
instrument_id = None
|
|
374
|
+
else:
|
|
375
|
+
instrument_id = instrument_used[0]
|
|
376
|
+
|
|
370
377
|
instrument = all_instruments.get(instrument_id, {})
|
|
371
378
|
instrument_vendor = instrument.get("vendor", "")
|
|
372
379
|
instrument_model = instrument.get("model", "")
|
|
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
|
|
|
448
455
|
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
449
456
|
)
|
|
450
457
|
)
|
|
458
|
+
elif instrument_model == "novaseq_6000":
|
|
459
|
+
sra_attributes.append(
|
|
460
|
+
self.set_element(
|
|
461
|
+
"Attribute",
|
|
462
|
+
"NovaSeq 6000",
|
|
463
|
+
{"name": "instrument_model"},
|
|
464
|
+
)
|
|
465
|
+
)
|
|
466
|
+
elif instrument_model == "hiseq":
|
|
467
|
+
sra_attributes.append(
|
|
468
|
+
self.set_element(
|
|
469
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
470
|
+
)
|
|
471
|
+
)
|
|
451
472
|
|
|
452
473
|
if analyte_category == "metagenome":
|
|
453
474
|
sra_attributes.append(
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
from nmdc_runtime.api.endpoints.util import strip_oid
|
|
2
5
|
from nmdc_runtime.minter.config import typecodes
|
|
3
6
|
from lxml import etree
|
|
7
|
+
from pymongo.collection import Collection
|
|
4
8
|
|
|
5
9
|
import csv
|
|
6
10
|
import requests
|
|
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
|
|
|
45
49
|
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
|
|
46
50
|
|
|
47
51
|
|
|
48
|
-
def fetch_data_objects_from_biosamples(
|
|
52
|
+
def fetch_data_objects_from_biosamples(
|
|
53
|
+
all_docs_collection: Collection,
|
|
54
|
+
data_object_set: Collection,
|
|
55
|
+
biosamples_list: List[Dict[str, Any]],
|
|
56
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
57
|
+
"""This method fetches the data objects that are "associated" (derived from/products of)
|
|
58
|
+
with their respective biosamples by iterating over the alldocs collection recursively.
|
|
59
|
+
The methods returns a dictionary with biosample ids as keys and the associated list of
|
|
60
|
+
data objects as values.
|
|
61
|
+
|
|
62
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
63
|
+
:param data_object_set: reference to the data_object_set collection
|
|
64
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
65
|
+
:return: list of dictionaries with biosample ids as keys and associated data objects as values
|
|
66
|
+
"""
|
|
67
|
+
biosample_data_objects = []
|
|
68
|
+
|
|
69
|
+
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
70
|
+
for doc_id in doc_ids:
|
|
71
|
+
if (
|
|
72
|
+
get_classname_from_typecode(doc_id) == "DataObject"
|
|
73
|
+
and doc_id not in unique_ids
|
|
74
|
+
):
|
|
75
|
+
data_obj = data_object_set.find_one({"id": doc_id})
|
|
76
|
+
if data_obj:
|
|
77
|
+
collected_objects.append(strip_oid(data_obj))
|
|
78
|
+
unique_ids.add(doc_id)
|
|
79
|
+
|
|
49
80
|
biosample_data_objects = []
|
|
50
81
|
|
|
51
82
|
for biosample in biosamples_list:
|
|
52
83
|
current_ids = [biosample["id"]]
|
|
53
84
|
collected_data_objects = []
|
|
85
|
+
unique_ids = set()
|
|
54
86
|
|
|
55
87
|
while current_ids:
|
|
56
88
|
new_current_ids = []
|
|
57
89
|
for current_id in current_ids:
|
|
58
|
-
|
|
59
|
-
|
|
90
|
+
for doc in all_docs_collection.find({"has_input": current_id}):
|
|
91
|
+
has_output = doc.get("has_output", [])
|
|
60
92
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
for output_id in has_output:
|
|
69
|
-
if get_classname_from_typecode(output_id) == "DataObject":
|
|
70
|
-
data_object_doc = all_docs_collection.find_one(
|
|
71
|
-
{"id": output_id}
|
|
72
|
-
)
|
|
73
|
-
if data_object_doc:
|
|
74
|
-
collected_data_objects.append(data_object_doc)
|
|
75
|
-
else:
|
|
76
|
-
new_current_ids.append(output_id)
|
|
93
|
+
collect_data_objects(has_output, collected_data_objects, unique_ids)
|
|
94
|
+
new_current_ids.extend(
|
|
95
|
+
op
|
|
96
|
+
for op in has_output
|
|
97
|
+
if get_classname_from_typecode(op) != "DataObject"
|
|
98
|
+
)
|
|
77
99
|
|
|
78
100
|
current_ids = new_current_ids
|
|
79
101
|
|
|
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
|
83
105
|
return biosample_data_objects
|
|
84
106
|
|
|
85
107
|
|
|
86
|
-
def fetch_nucleotide_sequencing_from_biosamples(
|
|
87
|
-
|
|
108
|
+
def fetch_nucleotide_sequencing_from_biosamples(
|
|
109
|
+
all_docs_collection: Collection,
|
|
110
|
+
data_generation_set: Collection,
|
|
111
|
+
biosamples_list: List[Dict[str, Any]],
|
|
112
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
113
|
+
"""This method fetches the nucleotide sequencing process records that create data objects
|
|
114
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
115
|
+
|
|
116
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
117
|
+
:param data_generation_set: reference to the data_generation_set collection
|
|
118
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
119
|
+
:return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
|
|
120
|
+
process objects as values
|
|
121
|
+
"""
|
|
122
|
+
biosample_ntseq_objects = []
|
|
88
123
|
|
|
89
124
|
for biosample in biosamples_list:
|
|
90
125
|
current_ids = [biosample["id"]]
|
|
91
|
-
|
|
126
|
+
collected_ntseq_objects = []
|
|
92
127
|
|
|
93
128
|
while current_ids:
|
|
94
129
|
new_current_ids = []
|
|
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
|
|
|
105
140
|
|
|
106
141
|
for output_id in has_output:
|
|
107
142
|
if get_classname_from_typecode(output_id) == "DataObject":
|
|
108
|
-
nucleotide_sequencing_doc =
|
|
143
|
+
nucleotide_sequencing_doc = data_generation_set.find_one(
|
|
109
144
|
{"id": document["id"]}
|
|
110
145
|
)
|
|
111
146
|
if nucleotide_sequencing_doc:
|
|
112
|
-
|
|
147
|
+
collected_ntseq_objects.append(
|
|
148
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
149
|
+
)
|
|
113
150
|
else:
|
|
114
151
|
new_current_ids.append(output_id)
|
|
115
152
|
|
|
116
153
|
current_ids = new_current_ids
|
|
117
154
|
|
|
118
|
-
if
|
|
119
|
-
|
|
155
|
+
if collected_ntseq_objects:
|
|
156
|
+
biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
|
|
157
|
+
|
|
158
|
+
return biosample_ntseq_objects
|
|
120
159
|
|
|
121
|
-
return biosample_data_objects
|
|
122
160
|
|
|
161
|
+
def fetch_library_preparation_from_biosamples(
|
|
162
|
+
all_docs_collection: Collection,
|
|
163
|
+
material_processing_set: Collection,
|
|
164
|
+
biosamples_list: List[Dict[str, Any]],
|
|
165
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
166
|
+
"""This method fetches the library preparation process records that create processed samples,
|
|
167
|
+
which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
|
|
168
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
123
169
|
|
|
124
|
-
|
|
170
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
171
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
172
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
173
|
+
:return: list of dictionaries with biosample ids as keys and associated library preparation process
|
|
174
|
+
objects as values
|
|
175
|
+
"""
|
|
125
176
|
biosample_lib_prep = []
|
|
126
177
|
|
|
127
178
|
for biosample in biosamples_list:
|
|
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
|
|
|
144
195
|
"has_input": output_id,
|
|
145
196
|
"type": {"$in": ["LibraryPreparation"]},
|
|
146
197
|
}
|
|
147
|
-
lib_prep_doc =
|
|
198
|
+
lib_prep_doc = material_processing_set.find_one(lib_prep_query)
|
|
148
199
|
|
|
149
200
|
if lib_prep_doc:
|
|
150
|
-
biosample_lib_prep.append({biosample_id: lib_prep_doc})
|
|
201
|
+
biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
|
|
151
202
|
break # Stop at the first document that meets the criteria
|
|
152
203
|
|
|
153
204
|
return biosample_lib_prep
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1188,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
|
|
|
1188
1188
|
def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1189
1189
|
mdb = context.resources.mongo.db
|
|
1190
1190
|
alldocs_collection = mdb["alldocs"]
|
|
1191
|
+
data_object_set = mdb["data_object_set"]
|
|
1191
1192
|
biosample_data_objects = fetch_data_objects_from_biosamples(
|
|
1192
|
-
alldocs_collection, biosamples
|
|
1193
|
+
alldocs_collection, data_object_set, biosamples
|
|
1193
1194
|
)
|
|
1194
1195
|
return biosample_data_objects
|
|
1195
1196
|
|
|
@@ -1200,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
|
|
|
1200
1201
|
):
|
|
1201
1202
|
mdb = context.resources.mongo.db
|
|
1202
1203
|
alldocs_collection = mdb["alldocs"]
|
|
1204
|
+
data_generation_set = mdb["data_generation_set"]
|
|
1203
1205
|
biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
|
|
1204
|
-
alldocs_collection, biosamples
|
|
1206
|
+
alldocs_collection, data_generation_set, biosamples
|
|
1205
1207
|
)
|
|
1206
1208
|
return biosample_omics_processing
|
|
1207
1209
|
|
|
@@ -1212,8 +1214,9 @@ def get_library_preparation_from_biosamples(
|
|
|
1212
1214
|
):
|
|
1213
1215
|
mdb = context.resources.mongo.db
|
|
1214
1216
|
alldocs_collection = mdb["alldocs"]
|
|
1217
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1215
1218
|
biosample_lib_prep = fetch_library_preparation_from_biosamples(
|
|
1216
|
-
alldocs_collection, biosamples
|
|
1219
|
+
alldocs_collection, material_processing_set, biosamples
|
|
1217
1220
|
)
|
|
1218
1221
|
return biosample_lib_prep
|
|
1219
1222
|
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -652,7 +652,7 @@ def biosample_submission_ingest():
|
|
|
652
652
|
"inputs": {
|
|
653
653
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
654
654
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
655
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
655
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
656
656
|
}
|
|
657
657
|
},
|
|
658
658
|
},
|
|
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
|
|
|
694
694
|
"inputs": {
|
|
695
695
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
696
696
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
697
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
697
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
698
698
|
}
|
|
699
699
|
},
|
|
700
700
|
},
|
|
@@ -737,7 +737,7 @@ def biosample_submission_ingest():
|
|
|
737
737
|
"inputs": {
|
|
738
738
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
739
739
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
740
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
740
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
741
741
|
}
|
|
742
742
|
},
|
|
743
743
|
"get_neon_pipeline_benthic_data_product": {
|
|
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
|
|
|
779
779
|
"inputs": {
|
|
780
780
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
781
781
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
782
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
782
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
783
783
|
}
|
|
784
784
|
},
|
|
785
785
|
},
|
|
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
|
|
|
822
822
|
"inputs": {
|
|
823
823
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
824
824
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
825
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
825
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
826
826
|
}
|
|
827
827
|
},
|
|
828
828
|
"get_neon_pipeline_surface_water_data_product": {
|
|
829
829
|
"config": {
|
|
830
830
|
"surface_water_data_product": {
|
|
831
831
|
"product_id": "DP1.20281.001",
|
|
832
|
-
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
|
|
832
|
+
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
|
|
833
833
|
}
|
|
834
834
|
}
|
|
835
835
|
},
|
|
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
|
|
|
856
856
|
"config": {
|
|
857
857
|
"surface_water_data_product": {
|
|
858
858
|
"product_id": "DP1.20281.001",
|
|
859
|
-
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
|
|
859
|
+
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
|
|
860
860
|
}
|
|
861
861
|
}
|
|
862
862
|
},
|
|
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
|
|
|
864
864
|
"inputs": {
|
|
865
865
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
866
866
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
867
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
867
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
868
868
|
}
|
|
869
869
|
},
|
|
870
870
|
},
|
|
@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
71
71
|
neon_amb_data_tables = (
|
|
72
72
|
"mms_swMetagenomeSequencing",
|
|
73
73
|
"mms_swMetagenomeDnaExtraction",
|
|
74
|
+
"mms_swRawDataFiles",
|
|
74
75
|
"amc_fieldGenetic",
|
|
75
76
|
"amc_fieldSuperParent",
|
|
76
77
|
)
|
|
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
88
89
|
if_exists="replace",
|
|
89
90
|
index=False,
|
|
90
91
|
)
|
|
92
|
+
surface_water_data["mms_swRawDataFiles"].to_sql(
|
|
93
|
+
"mms_swRawDataFiles", self.conn, if_exists="replace", index=False
|
|
94
|
+
)
|
|
91
95
|
surface_water_data["amc_fieldGenetic"].to_sql(
|
|
92
96
|
"amc_fieldGenetic", self.conn, if_exists="replace", index=False
|
|
93
97
|
)
|
|
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
103
107
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
104
108
|
)
|
|
105
109
|
|
|
106
|
-
self.neon_raw_data_file_mappings_df =
|
|
107
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
108
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
109
|
-
)
|
|
110
|
+
self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
|
|
110
111
|
|
|
111
112
|
self.site_code_mapping = site_code_mapping
|
|
112
113
|
|
|
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
371
372
|
)
|
|
372
373
|
|
|
373
374
|
def _translate_data_object(
|
|
374
|
-
self, do_id: str, url: str, do_type: str,
|
|
375
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
375
376
|
) -> nmdc.DataObject:
|
|
376
377
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
377
378
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
395
396
|
url=url,
|
|
396
397
|
description=f"sequencing results for {basename}",
|
|
397
398
|
type="nmdc:DataObject",
|
|
398
|
-
md5_checksum=checksum,
|
|
399
399
|
data_object_type=do_type,
|
|
400
|
+
in_manifest=manifest_id,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
404
|
+
return nmdc.Manifest(
|
|
405
|
+
id=manifest_id,
|
|
406
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
407
|
+
type="nmdc:Manifest",
|
|
400
408
|
)
|
|
401
409
|
|
|
402
410
|
def get_database(self):
|
|
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
477
485
|
"""
|
|
478
486
|
surface_water_samples = pd.read_sql_query(query, self.conn)
|
|
479
487
|
|
|
488
|
+
# --------------------------------------------------
|
|
489
|
+
# Create mappings for minted NMDC IDs
|
|
490
|
+
# --------------------------------------------------
|
|
480
491
|
neon_biosample_ids = surface_water_samples["parentSampleID"]
|
|
481
492
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
|
|
482
493
|
neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
|
|
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
511
522
|
zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
|
|
512
523
|
)
|
|
513
524
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
)
|
|
518
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
519
|
-
|
|
520
|
-
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
521
|
-
neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
|
|
522
|
-
nmdc_data_object_ids = self._id_minter(
|
|
523
|
-
"nmdc:DataObject", len(neon_raw_file_paths)
|
|
524
|
-
)
|
|
525
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
526
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
527
|
-
)
|
|
528
|
-
|
|
525
|
+
# --------------------------------------------------
|
|
526
|
+
# STEP 1: Insert Biosamples
|
|
527
|
+
# --------------------------------------------------
|
|
529
528
|
for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
|
|
530
529
|
biosample_row = surface_water_samples[
|
|
531
530
|
surface_water_samples["parentSampleID"] == neon_id
|
|
532
531
|
]
|
|
532
|
+
# database.biosample_set.append(
|
|
533
|
+
# self._translate_biosample(neon_id, nmdc_id, biosample_row)
|
|
534
|
+
# )
|
|
533
535
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
536
|
+
# --------------------------------------------------
|
|
537
|
+
# STEP 2: Insert Extraction Processes
|
|
538
|
+
# --------------------------------------------------
|
|
538
539
|
for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
|
|
539
540
|
extraction_row = surface_water_samples[
|
|
540
541
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
557
558
|
extraction_row, "genomicsSampleID"
|
|
558
559
|
)
|
|
559
560
|
|
|
561
|
+
# Each Extraction process output => ProcessedSample
|
|
560
562
|
database.processed_sample_set.append(
|
|
561
563
|
self._translate_processed_sample(
|
|
562
564
|
processed_sample_id,
|
|
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
564
566
|
)
|
|
565
567
|
)
|
|
566
568
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
GROUP BY dnaSampleID
|
|
571
|
-
"""
|
|
572
|
-
neon_raw_data_files = pd.read_sql_query(query, self.conn)
|
|
573
|
-
neon_raw_data_files_dict = (
|
|
574
|
-
neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
575
|
-
.str.split("|")
|
|
576
|
-
.to_dict()
|
|
577
|
-
)
|
|
578
|
-
filtered_neon_raw_data_files_dict = {
|
|
579
|
-
key: value
|
|
580
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
581
|
-
if len(value) <= 2
|
|
582
|
-
}
|
|
583
|
-
|
|
569
|
+
# --------------------------------------------------
|
|
570
|
+
# STEP 3: Insert LibraryPreparation Processes
|
|
571
|
+
# --------------------------------------------------
|
|
584
572
|
for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
|
|
585
573
|
lib_prep_row = surface_water_samples[
|
|
586
574
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
601
589
|
|
|
602
590
|
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
603
591
|
|
|
592
|
+
# Each LibraryPreparation process output => ProcessedSample
|
|
604
593
|
database.processed_sample_set.append(
|
|
605
594
|
self._translate_processed_sample(
|
|
606
595
|
processed_sample_id,
|
|
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
608
597
|
)
|
|
609
598
|
)
|
|
610
599
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
600
|
+
# --------------------------------------------------
|
|
601
|
+
# STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
|
|
602
|
+
# and insert DataObjects + DataGeneration processes
|
|
603
|
+
# --------------------------------------------------
|
|
604
|
+
raw_query = """
|
|
605
|
+
SELECT dnaSampleID, sequencerRunID, rawDataFilePath
|
|
606
|
+
FROM mms_swRawDataFiles
|
|
607
|
+
"""
|
|
608
|
+
neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
|
|
609
|
+
|
|
610
|
+
for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
|
|
611
|
+
# 1) Pull out the row that corresponds to this parentSampleID
|
|
612
|
+
lib_prep_row = surface_water_samples[
|
|
613
|
+
surface_water_samples["parentSampleID"] == neon_id
|
|
614
|
+
]
|
|
615
|
+
|
|
616
|
+
# 2) Grab the dnaSampleID from that row
|
|
617
|
+
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
618
|
+
if not dna_sample_id:
|
|
619
|
+
# No dnaSampleID => skip
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
# 3) Find all raw files for that dnaSampleID
|
|
623
|
+
dna_files = neon_raw_data_files_df[
|
|
624
|
+
neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
|
|
625
|
+
]
|
|
626
|
+
if dna_files.empty:
|
|
627
|
+
# No raw files => skip
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
# -----------------------------------------
|
|
631
|
+
# LOOKUP DICT: get "has_input" for this neon_id
|
|
632
|
+
# -----------------------------------------
|
|
633
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
634
|
+
# If some neon_id isn't in the dictionary, handle it as needed
|
|
635
|
+
if not has_input_value:
|
|
636
|
+
# Could skip, or raise an error, or set a default
|
|
637
|
+
continue
|
|
638
|
+
|
|
639
|
+
# -------------------------------------------
|
|
640
|
+
# 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
|
|
641
|
+
# for this row's dnaSampleID
|
|
642
|
+
# -------------------------------------------
|
|
643
|
+
manifest_id = None
|
|
644
|
+
if len(dna_files) > 2:
|
|
645
|
+
# For each row that references a dnaSampleID with multiple raw files,
|
|
646
|
+
# mint exactly one new manifest record
|
|
647
|
+
manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
648
|
+
new_manifest = self._translate_manifest(manifest_id)
|
|
649
|
+
# Add to the database
|
|
650
|
+
database.manifest_set.append(new_manifest)
|
|
651
|
+
|
|
652
|
+
# -------------------------------------------
|
|
653
|
+
# 5) NOW GROUP FILES BY sequencerRunID
|
|
654
|
+
# => one data_generation record per run
|
|
655
|
+
# -------------------------------------------
|
|
656
|
+
lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
|
|
657
|
+
neon_id
|
|
658
|
+
)
|
|
659
|
+
if not lib_prep_processed_sample_id:
|
|
660
|
+
# If we don't have a ProcessedSample for some reason, skip
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
for run_id, group_df in dna_files.groupby("sequencerRunID"):
|
|
664
|
+
# a) Mint new data_generation (NucleotideSequencing) ID for this run
|
|
665
|
+
data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
|
|
666
|
+
|
|
667
|
+
# b) Create DataObjects for each raw file in this run
|
|
668
|
+
data_object_ids = []
|
|
669
|
+
for raw_fp in group_df["rawDataFilePath"]:
|
|
670
|
+
do_id = self._id_minter("nmdc:DataObject", 1)[0]
|
|
671
|
+
|
|
672
|
+
# Distinguish read type
|
|
673
|
+
do_type = None
|
|
674
|
+
if "_R1.fastq.gz" in raw_fp:
|
|
675
|
+
do_type = "Metagenome Raw Read 1"
|
|
676
|
+
elif "_R2.fastq.gz" in raw_fp:
|
|
677
|
+
do_type = "Metagenome Raw Read 2"
|
|
678
|
+
|
|
679
|
+
# Create the DataObject
|
|
680
|
+
data_obj = self._translate_data_object(
|
|
681
|
+
do_id=do_id,
|
|
682
|
+
url=raw_fp,
|
|
683
|
+
do_type=do_type,
|
|
684
|
+
manifest_id=manifest_id, # link to the new Manifest if it exists
|
|
685
|
+
)
|
|
686
|
+
database.data_object_set.append(data_obj)
|
|
687
|
+
data_object_ids.append(do_id)
|
|
688
|
+
|
|
689
|
+
# c) Finally, create the data generation record for this run
|
|
690
|
+
database.data_generation_set.append(
|
|
691
|
+
self._translate_nucleotide_sequencing(
|
|
692
|
+
nucleotide_sequencing_id=data_generation_id,
|
|
693
|
+
processed_sample_id=has_input_value,
|
|
694
|
+
raw_data_file_data=data_object_ids,
|
|
695
|
+
nucleotide_sequencing_row=lib_prep_row,
|
|
647
696
|
)
|
|
697
|
+
)
|
|
648
698
|
|
|
649
699
|
return database
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.4.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -37,7 +37,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
|
|
|
37
37
|
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
38
38
|
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
39
39
|
|
|
40
|
-
* [workflow_documentation](https://
|
|
40
|
+
* [workflow_documentation](https://docs.microbiomedata.org/workflows/)
|
|
41
41
|
references workflow code spread across several repositories, that take source data and produce computed data.
|
|
42
42
|
|
|
43
43
|
* This repo (nmdc-runtime)
|
|
@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
39
|
nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
|
|
42
42
|
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
43
|
nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -51,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
51
51
|
nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
|
|
52
52
|
nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
|
|
53
53
|
nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
nmdc_runtime/site/export/ncbi_xml.py,sha256=
|
|
55
|
-
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=
|
|
54
|
+
nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
|
|
55
|
+
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
@@ -65,7 +65,7 @@ nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-N
|
|
|
65
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
66
66
|
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
|
|
67
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
68
|
-
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=
|
|
68
|
+
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
|
|
69
69
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
70
70
|
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
|
|
71
71
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
75
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
76
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
77
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
82
|
-
nmdc_runtime-2.
|
|
83
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
|
|
80
|
+
nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
81
|
+
nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|