nmdc-runtime 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/site/export/ncbi_xml.py +23 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
- nmdc_runtime/site/ops.py +12 -4
- nmdc_runtime/site/repair/database_updater.py +12 -0
- nmdc_runtime/site/repository.py +10 -10
- nmdc_runtime/site/translation/neon_benthic_translator.py +156 -157
- nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/METADATA +17 -4
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/RECORD +13 -13
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info/licenses}/LICENSE +0 -0
- {nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any
|
|
7
|
+
from typing import Any, List, Union
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
10
|
get_instruments,
|
|
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
|
|
|
366
366
|
)
|
|
367
367
|
# Currently, we are making the assumption that only one instrument
|
|
368
368
|
# is used to sequence a Biosample
|
|
369
|
-
|
|
369
|
+
instrument_used: List[str] = ntseq.get(
|
|
370
|
+
"instrument_used", []
|
|
371
|
+
)
|
|
372
|
+
if not instrument_used:
|
|
373
|
+
instrument_id = None
|
|
374
|
+
else:
|
|
375
|
+
instrument_id = instrument_used[0]
|
|
376
|
+
|
|
370
377
|
instrument = all_instruments.get(instrument_id, {})
|
|
371
378
|
instrument_vendor = instrument.get("vendor", "")
|
|
372
379
|
instrument_model = instrument.get("model", "")
|
|
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
|
|
|
448
455
|
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
449
456
|
)
|
|
450
457
|
)
|
|
458
|
+
elif instrument_model == "novaseq_6000":
|
|
459
|
+
sra_attributes.append(
|
|
460
|
+
self.set_element(
|
|
461
|
+
"Attribute",
|
|
462
|
+
"NovaSeq 6000",
|
|
463
|
+
{"name": "instrument_model"},
|
|
464
|
+
)
|
|
465
|
+
)
|
|
466
|
+
elif instrument_model == "hiseq":
|
|
467
|
+
sra_attributes.append(
|
|
468
|
+
self.set_element(
|
|
469
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
470
|
+
)
|
|
471
|
+
)
|
|
451
472
|
|
|
452
473
|
if analyte_category == "metagenome":
|
|
453
474
|
sra_attributes.append(
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from io import BytesIO, StringIO
|
|
2
|
+
from typing import Any, Dict, List, Union
|
|
3
|
+
|
|
4
|
+
from nmdc_runtime.api.endpoints.util import strip_oid
|
|
2
5
|
from nmdc_runtime.minter.config import typecodes
|
|
3
6
|
from lxml import etree
|
|
7
|
+
from pymongo.collection import Collection
|
|
4
8
|
|
|
5
9
|
import csv
|
|
6
10
|
import requests
|
|
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
|
|
|
45
49
|
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
|
|
46
50
|
|
|
47
51
|
|
|
48
|
-
def fetch_data_objects_from_biosamples(
|
|
52
|
+
def fetch_data_objects_from_biosamples(
|
|
53
|
+
all_docs_collection: Collection,
|
|
54
|
+
data_object_set: Collection,
|
|
55
|
+
biosamples_list: List[Dict[str, Any]],
|
|
56
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
57
|
+
"""This method fetches the data objects that are "associated" (derived from/products of)
|
|
58
|
+
with their respective biosamples by iterating over the alldocs collection recursively.
|
|
59
|
+
The methods returns a dictionary with biosample ids as keys and the associated list of
|
|
60
|
+
data objects as values.
|
|
61
|
+
|
|
62
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
63
|
+
:param data_object_set: reference to the data_object_set collection
|
|
64
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
65
|
+
:return: list of dictionaries with biosample ids as keys and associated data objects as values
|
|
66
|
+
"""
|
|
67
|
+
biosample_data_objects = []
|
|
68
|
+
|
|
69
|
+
def collect_data_objects(doc_ids, collected_objects, unique_ids):
|
|
70
|
+
for doc_id in doc_ids:
|
|
71
|
+
if (
|
|
72
|
+
get_classname_from_typecode(doc_id) == "DataObject"
|
|
73
|
+
and doc_id not in unique_ids
|
|
74
|
+
):
|
|
75
|
+
data_obj = data_object_set.find_one({"id": doc_id})
|
|
76
|
+
if data_obj:
|
|
77
|
+
collected_objects.append(strip_oid(data_obj))
|
|
78
|
+
unique_ids.add(doc_id)
|
|
79
|
+
|
|
49
80
|
biosample_data_objects = []
|
|
50
81
|
|
|
51
82
|
for biosample in biosamples_list:
|
|
52
83
|
current_ids = [biosample["id"]]
|
|
53
84
|
collected_data_objects = []
|
|
85
|
+
unique_ids = set()
|
|
54
86
|
|
|
55
87
|
while current_ids:
|
|
56
88
|
new_current_ids = []
|
|
57
89
|
for current_id in current_ids:
|
|
58
|
-
|
|
59
|
-
|
|
90
|
+
for doc in all_docs_collection.find({"has_input": current_id}):
|
|
91
|
+
has_output = doc.get("has_output", [])
|
|
60
92
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
for output_id in has_output:
|
|
69
|
-
if get_classname_from_typecode(output_id) == "DataObject":
|
|
70
|
-
data_object_doc = all_docs_collection.find_one(
|
|
71
|
-
{"id": output_id}
|
|
72
|
-
)
|
|
73
|
-
if data_object_doc:
|
|
74
|
-
collected_data_objects.append(data_object_doc)
|
|
75
|
-
else:
|
|
76
|
-
new_current_ids.append(output_id)
|
|
93
|
+
collect_data_objects(has_output, collected_data_objects, unique_ids)
|
|
94
|
+
new_current_ids.extend(
|
|
95
|
+
op
|
|
96
|
+
for op in has_output
|
|
97
|
+
if get_classname_from_typecode(op) != "DataObject"
|
|
98
|
+
)
|
|
77
99
|
|
|
78
100
|
current_ids = new_current_ids
|
|
79
101
|
|
|
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
|
83
105
|
return biosample_data_objects
|
|
84
106
|
|
|
85
107
|
|
|
86
|
-
def fetch_nucleotide_sequencing_from_biosamples(
|
|
87
|
-
|
|
108
|
+
def fetch_nucleotide_sequencing_from_biosamples(
|
|
109
|
+
all_docs_collection: Collection,
|
|
110
|
+
data_generation_set: Collection,
|
|
111
|
+
biosamples_list: List[Dict[str, Any]],
|
|
112
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
113
|
+
"""This method fetches the nucleotide sequencing process records that create data objects
|
|
114
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
115
|
+
|
|
116
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
117
|
+
:param data_generation_set: reference to the data_generation_set collection
|
|
118
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
119
|
+
:return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
|
|
120
|
+
process objects as values
|
|
121
|
+
"""
|
|
122
|
+
biosample_ntseq_objects = []
|
|
88
123
|
|
|
89
124
|
for biosample in biosamples_list:
|
|
90
125
|
current_ids = [biosample["id"]]
|
|
91
|
-
|
|
126
|
+
collected_ntseq_objects = []
|
|
92
127
|
|
|
93
128
|
while current_ids:
|
|
94
129
|
new_current_ids = []
|
|
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
|
|
|
105
140
|
|
|
106
141
|
for output_id in has_output:
|
|
107
142
|
if get_classname_from_typecode(output_id) == "DataObject":
|
|
108
|
-
nucleotide_sequencing_doc =
|
|
143
|
+
nucleotide_sequencing_doc = data_generation_set.find_one(
|
|
109
144
|
{"id": document["id"]}
|
|
110
145
|
)
|
|
111
146
|
if nucleotide_sequencing_doc:
|
|
112
|
-
|
|
147
|
+
collected_ntseq_objects.append(
|
|
148
|
+
strip_oid(nucleotide_sequencing_doc)
|
|
149
|
+
)
|
|
113
150
|
else:
|
|
114
151
|
new_current_ids.append(output_id)
|
|
115
152
|
|
|
116
153
|
current_ids = new_current_ids
|
|
117
154
|
|
|
118
|
-
if
|
|
119
|
-
|
|
155
|
+
if collected_ntseq_objects:
|
|
156
|
+
biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
|
|
157
|
+
|
|
158
|
+
return biosample_ntseq_objects
|
|
120
159
|
|
|
121
|
-
return biosample_data_objects
|
|
122
160
|
|
|
161
|
+
def fetch_library_preparation_from_biosamples(
|
|
162
|
+
all_docs_collection: Collection,
|
|
163
|
+
material_processing_set: Collection,
|
|
164
|
+
biosamples_list: List[Dict[str, Any]],
|
|
165
|
+
) -> List[Dict[str, Dict[str, Any]]]:
|
|
166
|
+
"""This method fetches the library preparation process records that create processed samples,
|
|
167
|
+
which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
|
|
168
|
+
for biosamples by iterating over the alldocs collection recursively.
|
|
123
169
|
|
|
124
|
-
|
|
170
|
+
:param all_docs_collection: reference to the alldocs collection
|
|
171
|
+
:param material_processing_set: reference to the material_processing_set collection
|
|
172
|
+
:param biosamples_list: list of biosamples as JSON documents
|
|
173
|
+
:return: list of dictionaries with biosample ids as keys and associated library preparation process
|
|
174
|
+
objects as values
|
|
175
|
+
"""
|
|
125
176
|
biosample_lib_prep = []
|
|
126
177
|
|
|
127
178
|
for biosample in biosamples_list:
|
|
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
|
|
|
144
195
|
"has_input": output_id,
|
|
145
196
|
"type": {"$in": ["LibraryPreparation"]},
|
|
146
197
|
}
|
|
147
|
-
lib_prep_doc =
|
|
198
|
+
lib_prep_doc = material_processing_set.find_one(lib_prep_query)
|
|
148
199
|
|
|
149
200
|
if lib_prep_doc:
|
|
150
|
-
biosample_lib_prep.append({biosample_id: lib_prep_doc})
|
|
201
|
+
biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
|
|
151
202
|
break # Stop at the first document that meets the criteria
|
|
152
203
|
|
|
153
204
|
return biosample_lib_prep
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1100,7 +1100,12 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1100
|
write_operations = []
|
|
1101
1101
|
documents_processed_counter = 0
|
|
1102
1102
|
for doc in mdb[coll_name].find():
|
|
1103
|
-
|
|
1103
|
+
try:
|
|
1104
|
+
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
1105
|
+
except KeyError:
|
|
1106
|
+
raise Exception(
|
|
1107
|
+
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1108
|
+
)
|
|
1104
1109
|
slots_to_include = ["id", "type"] + document_reference_ranged_slots[
|
|
1105
1110
|
doc_type
|
|
1106
1111
|
]
|
|
@@ -1188,8 +1193,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
|
|
|
1188
1193
|
def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1189
1194
|
mdb = context.resources.mongo.db
|
|
1190
1195
|
alldocs_collection = mdb["alldocs"]
|
|
1196
|
+
data_object_set = mdb["data_object_set"]
|
|
1191
1197
|
biosample_data_objects = fetch_data_objects_from_biosamples(
|
|
1192
|
-
alldocs_collection, biosamples
|
|
1198
|
+
alldocs_collection, data_object_set, biosamples
|
|
1193
1199
|
)
|
|
1194
1200
|
return biosample_data_objects
|
|
1195
1201
|
|
|
@@ -1200,8 +1206,9 @@ def get_nucleotide_sequencing_from_biosamples(
|
|
|
1200
1206
|
):
|
|
1201
1207
|
mdb = context.resources.mongo.db
|
|
1202
1208
|
alldocs_collection = mdb["alldocs"]
|
|
1209
|
+
data_generation_set = mdb["data_generation_set"]
|
|
1203
1210
|
biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
|
|
1204
|
-
alldocs_collection, biosamples
|
|
1211
|
+
alldocs_collection, data_generation_set, biosamples
|
|
1205
1212
|
)
|
|
1206
1213
|
return biosample_omics_processing
|
|
1207
1214
|
|
|
@@ -1212,8 +1219,9 @@ def get_library_preparation_from_biosamples(
|
|
|
1212
1219
|
):
|
|
1213
1220
|
mdb = context.resources.mongo.db
|
|
1214
1221
|
alldocs_collection = mdb["alldocs"]
|
|
1222
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1215
1223
|
biosample_lib_prep = fetch_library_preparation_from_biosamples(
|
|
1216
|
-
alldocs_collection, biosamples
|
|
1224
|
+
alldocs_collection, material_processing_set, biosamples
|
|
1217
1225
|
)
|
|
1218
1226
|
return biosample_lib_prep
|
|
1219
1227
|
|
|
@@ -199,8 +199,20 @@ class DatabaseUpdater:
|
|
|
199
199
|
if gbs.get("biosampleGoldId") not in nmdc_gold_ids
|
|
200
200
|
]
|
|
201
201
|
|
|
202
|
+
# use the GOLD study id to fetch all sequencing project records associated with the study
|
|
203
|
+
gold_sequencing_projects_for_study = (
|
|
204
|
+
self.gold_api_client.fetch_projects_by_study(gold_study_id)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# use the GOLD study id to fetch all analysis project records associated with the study
|
|
208
|
+
gold_analysis_projects_for_study = (
|
|
209
|
+
self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
|
|
210
|
+
)
|
|
211
|
+
|
|
202
212
|
gold_study_translator = GoldStudyTranslator(
|
|
203
213
|
biosamples=missing_gold_biosamples,
|
|
214
|
+
projects=gold_sequencing_projects_for_study,
|
|
215
|
+
analysis_projects=gold_analysis_projects_for_study,
|
|
204
216
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
205
217
|
)
|
|
206
218
|
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -652,7 +652,7 @@ def biosample_submission_ingest():
|
|
|
652
652
|
"inputs": {
|
|
653
653
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
654
654
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
655
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
655
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
656
656
|
}
|
|
657
657
|
},
|
|
658
658
|
},
|
|
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
|
|
|
694
694
|
"inputs": {
|
|
695
695
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
696
696
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
697
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
697
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
698
698
|
}
|
|
699
699
|
},
|
|
700
700
|
},
|
|
@@ -737,14 +737,14 @@ def biosample_submission_ingest():
|
|
|
737
737
|
"inputs": {
|
|
738
738
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
739
739
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
740
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
740
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
741
741
|
}
|
|
742
742
|
},
|
|
743
743
|
"get_neon_pipeline_benthic_data_product": {
|
|
744
744
|
"config": {
|
|
745
745
|
"benthic_data_product": {
|
|
746
746
|
"product_id": "DP1.20279.001",
|
|
747
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
747
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
748
748
|
}
|
|
749
749
|
}
|
|
750
750
|
},
|
|
@@ -771,7 +771,7 @@ def biosample_submission_ingest():
|
|
|
771
771
|
"config": {
|
|
772
772
|
"benthic_data_product": {
|
|
773
773
|
"product_id": "DP1.20279.001",
|
|
774
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
774
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
775
775
|
}
|
|
776
776
|
}
|
|
777
777
|
},
|
|
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
|
|
|
779
779
|
"inputs": {
|
|
780
780
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
781
781
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
782
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
782
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
783
783
|
}
|
|
784
784
|
},
|
|
785
785
|
},
|
|
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
|
|
|
822
822
|
"inputs": {
|
|
823
823
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
824
824
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
825
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
825
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
826
826
|
}
|
|
827
827
|
},
|
|
828
828
|
"get_neon_pipeline_surface_water_data_product": {
|
|
829
829
|
"config": {
|
|
830
830
|
"surface_water_data_product": {
|
|
831
831
|
"product_id": "DP1.20281.001",
|
|
832
|
-
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
|
|
832
|
+
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
|
|
833
833
|
}
|
|
834
834
|
}
|
|
835
835
|
},
|
|
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
|
|
|
856
856
|
"config": {
|
|
857
857
|
"surface_water_data_product": {
|
|
858
858
|
"product_id": "DP1.20281.001",
|
|
859
|
-
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
|
|
859
|
+
"product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
|
|
860
860
|
}
|
|
861
861
|
}
|
|
862
862
|
},
|
|
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
|
|
|
864
864
|
"inputs": {
|
|
865
865
|
"neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
|
|
866
866
|
"neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
|
|
867
|
-
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/
|
|
867
|
+
"neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
|
|
868
868
|
}
|
|
869
869
|
},
|
|
870
870
|
},
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import sqlite3
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Optional, Union
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import requests_cache
|
|
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
61
61
|
"mms_benthicMetagenomeSequencing",
|
|
62
62
|
"mms_benthicMetagenomeDnaExtraction",
|
|
63
63
|
"amb_fieldParent",
|
|
64
|
+
"mms_benthicRawDataFiles", # <--- ensure this is present
|
|
64
65
|
)
|
|
65
66
|
|
|
66
67
|
if all(k in benthic_data for k in neon_amb_data_tables):
|
|
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
79
80
|
benthic_data["amb_fieldParent"].to_sql(
|
|
80
81
|
"amb_fieldParent", self.conn, if_exists="replace", index=False
|
|
81
82
|
)
|
|
83
|
+
benthic_data["mms_benthicRawDataFiles"].to_sql(
|
|
84
|
+
"mms_benthicRawDataFiles",
|
|
85
|
+
self.conn,
|
|
86
|
+
if_exists="replace",
|
|
87
|
+
index=False,
|
|
88
|
+
)
|
|
82
89
|
else:
|
|
83
90
|
raise ValueError(
|
|
84
91
|
f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
|
|
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
88
95
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
89
96
|
)
|
|
90
97
|
|
|
91
|
-
self.neon_raw_data_file_mappings_df =
|
|
92
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
93
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
94
|
-
)
|
|
98
|
+
self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
|
|
95
99
|
|
|
96
100
|
self.site_code_mapping = site_code_mapping
|
|
101
|
+
|
|
97
102
|
self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
|
|
98
103
|
|
|
104
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
105
|
+
return nmdc.Manifest(
|
|
106
|
+
id=manifest_id,
|
|
107
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
108
|
+
type="nmdc:Manifest",
|
|
109
|
+
)
|
|
110
|
+
|
|
99
111
|
def _translate_biosample(
|
|
100
112
|
self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
|
|
101
113
|
) -> nmdc.Biosample:
|
|
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
313
325
|
)
|
|
314
326
|
|
|
315
327
|
def _translate_data_object(
|
|
316
|
-
self, do_id: str, url: str, do_type: str,
|
|
328
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
317
329
|
) -> nmdc.DataObject:
|
|
318
330
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
319
331
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
324
336
|
:param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
|
|
325
337
|
by Hugh Cross at NEON.
|
|
326
338
|
:param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
|
|
327
|
-
:param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
|
|
328
339
|
at NEON.
|
|
329
340
|
:return: DataObject with all the sequencing file metadata.
|
|
330
341
|
"""
|
|
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
337
348
|
url=url,
|
|
338
349
|
description=f"sequencing results for {basename}",
|
|
339
350
|
type="nmdc:DataObject",
|
|
340
|
-
md5_checksum=checksum,
|
|
341
351
|
data_object_type=do_type,
|
|
352
|
+
in_manifest=manifest_id,
|
|
342
353
|
)
|
|
343
354
|
|
|
344
|
-
def get_database(self):
|
|
355
|
+
def get_database(self) -> nmdc.Database:
|
|
345
356
|
database = nmdc.Database()
|
|
346
357
|
|
|
347
|
-
|
|
358
|
+
join_query = """
|
|
348
359
|
SELECT
|
|
349
360
|
merged.laboratoryName,
|
|
350
361
|
merged.sequencingFacilityID,
|
|
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
|
|
|
372
383
|
afp.siteID,
|
|
373
384
|
afp.sampleID,
|
|
374
385
|
afp.collectDate
|
|
375
|
-
FROM
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
mms_benthicMetagenomeDnaExtraction AS bd
|
|
398
|
-
ON
|
|
399
|
-
bs.dnaSampleID = bd.dnaSampleID
|
|
400
|
-
) AS merged
|
|
386
|
+
FROM (
|
|
387
|
+
SELECT
|
|
388
|
+
bs.collectDate,
|
|
389
|
+
bs.laboratoryName,
|
|
390
|
+
bs.sequencingFacilityID,
|
|
391
|
+
bs.processedDate,
|
|
392
|
+
bs.dnaSampleID,
|
|
393
|
+
bs.dnaSampleCode,
|
|
394
|
+
bs.internalLabID,
|
|
395
|
+
bs.instrument_model,
|
|
396
|
+
bs.sequencingMethod,
|
|
397
|
+
bs.investigation_type,
|
|
398
|
+
bs.qaqcStatus,
|
|
399
|
+
bs.ncbiProjectID,
|
|
400
|
+
bd.genomicsSampleID,
|
|
401
|
+
bd.sequenceAnalysisType,
|
|
402
|
+
bd.sampleMass,
|
|
403
|
+
bd.nucleicAcidConcentration
|
|
404
|
+
FROM mms_benthicMetagenomeSequencing AS bs
|
|
405
|
+
JOIN mms_benthicMetagenomeDnaExtraction AS bd
|
|
406
|
+
ON bs.dnaSampleID = bd.dnaSampleID
|
|
407
|
+
) AS merged
|
|
401
408
|
LEFT JOIN amb_fieldParent AS afp
|
|
402
|
-
ON
|
|
403
|
-
merged.genomicsSampleID = afp.geneticSampleID
|
|
409
|
+
ON merged.genomicsSampleID = afp.geneticSampleID
|
|
404
410
|
"""
|
|
405
|
-
benthic_samples = pd.read_sql_query(
|
|
411
|
+
benthic_samples = pd.read_sql_query(join_query, self.conn)
|
|
406
412
|
benthic_samples.to_sql(
|
|
407
413
|
"benthicSamples", self.conn, if_exists="replace", index=False
|
|
408
414
|
)
|
|
409
415
|
|
|
410
|
-
|
|
411
|
-
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(
|
|
412
|
-
neon_to_nmdc_biosample_ids = dict(zip(
|
|
416
|
+
sample_ids = benthic_samples["sampleID"]
|
|
417
|
+
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
|
|
418
|
+
neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
|
|
413
419
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
"nmdc:Extraction", len(neon_extraction_ids)
|
|
417
|
-
)
|
|
418
|
-
neon_to_nmdc_extraction_ids = dict(
|
|
419
|
-
zip(neon_extraction_ids, nmdc_extraction_ids)
|
|
420
|
-
)
|
|
420
|
+
nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
|
|
421
|
+
neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
|
|
421
422
|
|
|
422
|
-
neon_extraction_processed_ids = benthic_samples["sampleID"]
|
|
423
423
|
nmdc_extraction_processed_ids = self._id_minter(
|
|
424
|
-
"nmdc:ProcessedSample", len(
|
|
424
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
425
425
|
)
|
|
426
426
|
neon_to_nmdc_extraction_processed_ids = dict(
|
|
427
|
-
zip(
|
|
427
|
+
zip(sample_ids, nmdc_extraction_processed_ids)
|
|
428
428
|
)
|
|
429
429
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
"nmdc:LibraryPreparation", len(neon_lib_prep_ids)
|
|
433
|
-
)
|
|
434
|
-
neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
|
|
430
|
+
nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
|
|
431
|
+
neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
|
|
435
432
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
"nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
|
|
433
|
+
nmdc_libprep_processed_ids = self._id_minter(
|
|
434
|
+
"nmdc:ProcessedSample", len(sample_ids)
|
|
439
435
|
)
|
|
440
|
-
|
|
441
|
-
zip(
|
|
436
|
+
neon_to_nmdc_libprep_processed_ids = dict(
|
|
437
|
+
zip(sample_ids, nmdc_libprep_processed_ids)
|
|
442
438
|
)
|
|
443
439
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
"nmdc:NucleotideSequencing", len(neon_omprc_ids)
|
|
447
|
-
)
|
|
448
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
440
|
+
nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
|
|
441
|
+
neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
|
|
449
442
|
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
)
|
|
455
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
456
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
457
|
-
)
|
|
443
|
+
raw_df = self.neon_raw_data_file_mappings_df
|
|
444
|
+
raw_file_paths = raw_df["rawDataFilePath"]
|
|
445
|
+
dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
|
|
446
|
+
neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
|
|
458
447
|
|
|
459
|
-
for neon_id,
|
|
460
|
-
|
|
448
|
+
for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
|
|
449
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
450
|
+
if row.empty:
|
|
451
|
+
continue
|
|
461
452
|
|
|
453
|
+
# Example of how you might call _translate_biosample:
|
|
462
454
|
database.biosample_set.append(
|
|
463
|
-
self._translate_biosample(neon_id,
|
|
455
|
+
self._translate_biosample(neon_id, biosample_id, row)
|
|
464
456
|
)
|
|
465
457
|
|
|
466
|
-
for neon_id,
|
|
467
|
-
|
|
458
|
+
for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
|
|
459
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
460
|
+
if row.empty:
|
|
461
|
+
continue
|
|
468
462
|
|
|
469
|
-
|
|
470
|
-
|
|
463
|
+
biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
|
|
464
|
+
extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
471
465
|
|
|
472
|
-
if
|
|
466
|
+
if biosample_id and extraction_ps_id:
|
|
473
467
|
database.material_processing_set.append(
|
|
474
468
|
self._translate_extraction_process(
|
|
475
|
-
|
|
476
|
-
extraction_input,
|
|
477
|
-
processed_sample_id,
|
|
478
|
-
extraction_row,
|
|
469
|
+
extraction_id, biosample_id, extraction_ps_id, row
|
|
479
470
|
)
|
|
480
471
|
)
|
|
481
|
-
|
|
482
|
-
genomics_sample_id = _get_value_or_none(
|
|
483
|
-
extraction_row, "genomicsSampleID"
|
|
484
|
-
)
|
|
485
|
-
|
|
472
|
+
genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
|
|
486
473
|
database.processed_sample_set.append(
|
|
487
474
|
self._translate_processed_sample(
|
|
488
|
-
|
|
475
|
+
extraction_ps_id,
|
|
489
476
|
f"Extracted DNA from {genomics_sample_id}",
|
|
490
477
|
)
|
|
491
478
|
)
|
|
492
479
|
|
|
493
|
-
|
|
480
|
+
query2 = """
|
|
494
481
|
SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
|
|
495
|
-
FROM
|
|
482
|
+
FROM mms_benthicRawDataFiles
|
|
496
483
|
GROUP BY dnaSampleID
|
|
497
484
|
"""
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
485
|
+
raw_data_files_df = pd.read_sql_query(query2, self.conn)
|
|
486
|
+
dna_files_dict = (
|
|
487
|
+
raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
501
488
|
.str.split("|")
|
|
502
489
|
.to_dict()
|
|
503
490
|
)
|
|
504
|
-
filtered_neon_raw_data_files_dict = {
|
|
505
|
-
key: value
|
|
506
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
507
|
-
if len(value) <= 2
|
|
508
|
-
}
|
|
509
491
|
|
|
510
|
-
|
|
511
|
-
lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
492
|
+
dna_sample_to_manifest_id: dict[str, str] = {}
|
|
512
493
|
|
|
513
|
-
|
|
514
|
-
|
|
494
|
+
for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
|
|
495
|
+
row = benthic_samples[benthic_samples["sampleID"] == neon_id]
|
|
496
|
+
if row.empty:
|
|
497
|
+
continue
|
|
515
498
|
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
499
|
+
extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
|
|
500
|
+
libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
|
|
501
|
+
if not extr_ps_id or not libprep_ps_id:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
database.material_processing_set.append(
|
|
505
|
+
self._translate_library_preparation(
|
|
506
|
+
libprep_id, extr_ps_id, libprep_ps_id, row
|
|
524
507
|
)
|
|
508
|
+
)
|
|
525
509
|
|
|
526
|
-
|
|
510
|
+
dna_sample_id = _get_value_or_none(row, "dnaSampleID")
|
|
511
|
+
database.processed_sample_set.append(
|
|
512
|
+
self._translate_processed_sample(
|
|
513
|
+
libprep_ps_id,
|
|
514
|
+
f"Library preparation for {dna_sample_id}",
|
|
515
|
+
)
|
|
516
|
+
)
|
|
527
517
|
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
518
|
+
filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
|
|
519
|
+
if not filepaths_for_dna:
|
|
520
|
+
# no raw files => skip
|
|
521
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
522
|
+
if ntseq_id:
|
|
523
|
+
continue
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
# If multiple => we create a Manifest
|
|
527
|
+
manifest_id: Optional[str] = None
|
|
528
|
+
if len(filepaths_for_dna) > 2:
|
|
529
|
+
if dna_sample_id not in dna_sample_to_manifest_id:
|
|
530
|
+
new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
531
|
+
dna_sample_to_manifest_id[dna_sample_id] = new_man_id
|
|
532
|
+
database.manifest_set.append(self._translate_manifest(new_man_id))
|
|
533
|
+
manifest_id = dna_sample_to_manifest_id[dna_sample_id]
|
|
534
|
+
|
|
535
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
536
|
+
if not has_input_value:
|
|
537
|
+
continue
|
|
538
|
+
|
|
539
|
+
dataobject_ids_for_run: list[str] = []
|
|
540
|
+
for fp in filepaths_for_dna:
|
|
541
|
+
if fp not in neon_to_nmdc_dataobject_ids:
|
|
542
|
+
continue
|
|
543
|
+
do_id = neon_to_nmdc_dataobject_ids[fp]
|
|
544
|
+
|
|
545
|
+
do_type = None
|
|
546
|
+
if "_R1.fastq.gz" in fp:
|
|
547
|
+
do_type = "Metagenome Raw Read 1"
|
|
548
|
+
elif "_R2.fastq.gz" in fp:
|
|
549
|
+
do_type = "Metagenome Raw Read 2"
|
|
550
|
+
|
|
551
|
+
database.data_object_set.append(
|
|
552
|
+
self._translate_data_object(
|
|
553
|
+
do_id=do_id,
|
|
554
|
+
url=fp,
|
|
555
|
+
do_type=do_type,
|
|
556
|
+
manifest_id=manifest_id,
|
|
532
557
|
)
|
|
533
558
|
)
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
checksum = None
|
|
545
|
-
do_type = None
|
|
546
|
-
|
|
547
|
-
checksum = neon_raw_data_file_mappings_df[
|
|
548
|
-
neon_raw_data_file_mappings_df["rawDataFilePath"] == item
|
|
549
|
-
]["checkSum"].values[0]
|
|
550
|
-
if "_R1.fastq.gz" in item:
|
|
551
|
-
do_type = "Metagenome Raw Read 1"
|
|
552
|
-
elif "_R2.fastq.gz" in item:
|
|
553
|
-
do_type = "Metagenome Raw Read 2"
|
|
554
|
-
|
|
555
|
-
database.data_object_set.append(
|
|
556
|
-
self._translate_data_object(
|
|
557
|
-
neon_to_nmdc_data_object_ids.get(item),
|
|
558
|
-
item,
|
|
559
|
-
do_type,
|
|
560
|
-
checksum,
|
|
561
|
-
)
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
database.data_generation_set.append(
|
|
565
|
-
self._translate_nucleotide_sequencing(
|
|
566
|
-
neon_to_nmdc_omprc_ids.get(neon_id),
|
|
567
|
-
processed_sample_id,
|
|
568
|
-
has_output_do_ids,
|
|
569
|
-
lib_prep_row,
|
|
570
|
-
)
|
|
559
|
+
dataobject_ids_for_run.append(do_id)
|
|
560
|
+
|
|
561
|
+
ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
|
|
562
|
+
if ntseq_id:
|
|
563
|
+
database.data_generation_set.append(
|
|
564
|
+
self._translate_nucleotide_sequencing(
|
|
565
|
+
ntseq_id,
|
|
566
|
+
has_input_value, # <--- from self.samp_procsm_dict
|
|
567
|
+
dataobject_ids_for_run,
|
|
568
|
+
row,
|
|
571
569
|
)
|
|
570
|
+
)
|
|
572
571
|
|
|
573
572
|
return database
|
|
@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
71
71
|
neon_amb_data_tables = (
|
|
72
72
|
"mms_swMetagenomeSequencing",
|
|
73
73
|
"mms_swMetagenomeDnaExtraction",
|
|
74
|
+
"mms_swRawDataFiles",
|
|
74
75
|
"amc_fieldGenetic",
|
|
75
76
|
"amc_fieldSuperParent",
|
|
76
77
|
)
|
|
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
88
89
|
if_exists="replace",
|
|
89
90
|
index=False,
|
|
90
91
|
)
|
|
92
|
+
surface_water_data["mms_swRawDataFiles"].to_sql(
|
|
93
|
+
"mms_swRawDataFiles", self.conn, if_exists="replace", index=False
|
|
94
|
+
)
|
|
91
95
|
surface_water_data["amc_fieldGenetic"].to_sql(
|
|
92
96
|
"amc_fieldGenetic", self.conn, if_exists="replace", index=False
|
|
93
97
|
)
|
|
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
103
107
|
"neonEnvoTerms", self.conn, if_exists="replace", index=False
|
|
104
108
|
)
|
|
105
109
|
|
|
106
|
-
self.neon_raw_data_file_mappings_df =
|
|
107
|
-
self.neon_raw_data_file_mappings_df.to_sql(
|
|
108
|
-
"neonRawDataFile", self.conn, if_exists="replace", index=False
|
|
109
|
-
)
|
|
110
|
+
self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
|
|
110
111
|
|
|
111
112
|
self.site_code_mapping = site_code_mapping
|
|
112
113
|
|
|
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
371
372
|
)
|
|
372
373
|
|
|
373
374
|
def _translate_data_object(
|
|
374
|
-
self, do_id: str, url: str, do_type: str,
|
|
375
|
+
self, do_id: str, url: str, do_type: str, manifest_id: str
|
|
375
376
|
) -> nmdc.DataObject:
|
|
376
377
|
"""Create nmdc DataObject which is the output of a NucleotideSequencing process. This
|
|
377
378
|
object mainly contains information about the sequencing file that was generated as
|
|
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
395
396
|
url=url,
|
|
396
397
|
description=f"sequencing results for {basename}",
|
|
397
398
|
type="nmdc:DataObject",
|
|
398
|
-
md5_checksum=checksum,
|
|
399
399
|
data_object_type=do_type,
|
|
400
|
+
in_manifest=manifest_id,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
|
|
404
|
+
return nmdc.Manifest(
|
|
405
|
+
id=manifest_id,
|
|
406
|
+
manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
|
|
407
|
+
type="nmdc:Manifest",
|
|
400
408
|
)
|
|
401
409
|
|
|
402
410
|
def get_database(self):
|
|
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
477
485
|
"""
|
|
478
486
|
surface_water_samples = pd.read_sql_query(query, self.conn)
|
|
479
487
|
|
|
488
|
+
# --------------------------------------------------
|
|
489
|
+
# Create mappings for minted NMDC IDs
|
|
490
|
+
# --------------------------------------------------
|
|
480
491
|
neon_biosample_ids = surface_water_samples["parentSampleID"]
|
|
481
492
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
|
|
482
493
|
neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
|
|
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
511
522
|
zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
|
|
512
523
|
)
|
|
513
524
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
)
|
|
518
|
-
neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
|
|
519
|
-
|
|
520
|
-
neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
|
|
521
|
-
neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
|
|
522
|
-
nmdc_data_object_ids = self._id_minter(
|
|
523
|
-
"nmdc:DataObject", len(neon_raw_file_paths)
|
|
524
|
-
)
|
|
525
|
-
neon_to_nmdc_data_object_ids = dict(
|
|
526
|
-
zip(neon_raw_file_paths, nmdc_data_object_ids)
|
|
527
|
-
)
|
|
528
|
-
|
|
525
|
+
# --------------------------------------------------
|
|
526
|
+
# STEP 1: Insert Biosamples
|
|
527
|
+
# --------------------------------------------------
|
|
529
528
|
for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
|
|
530
529
|
biosample_row = surface_water_samples[
|
|
531
530
|
surface_water_samples["parentSampleID"] == neon_id
|
|
532
531
|
]
|
|
532
|
+
# database.biosample_set.append(
|
|
533
|
+
# self._translate_biosample(neon_id, nmdc_id, biosample_row)
|
|
534
|
+
# )
|
|
533
535
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
536
|
+
# --------------------------------------------------
|
|
537
|
+
# STEP 2: Insert Extraction Processes
|
|
538
|
+
# --------------------------------------------------
|
|
538
539
|
for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
|
|
539
540
|
extraction_row = surface_water_samples[
|
|
540
541
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
557
558
|
extraction_row, "genomicsSampleID"
|
|
558
559
|
)
|
|
559
560
|
|
|
561
|
+
# Each Extraction process output => ProcessedSample
|
|
560
562
|
database.processed_sample_set.append(
|
|
561
563
|
self._translate_processed_sample(
|
|
562
564
|
processed_sample_id,
|
|
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
564
566
|
)
|
|
565
567
|
)
|
|
566
568
|
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
GROUP BY dnaSampleID
|
|
571
|
-
"""
|
|
572
|
-
neon_raw_data_files = pd.read_sql_query(query, self.conn)
|
|
573
|
-
neon_raw_data_files_dict = (
|
|
574
|
-
neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
|
|
575
|
-
.str.split("|")
|
|
576
|
-
.to_dict()
|
|
577
|
-
)
|
|
578
|
-
filtered_neon_raw_data_files_dict = {
|
|
579
|
-
key: value
|
|
580
|
-
for key, value in neon_raw_data_files_dict.items()
|
|
581
|
-
if len(value) <= 2
|
|
582
|
-
}
|
|
583
|
-
|
|
569
|
+
# --------------------------------------------------
|
|
570
|
+
# STEP 3: Insert LibraryPreparation Processes
|
|
571
|
+
# --------------------------------------------------
|
|
584
572
|
for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
|
|
585
573
|
lib_prep_row = surface_water_samples[
|
|
586
574
|
surface_water_samples["parentSampleID"] == neon_id
|
|
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
601
589
|
|
|
602
590
|
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
603
591
|
|
|
592
|
+
# Each LibraryPreparation process output => ProcessedSample
|
|
604
593
|
database.processed_sample_set.append(
|
|
605
594
|
self._translate_processed_sample(
|
|
606
595
|
processed_sample_id,
|
|
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
|
|
|
608
597
|
)
|
|
609
598
|
)
|
|
610
599
|
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
600
|
+
# --------------------------------------------------
|
|
601
|
+
# STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
|
|
602
|
+
# and insert DataObjects + DataGeneration processes
|
|
603
|
+
# --------------------------------------------------
|
|
604
|
+
raw_query = """
|
|
605
|
+
SELECT dnaSampleID, sequencerRunID, rawDataFilePath
|
|
606
|
+
FROM mms_swRawDataFiles
|
|
607
|
+
"""
|
|
608
|
+
neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
|
|
609
|
+
|
|
610
|
+
for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
|
|
611
|
+
# 1) Pull out the row that corresponds to this parentSampleID
|
|
612
|
+
lib_prep_row = surface_water_samples[
|
|
613
|
+
surface_water_samples["parentSampleID"] == neon_id
|
|
614
|
+
]
|
|
615
|
+
|
|
616
|
+
# 2) Grab the dnaSampleID from that row
|
|
617
|
+
dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
|
|
618
|
+
if not dna_sample_id:
|
|
619
|
+
# No dnaSampleID => skip
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
# 3) Find all raw files for that dnaSampleID
|
|
623
|
+
dna_files = neon_raw_data_files_df[
|
|
624
|
+
neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
|
|
625
|
+
]
|
|
626
|
+
if dna_files.empty:
|
|
627
|
+
# No raw files => skip
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
# -----------------------------------------
|
|
631
|
+
# LOOKUP DICT: get "has_input" for this neon_id
|
|
632
|
+
# -----------------------------------------
|
|
633
|
+
has_input_value = self.samp_procsm_dict.get(neon_id)
|
|
634
|
+
# If some neon_id isn't in the dictionary, handle it as needed
|
|
635
|
+
if not has_input_value:
|
|
636
|
+
# Could skip, or raise an error, or set a default
|
|
637
|
+
continue
|
|
638
|
+
|
|
639
|
+
# -------------------------------------------
|
|
640
|
+
# 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
|
|
641
|
+
# for this row's dnaSampleID
|
|
642
|
+
# -------------------------------------------
|
|
643
|
+
manifest_id = None
|
|
644
|
+
if len(dna_files) > 2:
|
|
645
|
+
# For each row that references a dnaSampleID with multiple raw files,
|
|
646
|
+
# mint exactly one new manifest record
|
|
647
|
+
manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
|
|
648
|
+
new_manifest = self._translate_manifest(manifest_id)
|
|
649
|
+
# Add to the database
|
|
650
|
+
database.manifest_set.append(new_manifest)
|
|
651
|
+
|
|
652
|
+
# -------------------------------------------
|
|
653
|
+
# 5) NOW GROUP FILES BY sequencerRunID
|
|
654
|
+
# => one data_generation record per run
|
|
655
|
+
# -------------------------------------------
|
|
656
|
+
lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
|
|
657
|
+
neon_id
|
|
658
|
+
)
|
|
659
|
+
if not lib_prep_processed_sample_id:
|
|
660
|
+
# If we don't have a ProcessedSample for some reason, skip
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
for run_id, group_df in dna_files.groupby("sequencerRunID"):
|
|
664
|
+
# a) Mint new data_generation (NucleotideSequencing) ID for this run
|
|
665
|
+
data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
|
|
666
|
+
|
|
667
|
+
# b) Create DataObjects for each raw file in this run
|
|
668
|
+
data_object_ids = []
|
|
669
|
+
for raw_fp in group_df["rawDataFilePath"]:
|
|
670
|
+
do_id = self._id_minter("nmdc:DataObject", 1)[0]
|
|
671
|
+
|
|
672
|
+
# Distinguish read type
|
|
673
|
+
do_type = None
|
|
674
|
+
if "_R1.fastq.gz" in raw_fp:
|
|
675
|
+
do_type = "Metagenome Raw Read 1"
|
|
676
|
+
elif "_R2.fastq.gz" in raw_fp:
|
|
677
|
+
do_type = "Metagenome Raw Read 2"
|
|
678
|
+
|
|
679
|
+
# Create the DataObject
|
|
680
|
+
data_obj = self._translate_data_object(
|
|
681
|
+
do_id=do_id,
|
|
682
|
+
url=raw_fp,
|
|
683
|
+
do_type=do_type,
|
|
684
|
+
manifest_id=manifest_id, # link to the new Manifest if it exists
|
|
685
|
+
)
|
|
686
|
+
database.data_object_set.append(data_obj)
|
|
687
|
+
data_object_ids.append(do_id)
|
|
688
|
+
|
|
689
|
+
# c) Finally, create the data generation record for this run
|
|
690
|
+
database.data_generation_set.append(
|
|
691
|
+
self._translate_nucleotide_sequencing(
|
|
692
|
+
nucleotide_sequencing_id=data_generation_id,
|
|
693
|
+
processed_sample_id=has_input_value,
|
|
694
|
+
raw_data_file_data=data_object_ids,
|
|
695
|
+
nucleotide_sequencing_row=lib_prep_row,
|
|
647
696
|
)
|
|
697
|
+
)
|
|
648
698
|
|
|
649
699
|
return database
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.5.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -17,6 +17,7 @@ Dynamic: classifier
|
|
|
17
17
|
Dynamic: description
|
|
18
18
|
Dynamic: description-content-type
|
|
19
19
|
Dynamic: home-page
|
|
20
|
+
Dynamic: license-file
|
|
20
21
|
Dynamic: requires-python
|
|
21
22
|
Dynamic: summary
|
|
22
23
|
|
|
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
|
|
|
37
38
|
* [nmdc-server](https://github.com/microbiomedata/nmdc-server)
|
|
38
39
|
houses code specific to the data portal -- its database, back-end API, and front-end application.
|
|
39
40
|
|
|
40
|
-
* [
|
|
41
|
-
references workflow code spread across several repositories, that take source data and produce computed data.
|
|
41
|
+
* Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
|
|
42
42
|
|
|
43
43
|
* This repo (nmdc-runtime)
|
|
44
44
|
* houses code that takes source data and computed data, and transforms it
|
|
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
|
|
|
156
156
|
```bash
|
|
157
157
|
make up-test
|
|
158
158
|
make test
|
|
159
|
+
|
|
160
|
+
# Run a Specific test file eg. tests/test_api/test_endpoints.py
|
|
161
|
+
make test ARGS="tests/test_api/test_endpoints.py"
|
|
159
162
|
```
|
|
160
163
|
|
|
161
164
|
As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
|
|
@@ -164,6 +167,16 @@ desired and does not break over time.
|
|
|
164
167
|
[For hints on how to write tests for solids and pipelines in Dagster, see their documentation
|
|
165
168
|
tutorial on Testing](https://docs.dagster.io/tutorial/testable).
|
|
166
169
|
|
|
170
|
+
### RAM usage
|
|
171
|
+
|
|
172
|
+
The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
|
|
173
|
+
the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
|
|
174
|
+
"Settings > Resources > Advanced," and increase the memory limit. One of our team members has
|
|
175
|
+
found **12 GB** to be sufficient for running the tests.
|
|
176
|
+
|
|
177
|
+
> Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
|
|
178
|
+
> There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
|
|
179
|
+
|
|
167
180
|
## Publish to PyPI
|
|
168
181
|
|
|
169
182
|
This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).
|
|
@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
39
|
nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
|
|
42
42
|
nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
|
|
43
43
|
nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -51,21 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
51
51
|
nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
|
|
52
52
|
nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
|
|
53
53
|
nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
nmdc_runtime/site/export/ncbi_xml.py,sha256=
|
|
55
|
-
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=
|
|
54
|
+
nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
|
|
55
|
+
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
59
|
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
|
-
nmdc_runtime/site/repair/database_updater.py,sha256=
|
|
60
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
|
|
61
61
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
62
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
63
63
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
64
64
|
nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
|
|
65
65
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
66
|
-
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=
|
|
66
|
+
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
|
|
67
67
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
68
|
-
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=
|
|
68
|
+
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
|
|
69
69
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
70
70
|
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
|
|
71
71
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
75
75
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
76
76
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
77
77
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
82
|
-
nmdc_runtime-2.
|
|
83
|
-
nmdc_runtime-2.
|
|
78
|
+
nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
79
|
+
nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
|
|
80
|
+
nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
|
|
81
|
+
nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
82
|
+
nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
83
|
+
nmdc_runtime-2.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|