nmdc-runtime 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/site/export/ncbi_xml.py +0 -1
- nmdc_runtime/site/export/ncbi_xml_utils.py +0 -25
- nmdc_runtime/site/graphs.py +11 -0
- nmdc_runtime/site/ops.py +54 -12
- nmdc_runtime/site/repair/database_updater.py +12 -0
- nmdc_runtime/site/repository.py +2 -6
- nmdc_runtime/site/translation/gold_translator.py +11 -0
- nmdc_runtime/site/translation/neon_benthic_translator.py +156 -157
- nmdc_runtime/site/translation/submission_portal_translator.py +269 -51
- nmdc_runtime/site/util.py +8 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/METADATA +19 -6
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/RECORD +16 -16
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info/licenses}/LICENSE +0 -0
- {nmdc_runtime-2.4.0.dist-info → nmdc_runtime-2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,6 @@ import xml.dom.minidom
|
|
|
7
7
|
from typing import Any, List, Union
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
|
-
get_instruments,
|
|
11
10
|
handle_controlled_identified_term_value,
|
|
12
11
|
handle_controlled_term_value,
|
|
13
12
|
handle_geolocation_value,
|
|
@@ -24,31 +24,6 @@ def get_classname_from_typecode(doc_id):
|
|
|
24
24
|
return class_map.get(typecode)
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def get_instruments(instrument_set_collection):
|
|
28
|
-
# dictionary to capture a list of all instruments
|
|
29
|
-
# Structure of dict:
|
|
30
|
-
# {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
|
|
31
|
-
all_instruments = {}
|
|
32
|
-
|
|
33
|
-
try:
|
|
34
|
-
query = {"type": "nmdc:Instrument"}
|
|
35
|
-
cursor = instrument_set_collection.find(query)
|
|
36
|
-
|
|
37
|
-
for document in cursor:
|
|
38
|
-
instrument_id = document.get("id")
|
|
39
|
-
vendor = document.get("vendor")
|
|
40
|
-
model = document.get("model")
|
|
41
|
-
|
|
42
|
-
if not instrument_id or not vendor or not model:
|
|
43
|
-
continue
|
|
44
|
-
|
|
45
|
-
all_instruments[instrument_id] = {"vendor": vendor, "model": model}
|
|
46
|
-
|
|
47
|
-
return all_instruments
|
|
48
|
-
except Exception as e:
|
|
49
|
-
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
|
|
50
|
-
|
|
51
|
-
|
|
52
27
|
def fetch_data_objects_from_biosamples(
|
|
53
28
|
all_docs_collection: Collection,
|
|
54
29
|
data_object_set: Collection,
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -61,6 +61,8 @@ from nmdc_runtime.site.ops import (
|
|
|
61
61
|
get_database_updater_inputs,
|
|
62
62
|
post_submission_portal_biosample_ingest_record_stitching_filename,
|
|
63
63
|
generate_data_generation_set_post_biosample_ingest,
|
|
64
|
+
get_instrument_ids_by_model,
|
|
65
|
+
log_database_ids,
|
|
64
66
|
)
|
|
65
67
|
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
66
68
|
|
|
@@ -181,6 +183,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
181
183
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
182
184
|
biosample_extras_slot_mapping_file_url
|
|
183
185
|
)
|
|
186
|
+
instrument_mapping = get_instrument_ids_by_model()
|
|
184
187
|
|
|
185
188
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
186
189
|
metadata_submission,
|
|
@@ -188,10 +191,13 @@ def translate_metadata_submission_to_nmdc_schema_database():
|
|
|
188
191
|
data_object_mapping=data_object_mapping,
|
|
189
192
|
biosample_extras=biosample_extras,
|
|
190
193
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
194
|
+
instrument_mapping=instrument_mapping,
|
|
191
195
|
)
|
|
192
196
|
|
|
193
197
|
validate_metadata(database)
|
|
194
198
|
|
|
199
|
+
log_database_ids(database)
|
|
200
|
+
|
|
195
201
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
196
202
|
filename = nmdc_schema_database_export_filename(metadata_submission)
|
|
197
203
|
outputs = export_json_to_drs(database_dict, filename)
|
|
@@ -217,6 +223,7 @@ def ingest_metadata_submission():
|
|
|
217
223
|
biosample_extras_slot_mapping = get_csv_rows_from_url(
|
|
218
224
|
biosample_extras_slot_mapping_file_url
|
|
219
225
|
)
|
|
226
|
+
instrument_mapping = get_instrument_ids_by_model()
|
|
220
227
|
|
|
221
228
|
database = translate_portal_submission_to_nmdc_schema_database(
|
|
222
229
|
metadata_submission,
|
|
@@ -224,7 +231,11 @@ def ingest_metadata_submission():
|
|
|
224
231
|
data_object_mapping=data_object_mapping,
|
|
225
232
|
biosample_extras=biosample_extras,
|
|
226
233
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
234
|
+
instrument_mapping=instrument_mapping,
|
|
227
235
|
)
|
|
236
|
+
|
|
237
|
+
log_database_ids(database)
|
|
238
|
+
|
|
228
239
|
run_id = submit_metadata_to_db(database)
|
|
229
240
|
poll_for_run_completion(run_id)
|
|
230
241
|
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -7,6 +7,7 @@ import tempfile
|
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from datetime import datetime, timezone
|
|
9
9
|
from io import BytesIO, StringIO
|
|
10
|
+
from pprint import pformat
|
|
10
11
|
from toolz.dicttoolz import keyfilter
|
|
11
12
|
from typing import Tuple
|
|
12
13
|
from zipfile import ZipFile
|
|
@@ -38,7 +39,7 @@ from dagster import (
|
|
|
38
39
|
Bool,
|
|
39
40
|
)
|
|
40
41
|
from gridfs import GridFS
|
|
41
|
-
from linkml_runtime.
|
|
42
|
+
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
42
43
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
43
44
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
44
45
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
@@ -69,7 +70,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
69
70
|
fetch_data_objects_from_biosamples,
|
|
70
71
|
fetch_nucleotide_sequencing_from_biosamples,
|
|
71
72
|
fetch_library_preparation_from_biosamples,
|
|
72
|
-
get_instruments,
|
|
73
73
|
)
|
|
74
74
|
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
75
75
|
from nmdc_runtime.site.resources import (
|
|
@@ -96,6 +96,7 @@ from nmdc_runtime.site.util import (
|
|
|
96
96
|
run_and_log,
|
|
97
97
|
schema_collection_has_index_on_id,
|
|
98
98
|
nmdc_study_id_to_filename,
|
|
99
|
+
get_instruments_by_id,
|
|
99
100
|
)
|
|
100
101
|
from nmdc_runtime.util import (
|
|
101
102
|
drs_object_in_for,
|
|
@@ -720,9 +721,8 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
720
721
|
metadata_submission: Dict[str, Any],
|
|
721
722
|
nucleotide_sequencing_mapping: List,
|
|
722
723
|
data_object_mapping: List,
|
|
724
|
+
instrument_mapping: Dict[str, str],
|
|
723
725
|
study_category: Optional[str],
|
|
724
|
-
study_doi_category: Optional[str],
|
|
725
|
-
study_doi_provider: Optional[str],
|
|
726
726
|
study_pi_image_url: Optional[str],
|
|
727
727
|
biosample_extras: Optional[list[dict]],
|
|
728
728
|
biosample_extras_slot_mapping: Optional[list[dict]],
|
|
@@ -739,11 +739,10 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
739
739
|
data_object_mapping=data_object_mapping,
|
|
740
740
|
id_minter=id_minter,
|
|
741
741
|
study_category=study_category,
|
|
742
|
-
study_doi_category=study_doi_category,
|
|
743
|
-
study_doi_provider=study_doi_provider,
|
|
744
742
|
study_pi_image_url=study_pi_image_url,
|
|
745
743
|
biosample_extras=biosample_extras,
|
|
746
744
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
745
|
+
illumina_instrument_mapping=instrument_mapping,
|
|
747
746
|
)
|
|
748
747
|
database = translator.get_database()
|
|
749
748
|
return database
|
|
@@ -761,7 +760,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
|
|
|
761
760
|
|
|
762
761
|
@op
|
|
763
762
|
def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
|
|
764
|
-
return
|
|
763
|
+
return as_simple_dict(object)
|
|
765
764
|
|
|
766
765
|
|
|
767
766
|
@op(required_resource_keys={"mongo"}, config_schema={"username": str})
|
|
@@ -1100,7 +1099,12 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1099
|
write_operations = []
|
|
1101
1100
|
documents_processed_counter = 0
|
|
1102
1101
|
for doc in mdb[coll_name].find():
|
|
1103
|
-
|
|
1102
|
+
try:
|
|
1103
|
+
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
1104
|
+
except KeyError:
|
|
1105
|
+
raise Exception(
|
|
1106
|
+
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1107
|
+
)
|
|
1104
1108
|
slots_to_include = ["id", "type"] + document_reference_ranged_slots[
|
|
1105
1109
|
doc_type
|
|
1106
1110
|
]
|
|
@@ -1222,11 +1226,26 @@ def get_library_preparation_from_biosamples(
|
|
|
1222
1226
|
|
|
1223
1227
|
|
|
1224
1228
|
@op(required_resource_keys={"mongo"})
|
|
1225
|
-
def get_all_instruments(context: OpExecutionContext):
|
|
1229
|
+
def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
|
|
1230
|
+
mdb = context.resources.mongo.db
|
|
1231
|
+
return get_instruments_by_id(mdb)
|
|
1232
|
+
|
|
1233
|
+
|
|
1234
|
+
@op(required_resource_keys={"mongo"})
|
|
1235
|
+
def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
|
|
1226
1236
|
mdb = context.resources.mongo.db
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1237
|
+
instruments_by_id = get_instruments_by_id(mdb)
|
|
1238
|
+
instruments_by_model: dict[str, str] = {}
|
|
1239
|
+
for inst_id, instrument in instruments_by_id.items():
|
|
1240
|
+
model = instrument.get("model")
|
|
1241
|
+
if model is None:
|
|
1242
|
+
context.log.warning(f"Instrument {inst_id} has no model.")
|
|
1243
|
+
continue
|
|
1244
|
+
if model in instruments_by_model:
|
|
1245
|
+
context.log.warning(f"Instrument model {model} is not unique.")
|
|
1246
|
+
instruments_by_model[model] = inst_id
|
|
1247
|
+
context.log.info("Instrument models: %s", pformat(instruments_by_model))
|
|
1248
|
+
return instruments_by_model
|
|
1230
1249
|
|
|
1231
1250
|
|
|
1232
1251
|
@op
|
|
@@ -1340,3 +1359,26 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1340
1359
|
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1341
1360
|
|
|
1342
1361
|
return database
|
|
1362
|
+
|
|
1363
|
+
|
|
1364
|
+
@op
|
|
1365
|
+
def log_database_ids(
|
|
1366
|
+
context: OpExecutionContext,
|
|
1367
|
+
database: nmdc.Database,
|
|
1368
|
+
) -> None:
|
|
1369
|
+
"""Log the IDs of the database."""
|
|
1370
|
+
database_dict = as_simple_dict(database)
|
|
1371
|
+
message = ""
|
|
1372
|
+
for collection_name, collection in database_dict.items():
|
|
1373
|
+
if not isinstance(collection, list):
|
|
1374
|
+
continue
|
|
1375
|
+
message += f"{collection_name} ({len(collection)}):\n"
|
|
1376
|
+
if len(collection) < 10:
|
|
1377
|
+
message += "\n".join(f" {doc['id']}" for doc in collection)
|
|
1378
|
+
else:
|
|
1379
|
+
message += "\n".join(f" {doc['id']}" for doc in collection[:4])
|
|
1380
|
+
message += f"\n ... {len(collection) - 8} more\n"
|
|
1381
|
+
message += "\n".join(f" {doc['id']}" for doc in collection[-4:])
|
|
1382
|
+
message += "\n"
|
|
1383
|
+
if message:
|
|
1384
|
+
context.log.info(message)
|
|
@@ -199,8 +199,20 @@ class DatabaseUpdater:
|
|
|
199
199
|
if gbs.get("biosampleGoldId") not in nmdc_gold_ids
|
|
200
200
|
]
|
|
201
201
|
|
|
202
|
+
# use the GOLD study id to fetch all sequencing project records associated with the study
|
|
203
|
+
gold_sequencing_projects_for_study = (
|
|
204
|
+
self.gold_api_client.fetch_projects_by_study(gold_study_id)
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# use the GOLD study id to fetch all analysis project records associated with the study
|
|
208
|
+
gold_analysis_projects_for_study = (
|
|
209
|
+
self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
|
|
210
|
+
)
|
|
211
|
+
|
|
202
212
|
gold_study_translator = GoldStudyTranslator(
|
|
203
213
|
biosamples=missing_gold_biosamples,
|
|
214
|
+
projects=gold_sequencing_projects_for_study,
|
|
215
|
+
analysis_projects=gold_analysis_projects_for_study,
|
|
204
216
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
205
217
|
)
|
|
206
218
|
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -553,8 +553,6 @@ def biosample_submission_ingest():
|
|
|
553
553
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
554
554
|
"inputs": {
|
|
555
555
|
"study_category": "research_study",
|
|
556
|
-
"study_doi_category": None,
|
|
557
|
-
"study_doi_provider": None,
|
|
558
556
|
"study_pi_image_url": None,
|
|
559
557
|
}
|
|
560
558
|
},
|
|
@@ -591,8 +589,6 @@ def biosample_submission_ingest():
|
|
|
591
589
|
"translate_portal_submission_to_nmdc_schema_database": {
|
|
592
590
|
"inputs": {
|
|
593
591
|
"study_category": None,
|
|
594
|
-
"study_doi_category": None,
|
|
595
|
-
"study_doi_provider": None,
|
|
596
592
|
"study_pi_image_url": None,
|
|
597
593
|
}
|
|
598
594
|
},
|
|
@@ -744,7 +740,7 @@ def biosample_submission_ingest():
|
|
|
744
740
|
"config": {
|
|
745
741
|
"benthic_data_product": {
|
|
746
742
|
"product_id": "DP1.20279.001",
|
|
747
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
743
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
748
744
|
}
|
|
749
745
|
}
|
|
750
746
|
},
|
|
@@ -771,7 +767,7 @@ def biosample_submission_ingest():
|
|
|
771
767
|
"config": {
|
|
772
768
|
"benthic_data_product": {
|
|
773
769
|
"product_id": "DP1.20279.001",
|
|
774
|
-
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
|
|
770
|
+
"product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
|
|
775
771
|
}
|
|
776
772
|
}
|
|
777
773
|
},
|
|
@@ -639,6 +639,16 @@ class GoldStudyTranslator(Translator):
|
|
|
639
639
|
:return: nmdc:NucleotideSequencing object
|
|
640
640
|
"""
|
|
641
641
|
gold_project_id = gold_project["projectGoldId"]
|
|
642
|
+
ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
|
|
643
|
+
insdc_bioproject_identifiers = []
|
|
644
|
+
if ncbi_bioproject_identifier:
|
|
645
|
+
insdc_bioproject_identifiers.append(
|
|
646
|
+
self._ensure_curie(
|
|
647
|
+
ncbi_bioproject_identifier,
|
|
648
|
+
default_prefix="bioproject",
|
|
649
|
+
)
|
|
650
|
+
)
|
|
651
|
+
|
|
642
652
|
return nmdc.NucleotideSequencing(
|
|
643
653
|
id=nmdc_nucleotide_sequencing_id,
|
|
644
654
|
name=gold_project.get("projectName"),
|
|
@@ -650,6 +660,7 @@ class GoldStudyTranslator(Translator):
|
|
|
650
660
|
has_input=nmdc_biosample_id,
|
|
651
661
|
add_date=gold_project.get("addDate"),
|
|
652
662
|
mod_date=self._get_mod_date(gold_project),
|
|
663
|
+
insdc_bioproject_identifiers=insdc_bioproject_identifiers,
|
|
653
664
|
principal_investigator=self._get_pi(gold_project),
|
|
654
665
|
processing_institution=self._get_processing_institution(gold_project),
|
|
655
666
|
instrument_used=self._get_instrument(gold_project),
|