nmdc-runtime 2.1.0__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/site/export/ncbi_xml.py +63 -7
- nmdc_runtime/site/export/ncbi_xml_utils.py +25 -0
- nmdc_runtime/site/graphs.py +10 -3
- nmdc_runtime/site/ops.py +90 -89
- nmdc_runtime/site/repository.py +1 -0
- nmdc_runtime/site/resources.py +18 -1
- nmdc_runtime/site/translation/gold_translator.py +69 -26
- nmdc_runtime/site/translation/submission_portal_translator.py +47 -12
- nmdc_runtime/util.py +44 -0
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/METADATA +1 -3
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/RECORD +15 -15
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/LICENSE +0 -0
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.1.0.dist-info → nmdc_runtime-2.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import re
|
|
2
3
|
import datetime
|
|
3
4
|
import xml.etree.ElementTree as ET
|
|
4
5
|
import xml.dom.minidom
|
|
@@ -6,6 +7,7 @@ import xml.dom.minidom
|
|
|
6
7
|
from typing import Any
|
|
7
8
|
from urllib.parse import urlparse
|
|
8
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
|
+
get_instruments,
|
|
9
11
|
handle_controlled_identified_term_value,
|
|
10
12
|
handle_controlled_term_value,
|
|
11
13
|
handle_geolocation_value,
|
|
@@ -170,7 +172,39 @@ class NCBISubmissionXML:
|
|
|
170
172
|
|
|
171
173
|
for json_key, value in biosample.items():
|
|
172
174
|
if isinstance(value, list):
|
|
173
|
-
|
|
175
|
+
for item in value:
|
|
176
|
+
if json_key not in attribute_mappings:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
xml_key = attribute_mappings[json_key]
|
|
180
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
181
|
+
handler = self.type_handlers.get(
|
|
182
|
+
value_type, handle_string_value
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Special handling for "elev" key
|
|
186
|
+
if json_key == "elev":
|
|
187
|
+
value = f"{float(value)} m" # Convert to float if possible
|
|
188
|
+
attributes[xml_key] = value
|
|
189
|
+
continue # Skip applying the handler to this key
|
|
190
|
+
|
|
191
|
+
# Special handling for "host_taxid"
|
|
192
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
193
|
+
if "term" in value and "id" in value["term"]:
|
|
194
|
+
value = re.findall(
|
|
195
|
+
r"\d+", value["term"]["id"].split(":")[1]
|
|
196
|
+
)[0]
|
|
197
|
+
attributes[xml_key] = value
|
|
198
|
+
continue # Skip applying the handler to this key
|
|
199
|
+
|
|
200
|
+
formatted_value = handler(item)
|
|
201
|
+
|
|
202
|
+
# Combine multiple values with a separator for list elements
|
|
203
|
+
if xml_key in attributes:
|
|
204
|
+
attributes[xml_key] += f"| {formatted_value}"
|
|
205
|
+
else:
|
|
206
|
+
attributes[xml_key] = formatted_value
|
|
207
|
+
continue
|
|
174
208
|
|
|
175
209
|
if json_key == "env_package":
|
|
176
210
|
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
|
|
@@ -187,6 +221,20 @@ class NCBISubmissionXML:
|
|
|
187
221
|
value_type = slot_range_mappings.get(json_key, "string")
|
|
188
222
|
handler = self.type_handlers.get(value_type, handle_string_value)
|
|
189
223
|
|
|
224
|
+
# Special handling for "elev" key
|
|
225
|
+
if json_key == "elev":
|
|
226
|
+
value = f"{float(value)} m" # Convert to float if possible
|
|
227
|
+
attributes[xml_key] = value
|
|
228
|
+
continue # Skip applying the handler to this key
|
|
229
|
+
|
|
230
|
+
# Special handling for "host_taxid"
|
|
231
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
232
|
+
if "term" in value and "id" in value["term"]:
|
|
233
|
+
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
|
|
234
|
+
attributes[xml_key] = value
|
|
235
|
+
continue # Skip applying the handler to this key
|
|
236
|
+
|
|
237
|
+
# Default processing for other keys
|
|
190
238
|
formatted_value = handler(value)
|
|
191
239
|
attributes[xml_key] = formatted_value
|
|
192
240
|
|
|
@@ -286,6 +334,7 @@ class NCBISubmissionXML:
|
|
|
286
334
|
nmdc_nucleotide_sequencing: list,
|
|
287
335
|
nmdc_biosamples: list,
|
|
288
336
|
nmdc_library_preparation: list,
|
|
337
|
+
all_instruments: dict,
|
|
289
338
|
):
|
|
290
339
|
bsm_id_name_dict = {
|
|
291
340
|
biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
|
|
@@ -296,9 +345,10 @@ class NCBISubmissionXML:
|
|
|
296
345
|
biosample_ids = []
|
|
297
346
|
nucleotide_sequencing_ids = {}
|
|
298
347
|
lib_prep_protocol_names = {}
|
|
299
|
-
instrument_name = ""
|
|
300
348
|
analyte_category = ""
|
|
301
349
|
library_name = ""
|
|
350
|
+
instrument_vendor = ""
|
|
351
|
+
instrument_model = ""
|
|
302
352
|
|
|
303
353
|
for biosample_id, data_objects in entry.items():
|
|
304
354
|
biosample_ids.append(biosample_id)
|
|
@@ -316,7 +366,11 @@ class NCBISubmissionXML:
|
|
|
316
366
|
)
|
|
317
367
|
# Currently, we are making the assumption that only one instrument
|
|
318
368
|
# is used to sequence a Biosample
|
|
319
|
-
|
|
369
|
+
instrument_id = ntseq.get("instrument_used", "")[0]
|
|
370
|
+
instrument = all_instruments.get(instrument_id, {})
|
|
371
|
+
instrument_vendor = instrument.get("vendor", "")
|
|
372
|
+
instrument_model = instrument.get("model", "")
|
|
373
|
+
|
|
320
374
|
analyte_category = ntseq.get("analyte_category", "")
|
|
321
375
|
library_name = bsm_id_name_dict.get(biosample_id, "")
|
|
322
376
|
|
|
@@ -353,9 +407,9 @@ class NCBISubmissionXML:
|
|
|
353
407
|
"RefId",
|
|
354
408
|
children=[
|
|
355
409
|
self.set_element(
|
|
356
|
-
"
|
|
410
|
+
"PrimaryId",
|
|
357
411
|
bioproject_id,
|
|
358
|
-
{"
|
|
412
|
+
{"db": "BioProject"},
|
|
359
413
|
)
|
|
360
414
|
],
|
|
361
415
|
)
|
|
@@ -384,11 +438,11 @@ class NCBISubmissionXML:
|
|
|
384
438
|
)
|
|
385
439
|
|
|
386
440
|
sra_attributes = []
|
|
387
|
-
if
|
|
441
|
+
if instrument_vendor == "illumina":
|
|
388
442
|
sra_attributes.append(
|
|
389
443
|
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
|
|
390
444
|
)
|
|
391
|
-
if "
|
|
445
|
+
if instrument_model == "nextseq_550":
|
|
392
446
|
sra_attributes.append(
|
|
393
447
|
self.set_element(
|
|
394
448
|
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
@@ -501,6 +555,7 @@ class NCBISubmissionXML:
|
|
|
501
555
|
biosample_nucleotide_sequencing_list: list,
|
|
502
556
|
biosample_data_objects_list: list,
|
|
503
557
|
biosample_library_preparation_list: list,
|
|
558
|
+
instruments_dict: dict,
|
|
504
559
|
):
|
|
505
560
|
data_type = None
|
|
506
561
|
ncbi_project_id = None
|
|
@@ -545,6 +600,7 @@ class NCBISubmissionXML:
|
|
|
545
600
|
nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
|
|
546
601
|
nmdc_biosamples=biosamples_list,
|
|
547
602
|
nmdc_library_preparation=biosample_library_preparation_list,
|
|
603
|
+
all_instruments=instruments_dict,
|
|
548
604
|
)
|
|
549
605
|
|
|
550
606
|
rough_string = ET.tostring(self.root, "unicode")
|
|
@@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id):
|
|
|
20
20
|
return class_map.get(typecode)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def get_instruments(instrument_set_collection):
|
|
24
|
+
# dictionary to capture a list of all instruments
|
|
25
|
+
# Structure of dict:
|
|
26
|
+
# {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
|
|
27
|
+
all_instruments = {}
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
query = {"type": "nmdc:Instrument"}
|
|
31
|
+
cursor = instrument_set_collection.find(query)
|
|
32
|
+
|
|
33
|
+
for document in cursor:
|
|
34
|
+
instrument_id = document.get("id")
|
|
35
|
+
vendor = document.get("vendor")
|
|
36
|
+
model = document.get("model")
|
|
37
|
+
|
|
38
|
+
if not instrument_id or not vendor or not model:
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
all_instruments[instrument_id] = {"vendor": vendor, "model": model}
|
|
42
|
+
|
|
43
|
+
return all_instruments
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
|
|
46
|
+
|
|
47
|
+
|
|
23
48
|
def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
|
|
24
49
|
biosample_data_objects = []
|
|
25
50
|
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -53,6 +53,7 @@ from nmdc_runtime.site.ops import (
|
|
|
53
53
|
get_data_objects_from_biosamples,
|
|
54
54
|
get_nucleotide_sequencing_from_biosamples,
|
|
55
55
|
get_library_preparation_from_biosamples,
|
|
56
|
+
get_all_instruments,
|
|
56
57
|
get_ncbi_export_pipeline_inputs,
|
|
57
58
|
ncbi_submission_xml_from_nmdc_study,
|
|
58
59
|
ncbi_submission_xml_asset,
|
|
@@ -126,9 +127,12 @@ def apply_metadata_in():
|
|
|
126
127
|
|
|
127
128
|
@graph
|
|
128
129
|
def gold_study_to_database():
|
|
129
|
-
(
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
(
|
|
131
|
+
study_id,
|
|
132
|
+
study_type,
|
|
133
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
134
|
+
include_field_site_info,
|
|
135
|
+
) = get_gold_study_pipeline_inputs()
|
|
132
136
|
|
|
133
137
|
projects = gold_projects_by_study(study_id)
|
|
134
138
|
biosamples = gold_biosamples_by_study(study_id)
|
|
@@ -143,6 +147,7 @@ def gold_study_to_database():
|
|
|
143
147
|
biosamples,
|
|
144
148
|
analysis_projects,
|
|
145
149
|
gold_nmdc_instrument_map_df,
|
|
150
|
+
include_field_site_info,
|
|
146
151
|
)
|
|
147
152
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
148
153
|
filename = nmdc_schema_database_export_filename(study)
|
|
@@ -449,6 +454,7 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
449
454
|
)
|
|
450
455
|
data_object_records = get_data_objects_from_biosamples(biosamples)
|
|
451
456
|
library_preparation_records = get_library_preparation_from_biosamples(biosamples)
|
|
457
|
+
all_instruments = get_all_instruments()
|
|
452
458
|
xml_data = ncbi_submission_xml_from_nmdc_study(
|
|
453
459
|
nmdc_study,
|
|
454
460
|
ncbi_submission_metadata,
|
|
@@ -456,5 +462,6 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
456
462
|
nucleotide_sequencing_records,
|
|
457
463
|
data_object_records,
|
|
458
464
|
library_preparation_records,
|
|
465
|
+
all_instruments,
|
|
459
466
|
)
|
|
460
467
|
ncbi_submission_xml_asset(xml_data)
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -7,6 +7,7 @@ import tempfile
|
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from datetime import datetime, timezone
|
|
9
9
|
from io import BytesIO, StringIO
|
|
10
|
+
from toolz.dicttoolz import keyfilter
|
|
10
11
|
from typing import Tuple
|
|
11
12
|
from zipfile import ZipFile
|
|
12
13
|
from itertools import chain
|
|
@@ -68,6 +69,7 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
68
69
|
fetch_data_objects_from_biosamples,
|
|
69
70
|
fetch_nucleotide_sequencing_from_biosamples,
|
|
70
71
|
fetch_library_preparation_from_biosamples,
|
|
72
|
+
get_instruments,
|
|
71
73
|
)
|
|
72
74
|
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
73
75
|
from nmdc_runtime.site.resources import (
|
|
@@ -92,17 +94,20 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
|
92
94
|
from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
|
|
93
95
|
from nmdc_runtime.util import (
|
|
94
96
|
drs_object_in_for,
|
|
97
|
+
get_names_of_classes_in_effective_range_of_slot,
|
|
95
98
|
pluralize,
|
|
96
99
|
put_object,
|
|
97
100
|
validate_json,
|
|
98
101
|
specialize_activity_set_docs,
|
|
99
102
|
collection_name_to_class_names,
|
|
100
103
|
class_hierarchy_as_list,
|
|
104
|
+
nmdc_schema_view,
|
|
101
105
|
populated_schema_collection_names_with_id_field,
|
|
102
106
|
)
|
|
103
107
|
from nmdc_schema import nmdc
|
|
104
108
|
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
105
109
|
from pydantic import BaseModel
|
|
110
|
+
from pymongo import InsertOne
|
|
106
111
|
from pymongo.database import Database as MongoDatabase
|
|
107
112
|
from starlette import status
|
|
108
113
|
from toolz import assoc, dissoc, get_in, valfilter, identity
|
|
@@ -588,18 +593,23 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
588
593
|
"study_id": str,
|
|
589
594
|
"study_type": str,
|
|
590
595
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
596
|
+
"include_field_site_info": bool,
|
|
591
597
|
},
|
|
592
598
|
out={
|
|
593
599
|
"study_id": Out(str),
|
|
594
600
|
"study_type": Out(str),
|
|
595
601
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
602
|
+
"include_field_site_info": Out(bool),
|
|
596
603
|
},
|
|
597
604
|
)
|
|
598
|
-
def get_gold_study_pipeline_inputs(
|
|
605
|
+
def get_gold_study_pipeline_inputs(
|
|
606
|
+
context: OpExecutionContext,
|
|
607
|
+
) -> Tuple[str, str, str, bool]:
|
|
599
608
|
return (
|
|
600
609
|
context.op_config["study_id"],
|
|
601
610
|
context.op_config["study_type"],
|
|
602
611
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
612
|
+
context.op_config["include_field_site_info"],
|
|
603
613
|
)
|
|
604
614
|
|
|
605
615
|
|
|
@@ -642,6 +652,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
642
652
|
biosamples: List[Dict[str, Any]],
|
|
643
653
|
analysis_projects: List[Dict[str, Any]],
|
|
644
654
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
655
|
+
include_field_site_info: bool,
|
|
645
656
|
) -> nmdc.Database:
|
|
646
657
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
647
658
|
|
|
@@ -656,6 +667,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
656
667
|
projects,
|
|
657
668
|
analysis_projects,
|
|
658
669
|
gold_nmdc_instrument_map_df,
|
|
670
|
+
include_field_site_info,
|
|
659
671
|
id_minter=id_minter,
|
|
660
672
|
)
|
|
661
673
|
database = translator.get_database()
|
|
@@ -1029,23 +1041,51 @@ def site_code_mapping() -> dict:
|
|
|
1029
1041
|
|
|
1030
1042
|
@op(required_resource_keys={"mongo"})
|
|
1031
1043
|
def materialize_alldocs(context) -> int:
|
|
1044
|
+
"""
|
|
1045
|
+
This function re-creates the alldocs collection to reflect the current state of the Mongo database.
|
|
1046
|
+
See nmdc-runtime/docs/nb/bulk_validation_referential_integrity_check.ipynb for more details.
|
|
1047
|
+
"""
|
|
1032
1048
|
mdb = context.resources.mongo.db
|
|
1033
|
-
|
|
1049
|
+
schema_view = nmdc_schema_view()
|
|
1034
1050
|
|
|
1035
|
-
#
|
|
1036
|
-
|
|
1037
|
-
# Note: There used to be code here that `assert`-ed that each collection could only contain documents of a single
|
|
1038
|
-
# type. With the legacy schema, that assertion was true. With the Berkeley schema, it is false. That code was
|
|
1039
|
-
# in place because subsequent code (further below) used a single document in a collection as the source of the
|
|
1040
|
-
# class ancestry information of _all_ documents in that collection; an optimization that spared us from
|
|
1041
|
-
# having to do the same for every single document in that collection. With the Berkeley schema, we have
|
|
1042
|
-
# eliminated that optimization (since it is inadequate; it would produce some incorrect class ancestries
|
|
1043
|
-
# for descendants of `PlannedProcess`, for example).
|
|
1044
|
-
#
|
|
1045
|
-
pass
|
|
1051
|
+
# batch size for writing documents to alldocs
|
|
1052
|
+
BULK_WRITE_BATCH_SIZE = 2000
|
|
1046
1053
|
|
|
1054
|
+
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1055
|
+
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1056
|
+
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
1047
1057
|
context.log.info(f"{collection_names=}")
|
|
1048
1058
|
|
|
1059
|
+
# Build alldocs
|
|
1060
|
+
context.log.info("constructing `alldocs` collection")
|
|
1061
|
+
|
|
1062
|
+
document_class_names = set(
|
|
1063
|
+
chain.from_iterable(collection_name_to_class_names.values())
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
cls_slot_map = {
|
|
1067
|
+
cls_name: {
|
|
1068
|
+
slot.name: slot for slot in schema_view.class_induced_slots(cls_name)
|
|
1069
|
+
}
|
|
1070
|
+
for cls_name in document_class_names
|
|
1071
|
+
}
|
|
1072
|
+
|
|
1073
|
+
# Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
|
|
1074
|
+
document_referenceable_ranges = set(
|
|
1075
|
+
chain.from_iterable(
|
|
1076
|
+
schema_view.class_ancestors(cls_name) for cls_name in document_class_names
|
|
1077
|
+
)
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
document_reference_ranged_slots = defaultdict(list)
|
|
1081
|
+
for cls_name, slot_map in cls_slot_map.items():
|
|
1082
|
+
for slot_name, slot in slot_map.items():
|
|
1083
|
+
if (
|
|
1084
|
+
set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
|
|
1085
|
+
& document_referenceable_ranges
|
|
1086
|
+
):
|
|
1087
|
+
document_reference_ranged_slots[cls_name].append(slot_name)
|
|
1088
|
+
|
|
1049
1089
|
# Drop any existing `alldocs` collection (e.g. from previous use of this op).
|
|
1050
1090
|
#
|
|
1051
1091
|
# FIXME: This "nuke and pave" approach introduces a race condition.
|
|
@@ -1054,90 +1094,41 @@ def materialize_alldocs(context) -> int:
|
|
|
1054
1094
|
#
|
|
1055
1095
|
mdb.alldocs.drop()
|
|
1056
1096
|
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1097
|
+
for coll_name in collection_names:
|
|
1098
|
+
context.log.info(f"{coll_name=}")
|
|
1099
|
+
requests = []
|
|
1100
|
+
documents_processed_counter = 0
|
|
1101
|
+
for doc in mdb[coll_name].find():
|
|
1102
|
+
doc_type = doc["type"][5:] # lop off "nmdc:" prefix
|
|
1103
|
+
slots_to_include = ["id", "type"] + document_reference_ranged_slots[
|
|
1104
|
+
doc_type
|
|
1105
|
+
]
|
|
1106
|
+
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1107
|
+
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1108
|
+
requests.append(InsertOne(new_doc))
|
|
1109
|
+
if len(requests) == BULK_WRITE_BATCH_SIZE:
|
|
1110
|
+
_ = mdb.alldocs.bulk_write(requests, ordered=False)
|
|
1111
|
+
requests.clear()
|
|
1112
|
+
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1113
|
+
if len(requests) > 0:
|
|
1114
|
+
_ = mdb.alldocs.bulk_write(requests, ordered=False)
|
|
1115
|
+
documents_processed_counter += len(requests)
|
|
1062
1116
|
context.log.info(
|
|
1063
|
-
f"
|
|
1117
|
+
f"Inserted {documents_processed_counter} documents from {coll_name=} "
|
|
1064
1118
|
)
|
|
1065
1119
|
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
# - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct
|
|
1070
|
-
#
|
|
1071
|
-
distinct_type_values = mdb[collection_name].distinct(key="type")
|
|
1072
|
-
context.log.info(
|
|
1073
|
-
f"Found {len(distinct_type_values)} distinct `type` values in {collection_name=}: {distinct_type_values=}"
|
|
1074
|
-
)
|
|
1075
|
-
for type_value in distinct_type_values:
|
|
1076
|
-
|
|
1077
|
-
# Process all the documents in this collection that have this value in their `type` field.
|
|
1078
|
-
#
|
|
1079
|
-
# References:
|
|
1080
|
-
# - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.count_documents
|
|
1081
|
-
# - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.find
|
|
1082
|
-
#
|
|
1083
|
-
filter_ = {"type": type_value}
|
|
1084
|
-
num_docs_having_type = mdb[collection_name].count_documents(filter=filter_)
|
|
1085
|
-
docs_having_type = mdb[collection_name].find(filter=filter_)
|
|
1086
|
-
context.log.info(
|
|
1087
|
-
f"Found {num_docs_having_type} documents having {type_value=} in {collection_name=}."
|
|
1088
|
-
)
|
|
1089
|
-
|
|
1090
|
-
# Get a "representative" document from the result.
|
|
1091
|
-
#
|
|
1092
|
-
# Note: Since all of the documents in this batch have the same class ancestry, we will save time by
|
|
1093
|
-
# determining the class ancestry of only _one_ of them (we call this the "representative") and then
|
|
1094
|
-
# (later) attributing that class ancestry to all of them.
|
|
1095
|
-
#
|
|
1096
|
-
representative_doc = next(docs_having_type)
|
|
1097
|
-
|
|
1098
|
-
# Instantiate the Python class represented by the "representative" document.
|
|
1099
|
-
db_dict = {
|
|
1100
|
-
# Shed the `_id` attribute, since the constructor doesn't allow it.
|
|
1101
|
-
collection_name: [dissoc(representative_doc, "_id")]
|
|
1102
|
-
}
|
|
1103
|
-
nmdc_db = NMDCDatabase(**db_dict)
|
|
1104
|
-
representative_instance = getattr(nmdc_db, collection_name)[0]
|
|
1105
|
-
|
|
1106
|
-
# Get the class ancestry of that instance, as a list of class names (including its own class name).
|
|
1107
|
-
ancestor_class_names = class_hierarchy_as_list(representative_instance)
|
|
1108
|
-
|
|
1109
|
-
# Store the documents belonging to this group, in the `alldocs` collection, setting their `type` field
|
|
1110
|
-
# to the list of class names obtained from the "representative" document above.
|
|
1111
|
-
#
|
|
1112
|
-
# TODO: Document why clobbering the existing contents of the `type` field is OK.
|
|
1113
|
-
#
|
|
1114
|
-
# Note: The reason we `chain()` our "representative" document (in an iterable) with the `docs_having_type`
|
|
1115
|
-
# iterator here is that, when we called `next(docs_having_type)` above, we "consumed" our
|
|
1116
|
-
# "representative" document from that iterator. We use `chain()` here so that that document gets
|
|
1117
|
-
# inserted alongside its cousins (i.e. the documents _still_ accessible via `docs_having_type`).
|
|
1118
|
-
# Reference: https://docs.python.org/3/library/itertools.html#itertools.chain
|
|
1119
|
-
#
|
|
1120
|
-
inserted_many_result = mdb.alldocs.insert_many(
|
|
1121
|
-
[
|
|
1122
|
-
assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names)
|
|
1123
|
-
for doc in chain([representative_doc], docs_having_type)
|
|
1124
|
-
]
|
|
1125
|
-
)
|
|
1126
|
-
context.log.info(
|
|
1127
|
-
f"Inserted {len(inserted_many_result.inserted_ids)} documents from {collection_name=} "
|
|
1128
|
-
f"originally having {type_value=}."
|
|
1129
|
-
)
|
|
1120
|
+
context.log.info(
|
|
1121
|
+
f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
|
|
1122
|
+
)
|
|
1130
1123
|
|
|
1131
1124
|
# Re-idx for `alldocs` collection
|
|
1132
1125
|
mdb.alldocs.create_index("id", unique=True)
|
|
1133
1126
|
# The indexes were added to improve the performance of the
|
|
1134
1127
|
# /data_objects/study/{study_id} endpoint
|
|
1135
|
-
|
|
1136
|
-
mdb.alldocs.create_index(
|
|
1137
|
-
|
|
1138
|
-
context.log.info(
|
|
1139
|
-
f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
|
|
1140
|
-
)
|
|
1128
|
+
slots_to_index = ["has_input", "has_output", "was_informed_by"]
|
|
1129
|
+
[mdb.alldocs.create_index(slot) for slot in slots_to_index]
|
|
1130
|
+
|
|
1131
|
+
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1141
1132
|
return mdb.alldocs.estimated_document_count()
|
|
1142
1133
|
|
|
1143
1134
|
|
|
@@ -1221,6 +1212,14 @@ def get_library_preparation_from_biosamples(
|
|
|
1221
1212
|
return biosample_lib_prep
|
|
1222
1213
|
|
|
1223
1214
|
|
|
1215
|
+
@op(required_resource_keys={"mongo"})
|
|
1216
|
+
def get_all_instruments(context: OpExecutionContext):
|
|
1217
|
+
mdb = context.resources.mongo.db
|
|
1218
|
+
instrument_set_collection = mdb["instrument_set"]
|
|
1219
|
+
all_instruments = get_instruments(instrument_set_collection)
|
|
1220
|
+
return all_instruments
|
|
1221
|
+
|
|
1222
|
+
|
|
1224
1223
|
@op
|
|
1225
1224
|
def ncbi_submission_xml_from_nmdc_study(
|
|
1226
1225
|
context: OpExecutionContext,
|
|
@@ -1230,6 +1229,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1230
1229
|
omics_processing_records: list,
|
|
1231
1230
|
data_object_records: list,
|
|
1232
1231
|
library_preparation_records: list,
|
|
1232
|
+
all_instruments: dict,
|
|
1233
1233
|
) -> str:
|
|
1234
1234
|
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1235
1235
|
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
@@ -1237,5 +1237,6 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1237
1237
|
omics_processing_records,
|
|
1238
1238
|
data_object_records,
|
|
1239
1239
|
library_preparation_records,
|
|
1240
|
+
all_instruments,
|
|
1240
1241
|
)
|
|
1241
1242
|
return ncbi_xml
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -506,6 +506,7 @@ def biosample_submission_ingest():
|
|
|
506
506
|
"study_id": "",
|
|
507
507
|
"study_type": "research_study",
|
|
508
508
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
509
|
+
"include_field_site_info": False,
|
|
509
510
|
},
|
|
510
511
|
},
|
|
511
512
|
"export_json_to_drs": {"config": {"username": ""}},
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -331,9 +331,26 @@ class GoldApiClient(BasicAuthClient):
|
|
|
331
331
|
"""
|
|
332
332
|
return id.replace("gold:", "")
|
|
333
333
|
|
|
334
|
-
def fetch_biosamples_by_study(
|
|
334
|
+
def fetch_biosamples_by_study(
|
|
335
|
+
self, study_id: str, include_project=True
|
|
336
|
+
) -> List[Dict[str, Any]]:
|
|
335
337
|
id = self._normalize_id(study_id)
|
|
336
338
|
results = self.request("/biosamples", params={"studyGoldId": id})
|
|
339
|
+
if include_project:
|
|
340
|
+
projects = self.fetch_projects_by_study(id)
|
|
341
|
+
biosamples_by_id = {
|
|
342
|
+
biosample["biosampleGoldId"]: biosample for biosample in results
|
|
343
|
+
}
|
|
344
|
+
for project in projects:
|
|
345
|
+
sample_id = project.get("biosampleGoldId")
|
|
346
|
+
if not sample_id:
|
|
347
|
+
continue
|
|
348
|
+
if sample_id not in biosamples_by_id:
|
|
349
|
+
continue
|
|
350
|
+
biosample = biosamples_by_id[sample_id]
|
|
351
|
+
if "projects" not in biosample:
|
|
352
|
+
biosample["projects"] = []
|
|
353
|
+
biosample["projects"].append(project)
|
|
337
354
|
return results
|
|
338
355
|
|
|
339
356
|
def fetch_projects_by_study(self, study_id: str) -> List[Dict[str, Any]]:
|
|
@@ -7,6 +7,10 @@ import pandas as pd
|
|
|
7
7
|
|
|
8
8
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
9
9
|
|
|
10
|
+
# Dictionary of sequencing strategies from GOLD that we are filtering on
|
|
11
|
+
# based on the kind of samples that are required for NMDC
|
|
12
|
+
SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
|
|
13
|
+
|
|
10
14
|
|
|
11
15
|
class GoldStudyTranslator(Translator):
|
|
12
16
|
def __init__(
|
|
@@ -17,6 +21,7 @@ class GoldStudyTranslator(Translator):
|
|
|
17
21
|
projects: List[JSON_OBJECT] = [],
|
|
18
22
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
19
23
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
24
|
+
include_field_site_info: bool = False,
|
|
20
25
|
*args,
|
|
21
26
|
**kwargs,
|
|
22
27
|
) -> None:
|
|
@@ -24,9 +29,39 @@ class GoldStudyTranslator(Translator):
|
|
|
24
29
|
|
|
25
30
|
self.study = study
|
|
26
31
|
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
27
|
-
self.
|
|
28
|
-
|
|
29
|
-
|
|
32
|
+
self.include_field_site_info = include_field_site_info
|
|
33
|
+
# Filter biosamples to only those with `sequencingStrategy` of
|
|
34
|
+
# "Metagenome" or "Metatranscriptome"
|
|
35
|
+
self.biosamples = [
|
|
36
|
+
biosample
|
|
37
|
+
for biosample in biosamples
|
|
38
|
+
if any(
|
|
39
|
+
project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
|
|
40
|
+
for project in biosample.get("projects", [])
|
|
41
|
+
)
|
|
42
|
+
]
|
|
43
|
+
# Fetch the valid projectGoldIds that are associated with filtered
|
|
44
|
+
# biosamples on their `projects` field
|
|
45
|
+
valid_project_ids = {
|
|
46
|
+
project.get("projectGoldId")
|
|
47
|
+
for biosample in self.biosamples
|
|
48
|
+
for project in biosample.get("projects", [])
|
|
49
|
+
}
|
|
50
|
+
# Filter projects to only those with `projectGoldId` in valid_project_ids
|
|
51
|
+
self.projects = [
|
|
52
|
+
project
|
|
53
|
+
for project in projects
|
|
54
|
+
if project.get("projectGoldId") in valid_project_ids
|
|
55
|
+
]
|
|
56
|
+
# Filter analysis_projects to only those with all `projects` in valid_project_ids
|
|
57
|
+
self.analysis_projects = [
|
|
58
|
+
analysis_project
|
|
59
|
+
for analysis_project in analysis_projects
|
|
60
|
+
if all(
|
|
61
|
+
project_id in valid_project_ids
|
|
62
|
+
for project_id in analysis_project.get("projects", [])
|
|
63
|
+
)
|
|
64
|
+
]
|
|
30
65
|
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
31
66
|
|
|
32
67
|
self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
|
|
@@ -596,7 +631,11 @@ class GoldStudyTranslator(Translator):
|
|
|
596
631
|
principal_investigator=self._get_pi(gold_project),
|
|
597
632
|
processing_institution=self._get_processing_institution(gold_project),
|
|
598
633
|
instrument_used=self._get_instrument(gold_project),
|
|
599
|
-
analyte_category=
|
|
634
|
+
analyte_category=(
|
|
635
|
+
gold_project.get("sequencingStrategy").lower()
|
|
636
|
+
if gold_project.get("sequencingStrategy")
|
|
637
|
+
else None
|
|
638
|
+
),
|
|
600
639
|
associated_studies=[nmdc_study_id],
|
|
601
640
|
)
|
|
602
641
|
|
|
@@ -621,21 +660,24 @@ class GoldStudyTranslator(Translator):
|
|
|
621
660
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
|
|
622
661
|
gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
|
|
623
662
|
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
663
|
+
if self.include_field_site_info:
|
|
664
|
+
gold_field_site_names = sorted(
|
|
665
|
+
{self._get_field_site_name(biosample) for biosample in self.biosamples}
|
|
666
|
+
)
|
|
667
|
+
nmdc_field_site_ids = self._id_minter(
|
|
668
|
+
"nmdc:FieldResearchSite", len(gold_field_site_names)
|
|
669
|
+
)
|
|
670
|
+
gold_name_to_nmdc_field_site_ids = dict(
|
|
671
|
+
zip(gold_field_site_names, nmdc_field_site_ids)
|
|
672
|
+
)
|
|
673
|
+
gold_biosample_to_nmdc_field_site_ids = {
|
|
674
|
+
biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
|
|
675
|
+
self._get_field_site_name(biosample)
|
|
676
|
+
]
|
|
677
|
+
for biosample in self.biosamples
|
|
678
|
+
}
|
|
679
|
+
else:
|
|
680
|
+
gold_biosample_to_nmdc_field_site_ids = {}
|
|
639
681
|
|
|
640
682
|
gold_project_ids = [project["projectGoldId"] for project in self.projects]
|
|
641
683
|
nmdc_nucleotide_sequencing_ids = self._id_minter(
|
|
@@ -653,16 +695,17 @@ class GoldStudyTranslator(Translator):
|
|
|
653
695
|
biosample["biosampleGoldId"]
|
|
654
696
|
],
|
|
655
697
|
nmdc_study_id=nmdc_study_id,
|
|
656
|
-
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids
|
|
657
|
-
biosample["biosampleGoldId"]
|
|
658
|
-
|
|
698
|
+
nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
|
|
699
|
+
biosample["biosampleGoldId"], None
|
|
700
|
+
),
|
|
659
701
|
)
|
|
660
702
|
for biosample in self.biosamples
|
|
661
703
|
]
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
704
|
+
if self.include_field_site_info:
|
|
705
|
+
database.field_research_site_set = [
|
|
706
|
+
nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
|
|
707
|
+
for name, id in gold_name_to_nmdc_field_site_ids.items()
|
|
708
|
+
]
|
|
666
709
|
database.data_generation_set = [
|
|
667
710
|
self._translate_nucleotide_sequencing(
|
|
668
711
|
project,
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import re
|
|
3
3
|
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from functools import lru_cache
|
|
5
6
|
from importlib import resources
|
|
6
7
|
from typing import Any, List, Optional, Union
|
|
@@ -8,14 +9,36 @@ from typing import Any, List, Optional, Union
|
|
|
8
9
|
from linkml_runtime import SchemaView
|
|
9
10
|
from linkml_runtime.linkml_model import SlotDefinition
|
|
10
11
|
from nmdc_schema import nmdc
|
|
11
|
-
from toolz import
|
|
12
|
+
from toolz import concat, dissoc, get_in, groupby, valmap
|
|
12
13
|
|
|
13
14
|
from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
|
|
14
15
|
|
|
15
|
-
|
|
16
16
|
BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
class EnvironmentPackage(Enum):
|
|
20
|
+
r"""
|
|
21
|
+
Enumeration of all possible environmental packages.
|
|
22
|
+
|
|
23
|
+
>>> EnvironmentPackage.AIR.value
|
|
24
|
+
'air'
|
|
25
|
+
>>> EnvironmentPackage.SEDIMENT.value
|
|
26
|
+
'sediment'
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
AIR = "air"
|
|
30
|
+
BIOFILM = "microbial mat_biofilm"
|
|
31
|
+
BUILT_ENV = "built environment"
|
|
32
|
+
HCR_CORES = "hydrocarbon resources-cores"
|
|
33
|
+
HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
|
|
34
|
+
HOST_ASSOCIATED = "host-associated"
|
|
35
|
+
MISC_ENVS = "miscellaneous natural or artificial environment"
|
|
36
|
+
PLANT_ASSOCIATED = "plant-associated"
|
|
37
|
+
SEDIMENT = "sediment"
|
|
38
|
+
SOIL = "soil"
|
|
39
|
+
WATER = "water"
|
|
40
|
+
|
|
41
|
+
|
|
19
42
|
@lru_cache
|
|
20
43
|
def _get_schema_view():
|
|
21
44
|
"""Return a SchemaView instance representing the NMDC schema"""
|
|
@@ -550,7 +573,6 @@ class SubmissionPortalTranslator(Translator):
|
|
|
550
573
|
sample_data: List[JSON_OBJECT],
|
|
551
574
|
nmdc_biosample_id: str,
|
|
552
575
|
nmdc_study_id: str,
|
|
553
|
-
default_env_package: str,
|
|
554
576
|
) -> nmdc.Biosample:
|
|
555
577
|
"""Translate sample data from portal submission into an `nmdc:Biosample` object.
|
|
556
578
|
|
|
@@ -565,18 +587,23 @@ class SubmissionPortalTranslator(Translator):
|
|
|
565
587
|
from each applicable submission portal tab
|
|
566
588
|
:param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
|
|
567
589
|
:param nmdc_study_id: Minted nmdc:Study identifier for the related Study
|
|
568
|
-
:param default_env_package: Default value for `env_package` slot
|
|
569
590
|
:return: nmdc:Biosample
|
|
570
591
|
"""
|
|
571
|
-
|
|
592
|
+
env_idx = next(
|
|
593
|
+
(
|
|
594
|
+
i
|
|
595
|
+
for i, tab in enumerate(sample_data)
|
|
596
|
+
if tab.get("env_package") is not None
|
|
597
|
+
),
|
|
598
|
+
0,
|
|
599
|
+
)
|
|
600
|
+
biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
|
|
572
601
|
slots = {
|
|
573
602
|
"id": nmdc_biosample_id,
|
|
574
603
|
"associated_studies": [nmdc_study_id],
|
|
575
604
|
"type": "nmdc:Biosample",
|
|
576
|
-
"name": sample_data[
|
|
577
|
-
"env_package":
|
|
578
|
-
has_raw_value=default_env_package, type="nmdc:TextValue"
|
|
579
|
-
),
|
|
605
|
+
"name": sample_data[env_idx].get("samp_name", "").strip(),
|
|
606
|
+
"env_package": sample_data[env_idx].get("env_package"),
|
|
580
607
|
}
|
|
581
608
|
for tab in sample_data:
|
|
582
609
|
transformed_tab = self._transform_dict_for_class(tab, "Biosample")
|
|
@@ -613,9 +640,18 @@ class SubmissionPortalTranslator(Translator):
|
|
|
613
640
|
]
|
|
614
641
|
|
|
615
642
|
sample_data = metadata_submission_data.get("sampleData", {})
|
|
616
|
-
|
|
643
|
+
for key in sample_data.keys():
|
|
644
|
+
env = key.removesuffix("_data").upper()
|
|
645
|
+
try:
|
|
646
|
+
package_name = EnvironmentPackage[env].value
|
|
647
|
+
for sample in sample_data[key]:
|
|
648
|
+
sample["env_package"] = package_name
|
|
649
|
+
except KeyError:
|
|
650
|
+
pass
|
|
651
|
+
|
|
617
652
|
sample_data_by_id = groupby(
|
|
618
|
-
BIOSAMPLE_UNIQUE_KEY_SLOT,
|
|
653
|
+
BIOSAMPLE_UNIQUE_KEY_SLOT,
|
|
654
|
+
concat(sample_data.values()),
|
|
619
655
|
)
|
|
620
656
|
nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
|
|
621
657
|
sample_data_to_nmdc_biosample_ids = dict(
|
|
@@ -627,7 +663,6 @@ class SubmissionPortalTranslator(Translator):
|
|
|
627
663
|
sample_data,
|
|
628
664
|
nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
|
|
629
665
|
nmdc_study_id=nmdc_study_id,
|
|
630
|
-
default_env_package=package_name,
|
|
631
666
|
)
|
|
632
667
|
for sample_data_id, sample_data in sample_data_by_id.items()
|
|
633
668
|
if sample_data
|
nmdc_runtime/util.py
CHANGED
|
@@ -17,6 +17,8 @@ import fastjsonschema
|
|
|
17
17
|
import requests
|
|
18
18
|
from frozendict import frozendict
|
|
19
19
|
from jsonschema.validators import Draft7Validator
|
|
20
|
+
from linkml_runtime import linkml_model
|
|
21
|
+
from linkml_runtime.utils.schemaview import SchemaView
|
|
20
22
|
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
21
23
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
22
24
|
from pydantic import Field, BaseModel
|
|
@@ -29,6 +31,48 @@ from nmdc_runtime.api.models.object import DrsObjectIn
|
|
|
29
31
|
from typing_extensions import Annotated
|
|
30
32
|
|
|
31
33
|
|
|
34
|
+
def get_names_of_classes_in_effective_range_of_slot(
|
|
35
|
+
schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
|
|
36
|
+
) -> List[str]:
|
|
37
|
+
r"""
|
|
38
|
+
Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
|
|
39
|
+
|
|
40
|
+
Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
|
|
41
|
+
induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
|
|
42
|
+
of applying those additional constraints, so we do it manually here (if any are defined).
|
|
43
|
+
Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
|
|
44
|
+
|
|
45
|
+
Reference: https://linkml.io/linkml-model/latest/docs/any_of/
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Initialize the list to be empty.
|
|
49
|
+
names_of_eligible_target_classes = []
|
|
50
|
+
|
|
51
|
+
# If the `any_of` constraint is defined on this slot, use that instead of the `range`.
|
|
52
|
+
if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
|
|
53
|
+
for slot_expression in slot_definition.any_of:
|
|
54
|
+
# Use the slot expression's `range` to get the specified eligible class name
|
|
55
|
+
# and the names of all classes that inherit from that eligible class.
|
|
56
|
+
if slot_expression.range in schema_view.all_classes():
|
|
57
|
+
own_and_descendant_class_names = schema_view.class_descendants(
|
|
58
|
+
slot_expression.range
|
|
59
|
+
)
|
|
60
|
+
names_of_eligible_target_classes.extend(own_and_descendant_class_names)
|
|
61
|
+
else:
|
|
62
|
+
# Use the slot's `range` to get the specified eligible class name
|
|
63
|
+
# and the names of all classes that inherit from that eligible class.
|
|
64
|
+
if slot_definition.range in schema_view.all_classes():
|
|
65
|
+
own_and_descendant_class_names = schema_view.class_descendants(
|
|
66
|
+
slot_definition.range
|
|
67
|
+
)
|
|
68
|
+
names_of_eligible_target_classes.extend(own_and_descendant_class_names)
|
|
69
|
+
|
|
70
|
+
# Remove duplicate class names.
|
|
71
|
+
names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
|
|
72
|
+
|
|
73
|
+
return names_of_eligible_target_classes
|
|
74
|
+
|
|
75
|
+
|
|
32
76
|
def get_class_names_from_collection_spec(
|
|
33
77
|
spec: dict, prefix: Optional[str] = None
|
|
34
78
|
) -> List[str]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: nmdc_runtime
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: A runtime system for NMDC data management and orchestration
|
|
5
5
|
Home-page: https://github.com/microbiomedata/nmdc-runtime
|
|
6
6
|
Author: Donny Winston
|
|
@@ -145,8 +145,6 @@ http://127.0.0.1:8000/redoc/.
|
|
|
145
145
|
|
|
146
146
|
Tests can be found in `tests` and are run with the following commands:
|
|
147
147
|
|
|
148
|
-
On an M1 Mac? May need to `export DOCKER_DEFAULT_PLATFORM=linux/amd64`.
|
|
149
|
-
|
|
150
148
|
```bash
|
|
151
149
|
make up-test
|
|
152
150
|
make test
|
|
@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
2
2
|
nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
|
|
3
3
|
nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
|
|
4
4
|
nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
-
nmdc_runtime/util.py,sha256=
|
|
5
|
+
nmdc_runtime/util.py,sha256=aMzS8eATEjpXOiuyAFYthx92fb_cgIzWWd5ZQU6ZlAY,22931
|
|
6
6
|
nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
|
|
@@ -36,10 +36,10 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
|
|
|
36
36
|
nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
38
38
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
-
nmdc_runtime/site/graphs.py,sha256=
|
|
40
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
41
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
42
|
-
nmdc_runtime/site/resources.py,sha256=
|
|
39
|
+
nmdc_runtime/site/graphs.py,sha256=mu4bE8799TItWXaPBfOeFB2XMyYwPZcj-VJQmadN2MA,14171
|
|
40
|
+
nmdc_runtime/site/ops.py,sha256=T9_WrwDaySGnu6olwOHQizHQfeofMOaqMcq_vYEIzO0,43140
|
|
41
|
+
nmdc_runtime/site/repository.py,sha256=JtHlp6l3UVo0QhV670TGns9bMfht7NOQrNWQtvsYr2g,39183
|
|
42
|
+
nmdc_runtime/site/resources.py,sha256=6bmvplgql3KdEXKI49BibSk0Sug96SFJi8eOs2zeKK0,18252
|
|
43
43
|
nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
|
|
44
44
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
45
|
nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
|
|
@@ -51,21 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
51
51
|
nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
|
|
52
52
|
nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
|
|
53
53
|
nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
-
nmdc_runtime/site/export/ncbi_xml.py,sha256=
|
|
55
|
-
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=
|
|
54
|
+
nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
|
|
55
|
+
nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
|
|
56
56
|
nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
|
|
57
57
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
58
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
59
59
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
61
61
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
62
|
-
nmdc_runtime/site/translation/gold_translator.py,sha256=
|
|
62
|
+
nmdc_runtime/site/translation/gold_translator.py,sha256=RfAB68dJ9hDep20wETmCNBc0gugZbEKqVimT8h2t0uM,31470
|
|
63
63
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
64
64
|
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
|
|
65
65
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
|
|
66
66
|
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
|
|
67
67
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
68
|
-
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=
|
|
68
|
+
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
|
|
69
69
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
70
70
|
nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
|
|
71
71
|
nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -73,9 +73,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
73
73
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
74
74
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
75
75
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
76
|
-
nmdc_runtime-2.
|
|
77
|
-
nmdc_runtime-2.
|
|
78
|
-
nmdc_runtime-2.
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
76
|
+
nmdc_runtime-2.2.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
77
|
+
nmdc_runtime-2.2.0.dist-info/METADATA,sha256=igSdpzN5dxlLV9r_O8btdkVPMTvLDzkn032LUdb-3hY,7256
|
|
78
|
+
nmdc_runtime-2.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
79
|
+
nmdc_runtime-2.2.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
80
|
+
nmdc_runtime-2.2.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
81
|
+
nmdc_runtime-2.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|