nmdc-runtime 2.8.0__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/config.py +54 -2
- nmdc_runtime/site/graphs.py +32 -6
- nmdc_runtime/site/ops.py +42 -90
- nmdc_runtime/site/repair/database_updater.py +8 -0
- nmdc_runtime/site/repository.py +7 -5
- nmdc_runtime/site/resources.py +4 -4
- nmdc_runtime/site/translation/gold_translator.py +18 -9
- nmdc_runtime/site/translation/submission_portal_translator.py +62 -0
- nmdc_runtime/util.py +6 -274
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/METADATA +1 -1
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/RECORD +15 -15
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.9.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/config.py
CHANGED
|
@@ -1,5 +1,57 @@
|
|
|
1
|
-
|
|
1
|
+
"""
|
|
2
|
+
This module acts as a unified interface between the codebase and the environment.
|
|
3
|
+
We will eventually move all of the Runtime's environment variables reads into this
|
|
4
|
+
module, instead of leaving them sprinkled throughout the codebase.
|
|
5
|
+
|
|
6
|
+
TODO: Move all environment variable reads into this module and update references accordingly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Set
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def is_env_var_true(name: str, default: str = "false") -> bool:
|
|
14
|
+
r"""
|
|
15
|
+
Checks whether the value of the specified environment variable
|
|
16
|
+
meets our criteria for true-ness.
|
|
17
|
+
|
|
18
|
+
Reference: https://docs.python.org/3/library/os.html#os.environ
|
|
19
|
+
|
|
20
|
+
Run doctests via: $ python -m doctest nmdc_runtime/config.py
|
|
21
|
+
|
|
22
|
+
>>> import os
|
|
23
|
+
>>> name = "EXAMPLE_ENV_VAR"
|
|
24
|
+
>>> os.unsetenv(name) # Undefined
|
|
25
|
+
>>> is_env_var_true(name)
|
|
26
|
+
False
|
|
27
|
+
>>> is_env_var_true(name, "true") # Undefined, overridden default
|
|
28
|
+
True
|
|
29
|
+
>>> os.environ[name] = "false" # Defined as false
|
|
30
|
+
>>> is_env_var_true(name)
|
|
31
|
+
False
|
|
32
|
+
>>> os.environ[name] = "true" # Defined as true
|
|
33
|
+
>>> is_env_var_true(name)
|
|
34
|
+
True
|
|
35
|
+
>>> os.environ[name] = "TRUE" # Case-insensitive
|
|
36
|
+
>>> is_env_var_true(name)
|
|
37
|
+
True
|
|
38
|
+
>>> os.environ[name] = "potato" # Non-boolean string
|
|
39
|
+
>>> is_env_var_true(name)
|
|
40
|
+
False
|
|
41
|
+
"""
|
|
42
|
+
lowercase_true_strings: Set[str] = {"true"}
|
|
43
|
+
return os.environ.get(name, default).lower() in lowercase_true_strings
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# The name of the schema class representing the database. We don't bother to
|
|
47
|
+
# make this customizable via the environment, as we expect it to never change.
|
|
48
|
+
DATABASE_CLASS_NAME: str = "Database"
|
|
2
49
|
|
|
3
50
|
# Feature flag that can be used to enable/disable the `/nmdcschema/related_ids`
|
|
4
51
|
# endpoint and the tests that target it.
|
|
5
|
-
IS_RELATED_IDS_ENDPOINT_ENABLED =
|
|
52
|
+
IS_RELATED_IDS_ENDPOINT_ENABLED: bool = is_env_var_true(
|
|
53
|
+
"IS_RELATED_IDS_ENDPOINT_ENABLED", default="true"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Feature flag that can be used to enable/disable the `/scalar` endpoint.
|
|
57
|
+
IS_SCALAR_ENABLED: bool = is_env_var_true("IS_SCALAR_ENABLED", default="true")
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -160,6 +160,7 @@ def gold_study_to_database():
|
|
|
160
160
|
study_type,
|
|
161
161
|
gold_nmdc_instrument_mapping_file_url,
|
|
162
162
|
include_field_site_info,
|
|
163
|
+
enable_biosample_filtering,
|
|
163
164
|
) = get_gold_study_pipeline_inputs()
|
|
164
165
|
|
|
165
166
|
projects = gold_projects_by_study(study_id)
|
|
@@ -176,6 +177,7 @@ def gold_study_to_database():
|
|
|
176
177
|
analysis_projects,
|
|
177
178
|
gold_nmdc_instrument_map_df,
|
|
178
179
|
include_field_site_info,
|
|
180
|
+
enable_biosample_filtering,
|
|
179
181
|
)
|
|
180
182
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
181
183
|
filename = nmdc_schema_database_export_filename(study)
|
|
@@ -506,11 +508,19 @@ def nmdc_study_to_ncbi_submission_export():
|
|
|
506
508
|
|
|
507
509
|
@graph
|
|
508
510
|
def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
509
|
-
(
|
|
511
|
+
(
|
|
512
|
+
study_id,
|
|
513
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
514
|
+
include_field_site_info,
|
|
515
|
+
enable_biosample_filtering,
|
|
516
|
+
) = get_database_updater_inputs()
|
|
510
517
|
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
511
518
|
|
|
512
519
|
database = generate_data_generation_set_post_biosample_ingest(
|
|
513
|
-
study_id,
|
|
520
|
+
study_id,
|
|
521
|
+
gold_nmdc_instrument_map_df,
|
|
522
|
+
include_field_site_info,
|
|
523
|
+
enable_biosample_filtering,
|
|
514
524
|
)
|
|
515
525
|
|
|
516
526
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
@@ -523,11 +533,19 @@ def generate_data_generation_set_for_biosamples_in_nmdc_study():
|
|
|
523
533
|
|
|
524
534
|
@graph
|
|
525
535
|
def generate_biosample_set_from_samples_in_gold():
|
|
526
|
-
(
|
|
536
|
+
(
|
|
537
|
+
study_id,
|
|
538
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
539
|
+
include_field_site_info,
|
|
540
|
+
enable_biosample_filtering,
|
|
541
|
+
) = get_database_updater_inputs()
|
|
527
542
|
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
528
543
|
|
|
529
544
|
database = generate_biosample_set_for_nmdc_study_from_gold(
|
|
530
|
-
study_id,
|
|
545
|
+
study_id,
|
|
546
|
+
gold_nmdc_instrument_map_df,
|
|
547
|
+
include_field_site_info,
|
|
548
|
+
enable_biosample_filtering,
|
|
531
549
|
)
|
|
532
550
|
database_dict = nmdc_schema_object_to_dict(database)
|
|
533
551
|
filename = post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
@@ -545,10 +563,18 @@ def generate_update_script_for_insdc_biosample_identifiers():
|
|
|
545
563
|
to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
|
|
546
564
|
The script is returned as a dictionary that can be executed against MongoDB.
|
|
547
565
|
"""
|
|
548
|
-
(
|
|
566
|
+
(
|
|
567
|
+
study_id,
|
|
568
|
+
gold_nmdc_instrument_mapping_file_url,
|
|
569
|
+
include_field_site_info,
|
|
570
|
+
enable_biosample_filtering,
|
|
571
|
+
) = get_database_updater_inputs()
|
|
549
572
|
gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
|
|
550
573
|
|
|
551
574
|
update_script = run_script_to_update_insdc_biosample_identifiers(
|
|
552
|
-
study_id,
|
|
575
|
+
study_id,
|
|
576
|
+
gold_nmdc_instrument_map_df,
|
|
577
|
+
include_field_site_info,
|
|
578
|
+
enable_biosample_filtering,
|
|
553
579
|
)
|
|
554
580
|
render_text(update_script)
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -10,7 +10,7 @@ from datetime import datetime, timezone
|
|
|
10
10
|
from io import BytesIO, StringIO
|
|
11
11
|
from pprint import pformat
|
|
12
12
|
from toolz.dicttoolz import keyfilter
|
|
13
|
-
from typing import Tuple, Set
|
|
13
|
+
from typing import Tuple, Set, Union
|
|
14
14
|
from zipfile import ZipFile
|
|
15
15
|
from itertools import chain
|
|
16
16
|
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
@@ -44,7 +44,7 @@ from dagster import (
|
|
|
44
44
|
from gridfs import GridFS
|
|
45
45
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
46
46
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
47
|
-
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
47
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db, validate_json
|
|
48
48
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
49
49
|
from nmdc_runtime.api.core.metadata import (
|
|
50
50
|
_validate_changesheet,
|
|
@@ -106,7 +106,6 @@ from nmdc_runtime.util import (
|
|
|
106
106
|
get_names_of_classes_in_effective_range_of_slot,
|
|
107
107
|
pluralize,
|
|
108
108
|
put_object,
|
|
109
|
-
validate_json,
|
|
110
109
|
specialize_activity_set_docs,
|
|
111
110
|
collection_name_to_class_names,
|
|
112
111
|
class_hierarchy_as_list,
|
|
@@ -481,83 +480,6 @@ def get_json_in(context):
|
|
|
481
480
|
return rv.json()
|
|
482
481
|
|
|
483
482
|
|
|
484
|
-
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
485
|
-
"""
|
|
486
|
-
Does not ensure ordering of `docs`.
|
|
487
|
-
|
|
488
|
-
TODO: Document this function. What _does_ it do (or what was it designed to do)?
|
|
489
|
-
What, conceptually, did the author design it to receive (as `docs`); a dict
|
|
490
|
-
having a `data_object_set` item whose value is a list of documents.
|
|
491
|
-
What, conceptually, did the author design it to return?
|
|
492
|
-
"""
|
|
493
|
-
|
|
494
|
-
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
495
|
-
return docs, 0
|
|
496
|
-
|
|
497
|
-
do_docs = docs["data_object_set"]
|
|
498
|
-
|
|
499
|
-
class FileTypeEnumBase(BaseModel):
|
|
500
|
-
name: str
|
|
501
|
-
description: str
|
|
502
|
-
filter: str # JSON-encoded data_object_set mongo collection filter document
|
|
503
|
-
|
|
504
|
-
class FileTypeEnum(FileTypeEnumBase):
|
|
505
|
-
id: str
|
|
506
|
-
|
|
507
|
-
# Make a temporary collection (which will be dropped below) and insert the
|
|
508
|
-
# specified `data_object_set` documents into it.
|
|
509
|
-
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
510
|
-
temp_collection = mdb[temp_collection_name]
|
|
511
|
-
temp_collection.insert_many(do_docs)
|
|
512
|
-
temp_collection.create_index("id")
|
|
513
|
-
|
|
514
|
-
def fte_matches(fte_filter: str) -> List[dict]:
|
|
515
|
-
r"""
|
|
516
|
-
Returns a list of documents—without their `_id` field—that match the specified filter,
|
|
517
|
-
which is encoded as a JSON string.
|
|
518
|
-
"""
|
|
519
|
-
return [
|
|
520
|
-
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
521
|
-
]
|
|
522
|
-
|
|
523
|
-
# Create a mapping from each document's `id` to the document, itself.
|
|
524
|
-
do_docs_map = {d["id"]: d for d in do_docs}
|
|
525
|
-
|
|
526
|
-
n_docs_with_types_added = 0
|
|
527
|
-
|
|
528
|
-
# For each `file_type_enum` document in the database, find all the documents (among the
|
|
529
|
-
# `data_object_set` documents provided by the caller) that match that `file_type_enum`
|
|
530
|
-
# document's filter.
|
|
531
|
-
#
|
|
532
|
-
# If any of those documents lacks a `data_object_type` field, update the original
|
|
533
|
-
# `data_object_set` document so that its `data_object_type` field is set to
|
|
534
|
-
# the `file_type_enum` document's `id` (why not its `name`?).
|
|
535
|
-
#
|
|
536
|
-
# TODO: I don't know why this sets `data_object_type` to `file_type_enum.id`,
|
|
537
|
-
# as opposed to `file_type_enum.name`.
|
|
538
|
-
#
|
|
539
|
-
for fte_doc in mdb.file_type_enum.find():
|
|
540
|
-
fte = FileTypeEnum(**fte_doc)
|
|
541
|
-
docs_matching = fte_matches(fte.filter)
|
|
542
|
-
for doc in docs_matching:
|
|
543
|
-
if "data_object_type" not in doc:
|
|
544
|
-
do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
|
|
545
|
-
n_docs_with_types_added += 1
|
|
546
|
-
|
|
547
|
-
mdb.drop_collection(temp_collection_name)
|
|
548
|
-
|
|
549
|
-
# Returns a tuple. The first item is the original `docs` dictionary, but with the
|
|
550
|
-
# `data_object_set` list replaced by the list of the documents that are in the
|
|
551
|
-
# `do_docs_map` dictionary (with their `_id` fields omitted). The second item is
|
|
552
|
-
# the number of documents to which this function added a `data_object_type` field.
|
|
553
|
-
return (
|
|
554
|
-
assoc(
|
|
555
|
-
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
556
|
-
),
|
|
557
|
-
n_docs_with_types_added,
|
|
558
|
-
)
|
|
559
|
-
|
|
560
|
-
|
|
561
483
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
562
484
|
def perform_mongo_updates(context, json_in):
|
|
563
485
|
mongo = context.resources.mongo
|
|
@@ -566,8 +488,6 @@ def perform_mongo_updates(context, json_in):
|
|
|
566
488
|
|
|
567
489
|
docs = json_in
|
|
568
490
|
docs, _ = specialize_activity_set_docs(docs)
|
|
569
|
-
docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
|
|
570
|
-
context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
|
|
571
491
|
context.log.debug(f"{docs}")
|
|
572
492
|
|
|
573
493
|
rv = validate_json(
|
|
@@ -636,22 +556,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
636
556
|
"study_type": str,
|
|
637
557
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
638
558
|
"include_field_site_info": bool,
|
|
559
|
+
"enable_biosample_filtering": bool,
|
|
639
560
|
},
|
|
640
561
|
out={
|
|
641
562
|
"study_id": Out(str),
|
|
642
563
|
"study_type": Out(str),
|
|
643
564
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
644
565
|
"include_field_site_info": Out(bool),
|
|
566
|
+
"enable_biosample_filtering": Out(bool),
|
|
645
567
|
},
|
|
646
568
|
)
|
|
647
569
|
def get_gold_study_pipeline_inputs(
|
|
648
570
|
context: OpExecutionContext,
|
|
649
|
-
) -> Tuple[str, str, str, bool]:
|
|
571
|
+
) -> Tuple[str, str, str, bool, bool]:
|
|
650
572
|
return (
|
|
651
573
|
context.op_config["study_id"],
|
|
652
574
|
context.op_config["study_type"],
|
|
653
575
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
654
576
|
context.op_config["include_field_site_info"],
|
|
577
|
+
context.op_config["enable_biosample_filtering"],
|
|
655
578
|
)
|
|
656
579
|
|
|
657
580
|
|
|
@@ -695,6 +618,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
695
618
|
analysis_projects: List[Dict[str, Any]],
|
|
696
619
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
697
620
|
include_field_site_info: bool,
|
|
621
|
+
enable_biosample_filtering: bool,
|
|
698
622
|
) -> nmdc.Database:
|
|
699
623
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
700
624
|
|
|
@@ -710,6 +634,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
710
634
|
analysis_projects,
|
|
711
635
|
gold_nmdc_instrument_map_df,
|
|
712
636
|
include_field_site_info,
|
|
637
|
+
enable_biosample_filtering,
|
|
713
638
|
id_minter=id_minter,
|
|
714
639
|
)
|
|
715
640
|
database = translator.get_database()
|
|
@@ -1110,6 +1035,8 @@ def load_ontology(context: OpExecutionContext):
|
|
|
1110
1035
|
source_ontology=source_ontology,
|
|
1111
1036
|
output_directory=output_directory,
|
|
1112
1037
|
generate_reports=generate_reports,
|
|
1038
|
+
mongo_client=context.resources.mongo.client,
|
|
1039
|
+
db_name=context.resources.mongo.db.name,
|
|
1113
1040
|
)
|
|
1114
1041
|
|
|
1115
1042
|
loader.run_ontology_loader()
|
|
@@ -1192,8 +1119,8 @@ def _add_related_ids_to_alldocs(
|
|
|
1192
1119
|
"has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
|
|
1193
1120
|
"instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
|
|
1194
1121
|
"uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
|
|
1195
|
-
"was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy
|
|
1196
|
-
"was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy
|
|
1122
|
+
"was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy.
|
|
1123
|
+
"was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy.
|
|
1197
1124
|
]
|
|
1198
1125
|
# An "outbound" slot is one for which an entity in the domain "influences"
|
|
1199
1126
|
# (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
|
|
@@ -1572,16 +1499,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
|
1572
1499
|
config_schema={
|
|
1573
1500
|
"nmdc_study_id": str,
|
|
1574
1501
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1502
|
+
"include_field_site_info": bool,
|
|
1503
|
+
"enable_biosample_filtering": bool,
|
|
1575
1504
|
},
|
|
1576
1505
|
out={
|
|
1577
1506
|
"nmdc_study_id": Out(str),
|
|
1578
1507
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1508
|
+
"include_field_site_info": Out(bool),
|
|
1509
|
+
"enable_biosample_filtering": Out(bool),
|
|
1579
1510
|
},
|
|
1580
1511
|
)
|
|
1581
|
-
def get_database_updater_inputs(
|
|
1512
|
+
def get_database_updater_inputs(
|
|
1513
|
+
context: OpExecutionContext,
|
|
1514
|
+
) -> Tuple[str, str, bool, bool]:
|
|
1582
1515
|
return (
|
|
1583
1516
|
context.op_config["nmdc_study_id"],
|
|
1584
1517
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1518
|
+
context.op_config["include_field_site_info"],
|
|
1519
|
+
context.op_config["enable_biosample_filtering"],
|
|
1585
1520
|
)
|
|
1586
1521
|
|
|
1587
1522
|
|
|
@@ -1596,6 +1531,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1596
1531
|
context: OpExecutionContext,
|
|
1597
1532
|
nmdc_study_id: str,
|
|
1598
1533
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1534
|
+
include_field_site_info: bool,
|
|
1535
|
+
enable_biosample_filtering: bool,
|
|
1599
1536
|
) -> nmdc.Database:
|
|
1600
1537
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1601
1538
|
context.resources.runtime_api_user_client
|
|
@@ -1611,6 +1548,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1611
1548
|
gold_api_client,
|
|
1612
1549
|
nmdc_study_id,
|
|
1613
1550
|
gold_nmdc_instrument_map_df,
|
|
1551
|
+
include_field_site_info,
|
|
1552
|
+
enable_biosample_filtering,
|
|
1614
1553
|
)
|
|
1615
1554
|
database = (
|
|
1616
1555
|
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
@@ -1630,6 +1569,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1630
1569
|
context: OpExecutionContext,
|
|
1631
1570
|
nmdc_study_id: str,
|
|
1632
1571
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1572
|
+
include_field_site_info: bool = False,
|
|
1573
|
+
enable_biosample_filtering: bool = False,
|
|
1633
1574
|
) -> nmdc.Database:
|
|
1634
1575
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1635
1576
|
context.resources.runtime_api_user_client
|
|
@@ -1645,6 +1586,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1645
1586
|
gold_api_client,
|
|
1646
1587
|
nmdc_study_id,
|
|
1647
1588
|
gold_nmdc_instrument_map_df,
|
|
1589
|
+
include_field_site_info,
|
|
1590
|
+
enable_biosample_filtering,
|
|
1648
1591
|
)
|
|
1649
1592
|
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1650
1593
|
|
|
@@ -1656,13 +1599,16 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1656
1599
|
"runtime_api_user_client",
|
|
1657
1600
|
"runtime_api_site_client",
|
|
1658
1601
|
"gold_api_client",
|
|
1659
|
-
}
|
|
1602
|
+
},
|
|
1603
|
+
out=Out(Any),
|
|
1660
1604
|
)
|
|
1661
1605
|
def run_script_to_update_insdc_biosample_identifiers(
|
|
1662
1606
|
context: OpExecutionContext,
|
|
1663
1607
|
nmdc_study_id: str,
|
|
1664
1608
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1665
|
-
|
|
1609
|
+
include_field_site_info: bool,
|
|
1610
|
+
enable_biosample_filtering: bool,
|
|
1611
|
+
):
|
|
1666
1612
|
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1667
1613
|
|
|
1668
1614
|
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
@@ -1674,7 +1620,7 @@ def run_script_to_update_insdc_biosample_identifiers(
|
|
|
1674
1620
|
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1675
1621
|
|
|
1676
1622
|
Returns:
|
|
1677
|
-
A dictionary containing the MongoDB update script
|
|
1623
|
+
A dictionary or list of dictionaries containing the MongoDB update script(s)
|
|
1678
1624
|
"""
|
|
1679
1625
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1680
1626
|
context.resources.runtime_api_user_client
|
|
@@ -1690,11 +1636,17 @@ def run_script_to_update_insdc_biosample_identifiers(
|
|
|
1690
1636
|
gold_api_client,
|
|
1691
1637
|
nmdc_study_id,
|
|
1692
1638
|
gold_nmdc_instrument_map_df,
|
|
1639
|
+
include_field_site_info,
|
|
1640
|
+
enable_biosample_filtering,
|
|
1693
1641
|
)
|
|
1694
1642
|
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1695
1643
|
|
|
1644
|
+
if isinstance(update_script, list):
|
|
1645
|
+
total_updates = sum(len(item.get("updates", [])) for item in update_script)
|
|
1646
|
+
else:
|
|
1647
|
+
total_updates = len(update_script.get("updates", []))
|
|
1696
1648
|
context.log.info(
|
|
1697
|
-
f"Generated update script for study {nmdc_study_id} with {
|
|
1649
|
+
f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
|
|
1698
1650
|
)
|
|
1699
1651
|
|
|
1700
1652
|
return update_script
|
|
@@ -18,6 +18,8 @@ class DatabaseUpdater:
|
|
|
18
18
|
gold_api_client: GoldApiClient,
|
|
19
19
|
study_id: str,
|
|
20
20
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
21
|
+
include_field_site_info: bool = False,
|
|
22
|
+
enable_biosample_filtering: bool = True,
|
|
21
23
|
):
|
|
22
24
|
"""This class serves as an API for repairing connections in the database by
|
|
23
25
|
adding records that are essentially missing "links"/"connections". As we identify
|
|
@@ -39,6 +41,8 @@ class DatabaseUpdater:
|
|
|
39
41
|
self.gold_api_client = gold_api_client
|
|
40
42
|
self.study_id = study_id
|
|
41
43
|
self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
|
|
44
|
+
self.include_field_site_info = include_field_site_info
|
|
45
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
42
46
|
|
|
43
47
|
@lru_cache
|
|
44
48
|
def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
|
|
@@ -95,6 +99,8 @@ class DatabaseUpdater:
|
|
|
95
99
|
biosamples=all_gold_biosamples,
|
|
96
100
|
projects=all_gold_projects,
|
|
97
101
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
102
|
+
include_field_site_info=self.include_field_site_info,
|
|
103
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
98
104
|
)
|
|
99
105
|
|
|
100
106
|
# The GoldStudyTranslator class has some pre-processing logic which filters out
|
|
@@ -214,6 +220,8 @@ class DatabaseUpdater:
|
|
|
214
220
|
projects=gold_sequencing_projects_for_study,
|
|
215
221
|
analysis_projects=gold_analysis_projects_for_study,
|
|
216
222
|
gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
|
|
223
|
+
include_field_site_info=self.include_field_site_info,
|
|
224
|
+
enable_biosample_filtering=self.enable_biosample_filtering,
|
|
217
225
|
)
|
|
218
226
|
|
|
219
227
|
translated_biosamples = gold_study_translator.biosamples
|
nmdc_runtime/site/repository.py
CHANGED
|
@@ -463,11 +463,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
|
|
|
463
463
|
yield SkipReason("; ".join(skip_notes))
|
|
464
464
|
|
|
465
465
|
|
|
466
|
-
# TODO ensure data_object_type values from file_type_enum
|
|
467
|
-
# see /metadata-translation/notebooks/202106_curation_updates.ipynb
|
|
468
|
-
# for details ("Create file_type_enum collection" section).
|
|
469
|
-
|
|
470
|
-
|
|
471
466
|
@sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
|
|
472
467
|
def done_object_put_ops(_context):
|
|
473
468
|
client = get_runtime_api_site_client(run_config_frozen__normal_env)
|
|
@@ -574,6 +569,7 @@ def biosample_submission_ingest():
|
|
|
574
569
|
"study_type": "research_study",
|
|
575
570
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
576
571
|
"include_field_site_info": False,
|
|
572
|
+
"enable_biosample_filtering": True,
|
|
577
573
|
},
|
|
578
574
|
},
|
|
579
575
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1018,6 +1014,8 @@ def database_records_stitching():
|
|
|
1018
1014
|
"config": {
|
|
1019
1015
|
"nmdc_study_id": "",
|
|
1020
1016
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1017
|
+
"include_field_site_info": False,
|
|
1018
|
+
"enable_biosample_filtering": True,
|
|
1021
1019
|
}
|
|
1022
1020
|
},
|
|
1023
1021
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1060,6 +1058,8 @@ def database_records_stitching():
|
|
|
1060
1058
|
"config": {
|
|
1061
1059
|
"nmdc_study_id": "",
|
|
1062
1060
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1061
|
+
"include_field_site_info": False,
|
|
1062
|
+
"enable_biosample_filtering": True,
|
|
1063
1063
|
}
|
|
1064
1064
|
},
|
|
1065
1065
|
"export_json_to_drs": {"config": {"username": ""}},
|
|
@@ -1102,6 +1102,8 @@ def database_records_stitching():
|
|
|
1102
1102
|
"config": {
|
|
1103
1103
|
"nmdc_study_id": "",
|
|
1104
1104
|
"gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
|
|
1105
|
+
"include_field_site_info": False,
|
|
1106
|
+
"enable_biosample_filtering": True,
|
|
1105
1107
|
}
|
|
1106
1108
|
},
|
|
1107
1109
|
},
|
nmdc_runtime/site/resources.py
CHANGED
|
@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
109
109
|
},
|
|
110
110
|
)
|
|
111
111
|
response.raise_for_status()
|
|
112
|
-
return response.json()["cursor"]["
|
|
112
|
+
return response.json()["cursor"]["batch"]
|
|
113
113
|
|
|
114
114
|
def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
|
|
115
115
|
gold_project_id = normalize_gold_id(gold_project_id)
|
|
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
126
126
|
},
|
|
127
127
|
)
|
|
128
128
|
response.raise_for_status()
|
|
129
|
-
return response.json()["cursor"]["
|
|
129
|
+
return response.json()["cursor"]["batch"]
|
|
130
130
|
|
|
131
131
|
def get_biosamples_for_study(self, study_id: str):
|
|
132
132
|
# TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
|
|
@@ -170,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
170
170
|
},
|
|
171
171
|
)
|
|
172
172
|
response.raise_for_status()
|
|
173
|
-
return response.json()["cursor"]["
|
|
173
|
+
return response.json()["cursor"]["batch"]
|
|
174
174
|
|
|
175
175
|
def get_study(self, study_id: str):
|
|
176
176
|
response = self.request(
|
|
@@ -182,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
|
|
|
182
182
|
},
|
|
183
183
|
)
|
|
184
184
|
response.raise_for_status()
|
|
185
|
-
return response.json()["cursor"]["
|
|
185
|
+
return response.json()["cursor"]["batch"]
|
|
186
186
|
|
|
187
187
|
|
|
188
188
|
class RuntimeApiSiteClient(RuntimeApiClient):
|
|
@@ -45,6 +45,7 @@ class GoldStudyTranslator(Translator):
|
|
|
45
45
|
analysis_projects: List[JSON_OBJECT] = [],
|
|
46
46
|
gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
|
|
47
47
|
include_field_site_info: bool = False,
|
|
48
|
+
enable_biosample_filtering: bool = True,
|
|
48
49
|
*args,
|
|
49
50
|
**kwargs,
|
|
50
51
|
) -> None:
|
|
@@ -53,15 +54,20 @@ class GoldStudyTranslator(Translator):
|
|
|
53
54
|
self.study = study
|
|
54
55
|
self.study_type = nmdc.StudyCategoryEnum(study_type)
|
|
55
56
|
self.include_field_site_info = include_field_site_info
|
|
57
|
+
self.enable_biosample_filtering = enable_biosample_filtering
|
|
56
58
|
# Filter biosamples to only those with `sequencingStrategy` of
|
|
57
|
-
# "Metagenome" or "Metatranscriptome"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
59
|
+
# "Metagenome" or "Metatranscriptome" if filtering is enabled
|
|
60
|
+
if enable_biosample_filtering:
|
|
61
|
+
self.biosamples = [
|
|
62
|
+
biosample
|
|
63
|
+
for biosample in biosamples
|
|
64
|
+
if any(
|
|
65
|
+
_is_valid_project(project)
|
|
66
|
+
for project in biosample.get("projects", [])
|
|
67
|
+
)
|
|
68
|
+
]
|
|
69
|
+
else:
|
|
70
|
+
self.biosamples = biosamples
|
|
65
71
|
# Fetch the valid projectGoldIds that are associated with filtered
|
|
66
72
|
# biosamples on their `projects` field
|
|
67
73
|
valid_project_ids = {
|
|
@@ -116,6 +122,9 @@ class GoldStudyTranslator(Translator):
|
|
|
116
122
|
:param gold_entity: GOLD entity object
|
|
117
123
|
:return: PersonValue corresponding to the first PI in the `contacts` field
|
|
118
124
|
"""
|
|
125
|
+
if "contacts" not in gold_entity:
|
|
126
|
+
return None
|
|
127
|
+
|
|
119
128
|
pi_dict = next(
|
|
120
129
|
(
|
|
121
130
|
contact
|
|
@@ -169,7 +178,7 @@ class GoldStudyTranslator(Translator):
|
|
|
169
178
|
project["ncbiBioSampleAccession"], default_prefix="biosample"
|
|
170
179
|
)
|
|
171
180
|
for project in biosample_projects
|
|
172
|
-
if project
|
|
181
|
+
if project.get("ncbiBioSampleAccession")
|
|
173
182
|
]
|
|
174
183
|
|
|
175
184
|
def _get_samp_taxon_id(
|
|
@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
|
|
|
47
47
|
(INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
|
|
48
48
|
}
|
|
49
49
|
|
|
50
|
+
UNIT_OVERRIDES: dict[str, dict[str, str]] = {
|
|
51
|
+
"Biosample": {
|
|
52
|
+
"depth": "m",
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
50
56
|
|
|
51
57
|
class EnvironmentPackage(Enum):
|
|
52
58
|
r"""
|
|
@@ -475,6 +481,50 @@ class SubmissionPortalTranslator(Translator):
|
|
|
475
481
|
|
|
476
482
|
return value
|
|
477
483
|
|
|
484
|
+
def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
|
|
485
|
+
"""Collect and format DOIs from submission portal schema in nmdc format DOIs
|
|
486
|
+
|
|
487
|
+
If there were no DOIs, None is returned.
|
|
488
|
+
|
|
489
|
+
:param metadata_submission: submission portal entry
|
|
490
|
+
:return: list of nmdc.DOI objects
|
|
491
|
+
"""
|
|
492
|
+
data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
|
|
493
|
+
award_dois = self._get_from(
|
|
494
|
+
metadata_submission, ["multiOmicsForm", "awardDois"]
|
|
495
|
+
)
|
|
496
|
+
if data_dois and len(data_dois) > 0:
|
|
497
|
+
updated_data_dois = [
|
|
498
|
+
nmdc.Doi(
|
|
499
|
+
doi_category="dataset_doi",
|
|
500
|
+
doi_provider=doi["provider"],
|
|
501
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
502
|
+
type="nmdc:Doi",
|
|
503
|
+
)
|
|
504
|
+
for doi in data_dois
|
|
505
|
+
]
|
|
506
|
+
else:
|
|
507
|
+
updated_data_dois = []
|
|
508
|
+
|
|
509
|
+
if award_dois and len(award_dois) > 0:
|
|
510
|
+
updated_award_dois = [
|
|
511
|
+
nmdc.Doi(
|
|
512
|
+
doi_category="award_doi",
|
|
513
|
+
doi_provider=doi["provider"],
|
|
514
|
+
doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
|
|
515
|
+
type="nmdc:Doi",
|
|
516
|
+
)
|
|
517
|
+
for doi in award_dois
|
|
518
|
+
]
|
|
519
|
+
else:
|
|
520
|
+
updated_award_dois = []
|
|
521
|
+
|
|
522
|
+
return_val = updated_data_dois + updated_award_dois
|
|
523
|
+
if len(return_val) == 0:
|
|
524
|
+
return_val = None
|
|
525
|
+
|
|
526
|
+
return return_val
|
|
527
|
+
|
|
478
528
|
def _get_data_objects_from_fields(
|
|
479
529
|
self,
|
|
480
530
|
sample_data: JSON_OBJECT,
|
|
@@ -591,6 +641,7 @@ class SubmissionPortalTranslator(Translator):
|
|
|
591
641
|
websites=self._get_from(
|
|
592
642
|
metadata_submission, ["studyForm", "linkOutWebpage"]
|
|
593
643
|
),
|
|
644
|
+
associated_dois=self._get_study_dois(metadata_submission),
|
|
594
645
|
)
|
|
595
646
|
|
|
596
647
|
def _transform_value_for_slot(
|
|
@@ -660,6 +711,17 @@ class SubmissionPortalTranslator(Translator):
|
|
|
660
711
|
logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
|
|
661
712
|
continue
|
|
662
713
|
|
|
714
|
+
# This step handles cases where the submission portal/schema instructs a user to
|
|
715
|
+
# provide a value in a specific unit. The unit cannot be parsed out of the raw value
|
|
716
|
+
# in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
|
|
717
|
+
# go away once units are encoded in the schema itself.
|
|
718
|
+
# See: https://github.com/microbiomedata/nmdc-schema/issues/2517
|
|
719
|
+
if class_name in UNIT_OVERRIDES:
|
|
720
|
+
# If the class has unit overrides, check if the slot is in the overrides
|
|
721
|
+
unit_overrides = UNIT_OVERRIDES[class_name]
|
|
722
|
+
if slot_name in unit_overrides:
|
|
723
|
+
unit = unit_overrides[slot_name]
|
|
724
|
+
|
|
663
725
|
slot_definition = self.schema_view.induced_slot(slot_name, class_name)
|
|
664
726
|
if slot_definition.multivalued:
|
|
665
727
|
value_list = value
|
nmdc_runtime/util.py
CHANGED
|
@@ -3,36 +3,28 @@ import mimetypes
|
|
|
3
3
|
import os
|
|
4
4
|
import pkgutil
|
|
5
5
|
from collections.abc import Iterable
|
|
6
|
-
from contextlib import AbstractContextManager
|
|
7
6
|
from copy import deepcopy
|
|
8
7
|
from datetime import datetime, timezone
|
|
9
8
|
from functools import lru_cache
|
|
10
9
|
from io import BytesIO
|
|
11
10
|
from itertools import chain
|
|
12
11
|
from pathlib import Path
|
|
13
|
-
from uuid import uuid4
|
|
14
12
|
from typing import Callable, List, Optional, Set, Dict
|
|
15
13
|
|
|
16
14
|
import fastjsonschema
|
|
17
15
|
import requests
|
|
18
16
|
from frozendict import frozendict
|
|
19
|
-
from jsonschema.validators import Draft7Validator
|
|
20
17
|
from linkml_runtime import linkml_model
|
|
21
18
|
from linkml_runtime.utils.schemaview import SchemaView
|
|
22
|
-
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
23
19
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
24
|
-
from pydantic import Field, BaseModel
|
|
25
20
|
from pymongo.database import Database as MongoDatabase
|
|
26
21
|
from pymongo.errors import OperationFailure
|
|
27
22
|
from refscan.lib.helpers import identify_references
|
|
28
|
-
from refscan.lib.Finder import Finder
|
|
29
23
|
from refscan.lib.ReferenceList import ReferenceList
|
|
30
|
-
from
|
|
31
|
-
from toolz import merge, unique
|
|
24
|
+
from toolz import merge
|
|
32
25
|
|
|
33
26
|
from nmdc_runtime.api.core.util import sha256hash_from_file
|
|
34
27
|
from nmdc_runtime.api.models.object import DrsObjectIn
|
|
35
|
-
from typing_extensions import Annotated
|
|
36
28
|
|
|
37
29
|
|
|
38
30
|
def get_names_of_classes_in_effective_range_of_slot(
|
|
@@ -499,6 +491,11 @@ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[
|
|
|
499
491
|
|
|
500
492
|
def ensure_unique_id_indexes(mdb: MongoDatabase):
|
|
501
493
|
"""Ensure that any collections with an "id" field have an index on "id"."""
|
|
494
|
+
|
|
495
|
+
# Note: The pipe (i.e. `|`) operator performs a union of the two sets. In this case,
|
|
496
|
+
# it creates a set (i.e. `candidate_names`) consisting of the names of both
|
|
497
|
+
# (a) all collections in the real database, and (b) all collections that
|
|
498
|
+
# the NMDC schema says can contain instances of classes that have an "id" slot.
|
|
502
499
|
candidate_names = (
|
|
503
500
|
set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
|
|
504
501
|
)
|
|
@@ -533,271 +530,6 @@ def ensure_unique_id_indexes(mdb: MongoDatabase):
|
|
|
533
530
|
raise
|
|
534
531
|
|
|
535
532
|
|
|
536
|
-
class UpdateStatement(BaseModel):
|
|
537
|
-
q: dict
|
|
538
|
-
u: dict
|
|
539
|
-
upsert: bool = False
|
|
540
|
-
multi: bool = False
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
class DeleteStatement(BaseModel):
|
|
544
|
-
q: dict
|
|
545
|
-
limit: Annotated[int, Field(ge=0, le=1)] = 1
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
class OverlayDBError(Exception):
|
|
549
|
-
pass
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
class OverlayDB(AbstractContextManager):
|
|
553
|
-
"""Provides a context whereby a base Database is overlaid with a temporary one.
|
|
554
|
-
|
|
555
|
-
If you need to run basic simulations of updates to a base database,
|
|
556
|
-
you don't want to actually commit transactions to the base database.
|
|
557
|
-
|
|
558
|
-
For example, to insert or replace (matching on "id") many documents into a collection in order
|
|
559
|
-
to then validate the resulting total set of collection documents, an OverlayDB writes to
|
|
560
|
-
an overlay collection that "shadows" the base collection during a "find" query
|
|
561
|
-
(the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
|
|
562
|
-
overlay collection, that id is marked as "seen" and will not also be returned when
|
|
563
|
-
subsequently scanning the (unmodified) base-database collection.
|
|
564
|
-
|
|
565
|
-
Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
|
|
566
|
-
database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
|
|
567
|
-
`overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
|
|
568
|
-
the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
|
|
569
|
-
"merging" just-in-time to process the method invocation. You can see an example of this in the implementation
|
|
570
|
-
of the `merge_find` method, which internally accesses both the real database and the overlaying database.
|
|
571
|
-
|
|
572
|
-
Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
|
|
573
|
-
documents from a base collection to the overlay, and then applying the updates to the overlay,
|
|
574
|
-
so that again, base collections are unmodified, and a "merge_find" call will produce a result
|
|
575
|
-
*as if* the base collection(s) were modified.
|
|
576
|
-
|
|
577
|
-
Mongo deletions (as the "delete" method) also copy affected documents from the base collection
|
|
578
|
-
to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
|
|
579
|
-
call will match a relevant document given a suitable filter, and will mark the document's id
|
|
580
|
-
as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
|
|
581
|
-
|
|
582
|
-
Usage:
|
|
583
|
-
````
|
|
584
|
-
with OverlayDB(mdb) as odb:
|
|
585
|
-
# do stuff, e.g. `odb.replace_or_insert_many(...)`
|
|
586
|
-
```
|
|
587
|
-
"""
|
|
588
|
-
|
|
589
|
-
def __init__(self, mdb: MongoDatabase):
|
|
590
|
-
self._bottom_db = mdb
|
|
591
|
-
self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
|
|
592
|
-
ensure_unique_id_indexes(self._top_db)
|
|
593
|
-
|
|
594
|
-
def __enter__(self):
|
|
595
|
-
return self
|
|
596
|
-
|
|
597
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
598
|
-
self._bottom_db.client.drop_database(self._top_db.name)
|
|
599
|
-
|
|
600
|
-
def replace_or_insert_many(self, coll_name, documents: list):
|
|
601
|
-
try:
|
|
602
|
-
self._top_db[coll_name].insert_many(documents)
|
|
603
|
-
except OperationFailure as e:
|
|
604
|
-
raise OverlayDBError(str(e.details))
|
|
605
|
-
|
|
606
|
-
def apply_updates(self, coll_name, updates: list):
|
|
607
|
-
"""prepare overlay db and apply updates to it."""
|
|
608
|
-
assert all(UpdateStatement(**us) for us in updates)
|
|
609
|
-
for update_spec in updates:
|
|
610
|
-
for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
|
|
611
|
-
self._top_db[coll_name].insert_one(bottom_doc)
|
|
612
|
-
try:
|
|
613
|
-
self._top_db.command({"update": coll_name, "updates": updates})
|
|
614
|
-
except OperationFailure as e:
|
|
615
|
-
raise OverlayDBError(str(e.details))
|
|
616
|
-
|
|
617
|
-
def delete(self, coll_name, deletes: list):
|
|
618
|
-
""" "apply" delete command by flagging docs in overlay database"""
|
|
619
|
-
assert all(DeleteStatement(**us) for us in deletes)
|
|
620
|
-
for delete_spec in deletes:
|
|
621
|
-
for bottom_doc in self._bottom_db[coll_name].find(
|
|
622
|
-
delete_spec["q"], limit=delete_spec.get("limit", 1)
|
|
623
|
-
):
|
|
624
|
-
bottom_doc["_deleted"] = True
|
|
625
|
-
self._top_db[coll_name].insert_one(bottom_doc)
|
|
626
|
-
|
|
627
|
-
def merge_find(self, coll_name, find_spec: dict):
|
|
628
|
-
"""Yield docs first from overlay and then from base db, minding deletion flags."""
|
|
629
|
-
# ensure projection of "id" and "_deleted"
|
|
630
|
-
if "projection" in find_spec:
|
|
631
|
-
proj = find_spec["projection"]
|
|
632
|
-
if isinstance(proj, dict):
|
|
633
|
-
proj = merge(proj, {"id": 1, "_deleted": 1})
|
|
634
|
-
elif isinstance(proj, list):
|
|
635
|
-
proj = list(unique(proj + ["id", "_deleted"]))
|
|
636
|
-
|
|
637
|
-
top_docs = self._top_db[coll_name].find(**find_spec)
|
|
638
|
-
bottom_docs = self._bottom_db[coll_name].find(**find_spec)
|
|
639
|
-
top_seen_ids = set()
|
|
640
|
-
for doc in top_docs:
|
|
641
|
-
if not doc.get("_deleted"):
|
|
642
|
-
yield doc
|
|
643
|
-
top_seen_ids.add(doc["id"])
|
|
644
|
-
|
|
645
|
-
for doc in bottom_docs:
|
|
646
|
-
if doc["id"] not in top_seen_ids:
|
|
647
|
-
yield doc
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
def validate_json(
|
|
651
|
-
in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
|
|
652
|
-
):
|
|
653
|
-
r"""
|
|
654
|
-
Checks whether the specified dictionary represents a valid instance of the `Database` class
|
|
655
|
-
defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
|
|
656
|
-
|
|
657
|
-
Example dictionary:
|
|
658
|
-
{
|
|
659
|
-
"biosample_set": [
|
|
660
|
-
{"id": "nmdc:bsm-00-000001", ...},
|
|
661
|
-
{"id": "nmdc:bsm-00-000002", ...}
|
|
662
|
-
],
|
|
663
|
-
"study_set": [
|
|
664
|
-
{"id": "nmdc:sty-00-000001", ...},
|
|
665
|
-
{"id": "nmdc:sty-00-000002", ...}
|
|
666
|
-
]
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
:param in_docs: The dictionary you want to validate
|
|
670
|
-
:param mdb: A reference to a MongoDB database
|
|
671
|
-
:param check_inter_document_references: Whether you want this function to check whether every document that
|
|
672
|
-
is referenced by any of the documents passed in would, indeed, exist
|
|
673
|
-
in the database, if the documents passed in were to be inserted into
|
|
674
|
-
the database. In other words, set this to `True` if you want this
|
|
675
|
-
function to perform referential integrity checks.
|
|
676
|
-
"""
|
|
677
|
-
validator = Draft7Validator(get_nmdc_jsonschema_dict())
|
|
678
|
-
docs = deepcopy(in_docs)
|
|
679
|
-
validation_errors = {}
|
|
680
|
-
|
|
681
|
-
known_coll_names = set(nmdc_database_collection_names())
|
|
682
|
-
for coll_name, coll_docs in docs.items():
|
|
683
|
-
if coll_name not in known_coll_names:
|
|
684
|
-
# FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
|
|
685
|
-
# See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
|
|
686
|
-
if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
|
|
687
|
-
continue
|
|
688
|
-
else:
|
|
689
|
-
validation_errors[coll_name] = [
|
|
690
|
-
f"'{coll_name}' is not a known schema collection name"
|
|
691
|
-
]
|
|
692
|
-
continue
|
|
693
|
-
|
|
694
|
-
errors = list(validator.iter_errors({coll_name: coll_docs}))
|
|
695
|
-
validation_errors[coll_name] = [e.message for e in errors]
|
|
696
|
-
if coll_docs:
|
|
697
|
-
if not isinstance(coll_docs, list):
|
|
698
|
-
validation_errors[coll_name].append("value must be a list")
|
|
699
|
-
elif not all(isinstance(d, dict) for d in coll_docs):
|
|
700
|
-
validation_errors[coll_name].append(
|
|
701
|
-
"all elements of list must be dicts"
|
|
702
|
-
)
|
|
703
|
-
if not validation_errors[coll_name]:
|
|
704
|
-
try:
|
|
705
|
-
with OverlayDB(mdb) as odb:
|
|
706
|
-
odb.replace_or_insert_many(coll_name, coll_docs)
|
|
707
|
-
except OverlayDBError as e:
|
|
708
|
-
validation_errors[coll_name].append(str(e))
|
|
709
|
-
|
|
710
|
-
if all(len(v) == 0 for v in validation_errors.values()):
|
|
711
|
-
# Second pass. Try instantiating linkml-sourced dataclass
|
|
712
|
-
in_docs.pop("@type", None)
|
|
713
|
-
try:
|
|
714
|
-
NMDCDatabase(**in_docs)
|
|
715
|
-
except Exception as e:
|
|
716
|
-
return {"result": "errors", "detail": str(e)}
|
|
717
|
-
|
|
718
|
-
# Third pass (if enabled): Check inter-document references.
|
|
719
|
-
if check_inter_document_references is True:
|
|
720
|
-
# Prepare to use `refscan`.
|
|
721
|
-
#
|
|
722
|
-
# Note: We check the inter-document references in two stages, which are:
|
|
723
|
-
# 1. For each document in the JSON payload, check whether each document it references already exists
|
|
724
|
-
# (in the collections the schema says it can exist in) in the database. We use the
|
|
725
|
-
# `refscan` package to do this, which returns violation details we'll use in the second stage.
|
|
726
|
-
# 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
|
|
727
|
-
# check whether that document exists (in the collections the schema says it can exist in) in the
|
|
728
|
-
# JSON payload. If it does, then we "waive" (i.e. discard) that violation.
|
|
729
|
-
# The violations that remain after those two stages are the ones we return to the caller.
|
|
730
|
-
#
|
|
731
|
-
# Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
|
|
732
|
-
# does not provide a means to perform arbitrary queries against its virtual "merged" database. It
|
|
733
|
-
# is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
|
|
734
|
-
# `refscan`'s `Finder` class accepts.
|
|
735
|
-
#
|
|
736
|
-
finder = Finder(database=mdb)
|
|
737
|
-
references = get_allowed_references()
|
|
738
|
-
reference_field_names_by_source_class_name = (
|
|
739
|
-
references.get_reference_field_names_by_source_class_name()
|
|
740
|
-
)
|
|
741
|
-
|
|
742
|
-
# Iterate over the collections in the JSON payload.
|
|
743
|
-
for source_collection_name, documents in in_docs.items():
|
|
744
|
-
for document in documents:
|
|
745
|
-
# Add an `_id` field to the document, since `refscan` requires the document to have one.
|
|
746
|
-
source_document = dict(document, _id=None)
|
|
747
|
-
violations = scan_outgoing_references(
|
|
748
|
-
document=source_document,
|
|
749
|
-
schema_view=nmdc_schema_view(),
|
|
750
|
-
reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
|
|
751
|
-
references=references,
|
|
752
|
-
finder=finder,
|
|
753
|
-
collection_names=nmdc_database_collection_names(),
|
|
754
|
-
source_collection_name=source_collection_name,
|
|
755
|
-
user_wants_to_locate_misplaced_documents=False,
|
|
756
|
-
)
|
|
757
|
-
|
|
758
|
-
# For each violation, check whether the misplaced document is in the JSON payload, itself.
|
|
759
|
-
for violation in violations:
|
|
760
|
-
can_waive_violation = False
|
|
761
|
-
# Determine which collections can contain the referenced document, based upon
|
|
762
|
-
# the schema class of which this source document is an instance.
|
|
763
|
-
target_collection_names = (
|
|
764
|
-
references.get_target_collection_names(
|
|
765
|
-
source_class_name=violation.source_class_name,
|
|
766
|
-
source_field_name=violation.source_field_name,
|
|
767
|
-
)
|
|
768
|
-
)
|
|
769
|
-
# Check whether the referenced document exists in any of those collections in the JSON payload.
|
|
770
|
-
for json_coll_name, json_coll_docs in in_docs.items():
|
|
771
|
-
if json_coll_name in target_collection_names:
|
|
772
|
-
for json_coll_doc in json_coll_docs:
|
|
773
|
-
if json_coll_doc["id"] == violation.target_id:
|
|
774
|
-
can_waive_violation = True
|
|
775
|
-
break # stop checking
|
|
776
|
-
if can_waive_violation:
|
|
777
|
-
break # stop checking
|
|
778
|
-
if not can_waive_violation:
|
|
779
|
-
violation_as_str = (
|
|
780
|
-
f"Document '{violation.source_document_id}' "
|
|
781
|
-
f"in collection '{violation.source_collection_name}' "
|
|
782
|
-
f"has a field '{violation.source_field_name}' that "
|
|
783
|
-
f"references a document having id "
|
|
784
|
-
f"'{violation.target_id}', but the latter document "
|
|
785
|
-
f"does not exist in any of the collections the "
|
|
786
|
-
f"NMDC Schema says it can exist in."
|
|
787
|
-
)
|
|
788
|
-
validation_errors[source_collection_name].append(
|
|
789
|
-
violation_as_str
|
|
790
|
-
)
|
|
791
|
-
|
|
792
|
-
# If any collection's error list is not empty, return an error response.
|
|
793
|
-
if any(len(v) > 0 for v in validation_errors.values()):
|
|
794
|
-
return {"result": "errors", "detail": validation_errors}
|
|
795
|
-
|
|
796
|
-
return {"result": "All Okay!"}
|
|
797
|
-
else:
|
|
798
|
-
return {"result": "errors", "detail": validation_errors}
|
|
799
|
-
|
|
800
|
-
|
|
801
533
|
def decorate_if(condition: bool = False) -> Callable:
|
|
802
534
|
r"""
|
|
803
535
|
Decorator that applies another decorator only when `condition` is `True`.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
nmdc_runtime/config.py,sha256=
|
|
2
|
+
nmdc_runtime/config.py,sha256=CW6LnN8Idsbra_mZnHU-kcWsYBZWbgivqVEp8rpOMi4,1989
|
|
3
3
|
nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
|
|
4
4
|
nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
nmdc_runtime/mongo_util.py,sha256=7NRvqFE8W2CUcpcXAA4KElUACIdAkBehZ9TBG4k7zNE,3000
|
|
6
|
-
nmdc_runtime/util.py,sha256=
|
|
6
|
+
nmdc_runtime/util.py,sha256=Rw-OiQDHrz4cNX3ZdC-cgfHYUMq1qsk-_Mv81UrDlC8,19823
|
|
7
7
|
nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
|
|
@@ -37,10 +37,10 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
|
|
|
37
37
|
nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
38
|
nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
|
|
39
39
|
nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
nmdc_runtime/site/graphs.py,sha256=
|
|
41
|
-
nmdc_runtime/site/ops.py,sha256=
|
|
42
|
-
nmdc_runtime/site/repository.py,sha256=
|
|
43
|
-
nmdc_runtime/site/resources.py,sha256=
|
|
40
|
+
nmdc_runtime/site/graphs.py,sha256=CWbLLtoaakmNgSoaQWylXvcOY6qS7qwkTexEUDiMNfM,18295
|
|
41
|
+
nmdc_runtime/site/ops.py,sha256=y6bBJhAytrSqt0COkOqXVKgfSGVdgQ7uByUP8S-zUB4,63935
|
|
42
|
+
nmdc_runtime/site/repository.py,sha256=g0bZytvCrUjLpWuvkAzzmI16mChsrYPbWcvVFPNZFnM,47687
|
|
43
|
+
nmdc_runtime/site/resources.py,sha256=dLNtNa4FfSKN_6b21eItn-i8e0ZHyveoBsexl2I6zmo,20144
|
|
44
44
|
nmdc_runtime/site/util.py,sha256=h70UJCT9g-I63EJn0drZjv1iaQ8LHJTbG29R9kqJ04c,1821
|
|
45
45
|
nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
46
46
|
nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
|
|
@@ -58,17 +58,17 @@ nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ
|
|
|
58
58
|
nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
59
|
nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
|
|
60
60
|
nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
61
|
-
nmdc_runtime/site/repair/database_updater.py,sha256=
|
|
61
|
+
nmdc_runtime/site/repair/database_updater.py,sha256=a6POYZcLEl0JvnuWxPjaOJtwZjkJhhvvUg1ABhnBiP8,21268
|
|
62
62
|
nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
|
|
64
64
|
nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
|
|
65
|
-
nmdc_runtime/site/translation/gold_translator.py,sha256=
|
|
65
|
+
nmdc_runtime/site/translation/gold_translator.py,sha256=n7PrAyZb6ODG1uaZ0cay91DygAHIefOL2qXLuukOyIM,33075
|
|
66
66
|
nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
|
|
67
67
|
nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=8_QF75Gf-dc2xVeO6jzTmdDrlGdh1-QrLJKG2SwUhCA,23797
|
|
68
68
|
nmdc_runtime/site/translation/neon_soil_translator.py,sha256=IMeq4ABgWaSUbB_gmG8vBCMeynQSlbCUw9p2be6o8kE,38620
|
|
69
69
|
nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=Js8_r6vHBW8b-_BpFySTUuYOFe7r51k8HwaNCQ7nAAg,30587
|
|
70
70
|
nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
|
|
71
|
-
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=
|
|
71
|
+
nmdc_runtime/site/translation/submission_portal_translator.py,sha256=d5ycQhd-I07iUeuqN0vcHvMkOHqrwB67j2Q64aFkKBw,44147
|
|
72
72
|
nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
|
|
73
73
|
nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
|
|
74
74
|
nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -76,9 +76,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
|
|
|
76
76
|
nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
|
|
77
77
|
nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
|
|
78
78
|
nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
|
|
79
|
-
nmdc_runtime-2.
|
|
80
|
-
nmdc_runtime-2.
|
|
81
|
-
nmdc_runtime-2.
|
|
82
|
-
nmdc_runtime-2.
|
|
83
|
-
nmdc_runtime-2.
|
|
84
|
-
nmdc_runtime-2.
|
|
79
|
+
nmdc_runtime-2.9.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
|
|
80
|
+
nmdc_runtime-2.9.0.dist-info/METADATA,sha256=4NgNI-Et3t1WLDfZPbSFT18JnMBVEuSCoFAZbm_V0xk,8953
|
|
81
|
+
nmdc_runtime-2.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
82
|
+
nmdc_runtime-2.9.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
|
|
83
|
+
nmdc_runtime-2.9.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
|
|
84
|
+
nmdc_runtime-2.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|