nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/config.py +57 -1
- nmdc_runtime/mongo_util.py +90 -0
- nmdc_runtime/site/export/ncbi_xml.py +98 -27
- nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
- nmdc_runtime/site/graphs.py +72 -9
- nmdc_runtime/site/ops.py +408 -65
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +107 -6
- nmdc_runtime/site/resources.py +17 -4
- nmdc_runtime/site/translation/gold_translator.py +18 -9
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
- nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
- nmdc_runtime/site/translation/submission_portal_translator.py +62 -0
- nmdc_runtime/util.py +53 -267
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/METADATA +18 -3
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/RECORD +21 -20
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import mimetypes
|
|
4
5
|
import os
|
|
5
6
|
import subprocess
|
|
@@ -9,10 +10,10 @@ from datetime import datetime, timezone
|
|
|
9
10
|
from io import BytesIO, StringIO
|
|
10
11
|
from pprint import pformat
|
|
11
12
|
from toolz.dicttoolz import keyfilter
|
|
12
|
-
from typing import Tuple
|
|
13
|
+
from typing import Tuple, Set, Union
|
|
13
14
|
from zipfile import ZipFile
|
|
14
15
|
from itertools import chain
|
|
15
|
-
|
|
16
|
+
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
16
17
|
import pandas as pd
|
|
17
18
|
import requests
|
|
18
19
|
|
|
@@ -26,6 +27,7 @@ from dagster import (
|
|
|
26
27
|
Failure,
|
|
27
28
|
List,
|
|
28
29
|
MetadataValue,
|
|
30
|
+
Noneable,
|
|
29
31
|
OpExecutionContext,
|
|
30
32
|
Out,
|
|
31
33
|
Output,
|
|
@@ -36,12 +38,13 @@ from dagster import (
|
|
|
36
38
|
Optional,
|
|
37
39
|
Field,
|
|
38
40
|
Permissive,
|
|
39
|
-
|
|
41
|
+
In,
|
|
42
|
+
Nothing,
|
|
40
43
|
)
|
|
41
44
|
from gridfs import GridFS
|
|
42
45
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
43
46
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
44
|
-
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
47
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db, validate_json
|
|
45
48
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
46
49
|
from nmdc_runtime.api.core.metadata import (
|
|
47
50
|
_validate_changesheet,
|
|
@@ -103,7 +106,6 @@ from nmdc_runtime.util import (
|
|
|
103
106
|
get_names_of_classes_in_effective_range_of_slot,
|
|
104
107
|
pluralize,
|
|
105
108
|
put_object,
|
|
106
|
-
validate_json,
|
|
107
109
|
specialize_activity_set_docs,
|
|
108
110
|
collection_name_to_class_names,
|
|
109
111
|
class_hierarchy_as_list,
|
|
@@ -113,11 +115,14 @@ from nmdc_runtime.util import (
|
|
|
113
115
|
from nmdc_schema import nmdc
|
|
114
116
|
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
115
117
|
from pydantic import BaseModel
|
|
116
|
-
from pymongo import InsertOne
|
|
118
|
+
from pymongo import InsertOne, UpdateOne
|
|
117
119
|
from pymongo.database import Database as MongoDatabase
|
|
118
120
|
from starlette import status
|
|
119
121
|
from toolz import assoc, dissoc, get_in, valfilter, identity
|
|
120
122
|
|
|
123
|
+
# batch size for writing documents to alldocs
|
|
124
|
+
BULK_WRITE_BATCH_SIZE = 2000
|
|
125
|
+
|
|
121
126
|
|
|
122
127
|
@op
|
|
123
128
|
def hello(context):
|
|
@@ -475,53 +480,6 @@ def get_json_in(context):
|
|
|
475
480
|
return rv.json()
|
|
476
481
|
|
|
477
482
|
|
|
478
|
-
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
479
|
-
"""Does not ensure ordering of `docs`."""
|
|
480
|
-
|
|
481
|
-
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
482
|
-
return docs, 0
|
|
483
|
-
|
|
484
|
-
do_docs = docs["data_object_set"]
|
|
485
|
-
|
|
486
|
-
class FileTypeEnumBase(BaseModel):
|
|
487
|
-
name: str
|
|
488
|
-
description: str
|
|
489
|
-
filter: str # JSON-encoded data_object_set mongo collection filter document
|
|
490
|
-
|
|
491
|
-
class FileTypeEnum(FileTypeEnumBase):
|
|
492
|
-
id: str
|
|
493
|
-
|
|
494
|
-
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
495
|
-
temp_collection = mdb[temp_collection_name]
|
|
496
|
-
temp_collection.insert_many(do_docs)
|
|
497
|
-
temp_collection.create_index("id")
|
|
498
|
-
|
|
499
|
-
def fte_matches(fte_filter: str):
|
|
500
|
-
return [
|
|
501
|
-
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
502
|
-
]
|
|
503
|
-
|
|
504
|
-
do_docs_map = {d["id"]: d for d in do_docs}
|
|
505
|
-
|
|
506
|
-
n_docs_with_types_added = 0
|
|
507
|
-
|
|
508
|
-
for fte_doc in mdb.file_type_enum.find():
|
|
509
|
-
fte = FileTypeEnum(**fte_doc)
|
|
510
|
-
docs_matching = fte_matches(fte.filter)
|
|
511
|
-
for doc in docs_matching:
|
|
512
|
-
if "data_object_type" not in doc:
|
|
513
|
-
do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
|
|
514
|
-
n_docs_with_types_added += 1
|
|
515
|
-
|
|
516
|
-
mdb.drop_collection(temp_collection_name)
|
|
517
|
-
return (
|
|
518
|
-
assoc(
|
|
519
|
-
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
520
|
-
),
|
|
521
|
-
n_docs_with_types_added,
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
|
|
525
483
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
526
484
|
def perform_mongo_updates(context, json_in):
|
|
527
485
|
mongo = context.resources.mongo
|
|
@@ -530,8 +488,6 @@ def perform_mongo_updates(context, json_in):
|
|
|
530
488
|
|
|
531
489
|
docs = json_in
|
|
532
490
|
docs, _ = specialize_activity_set_docs(docs)
|
|
533
|
-
docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
|
|
534
|
-
context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
|
|
535
491
|
context.log.debug(f"{docs}")
|
|
536
492
|
|
|
537
493
|
rv = validate_json(
|
|
@@ -600,22 +556,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
600
556
|
"study_type": str,
|
|
601
557
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
602
558
|
"include_field_site_info": bool,
|
|
559
|
+
"enable_biosample_filtering": bool,
|
|
603
560
|
},
|
|
604
561
|
out={
|
|
605
562
|
"study_id": Out(str),
|
|
606
563
|
"study_type": Out(str),
|
|
607
564
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
608
565
|
"include_field_site_info": Out(bool),
|
|
566
|
+
"enable_biosample_filtering": Out(bool),
|
|
609
567
|
},
|
|
610
568
|
)
|
|
611
569
|
def get_gold_study_pipeline_inputs(
|
|
612
570
|
context: OpExecutionContext,
|
|
613
|
-
) -> Tuple[str, str, str, bool]:
|
|
571
|
+
) -> Tuple[str, str, str, bool, bool]:
|
|
614
572
|
return (
|
|
615
573
|
context.op_config["study_id"],
|
|
616
574
|
context.op_config["study_type"],
|
|
617
575
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
618
576
|
context.op_config["include_field_site_info"],
|
|
577
|
+
context.op_config["enable_biosample_filtering"],
|
|
619
578
|
)
|
|
620
579
|
|
|
621
580
|
|
|
@@ -659,6 +618,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
659
618
|
analysis_projects: List[Dict[str, Any]],
|
|
660
619
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
661
620
|
include_field_site_info: bool,
|
|
621
|
+
enable_biosample_filtering: bool,
|
|
662
622
|
) -> nmdc.Database:
|
|
663
623
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
664
624
|
|
|
@@ -674,6 +634,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
674
634
|
analysis_projects,
|
|
675
635
|
gold_nmdc_instrument_map_df,
|
|
676
636
|
include_field_site_info,
|
|
637
|
+
enable_biosample_filtering,
|
|
677
638
|
id_minter=id_minter,
|
|
678
639
|
)
|
|
679
640
|
database = translator.get_database()
|
|
@@ -1043,18 +1004,249 @@ def site_code_mapping() -> dict:
|
|
|
1043
1004
|
)
|
|
1044
1005
|
|
|
1045
1006
|
|
|
1046
|
-
@op(
|
|
1007
|
+
@op(
|
|
1008
|
+
required_resource_keys={"mongo"},
|
|
1009
|
+
config_schema={
|
|
1010
|
+
"source_ontology": str,
|
|
1011
|
+
"output_directory": Field(Noneable(str), default_value=None, is_required=False),
|
|
1012
|
+
"generate_reports": Field(bool, default_value=True, is_required=False),
|
|
1013
|
+
},
|
|
1014
|
+
)
|
|
1015
|
+
def load_ontology(context: OpExecutionContext):
|
|
1016
|
+
cfg = context.op_config
|
|
1017
|
+
source_ontology = cfg["source_ontology"]
|
|
1018
|
+
output_directory = cfg.get("output_directory")
|
|
1019
|
+
generate_reports = cfg.get("generate_reports", True)
|
|
1020
|
+
|
|
1021
|
+
if output_directory is None:
|
|
1022
|
+
output_directory = os.path.join(os.getcwd(), "ontology_reports")
|
|
1023
|
+
|
|
1024
|
+
# Redirect Python logging to Dagster context
|
|
1025
|
+
handler = logging.Handler()
|
|
1026
|
+
handler.emit = lambda record: context.log.info(record.getMessage())
|
|
1027
|
+
|
|
1028
|
+
# Get logger from ontology-loader package
|
|
1029
|
+
controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
|
|
1030
|
+
controller_logger.setLevel(logging.INFO)
|
|
1031
|
+
controller_logger.addHandler(handler)
|
|
1032
|
+
|
|
1033
|
+
context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
|
|
1034
|
+
loader = OntologyLoaderController(
|
|
1035
|
+
source_ontology=source_ontology,
|
|
1036
|
+
output_directory=output_directory,
|
|
1037
|
+
generate_reports=generate_reports,
|
|
1038
|
+
mongo_client=context.resources.mongo.client,
|
|
1039
|
+
db_name=context.resources.mongo.db.name,
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
loader.run_ontology_loader()
|
|
1043
|
+
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def _add_related_ids_to_alldocs(
|
|
1047
|
+
temp_collection, context, document_reference_ranged_slots_by_type
|
|
1048
|
+
) -> None:
|
|
1049
|
+
"""
|
|
1050
|
+
Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
|
|
1051
|
+
|
|
1052
|
+
The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
1053
|
+
Each subdocument represents a link to any other document that either links to or is linked from
|
|
1054
|
+
the document via document-reference-ranged slots.
|
|
1055
|
+
|
|
1056
|
+
Args:
|
|
1057
|
+
temp_collection: The temporary MongoDB collection to process
|
|
1058
|
+
context: The Dagster execution context for logging
|
|
1059
|
+
document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
|
|
1060
|
+
|
|
1061
|
+
Returns:
|
|
1062
|
+
None (modifies the documents in place)
|
|
1063
|
+
"""
|
|
1064
|
+
|
|
1065
|
+
context.log.info(
|
|
1066
|
+
"Building relationships and adding `_inbound` and `_outbound` fields..."
|
|
1067
|
+
)
|
|
1068
|
+
|
|
1069
|
+
# document ID -> type (with "nmdc:" prefix preserved)
|
|
1070
|
+
id_to_type_map: Dict[str, str] = {}
|
|
1071
|
+
|
|
1072
|
+
# set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
|
|
1073
|
+
relationship_triples: Set[Tuple[str, str, str]] = set()
|
|
1074
|
+
|
|
1075
|
+
# Collect relationship triples.
|
|
1076
|
+
for doc in temp_collection.find():
|
|
1077
|
+
doc_id = doc["id"]
|
|
1078
|
+
# Store the full type with prefix intact
|
|
1079
|
+
doc_type = doc["type"]
|
|
1080
|
+
# For looking up reference slots, we still need the type without prefix
|
|
1081
|
+
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1082
|
+
|
|
1083
|
+
# Record ID to type mapping - preserve the original type with prefix
|
|
1084
|
+
id_to_type_map[doc_id] = doc_type
|
|
1085
|
+
|
|
1086
|
+
# Find all document references from this document
|
|
1087
|
+
reference_slots = document_reference_ranged_slots_by_type.get(
|
|
1088
|
+
doc_type_no_prefix, []
|
|
1089
|
+
)
|
|
1090
|
+
for slot in reference_slots:
|
|
1091
|
+
if slot in doc:
|
|
1092
|
+
# Handle both single-value and array references
|
|
1093
|
+
refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
|
|
1094
|
+
for ref_doc in temp_collection.find(
|
|
1095
|
+
{"id": {"$in": refs}}, ["id", "type"]
|
|
1096
|
+
):
|
|
1097
|
+
id_to_type_map[ref_doc["id"]] = ref_doc["type"]
|
|
1098
|
+
for ref_id in refs:
|
|
1099
|
+
relationship_triples.add((doc_id, slot, ref_id))
|
|
1100
|
+
|
|
1101
|
+
context.log.info(
|
|
1102
|
+
f"Found {len(id_to_type_map)} documents, with "
|
|
1103
|
+
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
# The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
|
|
1107
|
+
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1108
|
+
# recursion "exploding".
|
|
1109
|
+
#
|
|
1110
|
+
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1111
|
+
# because the NMDC schema does not currently contain or expose it.
|
|
1112
|
+
#
|
|
1113
|
+
# An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
|
|
1114
|
+
# <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
|
|
1115
|
+
inbound_document_reference_ranged_slots = [
|
|
1116
|
+
"collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
|
|
1117
|
+
"has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
|
|
1118
|
+
"has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
|
|
1119
|
+
"has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
|
|
1120
|
+
"instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
|
|
1121
|
+
"uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
|
|
1122
|
+
"was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy.
|
|
1123
|
+
"was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy.
|
|
1124
|
+
]
|
|
1125
|
+
# An "outbound" slot is one for which an entity in the domain "influences"
|
|
1126
|
+
# (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
|
|
1127
|
+
outbound_document_reference_ranged_slots = [
|
|
1128
|
+
"associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
|
|
1129
|
+
"calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
|
|
1130
|
+
"generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
|
|
1131
|
+
"has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
|
|
1132
|
+
"in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
|
|
1133
|
+
"part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
|
|
1134
|
+
]
|
|
1135
|
+
|
|
1136
|
+
unique_document_reference_ranged_slot_names = set()
|
|
1137
|
+
for slot_names in document_reference_ranged_slots_by_type.values():
|
|
1138
|
+
for slot_name in slot_names:
|
|
1139
|
+
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1140
|
+
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1141
|
+
if len(inbound_document_reference_ranged_slots) + len(
|
|
1142
|
+
outbound_document_reference_ranged_slots
|
|
1143
|
+
) != len(unique_document_reference_ranged_slot_names):
|
|
1144
|
+
raise Failure(
|
|
1145
|
+
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1146
|
+
"sum of accounted-for inbound and outbound document-reference-ranged slot names."
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
# Construct, and update documents with, `_incoming` and `_outgoing` field values.
|
|
1150
|
+
#
|
|
1151
|
+
# manage batching of MongoDB `bulk_write` operations
|
|
1152
|
+
bulk_operations, update_count = [], 0
|
|
1153
|
+
for doc_id, slot, ref_id in relationship_triples:
|
|
1154
|
+
|
|
1155
|
+
# Determine in which respective fields to push this relationship
|
|
1156
|
+
# for the subject (doc) and object (ref) of this triple.
|
|
1157
|
+
if slot in inbound_document_reference_ranged_slots:
|
|
1158
|
+
field_for_doc, field_for_ref = "_inbound", "_outbound"
|
|
1159
|
+
elif slot in outbound_document_reference_ranged_slots:
|
|
1160
|
+
field_for_doc, field_for_ref = "_outbound", "_inbound"
|
|
1161
|
+
else:
|
|
1162
|
+
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1163
|
+
|
|
1164
|
+
updates = [
|
|
1165
|
+
{
|
|
1166
|
+
"filter": {"id": doc_id},
|
|
1167
|
+
"update": {
|
|
1168
|
+
"$push": {
|
|
1169
|
+
field_for_doc: {
|
|
1170
|
+
"id": ref_id,
|
|
1171
|
+
# TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
|
|
1172
|
+
# which acts as an implicit referential integrity checker (!). Using `.get` with
|
|
1173
|
+
# "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
|
|
1174
|
+
"type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
},
|
|
1178
|
+
},
|
|
1179
|
+
{
|
|
1180
|
+
"filter": {"id": ref_id},
|
|
1181
|
+
"update": {
|
|
1182
|
+
"$push": {
|
|
1183
|
+
field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
|
|
1184
|
+
}
|
|
1185
|
+
},
|
|
1186
|
+
},
|
|
1187
|
+
]
|
|
1188
|
+
for update in updates:
|
|
1189
|
+
bulk_operations.append(UpdateOne(**update))
|
|
1190
|
+
|
|
1191
|
+
# Execute in batches for efficiency
|
|
1192
|
+
if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
|
|
1193
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1194
|
+
update_count += len(bulk_operations)
|
|
1195
|
+
context.log.info(
|
|
1196
|
+
f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
|
|
1197
|
+
)
|
|
1198
|
+
bulk_operations = []
|
|
1199
|
+
|
|
1200
|
+
# Execute any remaining operations
|
|
1201
|
+
if bulk_operations:
|
|
1202
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1203
|
+
update_count += len(bulk_operations)
|
|
1204
|
+
|
|
1205
|
+
context.log.info(f"Pushed {update_count} updates in total")
|
|
1206
|
+
|
|
1207
|
+
context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
|
|
1208
|
+
temp_collection.create_index("_inbound.id")
|
|
1209
|
+
temp_collection.create_index("_outbound.id")
|
|
1210
|
+
# Create compound indexes to ensure index-covered queries
|
|
1211
|
+
temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
|
|
1212
|
+
temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
|
|
1213
|
+
context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1217
|
+
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
1218
|
+
# while also telling Dagster that this op doesn't need the _value_ of that argument.
|
|
1219
|
+
# This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
1220
|
+
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1221
|
+
#
|
|
1222
|
+
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1047
1223
|
def materialize_alldocs(context) -> int:
|
|
1048
1224
|
"""
|
|
1049
|
-
This function re
|
|
1050
|
-
|
|
1225
|
+
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1226
|
+
|
|
1227
|
+
1. Getting all populated schema collection names with an `id` field.
|
|
1228
|
+
2. Create a temporary collection to build the new alldocs collection.
|
|
1229
|
+
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1230
|
+
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1231
|
+
5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
|
|
1232
|
+
6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
|
|
1233
|
+
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1234
|
+
|
|
1235
|
+
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
1236
|
+
`nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
|
|
1237
|
+
such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
|
|
1238
|
+
|
|
1239
|
+
The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
|
|
1240
|
+
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1241
|
+
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1242
|
+
|
|
1243
|
+
The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1244
|
+
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1245
|
+
expansions.
|
|
1051
1246
|
"""
|
|
1052
1247
|
mdb = context.resources.mongo.db
|
|
1053
1248
|
schema_view = nmdc_schema_view()
|
|
1054
1249
|
|
|
1055
|
-
# batch size for writing documents to alldocs
|
|
1056
|
-
BULK_WRITE_BATCH_SIZE = 2000
|
|
1057
|
-
|
|
1058
1250
|
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1059
1251
|
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1060
1252
|
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
@@ -1100,7 +1292,14 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1292
|
documents_processed_counter = 0
|
|
1101
1293
|
for doc in mdb[coll_name].find():
|
|
1102
1294
|
try:
|
|
1103
|
-
|
|
1295
|
+
# Keep the full type with prefix for document
|
|
1296
|
+
doc_type_full = doc["type"]
|
|
1297
|
+
# Remove prefix for slot lookup and ancestor lookup
|
|
1298
|
+
doc_type = (
|
|
1299
|
+
doc_type_full[5:]
|
|
1300
|
+
if doc_type_full.startswith("nmdc:")
|
|
1301
|
+
else doc_type_full
|
|
1302
|
+
)
|
|
1104
1303
|
except KeyError:
|
|
1105
1304
|
raise Exception(
|
|
1106
1305
|
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
@@ -1109,13 +1308,21 @@ def materialize_alldocs(context) -> int:
|
|
|
1109
1308
|
doc_type
|
|
1110
1309
|
]
|
|
1111
1310
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1311
|
+
|
|
1112
1312
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1313
|
+
# InsertOne is a method on the py-mongo Client class.
|
|
1314
|
+
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1315
|
+
ancestors = schema_view.class_ancestors(doc_type)
|
|
1316
|
+
new_doc["_type_and_ancestors"] = [
|
|
1317
|
+
"nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
|
|
1318
|
+
]
|
|
1113
1319
|
write_operations.append(InsertOne(new_doc))
|
|
1114
1320
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1115
1321
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1116
1322
|
write_operations.clear()
|
|
1117
1323
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1118
1324
|
if len(write_operations) > 0:
|
|
1325
|
+
# here bulk_write is a method on the py-mongo db Client class
|
|
1119
1326
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1120
1327
|
documents_processed_counter += len(write_operations)
|
|
1121
1328
|
context.log.info(
|
|
@@ -1136,10 +1343,20 @@ def materialize_alldocs(context) -> int:
|
|
|
1136
1343
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1137
1344
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1138
1345
|
|
|
1346
|
+
# Add related-ids fields to enable efficient relationship traversal
|
|
1347
|
+
context.log.info("Adding fields for related ids to documents...")
|
|
1348
|
+
_add_related_ids_to_alldocs(
|
|
1349
|
+
temp_alldocs_collection, context, document_reference_ranged_slots
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1139
1352
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1140
1353
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1141
1354
|
|
|
1142
|
-
|
|
1355
|
+
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1356
|
+
context.log.info(
|
|
1357
|
+
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
1358
|
+
)
|
|
1359
|
+
return n_alldocs_documents
|
|
1143
1360
|
|
|
1144
1361
|
|
|
1145
1362
|
@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
|
|
@@ -1282,16 +1499,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
|
1282
1499
|
config_schema={
|
|
1283
1500
|
"nmdc_study_id": str,
|
|
1284
1501
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1502
|
+
"include_field_site_info": bool,
|
|
1503
|
+
"enable_biosample_filtering": bool,
|
|
1285
1504
|
},
|
|
1286
1505
|
out={
|
|
1287
1506
|
"nmdc_study_id": Out(str),
|
|
1288
1507
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1508
|
+
"include_field_site_info": Out(bool),
|
|
1509
|
+
"enable_biosample_filtering": Out(bool),
|
|
1289
1510
|
},
|
|
1290
1511
|
)
|
|
1291
|
-
def get_database_updater_inputs(
|
|
1512
|
+
def get_database_updater_inputs(
|
|
1513
|
+
context: OpExecutionContext,
|
|
1514
|
+
) -> Tuple[str, str, bool, bool]:
|
|
1292
1515
|
return (
|
|
1293
1516
|
context.op_config["nmdc_study_id"],
|
|
1294
1517
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1518
|
+
context.op_config["include_field_site_info"],
|
|
1519
|
+
context.op_config["enable_biosample_filtering"],
|
|
1295
1520
|
)
|
|
1296
1521
|
|
|
1297
1522
|
|
|
@@ -1306,6 +1531,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1306
1531
|
context: OpExecutionContext,
|
|
1307
1532
|
nmdc_study_id: str,
|
|
1308
1533
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1534
|
+
include_field_site_info: bool,
|
|
1535
|
+
enable_biosample_filtering: bool,
|
|
1309
1536
|
) -> nmdc.Database:
|
|
1310
1537
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1311
1538
|
context.resources.runtime_api_user_client
|
|
@@ -1321,6 +1548,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1321
1548
|
gold_api_client,
|
|
1322
1549
|
nmdc_study_id,
|
|
1323
1550
|
gold_nmdc_instrument_map_df,
|
|
1551
|
+
include_field_site_info,
|
|
1552
|
+
enable_biosample_filtering,
|
|
1324
1553
|
)
|
|
1325
1554
|
database = (
|
|
1326
1555
|
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
@@ -1340,6 +1569,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1340
1569
|
context: OpExecutionContext,
|
|
1341
1570
|
nmdc_study_id: str,
|
|
1342
1571
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1572
|
+
include_field_site_info: bool = False,
|
|
1573
|
+
enable_biosample_filtering: bool = False,
|
|
1343
1574
|
) -> nmdc.Database:
|
|
1344
1575
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1345
1576
|
context.resources.runtime_api_user_client
|
|
@@ -1355,12 +1586,72 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1355
1586
|
gold_api_client,
|
|
1356
1587
|
nmdc_study_id,
|
|
1357
1588
|
gold_nmdc_instrument_map_df,
|
|
1589
|
+
include_field_site_info,
|
|
1590
|
+
enable_biosample_filtering,
|
|
1358
1591
|
)
|
|
1359
1592
|
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1360
1593
|
|
|
1361
1594
|
return database
|
|
1362
1595
|
|
|
1363
1596
|
|
|
1597
|
+
@op(
|
|
1598
|
+
required_resource_keys={
|
|
1599
|
+
"runtime_api_user_client",
|
|
1600
|
+
"runtime_api_site_client",
|
|
1601
|
+
"gold_api_client",
|
|
1602
|
+
},
|
|
1603
|
+
out=Out(Any),
|
|
1604
|
+
)
|
|
1605
|
+
def run_script_to_update_insdc_biosample_identifiers(
|
|
1606
|
+
context: OpExecutionContext,
|
|
1607
|
+
nmdc_study_id: str,
|
|
1608
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1609
|
+
include_field_site_info: bool,
|
|
1610
|
+
enable_biosample_filtering: bool,
|
|
1611
|
+
):
|
|
1612
|
+
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1613
|
+
|
|
1614
|
+
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
1615
|
+
records with INSDC identifiers obtained from GOLD.
|
|
1616
|
+
|
|
1617
|
+
Args:
|
|
1618
|
+
context: The execution context
|
|
1619
|
+
nmdc_study_id: The NMDC study ID for which to generate the update script
|
|
1620
|
+
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1621
|
+
|
|
1622
|
+
Returns:
|
|
1623
|
+
A dictionary or list of dictionaries containing the MongoDB update script(s)
|
|
1624
|
+
"""
|
|
1625
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1626
|
+
context.resources.runtime_api_user_client
|
|
1627
|
+
)
|
|
1628
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1629
|
+
context.resources.runtime_api_site_client
|
|
1630
|
+
)
|
|
1631
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1632
|
+
|
|
1633
|
+
database_updater = DatabaseUpdater(
|
|
1634
|
+
runtime_api_user_client,
|
|
1635
|
+
runtime_api_site_client,
|
|
1636
|
+
gold_api_client,
|
|
1637
|
+
nmdc_study_id,
|
|
1638
|
+
gold_nmdc_instrument_map_df,
|
|
1639
|
+
include_field_site_info,
|
|
1640
|
+
enable_biosample_filtering,
|
|
1641
|
+
)
|
|
1642
|
+
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1643
|
+
|
|
1644
|
+
if isinstance(update_script, list):
|
|
1645
|
+
total_updates = sum(len(item.get("updates", [])) for item in update_script)
|
|
1646
|
+
else:
|
|
1647
|
+
total_updates = len(update_script.get("updates", []))
|
|
1648
|
+
context.log.info(
|
|
1649
|
+
f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
return update_script
|
|
1653
|
+
|
|
1654
|
+
|
|
1364
1655
|
@op
|
|
1365
1656
|
def log_database_ids(
|
|
1366
1657
|
context: OpExecutionContext,
|
|
@@ -1382,3 +1673,55 @@ def log_database_ids(
|
|
|
1382
1673
|
message += "\n"
|
|
1383
1674
|
if message:
|
|
1384
1675
|
context.log.info(message)
|
|
1676
|
+
|
|
1677
|
+
|
|
1678
|
+
@op(
|
|
1679
|
+
description="Render free text through the Dagit UI",
|
|
1680
|
+
out=Out(description="Text content rendered through Dagit UI"),
|
|
1681
|
+
)
|
|
1682
|
+
def render_text(context: OpExecutionContext, text: Any):
|
|
1683
|
+
"""
|
|
1684
|
+
Renders content as a Dagster Asset in the Dagit UI.
|
|
1685
|
+
|
|
1686
|
+
This operation creates a Dagster Asset with the provided content, making it
|
|
1687
|
+
visible in the Dagit UI for easy viewing and sharing.
|
|
1688
|
+
|
|
1689
|
+
Args:
|
|
1690
|
+
context: The execution context
|
|
1691
|
+
text: The content to render (can be a string or a dictionary that will be converted to JSON)
|
|
1692
|
+
|
|
1693
|
+
Returns:
|
|
1694
|
+
The same content that was provided as input
|
|
1695
|
+
"""
|
|
1696
|
+
# Convert dictionary to formatted JSON string if needed
|
|
1697
|
+
if isinstance(text, dict):
|
|
1698
|
+
import json
|
|
1699
|
+
|
|
1700
|
+
content = json.dumps(text, indent=2)
|
|
1701
|
+
file_extension = "json"
|
|
1702
|
+
hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
|
|
1703
|
+
else:
|
|
1704
|
+
content = str(text) # Convert to string in case it's not already
|
|
1705
|
+
file_extension = "txt"
|
|
1706
|
+
hash_text = content[:20]
|
|
1707
|
+
|
|
1708
|
+
filename = f"rendered_text_{context.run_id}.{file_extension}"
|
|
1709
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
1710
|
+
|
|
1711
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
1712
|
+
|
|
1713
|
+
with open(file_path, "w") as f:
|
|
1714
|
+
f.write(content)
|
|
1715
|
+
|
|
1716
|
+
context.log_event(
|
|
1717
|
+
AssetMaterialization(
|
|
1718
|
+
asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
|
|
1719
|
+
description="Rendered Content",
|
|
1720
|
+
metadata={
|
|
1721
|
+
"file_path": MetadataValue.path(file_path),
|
|
1722
|
+
"content": MetadataValue.text(content),
|
|
1723
|
+
},
|
|
1724
|
+
)
|
|
1725
|
+
)
|
|
1726
|
+
|
|
1727
|
+
return Output(text)
|