nmdc-runtime 2.6.0__py3-none-any.whl → 2.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/config.py +4 -0
- nmdc_runtime/mongo_util.py +90 -0
- nmdc_runtime/site/export/ncbi_xml.py +98 -27
- nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
- nmdc_runtime/site/graphs.py +42 -5
- nmdc_runtime/site/ops.py +405 -14
- nmdc_runtime/site/repair/database_updater.py +202 -1
- nmdc_runtime/site/repository.py +100 -1
- nmdc_runtime/site/resources.py +13 -0
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
- nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
- nmdc_runtime/util.py +56 -2
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/METADATA +18 -3
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/RECORD +19 -18
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/WHEEL +1 -1
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.8.0.dist-info}/top_level.txt +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
|
+
import logging
|
|
3
4
|
import mimetypes
|
|
4
5
|
import os
|
|
5
6
|
import subprocess
|
|
@@ -9,10 +10,10 @@ from datetime import datetime, timezone
|
|
|
9
10
|
from io import BytesIO, StringIO
|
|
10
11
|
from pprint import pformat
|
|
11
12
|
from toolz.dicttoolz import keyfilter
|
|
12
|
-
from typing import Tuple
|
|
13
|
+
from typing import Tuple, Set
|
|
13
14
|
from zipfile import ZipFile
|
|
14
15
|
from itertools import chain
|
|
15
|
-
|
|
16
|
+
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
16
17
|
import pandas as pd
|
|
17
18
|
import requests
|
|
18
19
|
|
|
@@ -26,6 +27,7 @@ from dagster import (
|
|
|
26
27
|
Failure,
|
|
27
28
|
List,
|
|
28
29
|
MetadataValue,
|
|
30
|
+
Noneable,
|
|
29
31
|
OpExecutionContext,
|
|
30
32
|
Out,
|
|
31
33
|
Output,
|
|
@@ -36,7 +38,8 @@ from dagster import (
|
|
|
36
38
|
Optional,
|
|
37
39
|
Field,
|
|
38
40
|
Permissive,
|
|
39
|
-
|
|
41
|
+
In,
|
|
42
|
+
Nothing,
|
|
40
43
|
)
|
|
41
44
|
from gridfs import GridFS
|
|
42
45
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
@@ -113,11 +116,14 @@ from nmdc_runtime.util import (
|
|
|
113
116
|
from nmdc_schema import nmdc
|
|
114
117
|
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
115
118
|
from pydantic import BaseModel
|
|
116
|
-
from pymongo import InsertOne
|
|
119
|
+
from pymongo import InsertOne, UpdateOne
|
|
117
120
|
from pymongo.database import Database as MongoDatabase
|
|
118
121
|
from starlette import status
|
|
119
122
|
from toolz import assoc, dissoc, get_in, valfilter, identity
|
|
120
123
|
|
|
124
|
+
# batch size for writing documents to alldocs
|
|
125
|
+
BULK_WRITE_BATCH_SIZE = 2000
|
|
126
|
+
|
|
121
127
|
|
|
122
128
|
@op
|
|
123
129
|
def hello(context):
|
|
@@ -476,7 +482,14 @@ def get_json_in(context):
|
|
|
476
482
|
|
|
477
483
|
|
|
478
484
|
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
479
|
-
"""
|
|
485
|
+
"""
|
|
486
|
+
Does not ensure ordering of `docs`.
|
|
487
|
+
|
|
488
|
+
TODO: Document this function. What _does_ it do (or what was it designed to do)?
|
|
489
|
+
What, conceptually, did the author design it to receive (as `docs`); a dict
|
|
490
|
+
having a `data_object_set` item whose value is a list of documents.
|
|
491
|
+
What, conceptually, did the author design it to return?
|
|
492
|
+
"""
|
|
480
493
|
|
|
481
494
|
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
482
495
|
return docs, 0
|
|
@@ -491,20 +504,38 @@ def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
|
491
504
|
class FileTypeEnum(FileTypeEnumBase):
|
|
492
505
|
id: str
|
|
493
506
|
|
|
507
|
+
# Make a temporary collection (which will be dropped below) and insert the
|
|
508
|
+
# specified `data_object_set` documents into it.
|
|
494
509
|
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
495
510
|
temp_collection = mdb[temp_collection_name]
|
|
496
511
|
temp_collection.insert_many(do_docs)
|
|
497
512
|
temp_collection.create_index("id")
|
|
498
513
|
|
|
499
|
-
def fte_matches(fte_filter: str):
|
|
514
|
+
def fte_matches(fte_filter: str) -> List[dict]:
|
|
515
|
+
r"""
|
|
516
|
+
Returns a list of documents—without their `_id` field—that match the specified filter,
|
|
517
|
+
which is encoded as a JSON string.
|
|
518
|
+
"""
|
|
500
519
|
return [
|
|
501
520
|
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
502
521
|
]
|
|
503
522
|
|
|
523
|
+
# Create a mapping from each document's `id` to the document, itself.
|
|
504
524
|
do_docs_map = {d["id"]: d for d in do_docs}
|
|
505
525
|
|
|
506
526
|
n_docs_with_types_added = 0
|
|
507
527
|
|
|
528
|
+
# For each `file_type_enum` document in the database, find all the documents (among the
|
|
529
|
+
# `data_object_set` documents provided by the caller) that match that `file_type_enum`
|
|
530
|
+
# document's filter.
|
|
531
|
+
#
|
|
532
|
+
# If any of those documents lacks a `data_object_type` field, update the original
|
|
533
|
+
# `data_object_set` document so that its `data_object_type` field is set to
|
|
534
|
+
# the `file_type_enum` document's `id` (why not its `name`?).
|
|
535
|
+
#
|
|
536
|
+
# TODO: I don't know why this sets `data_object_type` to `file_type_enum.id`,
|
|
537
|
+
# as opposed to `file_type_enum.name`.
|
|
538
|
+
#
|
|
508
539
|
for fte_doc in mdb.file_type_enum.find():
|
|
509
540
|
fte = FileTypeEnum(**fte_doc)
|
|
510
541
|
docs_matching = fte_matches(fte.filter)
|
|
@@ -514,6 +545,11 @@ def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
|
514
545
|
n_docs_with_types_added += 1
|
|
515
546
|
|
|
516
547
|
mdb.drop_collection(temp_collection_name)
|
|
548
|
+
|
|
549
|
+
# Returns a tuple. The first item is the original `docs` dictionary, but with the
|
|
550
|
+
# `data_object_set` list replaced by the list of the documents that are in the
|
|
551
|
+
# `do_docs_map` dictionary (with their `_id` fields omitted). The second item is
|
|
552
|
+
# the number of documents to which this function added a `data_object_type` field.
|
|
517
553
|
return (
|
|
518
554
|
assoc(
|
|
519
555
|
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
@@ -1043,18 +1079,247 @@ def site_code_mapping() -> dict:
|
|
|
1043
1079
|
)
|
|
1044
1080
|
|
|
1045
1081
|
|
|
1046
|
-
@op(
|
|
1082
|
+
@op(
|
|
1083
|
+
required_resource_keys={"mongo"},
|
|
1084
|
+
config_schema={
|
|
1085
|
+
"source_ontology": str,
|
|
1086
|
+
"output_directory": Field(Noneable(str), default_value=None, is_required=False),
|
|
1087
|
+
"generate_reports": Field(bool, default_value=True, is_required=False),
|
|
1088
|
+
},
|
|
1089
|
+
)
|
|
1090
|
+
def load_ontology(context: OpExecutionContext):
|
|
1091
|
+
cfg = context.op_config
|
|
1092
|
+
source_ontology = cfg["source_ontology"]
|
|
1093
|
+
output_directory = cfg.get("output_directory")
|
|
1094
|
+
generate_reports = cfg.get("generate_reports", True)
|
|
1095
|
+
|
|
1096
|
+
if output_directory is None:
|
|
1097
|
+
output_directory = os.path.join(os.getcwd(), "ontology_reports")
|
|
1098
|
+
|
|
1099
|
+
# Redirect Python logging to Dagster context
|
|
1100
|
+
handler = logging.Handler()
|
|
1101
|
+
handler.emit = lambda record: context.log.info(record.getMessage())
|
|
1102
|
+
|
|
1103
|
+
# Get logger from ontology-loader package
|
|
1104
|
+
controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
|
|
1105
|
+
controller_logger.setLevel(logging.INFO)
|
|
1106
|
+
controller_logger.addHandler(handler)
|
|
1107
|
+
|
|
1108
|
+
context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
|
|
1109
|
+
loader = OntologyLoaderController(
|
|
1110
|
+
source_ontology=source_ontology,
|
|
1111
|
+
output_directory=output_directory,
|
|
1112
|
+
generate_reports=generate_reports,
|
|
1113
|
+
)
|
|
1114
|
+
|
|
1115
|
+
loader.run_ontology_loader()
|
|
1116
|
+
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
1117
|
+
|
|
1118
|
+
|
|
1119
|
+
def _add_related_ids_to_alldocs(
|
|
1120
|
+
temp_collection, context, document_reference_ranged_slots_by_type
|
|
1121
|
+
) -> None:
|
|
1122
|
+
"""
|
|
1123
|
+
Adds {`_inbound`,`_outbound`} fields to each document in the temporary alldocs collection.
|
|
1124
|
+
|
|
1125
|
+
The {`_inbound`,`_outbound`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
1126
|
+
Each subdocument represents a link to any other document that either links to or is linked from
|
|
1127
|
+
the document via document-reference-ranged slots.
|
|
1128
|
+
|
|
1129
|
+
Args:
|
|
1130
|
+
temp_collection: The temporary MongoDB collection to process
|
|
1131
|
+
context: The Dagster execution context for logging
|
|
1132
|
+
document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
|
|
1133
|
+
|
|
1134
|
+
Returns:
|
|
1135
|
+
None (modifies the documents in place)
|
|
1136
|
+
"""
|
|
1137
|
+
|
|
1138
|
+
context.log.info(
|
|
1139
|
+
"Building relationships and adding `_inbound` and `_outbound` fields..."
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
# document ID -> type (with "nmdc:" prefix preserved)
|
|
1143
|
+
id_to_type_map: Dict[str, str] = {}
|
|
1144
|
+
|
|
1145
|
+
# set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
|
|
1146
|
+
relationship_triples: Set[Tuple[str, str, str]] = set()
|
|
1147
|
+
|
|
1148
|
+
# Collect relationship triples.
|
|
1149
|
+
for doc in temp_collection.find():
|
|
1150
|
+
doc_id = doc["id"]
|
|
1151
|
+
# Store the full type with prefix intact
|
|
1152
|
+
doc_type = doc["type"]
|
|
1153
|
+
# For looking up reference slots, we still need the type without prefix
|
|
1154
|
+
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1155
|
+
|
|
1156
|
+
# Record ID to type mapping - preserve the original type with prefix
|
|
1157
|
+
id_to_type_map[doc_id] = doc_type
|
|
1158
|
+
|
|
1159
|
+
# Find all document references from this document
|
|
1160
|
+
reference_slots = document_reference_ranged_slots_by_type.get(
|
|
1161
|
+
doc_type_no_prefix, []
|
|
1162
|
+
)
|
|
1163
|
+
for slot in reference_slots:
|
|
1164
|
+
if slot in doc:
|
|
1165
|
+
# Handle both single-value and array references
|
|
1166
|
+
refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
|
|
1167
|
+
for ref_doc in temp_collection.find(
|
|
1168
|
+
{"id": {"$in": refs}}, ["id", "type"]
|
|
1169
|
+
):
|
|
1170
|
+
id_to_type_map[ref_doc["id"]] = ref_doc["type"]
|
|
1171
|
+
for ref_id in refs:
|
|
1172
|
+
relationship_triples.add((doc_id, slot, ref_id))
|
|
1173
|
+
|
|
1174
|
+
context.log.info(
|
|
1175
|
+
f"Found {len(id_to_type_map)} documents, with "
|
|
1176
|
+
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1177
|
+
)
|
|
1178
|
+
|
|
1179
|
+
# The bifurcation of document-reference-ranged slots as "inbound" and "outbound" is essential
|
|
1180
|
+
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1181
|
+
# recursion "exploding".
|
|
1182
|
+
#
|
|
1183
|
+
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1184
|
+
# because the NMDC schema does not currently contain or expose it.
|
|
1185
|
+
#
|
|
1186
|
+
# An "inbound" slot is one for which an entity in the domain "was influenced by" (formally,
|
|
1187
|
+
# <https://www.w3.org/ns/prov#wasInfluencedBy>, with typical CURIE prov:wasInfluencedBy) an entity in the range.
|
|
1188
|
+
inbound_document_reference_ranged_slots = [
|
|
1189
|
+
"collected_from", # a `nmdc:Biosample` was influenced by the `nmdc:Site` from which it was collected.
|
|
1190
|
+
"has_chromatography_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
|
|
1191
|
+
"has_input", # a `nmdc:PlannedProcess` was influenced by a `nmdc:NamedThing`.
|
|
1192
|
+
"has_mass_spectrometry_configuration", # a `nmdc:PlannedProcess` was influenced by its `nmdc:Configuration`.
|
|
1193
|
+
"instrument_used", # a `nmdc:PlannedProcess` was influenced by a used `nmdc:Instrument`.
|
|
1194
|
+
"uses_calibration", # a `nmdc:PlannedProcess` was influenced by `nmdc:CalibrationInformation`.
|
|
1195
|
+
"was_generated_by", # prov:wasGeneratedBy rdfs:subPropertyOf prov:wasInfluencedBy .
|
|
1196
|
+
"was_informed_by", # prov:wasInformedBy rdfs:subPropertyOf prov:wasInfluencedBy .
|
|
1197
|
+
]
|
|
1198
|
+
# An "outbound" slot is one for which an entity in the domain "influences"
|
|
1199
|
+
# (i.e., [owl:inverseOf prov:wasInfluencedBy]) an entity in the range.
|
|
1200
|
+
outbound_document_reference_ranged_slots = [
|
|
1201
|
+
"associated_studies", # a `nmdc:Biosample` influences a `nmdc:Study`.
|
|
1202
|
+
"calibration_object", # `nmdc:CalibrationInformation` generates a `nmdc:DataObject`.
|
|
1203
|
+
"generates_calibration", # a `nmdc:PlannedProcess` generates `nmdc:CalibrationInformation`.
|
|
1204
|
+
"has_output", # a `nmdc:PlannedProcess` generates a `nmdc:NamedThing`.
|
|
1205
|
+
"in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
|
|
1206
|
+
"part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
|
|
1207
|
+
]
|
|
1208
|
+
|
|
1209
|
+
unique_document_reference_ranged_slot_names = set()
|
|
1210
|
+
for slot_names in document_reference_ranged_slots_by_type.values():
|
|
1211
|
+
for slot_name in slot_names:
|
|
1212
|
+
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1213
|
+
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1214
|
+
if len(inbound_document_reference_ranged_slots) + len(
|
|
1215
|
+
outbound_document_reference_ranged_slots
|
|
1216
|
+
) != len(unique_document_reference_ranged_slot_names):
|
|
1217
|
+
raise Failure(
|
|
1218
|
+
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1219
|
+
"sum of accounted-for inbound and outbound document-reference-ranged slot names."
|
|
1220
|
+
)
|
|
1221
|
+
|
|
1222
|
+
# Construct, and update documents with, `_incoming` and `_outgoing` field values.
|
|
1223
|
+
#
|
|
1224
|
+
# manage batching of MongoDB `bulk_write` operations
|
|
1225
|
+
bulk_operations, update_count = [], 0
|
|
1226
|
+
for doc_id, slot, ref_id in relationship_triples:
|
|
1227
|
+
|
|
1228
|
+
# Determine in which respective fields to push this relationship
|
|
1229
|
+
# for the subject (doc) and object (ref) of this triple.
|
|
1230
|
+
if slot in inbound_document_reference_ranged_slots:
|
|
1231
|
+
field_for_doc, field_for_ref = "_inbound", "_outbound"
|
|
1232
|
+
elif slot in outbound_document_reference_ranged_slots:
|
|
1233
|
+
field_for_doc, field_for_ref = "_outbound", "_inbound"
|
|
1234
|
+
else:
|
|
1235
|
+
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1236
|
+
|
|
1237
|
+
updates = [
|
|
1238
|
+
{
|
|
1239
|
+
"filter": {"id": doc_id},
|
|
1240
|
+
"update": {
|
|
1241
|
+
"$push": {
|
|
1242
|
+
field_for_doc: {
|
|
1243
|
+
"id": ref_id,
|
|
1244
|
+
# TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
|
|
1245
|
+
# which acts as an implicit referential integrity checker (!). Using `.get` with
|
|
1246
|
+
# "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
|
|
1247
|
+
"type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
},
|
|
1251
|
+
},
|
|
1252
|
+
{
|
|
1253
|
+
"filter": {"id": ref_id},
|
|
1254
|
+
"update": {
|
|
1255
|
+
"$push": {
|
|
1256
|
+
field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
|
|
1257
|
+
}
|
|
1258
|
+
},
|
|
1259
|
+
},
|
|
1260
|
+
]
|
|
1261
|
+
for update in updates:
|
|
1262
|
+
bulk_operations.append(UpdateOne(**update))
|
|
1263
|
+
|
|
1264
|
+
# Execute in batches for efficiency
|
|
1265
|
+
if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
|
|
1266
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1267
|
+
update_count += len(bulk_operations)
|
|
1268
|
+
context.log.info(
|
|
1269
|
+
f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
|
|
1270
|
+
)
|
|
1271
|
+
bulk_operations = []
|
|
1272
|
+
|
|
1273
|
+
# Execute any remaining operations
|
|
1274
|
+
if bulk_operations:
|
|
1275
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1276
|
+
update_count += len(bulk_operations)
|
|
1277
|
+
|
|
1278
|
+
context.log.info(f"Pushed {update_count} updates in total")
|
|
1279
|
+
|
|
1280
|
+
context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
|
|
1281
|
+
temp_collection.create_index("_inbound.id")
|
|
1282
|
+
temp_collection.create_index("_outbound.id")
|
|
1283
|
+
# Create compound indexes to ensure index-covered queries
|
|
1284
|
+
temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
|
|
1285
|
+
temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
|
|
1286
|
+
context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
|
|
1287
|
+
|
|
1288
|
+
|
|
1289
|
+
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1290
|
+
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
1291
|
+
# while also telling Dagster that this op doesn't need the _value_ of that argument.
|
|
1292
|
+
# This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
1293
|
+
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1294
|
+
#
|
|
1295
|
+
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1047
1296
|
def materialize_alldocs(context) -> int:
|
|
1048
1297
|
"""
|
|
1049
|
-
This function re
|
|
1050
|
-
|
|
1298
|
+
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1299
|
+
|
|
1300
|
+
1. Getting all populated schema collection names with an `id` field.
|
|
1301
|
+
2. Create a temporary collection to build the new alldocs collection.
|
|
1302
|
+
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1303
|
+
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1304
|
+
5. Add special `_inbound` and `_outbound` fields with subdocuments containing ID and type of related entities.
|
|
1305
|
+
6. Add indexes for `id`, relationship fields, and `{_inbound,_outbound}.type`/`.id` compound indexes.
|
|
1306
|
+
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1307
|
+
|
|
1308
|
+
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
1309
|
+
`nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
|
|
1310
|
+
such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
|
|
1311
|
+
|
|
1312
|
+
The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
|
|
1313
|
+
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1314
|
+
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1315
|
+
|
|
1316
|
+
The {`_inbound`,`_outbound`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1317
|
+
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1318
|
+
expansions.
|
|
1051
1319
|
"""
|
|
1052
1320
|
mdb = context.resources.mongo.db
|
|
1053
1321
|
schema_view = nmdc_schema_view()
|
|
1054
1322
|
|
|
1055
|
-
# batch size for writing documents to alldocs
|
|
1056
|
-
BULK_WRITE_BATCH_SIZE = 2000
|
|
1057
|
-
|
|
1058
1323
|
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1059
1324
|
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1060
1325
|
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
@@ -1100,7 +1365,14 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1365
|
documents_processed_counter = 0
|
|
1101
1366
|
for doc in mdb[coll_name].find():
|
|
1102
1367
|
try:
|
|
1103
|
-
|
|
1368
|
+
# Keep the full type with prefix for document
|
|
1369
|
+
doc_type_full = doc["type"]
|
|
1370
|
+
# Remove prefix for slot lookup and ancestor lookup
|
|
1371
|
+
doc_type = (
|
|
1372
|
+
doc_type_full[5:]
|
|
1373
|
+
if doc_type_full.startswith("nmdc:")
|
|
1374
|
+
else doc_type_full
|
|
1375
|
+
)
|
|
1104
1376
|
except KeyError:
|
|
1105
1377
|
raise Exception(
|
|
1106
1378
|
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
@@ -1109,13 +1381,21 @@ def materialize_alldocs(context) -> int:
|
|
|
1109
1381
|
doc_type
|
|
1110
1382
|
]
|
|
1111
1383
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1384
|
+
|
|
1112
1385
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1386
|
+
# InsertOne is a method on the py-mongo Client class.
|
|
1387
|
+
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1388
|
+
ancestors = schema_view.class_ancestors(doc_type)
|
|
1389
|
+
new_doc["_type_and_ancestors"] = [
|
|
1390
|
+
"nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
|
|
1391
|
+
]
|
|
1113
1392
|
write_operations.append(InsertOne(new_doc))
|
|
1114
1393
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1115
1394
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1116
1395
|
write_operations.clear()
|
|
1117
1396
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1118
1397
|
if len(write_operations) > 0:
|
|
1398
|
+
# here bulk_write is a method on the py-mongo db Client class
|
|
1119
1399
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1120
1400
|
documents_processed_counter += len(write_operations)
|
|
1121
1401
|
context.log.info(
|
|
@@ -1136,10 +1416,20 @@ def materialize_alldocs(context) -> int:
|
|
|
1136
1416
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1137
1417
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1138
1418
|
|
|
1419
|
+
# Add related-ids fields to enable efficient relationship traversal
|
|
1420
|
+
context.log.info("Adding fields for related ids to documents...")
|
|
1421
|
+
_add_related_ids_to_alldocs(
|
|
1422
|
+
temp_alldocs_collection, context, document_reference_ranged_slots
|
|
1423
|
+
)
|
|
1424
|
+
|
|
1139
1425
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1140
1426
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1141
1427
|
|
|
1142
|
-
|
|
1428
|
+
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1429
|
+
context.log.info(
|
|
1430
|
+
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
1431
|
+
)
|
|
1432
|
+
return n_alldocs_documents
|
|
1143
1433
|
|
|
1144
1434
|
|
|
1145
1435
|
@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
|
|
@@ -1361,6 +1651,55 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1361
1651
|
return database
|
|
1362
1652
|
|
|
1363
1653
|
|
|
1654
|
+
@op(
|
|
1655
|
+
required_resource_keys={
|
|
1656
|
+
"runtime_api_user_client",
|
|
1657
|
+
"runtime_api_site_client",
|
|
1658
|
+
"gold_api_client",
|
|
1659
|
+
}
|
|
1660
|
+
)
|
|
1661
|
+
def run_script_to_update_insdc_biosample_identifiers(
|
|
1662
|
+
context: OpExecutionContext,
|
|
1663
|
+
nmdc_study_id: str,
|
|
1664
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1665
|
+
) -> Dict[str, Any]:
|
|
1666
|
+
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1667
|
+
|
|
1668
|
+
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
1669
|
+
records with INSDC identifiers obtained from GOLD.
|
|
1670
|
+
|
|
1671
|
+
Args:
|
|
1672
|
+
context: The execution context
|
|
1673
|
+
nmdc_study_id: The NMDC study ID for which to generate the update script
|
|
1674
|
+
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1675
|
+
|
|
1676
|
+
Returns:
|
|
1677
|
+
A dictionary containing the MongoDB update script
|
|
1678
|
+
"""
|
|
1679
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1680
|
+
context.resources.runtime_api_user_client
|
|
1681
|
+
)
|
|
1682
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1683
|
+
context.resources.runtime_api_site_client
|
|
1684
|
+
)
|
|
1685
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1686
|
+
|
|
1687
|
+
database_updater = DatabaseUpdater(
|
|
1688
|
+
runtime_api_user_client,
|
|
1689
|
+
runtime_api_site_client,
|
|
1690
|
+
gold_api_client,
|
|
1691
|
+
nmdc_study_id,
|
|
1692
|
+
gold_nmdc_instrument_map_df,
|
|
1693
|
+
)
|
|
1694
|
+
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1695
|
+
|
|
1696
|
+
context.log.info(
|
|
1697
|
+
f"Generated update script for study {nmdc_study_id} with {len(update_script.get('updates', []))} updates"
|
|
1698
|
+
)
|
|
1699
|
+
|
|
1700
|
+
return update_script
|
|
1701
|
+
|
|
1702
|
+
|
|
1364
1703
|
@op
|
|
1365
1704
|
def log_database_ids(
|
|
1366
1705
|
context: OpExecutionContext,
|
|
@@ -1382,3 +1721,55 @@ def log_database_ids(
|
|
|
1382
1721
|
message += "\n"
|
|
1383
1722
|
if message:
|
|
1384
1723
|
context.log.info(message)
|
|
1724
|
+
|
|
1725
|
+
|
|
1726
|
+
@op(
|
|
1727
|
+
description="Render free text through the Dagit UI",
|
|
1728
|
+
out=Out(description="Text content rendered through Dagit UI"),
|
|
1729
|
+
)
|
|
1730
|
+
def render_text(context: OpExecutionContext, text: Any):
|
|
1731
|
+
"""
|
|
1732
|
+
Renders content as a Dagster Asset in the Dagit UI.
|
|
1733
|
+
|
|
1734
|
+
This operation creates a Dagster Asset with the provided content, making it
|
|
1735
|
+
visible in the Dagit UI for easy viewing and sharing.
|
|
1736
|
+
|
|
1737
|
+
Args:
|
|
1738
|
+
context: The execution context
|
|
1739
|
+
text: The content to render (can be a string or a dictionary that will be converted to JSON)
|
|
1740
|
+
|
|
1741
|
+
Returns:
|
|
1742
|
+
The same content that was provided as input
|
|
1743
|
+
"""
|
|
1744
|
+
# Convert dictionary to formatted JSON string if needed
|
|
1745
|
+
if isinstance(text, dict):
|
|
1746
|
+
import json
|
|
1747
|
+
|
|
1748
|
+
content = json.dumps(text, indent=2)
|
|
1749
|
+
file_extension = "json"
|
|
1750
|
+
hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
|
|
1751
|
+
else:
|
|
1752
|
+
content = str(text) # Convert to string in case it's not already
|
|
1753
|
+
file_extension = "txt"
|
|
1754
|
+
hash_text = content[:20]
|
|
1755
|
+
|
|
1756
|
+
filename = f"rendered_text_{context.run_id}.{file_extension}"
|
|
1757
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
1758
|
+
|
|
1759
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
1760
|
+
|
|
1761
|
+
with open(file_path, "w") as f:
|
|
1762
|
+
f.write(content)
|
|
1763
|
+
|
|
1764
|
+
context.log_event(
|
|
1765
|
+
AssetMaterialization(
|
|
1766
|
+
asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
|
|
1767
|
+
description="Rendered Content",
|
|
1768
|
+
metadata={
|
|
1769
|
+
"file_path": MetadataValue.path(file_path),
|
|
1770
|
+
"content": MetadataValue.text(content),
|
|
1771
|
+
},
|
|
1772
|
+
)
|
|
1773
|
+
)
|
|
1774
|
+
|
|
1775
|
+
return Output(text)
|