nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
|
-
import
|
|
3
|
+
import logging
|
|
4
4
|
import os
|
|
5
5
|
import subprocess
|
|
6
|
-
import tempfile
|
|
7
6
|
from collections import defaultdict
|
|
8
7
|
from datetime import datetime, timezone
|
|
9
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
10
9
|
from pprint import pformat
|
|
11
10
|
from toolz.dicttoolz import keyfilter
|
|
12
|
-
from typing import Tuple
|
|
11
|
+
from typing import Tuple, Set
|
|
13
12
|
from zipfile import ZipFile
|
|
14
13
|
from itertools import chain
|
|
15
|
-
|
|
14
|
+
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
16
15
|
import pandas as pd
|
|
17
16
|
import requests
|
|
18
|
-
|
|
17
|
+
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
18
|
+
from toolz import dissoc
|
|
19
19
|
|
|
20
20
|
from bson import ObjectId, json_util
|
|
21
21
|
from dagster import (
|
|
@@ -26,6 +26,7 @@ from dagster import (
|
|
|
26
26
|
Failure,
|
|
27
27
|
List,
|
|
28
28
|
MetadataValue,
|
|
29
|
+
Noneable,
|
|
29
30
|
OpExecutionContext,
|
|
30
31
|
Out,
|
|
31
32
|
Output,
|
|
@@ -36,12 +37,13 @@ from dagster import (
|
|
|
36
37
|
Optional,
|
|
37
38
|
Field,
|
|
38
39
|
Permissive,
|
|
39
|
-
|
|
40
|
+
In,
|
|
41
|
+
Nothing,
|
|
40
42
|
)
|
|
41
43
|
from gridfs import GridFS
|
|
42
44
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
43
45
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
44
|
-
from nmdc_runtime.api.db.mongo import
|
|
46
|
+
from nmdc_runtime.api.db.mongo import validate_json
|
|
45
47
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
46
48
|
from nmdc_runtime.api.core.metadata import (
|
|
47
49
|
_validate_changesheet,
|
|
@@ -71,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
71
73
|
fetch_nucleotide_sequencing_from_biosamples,
|
|
72
74
|
fetch_library_preparation_from_biosamples,
|
|
73
75
|
)
|
|
74
|
-
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
75
76
|
from nmdc_runtime.site.resources import (
|
|
76
77
|
NmdcPortalApiClient,
|
|
77
78
|
GoldApiClient,
|
|
@@ -93,30 +94,26 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
|
93
94
|
)
|
|
94
95
|
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
95
96
|
from nmdc_runtime.site.util import (
|
|
96
|
-
run_and_log,
|
|
97
97
|
schema_collection_has_index_on_id,
|
|
98
98
|
nmdc_study_id_to_filename,
|
|
99
99
|
get_instruments_by_id,
|
|
100
100
|
)
|
|
101
101
|
from nmdc_runtime.util import (
|
|
102
|
-
drs_object_in_for,
|
|
103
|
-
get_names_of_classes_in_effective_range_of_slot,
|
|
104
102
|
pluralize,
|
|
105
|
-
put_object,
|
|
106
|
-
validate_json,
|
|
107
103
|
specialize_activity_set_docs,
|
|
108
104
|
collection_name_to_class_names,
|
|
109
|
-
class_hierarchy_as_list,
|
|
110
105
|
nmdc_schema_view,
|
|
111
106
|
populated_schema_collection_names_with_id_field,
|
|
112
107
|
)
|
|
113
108
|
from nmdc_schema import nmdc
|
|
114
|
-
from
|
|
115
|
-
from pydantic import BaseModel
|
|
116
|
-
from pymongo import InsertOne
|
|
109
|
+
from pymongo import InsertOne, UpdateOne
|
|
117
110
|
from pymongo.database import Database as MongoDatabase
|
|
118
|
-
from
|
|
119
|
-
from toolz import
|
|
111
|
+
from pymongo.collection import Collection as MongoCollection
|
|
112
|
+
from toolz import get_in, valfilter, identity
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# batch size for writing documents to alldocs
|
|
116
|
+
BULK_WRITE_BATCH_SIZE = 2000
|
|
120
117
|
|
|
121
118
|
|
|
122
119
|
@op
|
|
@@ -148,99 +145,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
148
145
|
return collection_names
|
|
149
146
|
|
|
150
147
|
|
|
151
|
-
@op(
|
|
152
|
-
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
153
|
-
retry_policy=RetryPolicy(max_retries=2),
|
|
154
|
-
)
|
|
155
|
-
def local_file_to_api_object(context, file_info):
|
|
156
|
-
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
157
|
-
storage_path: str = file_info["storage_path"]
|
|
158
|
-
mime_type = file_info.get("mime_type")
|
|
159
|
-
if mime_type is None:
|
|
160
|
-
mime_type = mimetypes.guess_type(storage_path)[0]
|
|
161
|
-
rv = client.put_object_in_site(
|
|
162
|
-
{"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
|
|
163
|
-
)
|
|
164
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
165
|
-
raise Failure(description=f"put_object_in_site failed: {rv.content}")
|
|
166
|
-
op = rv.json()
|
|
167
|
-
context.log.info(f"put_object_in_site: {op}")
|
|
168
|
-
rv = put_object(storage_path, op["metadata"]["url"])
|
|
169
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
170
|
-
raise Failure(description=f"put_object failed: {rv.content}")
|
|
171
|
-
op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
|
|
172
|
-
rv = client.update_operation(op["id"], op_patch)
|
|
173
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
174
|
-
raise Failure(description="update_operation failed")
|
|
175
|
-
op = rv.json()
|
|
176
|
-
context.log.info(f"update_operation: {op}")
|
|
177
|
-
rv = client.create_object_from_op(op)
|
|
178
|
-
if rv.status_code != status.HTTP_201_CREATED:
|
|
179
|
-
raise Failure("create_object_from_op failed")
|
|
180
|
-
obj = rv.json()
|
|
181
|
-
context.log.info(f'Created /objects/{obj["id"]}')
|
|
182
|
-
mdb = context.resources.mongo.db
|
|
183
|
-
rv = mdb.operations.delete_one({"id": op["id"]})
|
|
184
|
-
if rv.deleted_count != 1:
|
|
185
|
-
context.log.error("deleting op failed")
|
|
186
|
-
yield AssetMaterialization(
|
|
187
|
-
asset_key=AssetKey(["object", obj["name"]]),
|
|
188
|
-
description="output of metadata-translation run_etl",
|
|
189
|
-
metadata={"object_id": MetadataValue.text(obj["id"])},
|
|
190
|
-
)
|
|
191
|
-
yield Output(obj)
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
@op(
|
|
195
|
-
out={
|
|
196
|
-
"merged_data_path": Out(
|
|
197
|
-
str,
|
|
198
|
-
description="path to TSV merging of source metadata",
|
|
199
|
-
)
|
|
200
|
-
}
|
|
201
|
-
)
|
|
202
|
-
def build_merged_db(context) -> str:
|
|
203
|
-
context.log.info("metadata-translation: running `make build-merged-db`")
|
|
204
|
-
run_and_log(
|
|
205
|
-
"cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
|
|
206
|
-
)
|
|
207
|
-
storage_path = (
|
|
208
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
209
|
-
)
|
|
210
|
-
yield AssetMaterialization(
|
|
211
|
-
asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
|
|
212
|
-
description="input to metadata-translation run_etl",
|
|
213
|
-
metadata={"path": MetadataValue.path(storage_path)},
|
|
214
|
-
)
|
|
215
|
-
yield Output(storage_path, "merged_data_path")
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
@op(
|
|
219
|
-
required_resource_keys={"runtime_api_site_client"},
|
|
220
|
-
)
|
|
221
|
-
def run_etl(context, merged_data_path: str):
|
|
222
|
-
context.log.info("metadata-translation: running `make run-etl`")
|
|
223
|
-
if not os.path.exists(merged_data_path):
|
|
224
|
-
raise Failure(description=f"merged_db not present at {merged_data_path}")
|
|
225
|
-
run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
|
|
226
|
-
storage_path = (
|
|
227
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
|
|
228
|
-
)
|
|
229
|
-
with ZipFile(storage_path) as zf:
|
|
230
|
-
name = zf.namelist()[0]
|
|
231
|
-
with zf.open(name) as f:
|
|
232
|
-
rv = json.load(f)
|
|
233
|
-
context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
|
|
234
|
-
yield AssetMaterialization(
|
|
235
|
-
asset_key=AssetKey(["gold_translation", "database.json.zip"]),
|
|
236
|
-
description="output of metadata-translation run_etl",
|
|
237
|
-
metadata={
|
|
238
|
-
"path": MetadataValue.path(storage_path),
|
|
239
|
-
},
|
|
240
|
-
)
|
|
241
|
-
yield Output({"storage_path": storage_path})
|
|
242
|
-
|
|
243
|
-
|
|
244
148
|
@op(required_resource_keys={"mongo"})
|
|
245
149
|
def get_operation(context):
|
|
246
150
|
mdb = context.resources.mongo.db
|
|
@@ -465,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
|
|
|
465
369
|
|
|
466
370
|
@op(required_resource_keys={"runtime_api_site_client"})
|
|
467
371
|
def get_json_in(context):
|
|
372
|
+
"""
|
|
373
|
+
TODO: Document this function.
|
|
374
|
+
"""
|
|
468
375
|
object_id = context.op_config.get("object_id")
|
|
469
376
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
470
377
|
rv = client.get_object_bytes(object_id)
|
|
@@ -475,63 +382,17 @@ def get_json_in(context):
|
|
|
475
382
|
return rv.json()
|
|
476
383
|
|
|
477
384
|
|
|
478
|
-
def ensure_data_object_type(docs: Dict[str, list], mdb: MongoDatabase):
|
|
479
|
-
"""Does not ensure ordering of `docs`."""
|
|
480
|
-
|
|
481
|
-
if ("data_object_set" not in docs) or len(docs["data_object_set"]) == 0:
|
|
482
|
-
return docs, 0
|
|
483
|
-
|
|
484
|
-
do_docs = docs["data_object_set"]
|
|
485
|
-
|
|
486
|
-
class FileTypeEnumBase(BaseModel):
|
|
487
|
-
name: str
|
|
488
|
-
description: str
|
|
489
|
-
filter: str # JSON-encoded data_object_set mongo collection filter document
|
|
490
|
-
|
|
491
|
-
class FileTypeEnum(FileTypeEnumBase):
|
|
492
|
-
id: str
|
|
493
|
-
|
|
494
|
-
temp_collection_name = f"tmp.data_object_set.{ObjectId()}"
|
|
495
|
-
temp_collection = mdb[temp_collection_name]
|
|
496
|
-
temp_collection.insert_many(do_docs)
|
|
497
|
-
temp_collection.create_index("id")
|
|
498
|
-
|
|
499
|
-
def fte_matches(fte_filter: str):
|
|
500
|
-
return [
|
|
501
|
-
dissoc(d, "_id") for d in mdb.temp_collection.find(json.loads(fte_filter))
|
|
502
|
-
]
|
|
503
|
-
|
|
504
|
-
do_docs_map = {d["id"]: d for d in do_docs}
|
|
505
|
-
|
|
506
|
-
n_docs_with_types_added = 0
|
|
507
|
-
|
|
508
|
-
for fte_doc in mdb.file_type_enum.find():
|
|
509
|
-
fte = FileTypeEnum(**fte_doc)
|
|
510
|
-
docs_matching = fte_matches(fte.filter)
|
|
511
|
-
for doc in docs_matching:
|
|
512
|
-
if "data_object_type" not in doc:
|
|
513
|
-
do_docs_map[doc["id"]] = assoc(doc, "data_object_type", fte.id)
|
|
514
|
-
n_docs_with_types_added += 1
|
|
515
|
-
|
|
516
|
-
mdb.drop_collection(temp_collection_name)
|
|
517
|
-
return (
|
|
518
|
-
assoc(
|
|
519
|
-
docs, "data_object_set", [dissoc(v, "_id") for v in do_docs_map.values()]
|
|
520
|
-
),
|
|
521
|
-
n_docs_with_types_added,
|
|
522
|
-
)
|
|
523
|
-
|
|
524
|
-
|
|
525
385
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
526
386
|
def perform_mongo_updates(context, json_in):
|
|
387
|
+
"""
|
|
388
|
+
TODO: Document this function.
|
|
389
|
+
"""
|
|
527
390
|
mongo = context.resources.mongo
|
|
528
391
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
529
392
|
op_id = context.op_config.get("operation_id")
|
|
530
393
|
|
|
531
394
|
docs = json_in
|
|
532
395
|
docs, _ = specialize_activity_set_docs(docs)
|
|
533
|
-
docs, n_docs_with_types_added = ensure_data_object_type(docs, mongo.db)
|
|
534
|
-
context.log.info(f"added `data_object_type` to {n_docs_with_types_added} docs")
|
|
535
396
|
context.log.debug(f"{docs}")
|
|
536
397
|
|
|
537
398
|
rv = validate_json(
|
|
@@ -555,6 +416,9 @@ def perform_mongo_updates(context, json_in):
|
|
|
555
416
|
def _add_schema_docs_with_or_without_replacement(
|
|
556
417
|
mongo: MongoDBResource, docs: Dict[str, list]
|
|
557
418
|
):
|
|
419
|
+
"""
|
|
420
|
+
TODO: Document this function.
|
|
421
|
+
"""
|
|
558
422
|
coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
|
|
559
423
|
if all(coll_index_on_id_map[coll] for coll in docs.keys()):
|
|
560
424
|
replace = True
|
|
@@ -578,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
|
|
|
578
442
|
f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
|
|
579
443
|
)
|
|
580
444
|
op_result = mongo.add_docs(docs, validate=False, replace=replace)
|
|
581
|
-
|
|
445
|
+
|
|
446
|
+
# Translate the operation result into a dictionary in which each item's key is a collection name
|
|
447
|
+
# and each item's value is the corresponding bulk API result (excluding the "upserted" field).
|
|
448
|
+
return {
|
|
449
|
+
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
450
|
+
for collection_name, bulk_write_result in op_result.items()
|
|
451
|
+
}
|
|
582
452
|
|
|
583
453
|
|
|
584
454
|
@op(required_resource_keys={"mongo"})
|
|
@@ -600,22 +470,25 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
|
|
|
600
470
|
"study_type": str,
|
|
601
471
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
602
472
|
"include_field_site_info": bool,
|
|
473
|
+
"enable_biosample_filtering": bool,
|
|
603
474
|
},
|
|
604
475
|
out={
|
|
605
476
|
"study_id": Out(str),
|
|
606
477
|
"study_type": Out(str),
|
|
607
478
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
608
479
|
"include_field_site_info": Out(bool),
|
|
480
|
+
"enable_biosample_filtering": Out(bool),
|
|
609
481
|
},
|
|
610
482
|
)
|
|
611
483
|
def get_gold_study_pipeline_inputs(
|
|
612
484
|
context: OpExecutionContext,
|
|
613
|
-
) -> Tuple[str, str, str, bool]:
|
|
485
|
+
) -> Tuple[str, str, str, bool, bool]:
|
|
614
486
|
return (
|
|
615
487
|
context.op_config["study_id"],
|
|
616
488
|
context.op_config["study_type"],
|
|
617
489
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
618
490
|
context.op_config["include_field_site_info"],
|
|
491
|
+
context.op_config["enable_biosample_filtering"],
|
|
619
492
|
)
|
|
620
493
|
|
|
621
494
|
|
|
@@ -659,6 +532,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
659
532
|
analysis_projects: List[Dict[str, Any]],
|
|
660
533
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
661
534
|
include_field_site_info: bool,
|
|
535
|
+
enable_biosample_filtering: bool,
|
|
662
536
|
) -> nmdc.Database:
|
|
663
537
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
664
538
|
|
|
@@ -674,6 +548,7 @@ def nmdc_schema_database_from_gold_study(
|
|
|
674
548
|
analysis_projects,
|
|
675
549
|
gold_nmdc_instrument_map_df,
|
|
676
550
|
include_field_site_info,
|
|
551
|
+
enable_biosample_filtering,
|
|
677
552
|
id_minter=id_minter,
|
|
678
553
|
)
|
|
679
554
|
database = translator.get_database()
|
|
@@ -681,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
|
|
|
681
556
|
|
|
682
557
|
|
|
683
558
|
@op(
|
|
559
|
+
required_resource_keys={"mongo"},
|
|
684
560
|
out={
|
|
685
561
|
"submission_id": Out(),
|
|
686
562
|
"nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
|
|
687
563
|
"data_object_mapping_file_url": Out(Optional[str]),
|
|
688
564
|
"biosample_extras_file_url": Out(Optional[str]),
|
|
689
565
|
"biosample_extras_slot_mapping_file_url": Out(Optional[str]),
|
|
566
|
+
"study_id": Out(Optional[str]),
|
|
690
567
|
},
|
|
691
568
|
)
|
|
692
569
|
def get_submission_portal_pipeline_inputs(
|
|
570
|
+
context: OpExecutionContext,
|
|
693
571
|
submission_id: str,
|
|
694
572
|
nucleotide_sequencing_mapping_file_url: Optional[str],
|
|
695
573
|
data_object_mapping_file_url: Optional[str],
|
|
696
574
|
biosample_extras_file_url: Optional[str],
|
|
697
575
|
biosample_extras_slot_mapping_file_url: Optional[str],
|
|
698
|
-
|
|
576
|
+
study_id: Optional[str],
|
|
577
|
+
) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
|
|
578
|
+
# query for studies matching the ID to see if it eists
|
|
579
|
+
if study_id:
|
|
580
|
+
mdb = context.resources.mongo.db
|
|
581
|
+
result = mdb.study_set.find_one({"id": study_id})
|
|
582
|
+
if not result:
|
|
583
|
+
raise Exception(f"Study id: {study_id} does not exist in Mongo.")
|
|
584
|
+
|
|
699
585
|
return (
|
|
700
586
|
submission_id,
|
|
701
587
|
nucleotide_sequencing_mapping_file_url,
|
|
702
588
|
data_object_mapping_file_url,
|
|
703
589
|
biosample_extras_file_url,
|
|
704
590
|
biosample_extras_slot_mapping_file_url,
|
|
591
|
+
study_id,
|
|
705
592
|
)
|
|
706
593
|
|
|
707
594
|
|
|
@@ -726,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
726
613
|
study_pi_image_url: Optional[str],
|
|
727
614
|
biosample_extras: Optional[list[dict]],
|
|
728
615
|
biosample_extras_slot_mapping: Optional[list[dict]],
|
|
616
|
+
study_id: Optional[str],
|
|
729
617
|
) -> nmdc.Database:
|
|
730
618
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
731
619
|
|
|
@@ -743,11 +631,37 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
743
631
|
biosample_extras=biosample_extras,
|
|
744
632
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
745
633
|
illumina_instrument_mapping=instrument_mapping,
|
|
634
|
+
study_id=study_id,
|
|
746
635
|
)
|
|
747
636
|
database = translator.get_database()
|
|
748
637
|
return database
|
|
749
638
|
|
|
750
639
|
|
|
640
|
+
@op(required_resource_keys={"nmdc_portal_api_client"})
|
|
641
|
+
def add_public_image_urls(
|
|
642
|
+
context: OpExecutionContext, database: nmdc.Database, submission_id: str
|
|
643
|
+
) -> nmdc.Database:
|
|
644
|
+
client: NmdcPortalApiClient = context.resources.nmdc_portal_api_client
|
|
645
|
+
|
|
646
|
+
if len(database.study_set) != 1:
|
|
647
|
+
raise Failure(
|
|
648
|
+
description="Expected exactly one study in the database to add public image URLs."
|
|
649
|
+
)
|
|
650
|
+
|
|
651
|
+
study_id = database.study_set[0].id
|
|
652
|
+
public_images = client.make_submission_images_public(
|
|
653
|
+
submission_id, study_id=study_id
|
|
654
|
+
)
|
|
655
|
+
SubmissionPortalTranslator.set_study_images(
|
|
656
|
+
database.study_set[0],
|
|
657
|
+
public_images.get("pi_image_url"),
|
|
658
|
+
public_images.get("primary_study_image_url"),
|
|
659
|
+
public_images.get("study_image_urls"),
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
return database
|
|
663
|
+
|
|
664
|
+
|
|
751
665
|
@op
|
|
752
666
|
def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
|
|
753
667
|
source_id = None
|
|
@@ -1043,18 +957,246 @@ def site_code_mapping() -> dict:
|
|
|
1043
957
|
)
|
|
1044
958
|
|
|
1045
959
|
|
|
1046
|
-
@op(
|
|
1047
|
-
|
|
960
|
+
@op(
|
|
961
|
+
required_resource_keys={"mongo"},
|
|
962
|
+
config_schema={
|
|
963
|
+
"source_ontology": str,
|
|
964
|
+
"output_directory": Field(Noneable(str), default_value=None, is_required=False),
|
|
965
|
+
"generate_reports": Field(bool, default_value=True, is_required=False),
|
|
966
|
+
},
|
|
967
|
+
)
|
|
968
|
+
def load_ontology(context: OpExecutionContext):
|
|
969
|
+
cfg = context.op_config
|
|
970
|
+
source_ontology = cfg["source_ontology"]
|
|
971
|
+
output_directory = cfg.get("output_directory")
|
|
972
|
+
generate_reports = cfg.get("generate_reports", True)
|
|
973
|
+
|
|
974
|
+
if output_directory is None:
|
|
975
|
+
output_directory = os.path.join(os.getcwd(), "ontology_reports")
|
|
976
|
+
|
|
977
|
+
# Redirect Python logging to Dagster context
|
|
978
|
+
handler = logging.Handler()
|
|
979
|
+
handler.emit = lambda record: context.log.info(record.getMessage())
|
|
980
|
+
|
|
981
|
+
# Get logger from ontology-loader package
|
|
982
|
+
controller_logger = logging.getLogger("ontology_loader.ontology_load_controller")
|
|
983
|
+
controller_logger.setLevel(logging.INFO)
|
|
984
|
+
controller_logger.addHandler(handler)
|
|
985
|
+
|
|
986
|
+
context.log.info(f"Running Ontology Loader for ontology: {source_ontology}")
|
|
987
|
+
loader = OntologyLoaderController(
|
|
988
|
+
source_ontology=source_ontology,
|
|
989
|
+
output_directory=output_directory,
|
|
990
|
+
generate_reports=generate_reports,
|
|
991
|
+
mongo_client=context.resources.mongo.client,
|
|
992
|
+
db_name=context.resources.mongo.db.name,
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
loader.run_ontology_loader()
|
|
996
|
+
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def _add_linked_instances_to_alldocs(
|
|
1000
|
+
temp_collection: MongoCollection,
|
|
1001
|
+
context: OpExecutionContext,
|
|
1002
|
+
document_reference_ranged_slots_by_type: dict,
|
|
1003
|
+
) -> None:
|
|
1004
|
+
"""
|
|
1005
|
+
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
1006
|
+
|
|
1007
|
+
The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
1008
|
+
Each subdocument represents a link to another document that either links to or is linked from the document via
|
|
1009
|
+
document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
|
|
1010
|
+
document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
|
|
1011
|
+
considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
|
|
1012
|
+
|
|
1013
|
+
Args:
|
|
1014
|
+
temp_collection: The temporary MongoDB collection to process
|
|
1015
|
+
context: The Dagster execution context for logging
|
|
1016
|
+
document_reference_ranged_slots_by_type: Dictionary mapping document types to their reference-ranged slot names
|
|
1017
|
+
|
|
1018
|
+
Returns:
|
|
1019
|
+
None (modifies the documents in place)
|
|
1020
|
+
"""
|
|
1021
|
+
|
|
1022
|
+
context.log.info(
|
|
1023
|
+
"Building relationships and adding `_upstream` and `_downstream` fields..."
|
|
1024
|
+
)
|
|
1025
|
+
|
|
1026
|
+
# document ID -> type (with "nmdc:" prefix preserved)
|
|
1027
|
+
id_to_type_map: Dict[str, str] = {}
|
|
1028
|
+
|
|
1029
|
+
# set of (<referencing document ID>, <slot>, <referenced document ID>) 3-tuples.
|
|
1030
|
+
relationship_triples: Set[Tuple[str, str, str]] = set()
|
|
1031
|
+
|
|
1032
|
+
# Collect relationship triples.
|
|
1033
|
+
for doc in temp_collection.find():
|
|
1034
|
+
doc_id = doc["id"]
|
|
1035
|
+
# Store the full type with prefix intact
|
|
1036
|
+
doc_type = doc["type"]
|
|
1037
|
+
# For looking up reference slots, we still need the type without prefix
|
|
1038
|
+
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1039
|
+
|
|
1040
|
+
# Record ID to type mapping - preserve the original type with prefix
|
|
1041
|
+
id_to_type_map[doc_id] = doc_type
|
|
1042
|
+
|
|
1043
|
+
# Find all document references from this document
|
|
1044
|
+
reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
|
|
1045
|
+
for slot in reference_slots:
|
|
1046
|
+
if slot in doc:
|
|
1047
|
+
# Handle both single-value and array references
|
|
1048
|
+
refs = doc[slot] if isinstance(doc[slot], list) else [doc[slot]]
|
|
1049
|
+
for ref_doc in temp_collection.find(
|
|
1050
|
+
{"id": {"$in": refs}}, ["id", "type"]
|
|
1051
|
+
):
|
|
1052
|
+
id_to_type_map[ref_doc["id"]] = ref_doc["type"]
|
|
1053
|
+
for ref_id in refs:
|
|
1054
|
+
relationship_triples.add((doc_id, slot, ref_id))
|
|
1055
|
+
|
|
1056
|
+
context.log.info(
|
|
1057
|
+
f"Found {len(id_to_type_map)} documents, with "
|
|
1058
|
+
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1059
|
+
)
|
|
1060
|
+
|
|
1061
|
+
# The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
|
|
1062
|
+
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1063
|
+
# recursion "exploding".
|
|
1064
|
+
#
|
|
1065
|
+
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1066
|
+
# because the NMDC schema does not currently contain or expose it.
|
|
1067
|
+
#
|
|
1068
|
+
# An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
|
|
1069
|
+
upstream_document_reference_ranged_slots = [
|
|
1070
|
+
"associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
|
|
1071
|
+
"collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
|
|
1072
|
+
"has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1073
|
+
"has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
|
|
1074
|
+
"has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1075
|
+
"instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
|
|
1076
|
+
"part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
|
|
1077
|
+
"was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
|
|
1078
|
+
"was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
|
|
1079
|
+
]
|
|
1080
|
+
# A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
|
|
1081
|
+
downstream_document_reference_ranged_slots = [
|
|
1082
|
+
"calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
|
|
1083
|
+
"generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
|
|
1084
|
+
"has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
|
|
1085
|
+
"in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
|
|
1086
|
+
"uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
|
|
1087
|
+
# Note: I don't think of superseding something as being either upstream or downstream of that thing;
|
|
1088
|
+
# but this function requires every document-reference-ranged slot to be accounted for in one
|
|
1089
|
+
# list or the other, and the superseding thing does arise _later_ than the thing it supersedes,
|
|
1090
|
+
# so I have opted to treat the superseding thing as being downstream.
|
|
1091
|
+
"superseded_by", # when a `nmdc:WorkflowExecution` or `nmdc:DataObject` is superseded by a `nmdc:WorkflowExecution`.
|
|
1092
|
+
]
|
|
1093
|
+
|
|
1094
|
+
unique_document_reference_ranged_slot_names = set()
|
|
1095
|
+
for slot_names in document_reference_ranged_slots_by_type.values():
|
|
1096
|
+
for slot_name in slot_names:
|
|
1097
|
+
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1098
|
+
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1099
|
+
if len(upstream_document_reference_ranged_slots) + len(
|
|
1100
|
+
downstream_document_reference_ranged_slots
|
|
1101
|
+
) != len(unique_document_reference_ranged_slot_names):
|
|
1102
|
+
raise Failure(
|
|
1103
|
+
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1104
|
+
"sum of accounted-for upstream and downstream document-reference-ranged slot names."
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
# Construct, and update documents with, `_upstream` and `_downstream` field values.
|
|
1108
|
+
#
|
|
1109
|
+
# manage batching of MongoDB `bulk_write` operations
|
|
1110
|
+
bulk_operations, update_count = [], 0
|
|
1111
|
+
for doc_id, slot, ref_id in relationship_triples:
|
|
1112
|
+
|
|
1113
|
+
# Determine in which respective fields to push this relationship
|
|
1114
|
+
# for the subject (doc) and object (ref) of this triple.
|
|
1115
|
+
if slot in upstream_document_reference_ranged_slots:
|
|
1116
|
+
field_for_doc, field_for_ref = "_upstream", "_downstream"
|
|
1117
|
+
elif slot in downstream_document_reference_ranged_slots:
|
|
1118
|
+
field_for_doc, field_for_ref = "_downstream", "_upstream"
|
|
1119
|
+
else:
|
|
1120
|
+
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1121
|
+
|
|
1122
|
+
updates = [
|
|
1123
|
+
{
|
|
1124
|
+
"filter": {"id": doc_id},
|
|
1125
|
+
"update": {
|
|
1126
|
+
"$push": {
|
|
1127
|
+
field_for_doc: {
|
|
1128
|
+
"id": ref_id,
|
|
1129
|
+
# TODO existing tests are failing due to `KeyError`s for `id_to_type_map.get[ref_id]` here,
|
|
1130
|
+
# which acts as an implicit referential integrity checker (!). Using `.get` with
|
|
1131
|
+
# "nmdc:NamedThing" as default in order to (for now) allow such tests to continue to pass.
|
|
1132
|
+
"type": id_to_type_map.get(ref_id, "nmdc:NamedThing"),
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
},
|
|
1136
|
+
},
|
|
1137
|
+
{
|
|
1138
|
+
"filter": {"id": ref_id},
|
|
1139
|
+
"update": {
|
|
1140
|
+
"$push": {
|
|
1141
|
+
field_for_ref: {"id": doc_id, "type": id_to_type_map[doc_id]}
|
|
1142
|
+
}
|
|
1143
|
+
},
|
|
1144
|
+
},
|
|
1145
|
+
]
|
|
1146
|
+
for update in updates:
|
|
1147
|
+
bulk_operations.append(UpdateOne(**update))
|
|
1148
|
+
|
|
1149
|
+
# Execute in batches for efficiency
|
|
1150
|
+
if len(bulk_operations) >= BULK_WRITE_BATCH_SIZE:
|
|
1151
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1152
|
+
update_count += len(bulk_operations)
|
|
1153
|
+
context.log.info(
|
|
1154
|
+
f"Pushed {update_count/(2*len(relationship_triples)):.1%} of updates so far..."
|
|
1155
|
+
)
|
|
1156
|
+
bulk_operations = []
|
|
1157
|
+
|
|
1158
|
+
# Execute any remaining operations
|
|
1159
|
+
if bulk_operations:
|
|
1160
|
+
temp_collection.bulk_write(bulk_operations)
|
|
1161
|
+
update_count += len(bulk_operations)
|
|
1162
|
+
|
|
1163
|
+
context.log.info(f"Pushed {update_count} updates in total")
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1167
|
+
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
1168
|
+
# while also telling Dagster that this op doesn't need the _value_ of that argument.
|
|
1169
|
+
# This is the approach shown on: https://docs.dagster.io/api/dagster/types#dagster.Nothing
|
|
1170
|
+
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1171
|
+
#
|
|
1172
|
+
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1173
|
+
def materialize_alldocs(context: OpExecutionContext) -> int:
|
|
1048
1174
|
"""
|
|
1049
|
-
This function re
|
|
1050
|
-
|
|
1175
|
+
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1176
|
+
|
|
1177
|
+
1. Getting all populated schema collection names with an `id` field.
|
|
1178
|
+
2. Create a temporary collection to build the new alldocs collection.
|
|
1179
|
+
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1180
|
+
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1181
|
+
5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
|
|
1182
|
+
6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
|
|
1183
|
+
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1184
|
+
|
|
1185
|
+
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
1186
|
+
`nmdc_runtime.site.repository.ensure_alldocs_daily`. The collection is also updated as part of various workflows,
|
|
1187
|
+
such as when applying a changesheet or metadata updates (see `nmdc_runtime.site.graphs`).
|
|
1188
|
+
|
|
1189
|
+
The `alldocs` collection is used primarily by API endpoints like `/data_objects/study/{study_id}` and
|
|
1190
|
+
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1191
|
+
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1192
|
+
|
|
1193
|
+
The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1194
|
+
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1195
|
+
expansions.
|
|
1051
1196
|
"""
|
|
1052
1197
|
mdb = context.resources.mongo.db
|
|
1053
1198
|
schema_view = nmdc_schema_view()
|
|
1054
1199
|
|
|
1055
|
-
# batch size for writing documents to alldocs
|
|
1056
|
-
BULK_WRITE_BATCH_SIZE = 2000
|
|
1057
|
-
|
|
1058
1200
|
# TODO include functional_annotation_agg for "real-time" ref integrity checking.
|
|
1059
1201
|
# For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
|
|
1060
1202
|
collection_names = populated_schema_collection_names_with_id_field(mdb)
|
|
@@ -1079,14 +1221,16 @@ def materialize_alldocs(context) -> int:
|
|
|
1079
1221
|
)
|
|
1080
1222
|
)
|
|
1081
1223
|
|
|
1082
|
-
|
|
1224
|
+
document_reference_ranged_slots_by_type = defaultdict(list)
|
|
1083
1225
|
for cls_name, slot_map in cls_slot_map.items():
|
|
1084
1226
|
for slot_name, slot in slot_map.items():
|
|
1085
1227
|
if (
|
|
1086
1228
|
set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
|
|
1087
1229
|
& document_referenceable_ranges
|
|
1088
1230
|
):
|
|
1089
|
-
|
|
1231
|
+
document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
|
|
1232
|
+
slot_name
|
|
1233
|
+
)
|
|
1090
1234
|
|
|
1091
1235
|
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1092
1236
|
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
@@ -1100,22 +1244,31 @@ def materialize_alldocs(context) -> int:
|
|
|
1100
1244
|
documents_processed_counter = 0
|
|
1101
1245
|
for doc in mdb[coll_name].find():
|
|
1102
1246
|
try:
|
|
1103
|
-
|
|
1247
|
+
# Keep the full type with prefix for document
|
|
1248
|
+
doc_type_full = doc["type"]
|
|
1249
|
+
# Remove prefix for slot lookup and ancestor lookup
|
|
1250
|
+
doc_type = doc_type_full.removeprefix("nmdc:")
|
|
1104
1251
|
except KeyError:
|
|
1105
1252
|
raise Exception(
|
|
1106
1253
|
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1107
1254
|
)
|
|
1108
|
-
slots_to_include = ["id", "type"] +
|
|
1109
|
-
|
|
1255
|
+
slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
|
|
1256
|
+
doc_type_full
|
|
1110
1257
|
]
|
|
1111
1258
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1112
|
-
|
|
1259
|
+
|
|
1260
|
+
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1261
|
+
new_doc["_type_and_ancestors"] = [
|
|
1262
|
+
f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
|
|
1263
|
+
]
|
|
1264
|
+
# InsertOne is a pymongo representation of a mongo command.
|
|
1113
1265
|
write_operations.append(InsertOne(new_doc))
|
|
1114
1266
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1115
1267
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1116
1268
|
write_operations.clear()
|
|
1117
1269
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1118
1270
|
if len(write_operations) > 0:
|
|
1271
|
+
# here bulk_write is a method on the pymongo db Collection class
|
|
1119
1272
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1120
1273
|
documents_processed_counter += len(write_operations)
|
|
1121
1274
|
context.log.info(
|
|
@@ -1132,14 +1285,36 @@ def materialize_alldocs(context) -> int:
|
|
|
1132
1285
|
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1133
1286
|
temp_alldocs_collection.create_index("id", unique=True)
|
|
1134
1287
|
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1135
|
-
slots_to_index =
|
|
1288
|
+
slots_to_index = {"_type_and_ancestors"} | {
|
|
1289
|
+
slot
|
|
1290
|
+
for slots in document_reference_ranged_slots_by_type.values()
|
|
1291
|
+
for slot in slots
|
|
1292
|
+
}
|
|
1136
1293
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1137
|
-
context.log.info(f"created indexes on id
|
|
1294
|
+
context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
|
|
1295
|
+
|
|
1296
|
+
# Add related-ids fields to enable efficient relationship traversal
|
|
1297
|
+
context.log.info("Adding fields for related ids to documents...")
|
|
1298
|
+
_add_linked_instances_to_alldocs(
|
|
1299
|
+
temp_alldocs_collection, context, document_reference_ranged_slots_by_type
|
|
1300
|
+
)
|
|
1301
|
+
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1302
|
+
temp_alldocs_collection.create_index("_upstream.id")
|
|
1303
|
+
temp_alldocs_collection.create_index("_downstream.id")
|
|
1304
|
+
# Create compound indexes to ensure index-covered queries
|
|
1305
|
+
temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
|
|
1306
|
+
temp_alldocs_collection.create_index(
|
|
1307
|
+
[("_downstream.type", 1), ("_downstream.id", 1)]
|
|
1308
|
+
)
|
|
1309
|
+
context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
|
|
1138
1310
|
|
|
1139
1311
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1140
1312
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1141
|
-
|
|
1142
|
-
|
|
1313
|
+
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1314
|
+
context.log.info(
|
|
1315
|
+
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
1316
|
+
)
|
|
1317
|
+
return n_alldocs_documents
|
|
1143
1318
|
|
|
1144
1319
|
|
|
1145
1320
|
@op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
|
|
@@ -1225,6 +1400,42 @@ def get_library_preparation_from_biosamples(
|
|
|
1225
1400
|
return biosample_lib_prep
|
|
1226
1401
|
|
|
1227
1402
|
|
|
1403
|
+
@op(required_resource_keys={"mongo"})
|
|
1404
|
+
def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1405
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
|
|
1406
|
+
|
|
1407
|
+
mdb = context.resources.mongo.db
|
|
1408
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1409
|
+
pooled_biosamples_data = check_pooling_for_biosamples(
|
|
1410
|
+
material_processing_set, biosamples
|
|
1411
|
+
)
|
|
1412
|
+
|
|
1413
|
+
# Fetch ProcessedSample names from database
|
|
1414
|
+
processed_sample_ids = set()
|
|
1415
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1416
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1417
|
+
processed_sample_ids.add(pooling_info["processed_sample_id"])
|
|
1418
|
+
|
|
1419
|
+
# Query database for ProcessedSample names
|
|
1420
|
+
if processed_sample_ids:
|
|
1421
|
+
processed_sample_set = mdb["processed_sample_set"]
|
|
1422
|
+
cursor = processed_sample_set.find(
|
|
1423
|
+
{"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
|
|
1424
|
+
)
|
|
1425
|
+
processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
|
|
1426
|
+
|
|
1427
|
+
# Update pooled_biosamples_data with ProcessedSample names
|
|
1428
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1429
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1430
|
+
processed_sample_id = pooling_info["processed_sample_id"]
|
|
1431
|
+
if processed_sample_id in processed_samples:
|
|
1432
|
+
pooling_info["processed_sample_name"] = processed_samples[
|
|
1433
|
+
processed_sample_id
|
|
1434
|
+
]
|
|
1435
|
+
|
|
1436
|
+
return pooled_biosamples_data
|
|
1437
|
+
|
|
1438
|
+
|
|
1228
1439
|
@op(required_resource_keys={"mongo"})
|
|
1229
1440
|
def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
|
|
1230
1441
|
mdb = context.resources.mongo.db
|
|
@@ -1258,6 +1469,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1258
1469
|
data_object_records: list,
|
|
1259
1470
|
library_preparation_records: list,
|
|
1260
1471
|
all_instruments: dict,
|
|
1472
|
+
pooled_biosamples_data: dict,
|
|
1261
1473
|
) -> str:
|
|
1262
1474
|
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1263
1475
|
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
@@ -1266,6 +1478,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1266
1478
|
data_object_records,
|
|
1267
1479
|
library_preparation_records,
|
|
1268
1480
|
all_instruments,
|
|
1481
|
+
pooled_biosamples_data,
|
|
1269
1482
|
)
|
|
1270
1483
|
return ncbi_xml
|
|
1271
1484
|
|
|
@@ -1282,16 +1495,24 @@ def post_submission_portal_biosample_ingest_record_stitching_filename(
|
|
|
1282
1495
|
config_schema={
|
|
1283
1496
|
"nmdc_study_id": str,
|
|
1284
1497
|
"gold_nmdc_instrument_mapping_file_url": str,
|
|
1498
|
+
"include_field_site_info": bool,
|
|
1499
|
+
"enable_biosample_filtering": bool,
|
|
1285
1500
|
},
|
|
1286
1501
|
out={
|
|
1287
1502
|
"nmdc_study_id": Out(str),
|
|
1288
1503
|
"gold_nmdc_instrument_mapping_file_url": Out(str),
|
|
1504
|
+
"include_field_site_info": Out(bool),
|
|
1505
|
+
"enable_biosample_filtering": Out(bool),
|
|
1289
1506
|
},
|
|
1290
1507
|
)
|
|
1291
|
-
def get_database_updater_inputs(
|
|
1508
|
+
def get_database_updater_inputs(
|
|
1509
|
+
context: OpExecutionContext,
|
|
1510
|
+
) -> Tuple[str, str, bool, bool]:
|
|
1292
1511
|
return (
|
|
1293
1512
|
context.op_config["nmdc_study_id"],
|
|
1294
1513
|
context.op_config["gold_nmdc_instrument_mapping_file_url"],
|
|
1514
|
+
context.op_config["include_field_site_info"],
|
|
1515
|
+
context.op_config["enable_biosample_filtering"],
|
|
1295
1516
|
)
|
|
1296
1517
|
|
|
1297
1518
|
|
|
@@ -1306,6 +1527,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1306
1527
|
context: OpExecutionContext,
|
|
1307
1528
|
nmdc_study_id: str,
|
|
1308
1529
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1530
|
+
include_field_site_info: bool,
|
|
1531
|
+
enable_biosample_filtering: bool,
|
|
1309
1532
|
) -> nmdc.Database:
|
|
1310
1533
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1311
1534
|
context.resources.runtime_api_user_client
|
|
@@ -1321,6 +1544,8 @@ def generate_data_generation_set_post_biosample_ingest(
|
|
|
1321
1544
|
gold_api_client,
|
|
1322
1545
|
nmdc_study_id,
|
|
1323
1546
|
gold_nmdc_instrument_map_df,
|
|
1547
|
+
include_field_site_info,
|
|
1548
|
+
enable_biosample_filtering,
|
|
1324
1549
|
)
|
|
1325
1550
|
database = (
|
|
1326
1551
|
database_updater.generate_data_generation_set_records_from_gold_api_for_study()
|
|
@@ -1340,6 +1565,8 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1340
1565
|
context: OpExecutionContext,
|
|
1341
1566
|
nmdc_study_id: str,
|
|
1342
1567
|
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1568
|
+
include_field_site_info: bool = False,
|
|
1569
|
+
enable_biosample_filtering: bool = False,
|
|
1343
1570
|
) -> nmdc.Database:
|
|
1344
1571
|
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1345
1572
|
context.resources.runtime_api_user_client
|
|
@@ -1355,12 +1582,72 @@ def generate_biosample_set_for_nmdc_study_from_gold(
|
|
|
1355
1582
|
gold_api_client,
|
|
1356
1583
|
nmdc_study_id,
|
|
1357
1584
|
gold_nmdc_instrument_map_df,
|
|
1585
|
+
include_field_site_info,
|
|
1586
|
+
enable_biosample_filtering,
|
|
1358
1587
|
)
|
|
1359
1588
|
database = database_updater.generate_biosample_set_from_gold_api_for_study()
|
|
1360
1589
|
|
|
1361
1590
|
return database
|
|
1362
1591
|
|
|
1363
1592
|
|
|
1593
|
+
@op(
|
|
1594
|
+
required_resource_keys={
|
|
1595
|
+
"runtime_api_user_client",
|
|
1596
|
+
"runtime_api_site_client",
|
|
1597
|
+
"gold_api_client",
|
|
1598
|
+
},
|
|
1599
|
+
out=Out(Any),
|
|
1600
|
+
)
|
|
1601
|
+
def run_script_to_update_insdc_biosample_identifiers(
|
|
1602
|
+
context: OpExecutionContext,
|
|
1603
|
+
nmdc_study_id: str,
|
|
1604
|
+
gold_nmdc_instrument_map_df: pd.DataFrame,
|
|
1605
|
+
include_field_site_info: bool,
|
|
1606
|
+
enable_biosample_filtering: bool,
|
|
1607
|
+
):
|
|
1608
|
+
"""Generates a MongoDB update script to add INSDC biosample identifiers to biosamples.
|
|
1609
|
+
|
|
1610
|
+
This op uses the DatabaseUpdater to generate a script that can be used to update biosample
|
|
1611
|
+
records with INSDC identifiers obtained from GOLD.
|
|
1612
|
+
|
|
1613
|
+
Args:
|
|
1614
|
+
context: The execution context
|
|
1615
|
+
nmdc_study_id: The NMDC study ID for which to generate the update script
|
|
1616
|
+
gold_nmdc_instrument_map_df: A dataframe mapping GOLD instrument IDs to NMDC instrument set records
|
|
1617
|
+
|
|
1618
|
+
Returns:
|
|
1619
|
+
A dictionary or list of dictionaries containing the MongoDB update script(s)
|
|
1620
|
+
"""
|
|
1621
|
+
runtime_api_user_client: RuntimeApiUserClient = (
|
|
1622
|
+
context.resources.runtime_api_user_client
|
|
1623
|
+
)
|
|
1624
|
+
runtime_api_site_client: RuntimeApiSiteClient = (
|
|
1625
|
+
context.resources.runtime_api_site_client
|
|
1626
|
+
)
|
|
1627
|
+
gold_api_client: GoldApiClient = context.resources.gold_api_client
|
|
1628
|
+
|
|
1629
|
+
database_updater = DatabaseUpdater(
|
|
1630
|
+
runtime_api_user_client,
|
|
1631
|
+
runtime_api_site_client,
|
|
1632
|
+
gold_api_client,
|
|
1633
|
+
nmdc_study_id,
|
|
1634
|
+
gold_nmdc_instrument_map_df,
|
|
1635
|
+
include_field_site_info,
|
|
1636
|
+
enable_biosample_filtering,
|
|
1637
|
+
)
|
|
1638
|
+
update_script = database_updater.queries_run_script_to_update_insdc_identifiers()
|
|
1639
|
+
|
|
1640
|
+
if isinstance(update_script, list):
|
|
1641
|
+
total_updates = sum(len(item.get("updates", [])) for item in update_script)
|
|
1642
|
+
else:
|
|
1643
|
+
total_updates = len(update_script.get("updates", []))
|
|
1644
|
+
context.log.info(
|
|
1645
|
+
f"Generated update script for study {nmdc_study_id} with {total_updates} updates"
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
return update_script
|
|
1649
|
+
|
|
1650
|
+
|
|
1364
1651
|
@op
|
|
1365
1652
|
def log_database_ids(
|
|
1366
1653
|
context: OpExecutionContext,
|
|
@@ -1382,3 +1669,55 @@ def log_database_ids(
|
|
|
1382
1669
|
message += "\n"
|
|
1383
1670
|
if message:
|
|
1384
1671
|
context.log.info(message)
|
|
1672
|
+
|
|
1673
|
+
|
|
1674
|
+
@op(
|
|
1675
|
+
description="Render free text through the Dagit UI",
|
|
1676
|
+
out=Out(description="Text content rendered through Dagit UI"),
|
|
1677
|
+
)
|
|
1678
|
+
def render_text(context: OpExecutionContext, text: Any):
|
|
1679
|
+
"""
|
|
1680
|
+
Renders content as a Dagster Asset in the Dagit UI.
|
|
1681
|
+
|
|
1682
|
+
This operation creates a Dagster Asset with the provided content, making it
|
|
1683
|
+
visible in the Dagit UI for easy viewing and sharing.
|
|
1684
|
+
|
|
1685
|
+
Args:
|
|
1686
|
+
context: The execution context
|
|
1687
|
+
text: The content to render (can be a string or a dictionary that will be converted to JSON)
|
|
1688
|
+
|
|
1689
|
+
Returns:
|
|
1690
|
+
The same content that was provided as input
|
|
1691
|
+
"""
|
|
1692
|
+
# Convert dictionary to formatted JSON string if needed
|
|
1693
|
+
if isinstance(text, dict):
|
|
1694
|
+
import json
|
|
1695
|
+
|
|
1696
|
+
content = json.dumps(text, indent=2)
|
|
1697
|
+
file_extension = "json"
|
|
1698
|
+
hash_text = json.dumps(text, sort_keys=True)[:20] # For consistent hashing
|
|
1699
|
+
else:
|
|
1700
|
+
content = str(text) # Convert to string in case it's not already
|
|
1701
|
+
file_extension = "txt"
|
|
1702
|
+
hash_text = content[:20]
|
|
1703
|
+
|
|
1704
|
+
filename = f"rendered_text_{context.run_id}.{file_extension}"
|
|
1705
|
+
file_path = os.path.join(context.instance.storage_directory(), filename)
|
|
1706
|
+
|
|
1707
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
1708
|
+
|
|
1709
|
+
with open(file_path, "w") as f:
|
|
1710
|
+
f.write(content)
|
|
1711
|
+
|
|
1712
|
+
context.log_event(
|
|
1713
|
+
AssetMaterialization(
|
|
1714
|
+
asset_key=f"rendered_text_{hash_from_str(hash_text, 'md5')[:8]}",
|
|
1715
|
+
description="Rendered Content",
|
|
1716
|
+
metadata={
|
|
1717
|
+
"file_path": MetadataValue.path(file_path),
|
|
1718
|
+
"content": MetadataValue.text(content),
|
|
1719
|
+
},
|
|
1720
|
+
)
|
|
1721
|
+
)
|
|
1722
|
+
|
|
1723
|
+
return Output(text)
|