nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +167 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/mongo.py +435 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +270 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +796 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +425 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +633 -13
- nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
- nmdc_runtime/site/graphs.py +8 -22
- nmdc_runtime/site/ops.py +147 -181
- nmdc_runtime/site/repository.py +2 -112
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +90 -48
- nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
- nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
nmdc_runtime/site/ops.py
CHANGED
|
@@ -1,22 +1,21 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
-
import mimetypes
|
|
5
4
|
import os
|
|
6
5
|
import subprocess
|
|
7
|
-
import tempfile
|
|
8
6
|
from collections import defaultdict
|
|
9
7
|
from datetime import datetime, timezone
|
|
10
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
11
9
|
from pprint import pformat
|
|
12
10
|
from toolz.dicttoolz import keyfilter
|
|
13
|
-
from typing import Tuple, Set
|
|
11
|
+
from typing import Tuple, Set
|
|
14
12
|
from zipfile import ZipFile
|
|
15
13
|
from itertools import chain
|
|
16
14
|
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
17
15
|
import pandas as pd
|
|
18
16
|
import requests
|
|
19
|
-
|
|
17
|
+
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
18
|
+
from toolz import dissoc
|
|
20
19
|
|
|
21
20
|
from bson import ObjectId, json_util
|
|
22
21
|
from dagster import (
|
|
@@ -44,7 +43,7 @@ from dagster import (
|
|
|
44
43
|
from gridfs import GridFS
|
|
45
44
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
46
45
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
47
|
-
from nmdc_runtime.api.db.mongo import
|
|
46
|
+
from nmdc_runtime.api.db.mongo import validate_json
|
|
48
47
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
49
48
|
from nmdc_runtime.api.core.metadata import (
|
|
50
49
|
_validate_changesheet,
|
|
@@ -74,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
74
73
|
fetch_nucleotide_sequencing_from_biosamples,
|
|
75
74
|
fetch_library_preparation_from_biosamples,
|
|
76
75
|
)
|
|
77
|
-
from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
|
|
78
76
|
from nmdc_runtime.site.resources import (
|
|
79
77
|
NmdcPortalApiClient,
|
|
80
78
|
GoldApiClient,
|
|
@@ -96,29 +94,23 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
|
|
|
96
94
|
)
|
|
97
95
|
from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
|
|
98
96
|
from nmdc_runtime.site.util import (
|
|
99
|
-
run_and_log,
|
|
100
97
|
schema_collection_has_index_on_id,
|
|
101
98
|
nmdc_study_id_to_filename,
|
|
102
99
|
get_instruments_by_id,
|
|
103
100
|
)
|
|
104
101
|
from nmdc_runtime.util import (
|
|
105
|
-
drs_object_in_for,
|
|
106
|
-
get_names_of_classes_in_effective_range_of_slot,
|
|
107
102
|
pluralize,
|
|
108
|
-
put_object,
|
|
109
103
|
specialize_activity_set_docs,
|
|
110
104
|
collection_name_to_class_names,
|
|
111
|
-
class_hierarchy_as_list,
|
|
112
105
|
nmdc_schema_view,
|
|
113
106
|
populated_schema_collection_names_with_id_field,
|
|
114
107
|
)
|
|
115
108
|
from nmdc_schema import nmdc
|
|
116
|
-
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
117
|
-
from pydantic import BaseModel
|
|
118
109
|
from pymongo import InsertOne, UpdateOne
|
|
119
110
|
from pymongo.database import Database as MongoDatabase
|
|
120
|
-
from
|
|
121
|
-
from toolz import
|
|
111
|
+
from pymongo.collection import Collection as MongoCollection
|
|
112
|
+
from toolz import get_in, valfilter, identity
|
|
113
|
+
|
|
122
114
|
|
|
123
115
|
# batch size for writing documents to alldocs
|
|
124
116
|
BULK_WRITE_BATCH_SIZE = 2000
|
|
@@ -153,99 +145,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
153
145
|
return collection_names
|
|
154
146
|
|
|
155
147
|
|
|
156
|
-
@op(
|
|
157
|
-
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
158
|
-
retry_policy=RetryPolicy(max_retries=2),
|
|
159
|
-
)
|
|
160
|
-
def local_file_to_api_object(context, file_info):
|
|
161
|
-
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
162
|
-
storage_path: str = file_info["storage_path"]
|
|
163
|
-
mime_type = file_info.get("mime_type")
|
|
164
|
-
if mime_type is None:
|
|
165
|
-
mime_type = mimetypes.guess_type(storage_path)[0]
|
|
166
|
-
rv = client.put_object_in_site(
|
|
167
|
-
{"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
|
|
168
|
-
)
|
|
169
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
170
|
-
raise Failure(description=f"put_object_in_site failed: {rv.content}")
|
|
171
|
-
op = rv.json()
|
|
172
|
-
context.log.info(f"put_object_in_site: {op}")
|
|
173
|
-
rv = put_object(storage_path, op["metadata"]["url"])
|
|
174
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
175
|
-
raise Failure(description=f"put_object failed: {rv.content}")
|
|
176
|
-
op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
|
|
177
|
-
rv = client.update_operation(op["id"], op_patch)
|
|
178
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
179
|
-
raise Failure(description="update_operation failed")
|
|
180
|
-
op = rv.json()
|
|
181
|
-
context.log.info(f"update_operation: {op}")
|
|
182
|
-
rv = client.create_object_from_op(op)
|
|
183
|
-
if rv.status_code != status.HTTP_201_CREATED:
|
|
184
|
-
raise Failure("create_object_from_op failed")
|
|
185
|
-
obj = rv.json()
|
|
186
|
-
context.log.info(f'Created /objects/{obj["id"]}')
|
|
187
|
-
mdb = context.resources.mongo.db
|
|
188
|
-
rv = mdb.operations.delete_one({"id": op["id"]})
|
|
189
|
-
if rv.deleted_count != 1:
|
|
190
|
-
context.log.error("deleting op failed")
|
|
191
|
-
yield AssetMaterialization(
|
|
192
|
-
asset_key=AssetKey(["object", obj["name"]]),
|
|
193
|
-
description="output of metadata-translation run_etl",
|
|
194
|
-
metadata={"object_id": MetadataValue.text(obj["id"])},
|
|
195
|
-
)
|
|
196
|
-
yield Output(obj)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
@op(
|
|
200
|
-
out={
|
|
201
|
-
"merged_data_path": Out(
|
|
202
|
-
str,
|
|
203
|
-
description="path to TSV merging of source metadata",
|
|
204
|
-
)
|
|
205
|
-
}
|
|
206
|
-
)
|
|
207
|
-
def build_merged_db(context) -> str:
|
|
208
|
-
context.log.info("metadata-translation: running `make build-merged-db`")
|
|
209
|
-
run_and_log(
|
|
210
|
-
"cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
|
|
211
|
-
)
|
|
212
|
-
storage_path = (
|
|
213
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
214
|
-
)
|
|
215
|
-
yield AssetMaterialization(
|
|
216
|
-
asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
|
|
217
|
-
description="input to metadata-translation run_etl",
|
|
218
|
-
metadata={"path": MetadataValue.path(storage_path)},
|
|
219
|
-
)
|
|
220
|
-
yield Output(storage_path, "merged_data_path")
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
@op(
|
|
224
|
-
required_resource_keys={"runtime_api_site_client"},
|
|
225
|
-
)
|
|
226
|
-
def run_etl(context, merged_data_path: str):
|
|
227
|
-
context.log.info("metadata-translation: running `make run-etl`")
|
|
228
|
-
if not os.path.exists(merged_data_path):
|
|
229
|
-
raise Failure(description=f"merged_db not present at {merged_data_path}")
|
|
230
|
-
run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
|
|
231
|
-
storage_path = (
|
|
232
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
|
|
233
|
-
)
|
|
234
|
-
with ZipFile(storage_path) as zf:
|
|
235
|
-
name = zf.namelist()[0]
|
|
236
|
-
with zf.open(name) as f:
|
|
237
|
-
rv = json.load(f)
|
|
238
|
-
context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
|
|
239
|
-
yield AssetMaterialization(
|
|
240
|
-
asset_key=AssetKey(["gold_translation", "database.json.zip"]),
|
|
241
|
-
description="output of metadata-translation run_etl",
|
|
242
|
-
metadata={
|
|
243
|
-
"path": MetadataValue.path(storage_path),
|
|
244
|
-
},
|
|
245
|
-
)
|
|
246
|
-
yield Output({"storage_path": storage_path})
|
|
247
|
-
|
|
248
|
-
|
|
249
148
|
@op(required_resource_keys={"mongo"})
|
|
250
149
|
def get_operation(context):
|
|
251
150
|
mdb = context.resources.mongo.db
|
|
@@ -470,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
|
|
|
470
369
|
|
|
471
370
|
@op(required_resource_keys={"runtime_api_site_client"})
|
|
472
371
|
def get_json_in(context):
|
|
372
|
+
"""
|
|
373
|
+
TODO: Document this function.
|
|
374
|
+
"""
|
|
473
375
|
object_id = context.op_config.get("object_id")
|
|
474
376
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
475
377
|
rv = client.get_object_bytes(object_id)
|
|
@@ -482,6 +384,9 @@ def get_json_in(context):
|
|
|
482
384
|
|
|
483
385
|
@op(required_resource_keys={"runtime_api_site_client", "mongo"})
|
|
484
386
|
def perform_mongo_updates(context, json_in):
|
|
387
|
+
"""
|
|
388
|
+
TODO: Document this function.
|
|
389
|
+
"""
|
|
485
390
|
mongo = context.resources.mongo
|
|
486
391
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
487
392
|
op_id = context.op_config.get("operation_id")
|
|
@@ -511,6 +416,9 @@ def perform_mongo_updates(context, json_in):
|
|
|
511
416
|
def _add_schema_docs_with_or_without_replacement(
|
|
512
417
|
mongo: MongoDBResource, docs: Dict[str, list]
|
|
513
418
|
):
|
|
419
|
+
"""
|
|
420
|
+
TODO: Document this function.
|
|
421
|
+
"""
|
|
514
422
|
coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
|
|
515
423
|
if all(coll_index_on_id_map[coll] for coll in docs.keys()):
|
|
516
424
|
replace = True
|
|
@@ -534,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
|
|
|
534
442
|
f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
|
|
535
443
|
)
|
|
536
444
|
op_result = mongo.add_docs(docs, validate=False, replace=replace)
|
|
537
|
-
|
|
445
|
+
|
|
446
|
+
# Translate the operation result into a dictionary in which each item's key is a collection name
|
|
447
|
+
# and each item's value is the corresponding bulk API result (excluding the "upserted" field).
|
|
448
|
+
return {
|
|
449
|
+
collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
|
|
450
|
+
for collection_name, bulk_write_result in op_result.items()
|
|
451
|
+
}
|
|
538
452
|
|
|
539
453
|
|
|
540
454
|
@op(required_resource_keys={"mongo"})
|
|
@@ -642,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
|
|
|
642
556
|
|
|
643
557
|
|
|
644
558
|
@op(
|
|
559
|
+
required_resource_keys={"mongo"},
|
|
645
560
|
out={
|
|
646
561
|
"submission_id": Out(),
|
|
647
562
|
"nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
|
|
648
563
|
"data_object_mapping_file_url": Out(Optional[str]),
|
|
649
564
|
"biosample_extras_file_url": Out(Optional[str]),
|
|
650
565
|
"biosample_extras_slot_mapping_file_url": Out(Optional[str]),
|
|
566
|
+
"study_id": Out(Optional[str]),
|
|
651
567
|
},
|
|
652
568
|
)
|
|
653
569
|
def get_submission_portal_pipeline_inputs(
|
|
570
|
+
context: OpExecutionContext,
|
|
654
571
|
submission_id: str,
|
|
655
572
|
nucleotide_sequencing_mapping_file_url: Optional[str],
|
|
656
573
|
data_object_mapping_file_url: Optional[str],
|
|
657
574
|
biosample_extras_file_url: Optional[str],
|
|
658
575
|
biosample_extras_slot_mapping_file_url: Optional[str],
|
|
659
|
-
|
|
576
|
+
study_id: Optional[str],
|
|
577
|
+
) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
|
|
578
|
+
# query for studies matching the ID to see if it eists
|
|
579
|
+
if study_id:
|
|
580
|
+
mdb = context.resources.mongo.db
|
|
581
|
+
result = mdb.study_set.find_one({"id": study_id})
|
|
582
|
+
if not result:
|
|
583
|
+
raise Exception(f"Study id: {study_id} does not exist in Mongo.")
|
|
584
|
+
|
|
660
585
|
return (
|
|
661
586
|
submission_id,
|
|
662
587
|
nucleotide_sequencing_mapping_file_url,
|
|
663
588
|
data_object_mapping_file_url,
|
|
664
589
|
biosample_extras_file_url,
|
|
665
590
|
biosample_extras_slot_mapping_file_url,
|
|
591
|
+
study_id,
|
|
666
592
|
)
|
|
667
593
|
|
|
668
594
|
|
|
@@ -687,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
687
613
|
study_pi_image_url: Optional[str],
|
|
688
614
|
biosample_extras: Optional[list[dict]],
|
|
689
615
|
biosample_extras_slot_mapping: Optional[list[dict]],
|
|
616
|
+
study_id: Optional[str],
|
|
690
617
|
) -> nmdc.Database:
|
|
691
618
|
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
692
619
|
|
|
@@ -704,6 +631,7 @@ def translate_portal_submission_to_nmdc_schema_database(
|
|
|
704
631
|
biosample_extras=biosample_extras,
|
|
705
632
|
biosample_extras_slot_mapping=biosample_extras_slot_mapping,
|
|
706
633
|
illumina_instrument_mapping=instrument_mapping,
|
|
634
|
+
study_id=study_id,
|
|
707
635
|
)
|
|
708
636
|
database = translator.get_database()
|
|
709
637
|
return database
|
|
@@ -1043,15 +971,19 @@ def load_ontology(context: OpExecutionContext):
|
|
|
1043
971
|
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
1044
972
|
|
|
1045
973
|
|
|
1046
|
-
def
|
|
1047
|
-
temp_collection
|
|
974
|
+
def _add_linked_instances_to_alldocs(
|
|
975
|
+
temp_collection: MongoCollection,
|
|
976
|
+
context: OpExecutionContext,
|
|
977
|
+
document_reference_ranged_slots_by_type: dict,
|
|
1048
978
|
) -> None:
|
|
1049
979
|
"""
|
|
1050
|
-
Adds {`
|
|
980
|
+
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
1051
981
|
|
|
1052
|
-
The {`
|
|
1053
|
-
Each subdocument represents a link to
|
|
1054
|
-
|
|
982
|
+
The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
983
|
+
Each subdocument represents a link to another document that either links to or is linked from the document via
|
|
984
|
+
document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
|
|
985
|
+
document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
|
|
986
|
+
considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
|
|
1055
987
|
|
|
1056
988
|
Args:
|
|
1057
989
|
temp_collection: The temporary MongoDB collection to process
|
|
@@ -1063,7 +995,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1063
995
|
"""
|
|
1064
996
|
|
|
1065
997
|
context.log.info(
|
|
1066
|
-
"Building relationships and adding `
|
|
998
|
+
"Building relationships and adding `_upstream` and `_downstream` fields..."
|
|
1067
999
|
)
|
|
1068
1000
|
|
|
1069
1001
|
# document ID -> type (with "nmdc:" prefix preserved)
|
|
@@ -1084,9 +1016,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1084
1016
|
id_to_type_map[doc_id] = doc_type
|
|
1085
1017
|
|
|
1086
1018
|
# Find all document references from this document
|
|
1087
|
-
reference_slots = document_reference_ranged_slots_by_type.get(
|
|
1088
|
-
doc_type_no_prefix, []
|
|
1089
|
-
)
|
|
1019
|
+
reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
|
|
1090
1020
|
for slot in reference_slots:
|
|
1091
1021
|
if slot in doc:
|
|
1092
1022
|
# Handle both single-value and array references
|
|
@@ -1103,34 +1033,32 @@ def _add_related_ids_to_alldocs(
|
|
|
1103
1033
|
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1104
1034
|
)
|
|
1105
1035
|
|
|
1106
|
-
# The bifurcation of document-reference-ranged slots as "
|
|
1036
|
+
# The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
|
|
1107
1037
|
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1108
1038
|
# recursion "exploding".
|
|
1109
1039
|
#
|
|
1110
1040
|
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1111
1041
|
# because the NMDC schema does not currently contain or expose it.
|
|
1112
1042
|
#
|
|
1113
|
-
# An "
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
"collected_from", # a `nmdc:
|
|
1117
|
-
"has_chromatography_configuration", # a `nmdc:
|
|
1118
|
-
"has_input", # a `nmdc:
|
|
1119
|
-
"has_mass_spectrometry_configuration", # a `nmdc:
|
|
1120
|
-
"instrument_used", # a `nmdc:
|
|
1121
|
-
"
|
|
1122
|
-
"was_generated_by", #
|
|
1123
|
-
"was_informed_by", #
|
|
1043
|
+
# An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
|
|
1044
|
+
upstream_document_reference_ranged_slots = [
|
|
1045
|
+
"associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
|
|
1046
|
+
"collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
|
|
1047
|
+
"has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1048
|
+
"has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
|
|
1049
|
+
"has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1050
|
+
"instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
|
|
1051
|
+
"part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
|
|
1052
|
+
"was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
|
|
1053
|
+
"was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
|
|
1124
1054
|
]
|
|
1125
|
-
#
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
"
|
|
1129
|
-
"
|
|
1130
|
-
"
|
|
1131
|
-
"
|
|
1132
|
-
"in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
|
|
1133
|
-
"part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
|
|
1055
|
+
# A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
|
|
1056
|
+
downstream_document_reference_ranged_slots = [
|
|
1057
|
+
"calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
|
|
1058
|
+
"generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
|
|
1059
|
+
"has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
|
|
1060
|
+
"in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
|
|
1061
|
+
"uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
|
|
1134
1062
|
]
|
|
1135
1063
|
|
|
1136
1064
|
unique_document_reference_ranged_slot_names = set()
|
|
@@ -1138,15 +1066,15 @@ def _add_related_ids_to_alldocs(
|
|
|
1138
1066
|
for slot_name in slot_names:
|
|
1139
1067
|
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1140
1068
|
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1141
|
-
if len(
|
|
1142
|
-
|
|
1069
|
+
if len(upstream_document_reference_ranged_slots) + len(
|
|
1070
|
+
downstream_document_reference_ranged_slots
|
|
1143
1071
|
) != len(unique_document_reference_ranged_slot_names):
|
|
1144
1072
|
raise Failure(
|
|
1145
1073
|
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1146
|
-
"sum of accounted-for
|
|
1074
|
+
"sum of accounted-for upstream and downstream document-reference-ranged slot names."
|
|
1147
1075
|
)
|
|
1148
1076
|
|
|
1149
|
-
# Construct, and update documents with, `
|
|
1077
|
+
# Construct, and update documents with, `_upstream` and `_downstream` field values.
|
|
1150
1078
|
#
|
|
1151
1079
|
# manage batching of MongoDB `bulk_write` operations
|
|
1152
1080
|
bulk_operations, update_count = [], 0
|
|
@@ -1154,10 +1082,10 @@ def _add_related_ids_to_alldocs(
|
|
|
1154
1082
|
|
|
1155
1083
|
# Determine in which respective fields to push this relationship
|
|
1156
1084
|
# for the subject (doc) and object (ref) of this triple.
|
|
1157
|
-
if slot in
|
|
1158
|
-
field_for_doc, field_for_ref = "
|
|
1159
|
-
elif slot in
|
|
1160
|
-
field_for_doc, field_for_ref = "
|
|
1085
|
+
if slot in upstream_document_reference_ranged_slots:
|
|
1086
|
+
field_for_doc, field_for_ref = "_upstream", "_downstream"
|
|
1087
|
+
elif slot in downstream_document_reference_ranged_slots:
|
|
1088
|
+
field_for_doc, field_for_ref = "_downstream", "_upstream"
|
|
1161
1089
|
else:
|
|
1162
1090
|
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1163
1091
|
|
|
@@ -1204,14 +1132,6 @@ def _add_related_ids_to_alldocs(
|
|
|
1204
1132
|
|
|
1205
1133
|
context.log.info(f"Pushed {update_count} updates in total")
|
|
1206
1134
|
|
|
1207
|
-
context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
|
|
1208
|
-
temp_collection.create_index("_inbound.id")
|
|
1209
|
-
temp_collection.create_index("_outbound.id")
|
|
1210
|
-
# Create compound indexes to ensure index-covered queries
|
|
1211
|
-
temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
|
|
1212
|
-
temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
|
|
1213
|
-
context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
|
|
1214
|
-
|
|
1215
1135
|
|
|
1216
1136
|
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1217
1137
|
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
@@ -1220,7 +1140,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1220
1140
|
# Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
|
|
1221
1141
|
#
|
|
1222
1142
|
@op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
|
|
1223
|
-
def materialize_alldocs(context) -> int:
|
|
1143
|
+
def materialize_alldocs(context: OpExecutionContext) -> int:
|
|
1224
1144
|
"""
|
|
1225
1145
|
This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
|
|
1226
1146
|
|
|
@@ -1228,8 +1148,8 @@ def materialize_alldocs(context) -> int:
|
|
|
1228
1148
|
2. Create a temporary collection to build the new alldocs collection.
|
|
1229
1149
|
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1230
1150
|
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1231
|
-
5. Add special `
|
|
1232
|
-
6. Add indexes for `id`, relationship fields, and `{
|
|
1151
|
+
5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
|
|
1152
|
+
6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
|
|
1233
1153
|
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1234
1154
|
|
|
1235
1155
|
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
@@ -1240,7 +1160,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1240
1160
|
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1241
1161
|
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1242
1162
|
|
|
1243
|
-
The {`
|
|
1163
|
+
The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1244
1164
|
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1245
1165
|
expansions.
|
|
1246
1166
|
"""
|
|
@@ -1271,14 +1191,16 @@ def materialize_alldocs(context) -> int:
|
|
|
1271
1191
|
)
|
|
1272
1192
|
)
|
|
1273
1193
|
|
|
1274
|
-
|
|
1194
|
+
document_reference_ranged_slots_by_type = defaultdict(list)
|
|
1275
1195
|
for cls_name, slot_map in cls_slot_map.items():
|
|
1276
1196
|
for slot_name, slot in slot_map.items():
|
|
1277
1197
|
if (
|
|
1278
1198
|
set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
|
|
1279
1199
|
& document_referenceable_ranges
|
|
1280
1200
|
):
|
|
1281
|
-
|
|
1201
|
+
document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
|
|
1202
|
+
slot_name
|
|
1203
|
+
)
|
|
1282
1204
|
|
|
1283
1205
|
# Build `alldocs` to a temporary collection for atomic replacement
|
|
1284
1206
|
# https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
|
|
@@ -1295,34 +1217,28 @@ def materialize_alldocs(context) -> int:
|
|
|
1295
1217
|
# Keep the full type with prefix for document
|
|
1296
1218
|
doc_type_full = doc["type"]
|
|
1297
1219
|
# Remove prefix for slot lookup and ancestor lookup
|
|
1298
|
-
doc_type = (
|
|
1299
|
-
doc_type_full[5:]
|
|
1300
|
-
if doc_type_full.startswith("nmdc:")
|
|
1301
|
-
else doc_type_full
|
|
1302
|
-
)
|
|
1220
|
+
doc_type = doc_type_full.removeprefix("nmdc:")
|
|
1303
1221
|
except KeyError:
|
|
1304
1222
|
raise Exception(
|
|
1305
1223
|
f"doc {doc['id']} in collection {coll_name} has no 'type'!"
|
|
1306
1224
|
)
|
|
1307
|
-
slots_to_include = ["id", "type"] +
|
|
1308
|
-
|
|
1225
|
+
slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
|
|
1226
|
+
doc_type_full
|
|
1309
1227
|
]
|
|
1310
1228
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1311
1229
|
|
|
1312
|
-
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1313
|
-
# InsertOne is a method on the py-mongo Client class.
|
|
1314
1230
|
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1315
|
-
ancestors = schema_view.class_ancestors(doc_type)
|
|
1316
1231
|
new_doc["_type_and_ancestors"] = [
|
|
1317
|
-
"nmdc:"
|
|
1232
|
+
f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
|
|
1318
1233
|
]
|
|
1234
|
+
# InsertOne is a pymongo representation of a mongo command.
|
|
1319
1235
|
write_operations.append(InsertOne(new_doc))
|
|
1320
1236
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1321
1237
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1322
1238
|
write_operations.clear()
|
|
1323
1239
|
documents_processed_counter += BULK_WRITE_BATCH_SIZE
|
|
1324
1240
|
if len(write_operations) > 0:
|
|
1325
|
-
# here bulk_write is a method on the
|
|
1241
|
+
# here bulk_write is a method on the pymongo db Collection class
|
|
1326
1242
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
1327
1243
|
documents_processed_counter += len(write_operations)
|
|
1328
1244
|
context.log.info(
|
|
@@ -1339,19 +1255,31 @@ def materialize_alldocs(context) -> int:
|
|
|
1339
1255
|
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1340
1256
|
temp_alldocs_collection.create_index("id", unique=True)
|
|
1341
1257
|
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1342
|
-
slots_to_index =
|
|
1258
|
+
slots_to_index = {"_type_and_ancestors"} | {
|
|
1259
|
+
slot
|
|
1260
|
+
for slots in document_reference_ranged_slots_by_type.values()
|
|
1261
|
+
for slot in slots
|
|
1262
|
+
}
|
|
1343
1263
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1344
|
-
context.log.info(f"created indexes on id
|
|
1264
|
+
context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
|
|
1345
1265
|
|
|
1346
1266
|
# Add related-ids fields to enable efficient relationship traversal
|
|
1347
1267
|
context.log.info("Adding fields for related ids to documents...")
|
|
1348
|
-
|
|
1349
|
-
temp_alldocs_collection, context,
|
|
1268
|
+
_add_linked_instances_to_alldocs(
|
|
1269
|
+
temp_alldocs_collection, context, document_reference_ranged_slots_by_type
|
|
1270
|
+
)
|
|
1271
|
+
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1272
|
+
temp_alldocs_collection.create_index("_upstream.id")
|
|
1273
|
+
temp_alldocs_collection.create_index("_downstream.id")
|
|
1274
|
+
# Create compound indexes to ensure index-covered queries
|
|
1275
|
+
temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
|
|
1276
|
+
temp_alldocs_collection.create_index(
|
|
1277
|
+
[("_downstream.type", 1), ("_downstream.id", 1)]
|
|
1350
1278
|
)
|
|
1279
|
+
context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
|
|
1351
1280
|
|
|
1352
1281
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1353
1282
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1354
|
-
|
|
1355
1283
|
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1356
1284
|
context.log.info(
|
|
1357
1285
|
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|
|
@@ -1442,6 +1370,42 @@ def get_library_preparation_from_biosamples(
|
|
|
1442
1370
|
return biosample_lib_prep
|
|
1443
1371
|
|
|
1444
1372
|
|
|
1373
|
+
@op(required_resource_keys={"mongo"})
|
|
1374
|
+
def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
|
|
1375
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
|
|
1376
|
+
|
|
1377
|
+
mdb = context.resources.mongo.db
|
|
1378
|
+
material_processing_set = mdb["material_processing_set"]
|
|
1379
|
+
pooled_biosamples_data = check_pooling_for_biosamples(
|
|
1380
|
+
material_processing_set, biosamples
|
|
1381
|
+
)
|
|
1382
|
+
|
|
1383
|
+
# Fetch ProcessedSample names from database
|
|
1384
|
+
processed_sample_ids = set()
|
|
1385
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1386
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1387
|
+
processed_sample_ids.add(pooling_info["processed_sample_id"])
|
|
1388
|
+
|
|
1389
|
+
# Query database for ProcessedSample names
|
|
1390
|
+
if processed_sample_ids:
|
|
1391
|
+
processed_sample_set = mdb["processed_sample_set"]
|
|
1392
|
+
cursor = processed_sample_set.find(
|
|
1393
|
+
{"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
|
|
1394
|
+
)
|
|
1395
|
+
processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
|
|
1396
|
+
|
|
1397
|
+
# Update pooled_biosamples_data with ProcessedSample names
|
|
1398
|
+
for biosample_id, pooling_info in pooled_biosamples_data.items():
|
|
1399
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
1400
|
+
processed_sample_id = pooling_info["processed_sample_id"]
|
|
1401
|
+
if processed_sample_id in processed_samples:
|
|
1402
|
+
pooling_info["processed_sample_name"] = processed_samples[
|
|
1403
|
+
processed_sample_id
|
|
1404
|
+
]
|
|
1405
|
+
|
|
1406
|
+
return pooled_biosamples_data
|
|
1407
|
+
|
|
1408
|
+
|
|
1445
1409
|
@op(required_resource_keys={"mongo"})
|
|
1446
1410
|
def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
|
|
1447
1411
|
mdb = context.resources.mongo.db
|
|
@@ -1475,6 +1439,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1475
1439
|
data_object_records: list,
|
|
1476
1440
|
library_preparation_records: list,
|
|
1477
1441
|
all_instruments: dict,
|
|
1442
|
+
pooled_biosamples_data: dict,
|
|
1478
1443
|
) -> str:
|
|
1479
1444
|
ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
|
|
1480
1445
|
ncbi_xml = ncbi_exporter.get_submission_xml(
|
|
@@ -1483,6 +1448,7 @@ def ncbi_submission_xml_from_nmdc_study(
|
|
|
1483
1448
|
data_object_records,
|
|
1484
1449
|
library_preparation_records,
|
|
1485
1450
|
all_instruments,
|
|
1451
|
+
pooled_biosamples_data,
|
|
1486
1452
|
)
|
|
1487
1453
|
return ncbi_xml
|
|
1488
1454
|
|