nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/api/__init__.py +0 -0
- nmdc_runtime/api/analytics.py +70 -0
- nmdc_runtime/api/boot/__init__.py +0 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/__init__.py +0 -0
- nmdc_runtime/api/core/auth.py +208 -0
- nmdc_runtime/api/core/idgen.py +170 -0
- nmdc_runtime/api/core/metadata.py +788 -0
- nmdc_runtime/api/core/util.py +109 -0
- nmdc_runtime/api/db/__init__.py +0 -0
- nmdc_runtime/api/db/mongo.py +447 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/__init__.py +0 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +794 -0
- nmdc_runtime/api/endpoints/ids.py +192 -0
- nmdc_runtime/api/endpoints/jobs.py +143 -0
- nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +105 -0
- nmdc_runtime/api/endpoints/queries.py +679 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +229 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +774 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/main.py +401 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/__init__.py +0 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +37 -0
- nmdc_runtime/api/models/lib/__init__.py +0 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/minter.py +0 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +140 -0
- nmdc_runtime/api/models/util.py +253 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +242 -0
- nmdc_runtime/config.py +7 -8
- nmdc_runtime/core/db/Database.py +1 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -9
- nmdc_runtime/lib/extract_nmdc_data.py +0 -8
- nmdc_runtime/lib/nmdc_dataframes.py +3 -7
- nmdc_runtime/lib/nmdc_etl_class.py +1 -7
- nmdc_runtime/minter/adapters/repository.py +1 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +35 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +1 -2
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/export/ncbi_xml.py +1 -2
- nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
- nmdc_runtime/site/graphs.py +1 -22
- nmdc_runtime/site/ops.py +60 -152
- nmdc_runtime/site/repository.py +0 -112
- nmdc_runtime/site/translation/gold_translator.py +4 -12
- nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
- nmdc_runtime/site/translation/translator.py +63 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +10 -5
- nmdc_runtime/util.py +3 -47
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
- nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
- {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
+
import re
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
5
|
+
from base32_lib import base32
|
|
4
6
|
from pydantic import BaseModel, PositiveInt
|
|
5
7
|
|
|
6
|
-
from nmdc_runtime.minter.config import schema_classes
|
|
8
|
+
from nmdc_runtime.minter.config import schema_classes, typecodes
|
|
7
9
|
|
|
8
10
|
|
|
9
11
|
class Entity(BaseModel):
|
|
@@ -71,3 +73,35 @@ class Identifier(Entity):
|
|
|
71
73
|
class Typecode(Entity):
|
|
72
74
|
schema_class: str
|
|
73
75
|
name: str
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
id_prefix_pattern = rf"(?P<prefix>nmdc)"
|
|
79
|
+
id_typecode_pattern = rf"(?P<typecode>[a-z]{{1,6}})"
|
|
80
|
+
id_shoulder_pattern = rf"(?P<shoulder>[0-9][a-z]{{0,6}}[0-9])"
|
|
81
|
+
id_blade_pattern = rf"(?P<blade>[A-Za-z0-9]+)"
|
|
82
|
+
id_version_pattern = rf"(?P<version>(\.[A-Za-z0-9]+)*)"
|
|
83
|
+
id_locus_pattern = rf"(?P<locus>_[A-Za-z0-9_\.-]+)?"
|
|
84
|
+
id_pattern = (
|
|
85
|
+
rf"^{id_prefix_pattern}:{id_typecode_pattern}-{id_shoulder_pattern}-"
|
|
86
|
+
rf"{id_blade_pattern}{id_version_pattern}{id_locus_pattern}$"
|
|
87
|
+
)
|
|
88
|
+
ID_TYPECODE_VALUES = [t["name"] for t in typecodes()]
|
|
89
|
+
id_typecode_pattern_strict = rf"(?P<typecode_strict>({'|'.join(ID_TYPECODE_VALUES)}))"
|
|
90
|
+
id_blade_pattern_strict = rf"(?P<blade_strict>[{base32.ENCODING_CHARS}]+)"
|
|
91
|
+
id_pattern_strict = (
|
|
92
|
+
rf"^{id_prefix_pattern}:{id_typecode_pattern_strict}-{id_shoulder_pattern}-"
|
|
93
|
+
rf"{id_blade_pattern_strict}{id_version_pattern}{id_locus_pattern}$"
|
|
94
|
+
)
|
|
95
|
+
id_pattern_strict_compiled = re.compile(id_pattern_strict)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def check_valid_ids(ids: list[str]):
|
|
99
|
+
for id_ in ids:
|
|
100
|
+
if not re.match(id_pattern, id_):
|
|
101
|
+
raise ValueError(
|
|
102
|
+
(
|
|
103
|
+
f"Invalid ID format for given ID: '{id_}'.\n\nAn ID must match the pattern: '{id_pattern}'.\n\n"
|
|
104
|
+
"See: <https://microbiomedata.github.io/nmdc-schema/identifiers/#ids-minted-for-use-within-nmdc>"
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
return ids
|
|
@@ -8,7 +8,7 @@ from nmdc_runtime.api.core.util import raise404_if_none
|
|
|
8
8
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
9
9
|
from nmdc_runtime.api.models.site import get_current_client_site, Site
|
|
10
10
|
from nmdc_runtime.minter.adapters.repository import MongoIDStore, MinterError
|
|
11
|
-
from nmdc_runtime.minter.config import minting_service_id
|
|
11
|
+
from nmdc_runtime.minter.config import minting_service_id
|
|
12
12
|
from nmdc_runtime.minter.domain.model import (
|
|
13
13
|
Identifier,
|
|
14
14
|
AuthenticatedMintingRequest,
|
nmdc_runtime/mongo_util.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from pymongo import MongoClient
|
|
2
1
|
from pymongo.database import Database
|
|
3
2
|
from pymongo.collection import Collection
|
|
4
|
-
from typing import Any,
|
|
3
|
+
from typing import Any, Optional
|
|
5
4
|
from pymongo.client_session import ClientSession
|
|
6
5
|
import inspect
|
|
7
6
|
|
|
@@ -16,9 +16,7 @@ from toolz import assoc
|
|
|
16
16
|
|
|
17
17
|
from nmdc_runtime.api.core.util import pick
|
|
18
18
|
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
19
|
-
from nmdc_runtime.
|
|
20
|
-
from nmdc_runtime.site.resources import get_mongo
|
|
21
|
-
from nmdc_runtime.util import nmdc_jsonschema, schema_collection_names_with_id_field
|
|
19
|
+
from nmdc_runtime.util import schema_collection_names_with_id_field
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def collection_stats(mdb: MongoDatabase):
|
|
@@ -4,7 +4,7 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any, List
|
|
7
|
+
from typing import Any, List
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
9
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
10
|
handle_controlled_identified_term_value,
|
|
@@ -16,7 +16,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
16
16
|
handle_float_value,
|
|
17
17
|
handle_string_value,
|
|
18
18
|
load_mappings,
|
|
19
|
-
validate_xml,
|
|
20
19
|
)
|
|
21
20
|
|
|
22
21
|
|
nmdc_runtime/site/graphs.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
from dagster import graph
|
|
1
|
+
from dagster import graph
|
|
2
2
|
|
|
3
3
|
from nmdc_runtime.site.ops import (
|
|
4
|
-
build_merged_db,
|
|
5
4
|
generate_biosample_set_for_nmdc_study_from_gold,
|
|
6
5
|
nmdc_schema_database_export_filename,
|
|
7
6
|
nmdc_schema_database_from_gold_study,
|
|
@@ -12,8 +11,6 @@ from nmdc_runtime.site.ops import (
|
|
|
12
11
|
gold_projects_by_study,
|
|
13
12
|
gold_study,
|
|
14
13
|
poll_for_run_completion,
|
|
15
|
-
run_etl,
|
|
16
|
-
local_file_to_api_object,
|
|
17
14
|
get_operation,
|
|
18
15
|
produce_curated_db,
|
|
19
16
|
delete_operations,
|
|
@@ -70,24 +67,6 @@ from nmdc_runtime.site.ops import (
|
|
|
70
67
|
from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
|
|
71
68
|
|
|
72
69
|
|
|
73
|
-
@graph
|
|
74
|
-
def gold_translation():
|
|
75
|
-
"""
|
|
76
|
-
Translating an export of the JGI GOLD [1] SQL database to the NMDC database JSON schema.
|
|
77
|
-
|
|
78
|
-
[1] Genomes OnLine Database (GOLD) <https://gold.jgi.doe.gov/>.
|
|
79
|
-
"""
|
|
80
|
-
local_file_to_api_object(run_etl(build_merged_db()))
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
@graph()
|
|
84
|
-
def gold_translation_curation():
|
|
85
|
-
# TODO
|
|
86
|
-
# - have produce_curated_db do actual curation (see notebook), persisting to db.
|
|
87
|
-
# - more steps in pipeline? Or handoff via run_status_sensor on DagsterRunStatus.SUCCESS.
|
|
88
|
-
produce_curated_db(get_operation())
|
|
89
|
-
|
|
90
|
-
|
|
91
70
|
@graph()
|
|
92
71
|
def create_objects_from_site_object_puts():
|
|
93
72
|
delete_operations(
|
nmdc_runtime/site/ops.py
CHANGED
|
@@ -4,19 +4,18 @@ import logging
|
|
|
4
4
|
import mimetypes
|
|
5
5
|
import os
|
|
6
6
|
import subprocess
|
|
7
|
-
import tempfile
|
|
8
7
|
from collections import defaultdict
|
|
9
8
|
from datetime import datetime, timezone
|
|
10
|
-
from io import BytesIO
|
|
9
|
+
from io import BytesIO
|
|
11
10
|
from pprint import pformat
|
|
12
11
|
from toolz.dicttoolz import keyfilter
|
|
13
|
-
from typing import Tuple, Set
|
|
12
|
+
from typing import Tuple, Set
|
|
14
13
|
from zipfile import ZipFile
|
|
15
14
|
from itertools import chain
|
|
16
15
|
from ontology_loader.ontology_load_controller import OntologyLoaderController
|
|
17
16
|
import pandas as pd
|
|
18
17
|
import requests
|
|
19
|
-
|
|
18
|
+
from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
|
|
20
19
|
|
|
21
20
|
from bson import ObjectId, json_util
|
|
22
21
|
from dagster import (
|
|
@@ -44,7 +43,7 @@ from dagster import (
|
|
|
44
43
|
from gridfs import GridFS
|
|
45
44
|
from linkml_runtime.utils.dictutils import as_simple_dict
|
|
46
45
|
from linkml_runtime.utils.yamlutils import YAMLRoot
|
|
47
|
-
from nmdc_runtime.api.db.mongo import
|
|
46
|
+
from nmdc_runtime.api.db.mongo import validate_json
|
|
48
47
|
from nmdc_runtime.api.core.idgen import generate_one_id
|
|
49
48
|
from nmdc_runtime.api.core.metadata import (
|
|
50
49
|
_validate_changesheet,
|
|
@@ -103,22 +102,19 @@ from nmdc_runtime.site.util import (
|
|
|
103
102
|
)
|
|
104
103
|
from nmdc_runtime.util import (
|
|
105
104
|
drs_object_in_for,
|
|
106
|
-
get_names_of_classes_in_effective_range_of_slot,
|
|
107
105
|
pluralize,
|
|
108
106
|
put_object,
|
|
109
107
|
specialize_activity_set_docs,
|
|
110
108
|
collection_name_to_class_names,
|
|
111
|
-
class_hierarchy_as_list,
|
|
112
109
|
nmdc_schema_view,
|
|
113
110
|
populated_schema_collection_names_with_id_field,
|
|
114
111
|
)
|
|
115
112
|
from nmdc_schema import nmdc
|
|
116
|
-
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
117
|
-
from pydantic import BaseModel
|
|
118
113
|
from pymongo import InsertOne, UpdateOne
|
|
119
114
|
from pymongo.database import Database as MongoDatabase
|
|
120
115
|
from starlette import status
|
|
121
|
-
from toolz import
|
|
116
|
+
from toolz import get_in, valfilter, identity
|
|
117
|
+
|
|
122
118
|
|
|
123
119
|
# batch size for writing documents to alldocs
|
|
124
120
|
BULK_WRITE_BATCH_SIZE = 2000
|
|
@@ -153,99 +149,6 @@ def mongo_stats(context) -> List[str]:
|
|
|
153
149
|
return collection_names
|
|
154
150
|
|
|
155
151
|
|
|
156
|
-
@op(
|
|
157
|
-
required_resource_keys={"mongo", "runtime_api_site_client"},
|
|
158
|
-
retry_policy=RetryPolicy(max_retries=2),
|
|
159
|
-
)
|
|
160
|
-
def local_file_to_api_object(context, file_info):
|
|
161
|
-
client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
|
|
162
|
-
storage_path: str = file_info["storage_path"]
|
|
163
|
-
mime_type = file_info.get("mime_type")
|
|
164
|
-
if mime_type is None:
|
|
165
|
-
mime_type = mimetypes.guess_type(storage_path)[0]
|
|
166
|
-
rv = client.put_object_in_site(
|
|
167
|
-
{"mime_type": mime_type, "name": storage_path.rpartition("/")[-1]}
|
|
168
|
-
)
|
|
169
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
170
|
-
raise Failure(description=f"put_object_in_site failed: {rv.content}")
|
|
171
|
-
op = rv.json()
|
|
172
|
-
context.log.info(f"put_object_in_site: {op}")
|
|
173
|
-
rv = put_object(storage_path, op["metadata"]["url"])
|
|
174
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
175
|
-
raise Failure(description=f"put_object failed: {rv.content}")
|
|
176
|
-
op_patch = {"done": True, "result": drs_object_in_for(storage_path, op)}
|
|
177
|
-
rv = client.update_operation(op["id"], op_patch)
|
|
178
|
-
if not rv.status_code == status.HTTP_200_OK:
|
|
179
|
-
raise Failure(description="update_operation failed")
|
|
180
|
-
op = rv.json()
|
|
181
|
-
context.log.info(f"update_operation: {op}")
|
|
182
|
-
rv = client.create_object_from_op(op)
|
|
183
|
-
if rv.status_code != status.HTTP_201_CREATED:
|
|
184
|
-
raise Failure("create_object_from_op failed")
|
|
185
|
-
obj = rv.json()
|
|
186
|
-
context.log.info(f'Created /objects/{obj["id"]}')
|
|
187
|
-
mdb = context.resources.mongo.db
|
|
188
|
-
rv = mdb.operations.delete_one({"id": op["id"]})
|
|
189
|
-
if rv.deleted_count != 1:
|
|
190
|
-
context.log.error("deleting op failed")
|
|
191
|
-
yield AssetMaterialization(
|
|
192
|
-
asset_key=AssetKey(["object", obj["name"]]),
|
|
193
|
-
description="output of metadata-translation run_etl",
|
|
194
|
-
metadata={"object_id": MetadataValue.text(obj["id"])},
|
|
195
|
-
)
|
|
196
|
-
yield Output(obj)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
@op(
|
|
200
|
-
out={
|
|
201
|
-
"merged_data_path": Out(
|
|
202
|
-
str,
|
|
203
|
-
description="path to TSV merging of source metadata",
|
|
204
|
-
)
|
|
205
|
-
}
|
|
206
|
-
)
|
|
207
|
-
def build_merged_db(context) -> str:
|
|
208
|
-
context.log.info("metadata-translation: running `make build-merged-db`")
|
|
209
|
-
run_and_log(
|
|
210
|
-
"cd /opt/dagster/lib/metadata-translation/ && make build-merged-db", context
|
|
211
|
-
)
|
|
212
|
-
storage_path = (
|
|
213
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_merged_data.tsv.zip"
|
|
214
|
-
)
|
|
215
|
-
yield AssetMaterialization(
|
|
216
|
-
asset_key=AssetKey(["gold_translation", "merged_data.tsv.zip"]),
|
|
217
|
-
description="input to metadata-translation run_etl",
|
|
218
|
-
metadata={"path": MetadataValue.path(storage_path)},
|
|
219
|
-
)
|
|
220
|
-
yield Output(storage_path, "merged_data_path")
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
@op(
|
|
224
|
-
required_resource_keys={"runtime_api_site_client"},
|
|
225
|
-
)
|
|
226
|
-
def run_etl(context, merged_data_path: str):
|
|
227
|
-
context.log.info("metadata-translation: running `make run-etl`")
|
|
228
|
-
if not os.path.exists(merged_data_path):
|
|
229
|
-
raise Failure(description=f"merged_db not present at {merged_data_path}")
|
|
230
|
-
run_and_log("cd /opt/dagster/lib/metadata-translation/ && make run-etl", context)
|
|
231
|
-
storage_path = (
|
|
232
|
-
"/opt/dagster/lib/metadata-translation/src/data/nmdc_database.json.zip"
|
|
233
|
-
)
|
|
234
|
-
with ZipFile(storage_path) as zf:
|
|
235
|
-
name = zf.namelist()[0]
|
|
236
|
-
with zf.open(name) as f:
|
|
237
|
-
rv = json.load(f)
|
|
238
|
-
context.log.info(f"nmdc_database.json keys: {list(rv.keys())}")
|
|
239
|
-
yield AssetMaterialization(
|
|
240
|
-
asset_key=AssetKey(["gold_translation", "database.json.zip"]),
|
|
241
|
-
description="output of metadata-translation run_etl",
|
|
242
|
-
metadata={
|
|
243
|
-
"path": MetadataValue.path(storage_path),
|
|
244
|
-
},
|
|
245
|
-
)
|
|
246
|
-
yield Output({"storage_path": storage_path})
|
|
247
|
-
|
|
248
|
-
|
|
249
152
|
@op(required_resource_keys={"mongo"})
|
|
250
153
|
def get_operation(context):
|
|
251
154
|
mdb = context.resources.mongo.db
|
|
@@ -1043,15 +946,17 @@ def load_ontology(context: OpExecutionContext):
|
|
|
1043
946
|
context.log.info(f"Ontology load for {source_ontology} completed successfully!")
|
|
1044
947
|
|
|
1045
948
|
|
|
1046
|
-
def
|
|
949
|
+
def _add_linked_instances_to_alldocs(
|
|
1047
950
|
temp_collection, context, document_reference_ranged_slots_by_type
|
|
1048
951
|
) -> None:
|
|
1049
952
|
"""
|
|
1050
|
-
Adds {`
|
|
953
|
+
Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
|
|
1051
954
|
|
|
1052
|
-
The {`
|
|
1053
|
-
Each subdocument represents a link to
|
|
1054
|
-
|
|
955
|
+
The {`_upstream`,`_downstream`} fields each contain an array of subdocuments, each with fields `id` and `type`.
|
|
956
|
+
Each subdocument represents a link to another document that either links to or is linked from the document via
|
|
957
|
+
document-reference-ranged slots. If document A links to document B, document A is not necessarily "upstream of"
|
|
958
|
+
document B. Rather, "upstream" and "downstream" are defined by domain semantics. For example, a Study is
|
|
959
|
+
considered upstream of a Biosample even though the link `associated_studies` goes from a Biosample to a Study.
|
|
1055
960
|
|
|
1056
961
|
Args:
|
|
1057
962
|
temp_collection: The temporary MongoDB collection to process
|
|
@@ -1063,7 +968,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1063
968
|
"""
|
|
1064
969
|
|
|
1065
970
|
context.log.info(
|
|
1066
|
-
"Building relationships and adding `
|
|
971
|
+
"Building relationships and adding `_upstream` and `_downstream` fields..."
|
|
1067
972
|
)
|
|
1068
973
|
|
|
1069
974
|
# document ID -> type (with "nmdc:" prefix preserved)
|
|
@@ -1078,6 +983,7 @@ def _add_related_ids_to_alldocs(
|
|
|
1078
983
|
# Store the full type with prefix intact
|
|
1079
984
|
doc_type = doc["type"]
|
|
1080
985
|
# For looking up reference slots, we still need the type without prefix
|
|
986
|
+
# FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
|
|
1081
987
|
doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
|
|
1082
988
|
|
|
1083
989
|
# Record ID to type mapping - preserve the original type with prefix
|
|
@@ -1103,34 +1009,32 @@ def _add_related_ids_to_alldocs(
|
|
|
1103
1009
|
f"{len({d for (d, _, _) in relationship_triples})} containing references"
|
|
1104
1010
|
)
|
|
1105
1011
|
|
|
1106
|
-
# The bifurcation of document-reference-ranged slots as "
|
|
1012
|
+
# The bifurcation of document-reference-ranged slots as "upstream" and "downstream" is essential
|
|
1107
1013
|
# in order to perform graph traversal and collect all entities "related" to a given entity without
|
|
1108
1014
|
# recursion "exploding".
|
|
1109
1015
|
#
|
|
1110
1016
|
# Note: We are hard-coding this "direction" information here in the Runtime
|
|
1111
1017
|
# because the NMDC schema does not currently contain or expose it.
|
|
1112
1018
|
#
|
|
1113
|
-
# An "
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
"collected_from", # a `nmdc:
|
|
1117
|
-
"has_chromatography_configuration", # a `nmdc:
|
|
1118
|
-
"has_input", # a `nmdc:
|
|
1119
|
-
"has_mass_spectrometry_configuration", # a `nmdc:
|
|
1120
|
-
"instrument_used", # a `nmdc:
|
|
1121
|
-
"
|
|
1122
|
-
"was_generated_by", #
|
|
1123
|
-
"was_informed_by", #
|
|
1019
|
+
# An "upstream" slot is such that the range entity originated, or helped produce, the domain entity.
|
|
1020
|
+
upstream_document_reference_ranged_slots = [
|
|
1021
|
+
"associated_studies", # when a `nmdc:Study` is upstream of a `nmdc:Biosample`.
|
|
1022
|
+
"collected_from", # when a `nmdc:Site` is upstream of a `nmdc:Biosample`.
|
|
1023
|
+
"has_chromatography_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1024
|
+
"has_input", # when a `nmdc:NamedThing` is upstream of a `nmdc:PlannedProcess`.
|
|
1025
|
+
"has_mass_spectrometry_configuration", # when a `nmdc:Configuration` is upstream of a `nmdc:PlannedProcess`.
|
|
1026
|
+
"instrument_used", # when a `nmdc:Instrument` is upstream of a `nmdc:PlannedProcess`.
|
|
1027
|
+
"part_of", # when a `nmdc:NamedThing` is upstream of a `nmdc:NamedThing`.
|
|
1028
|
+
"was_generated_by", # when a `nmdc:DataEmitterProcess` is upstream of a `nmdc:DataObject`.
|
|
1029
|
+
"was_informed_by", # when a `nmdc:DataGeneration` is upstream of a `nmdc:WorkflowExecution`.
|
|
1124
1030
|
]
|
|
1125
|
-
#
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
"
|
|
1129
|
-
"
|
|
1130
|
-
"
|
|
1131
|
-
"
|
|
1132
|
-
"in_manifest", # a `nmdc:DataObject` becomes associated with `nmdc:Manifest`.
|
|
1133
|
-
"part_of", # a "contained" `nmdc:NamedThing` influences its "container" `nmdc:NamedThing`,
|
|
1031
|
+
# A "downstream" slot is such that the range entity originated from, or is considered part of, the domain entity.
|
|
1032
|
+
downstream_document_reference_ranged_slots = [
|
|
1033
|
+
"calibration_object", # when a `nmdc:DataObject` is downstream of a `nmdc:CalibrationInformation`.
|
|
1034
|
+
"generates_calibration", # when a `nmdc:CalibrationInformation` is downstream of a `nmdc:PlannedProcess`.
|
|
1035
|
+
"has_output", # when a `nmdc:NamedThing` is downstream of a `nmdc:PlannedProcess`.
|
|
1036
|
+
"in_manifest", # when a `nmdc:Manifest` is downstream of a `nmdc:DataObject`.
|
|
1037
|
+
"uses_calibration", # when a `nmdc:CalibrationInformation`is part of a `nmdc:PlannedProcess`.
|
|
1134
1038
|
]
|
|
1135
1039
|
|
|
1136
1040
|
unique_document_reference_ranged_slot_names = set()
|
|
@@ -1138,15 +1042,15 @@ def _add_related_ids_to_alldocs(
|
|
|
1138
1042
|
for slot_name in slot_names:
|
|
1139
1043
|
unique_document_reference_ranged_slot_names.add(slot_name)
|
|
1140
1044
|
context.log.info(f"{unique_document_reference_ranged_slot_names=}")
|
|
1141
|
-
if len(
|
|
1142
|
-
|
|
1045
|
+
if len(upstream_document_reference_ranged_slots) + len(
|
|
1046
|
+
downstream_document_reference_ranged_slots
|
|
1143
1047
|
) != len(unique_document_reference_ranged_slot_names):
|
|
1144
1048
|
raise Failure(
|
|
1145
1049
|
"Number of detected unique document-reference-ranged slot names does not match "
|
|
1146
|
-
"sum of accounted-for
|
|
1050
|
+
"sum of accounted-for upstream and downstream document-reference-ranged slot names."
|
|
1147
1051
|
)
|
|
1148
1052
|
|
|
1149
|
-
# Construct, and update documents with, `
|
|
1053
|
+
# Construct, and update documents with, `_upstream` and `_downstream` field values.
|
|
1150
1054
|
#
|
|
1151
1055
|
# manage batching of MongoDB `bulk_write` operations
|
|
1152
1056
|
bulk_operations, update_count = [], 0
|
|
@@ -1154,10 +1058,10 @@ def _add_related_ids_to_alldocs(
|
|
|
1154
1058
|
|
|
1155
1059
|
# Determine in which respective fields to push this relationship
|
|
1156
1060
|
# for the subject (doc) and object (ref) of this triple.
|
|
1157
|
-
if slot in
|
|
1158
|
-
field_for_doc, field_for_ref = "
|
|
1159
|
-
elif slot in
|
|
1160
|
-
field_for_doc, field_for_ref = "
|
|
1061
|
+
if slot in upstream_document_reference_ranged_slots:
|
|
1062
|
+
field_for_doc, field_for_ref = "_upstream", "_downstream"
|
|
1063
|
+
elif slot in downstream_document_reference_ranged_slots:
|
|
1064
|
+
field_for_doc, field_for_ref = "_downstream", "_upstream"
|
|
1161
1065
|
else:
|
|
1162
1066
|
raise Failure(f"Unknown slot {slot} for document {doc_id}")
|
|
1163
1067
|
|
|
@@ -1204,14 +1108,6 @@ def _add_related_ids_to_alldocs(
|
|
|
1204
1108
|
|
|
1205
1109
|
context.log.info(f"Pushed {update_count} updates in total")
|
|
1206
1110
|
|
|
1207
|
-
context.log.info("Creating {`_inbound`,`_outbound`} indexes...")
|
|
1208
|
-
temp_collection.create_index("_inbound.id")
|
|
1209
|
-
temp_collection.create_index("_outbound.id")
|
|
1210
|
-
# Create compound indexes to ensure index-covered queries
|
|
1211
|
-
temp_collection.create_index([("_inbound.type", 1), ("_inbound.id", 1)])
|
|
1212
|
-
temp_collection.create_index([("_outbound.type", 1), ("_outbound.id", 1)])
|
|
1213
|
-
context.log.info("Successfully created {`_inbound`,`_outbound`} indexes")
|
|
1214
|
-
|
|
1215
1111
|
|
|
1216
1112
|
# Note: Here, we define a so-called "Nothing dependency," which allows us to (in a graph)
|
|
1217
1113
|
# pass an argument to the op (in order to specify the order of the ops in the graph)
|
|
@@ -1228,8 +1124,8 @@ def materialize_alldocs(context) -> int:
|
|
|
1228
1124
|
2. Create a temporary collection to build the new alldocs collection.
|
|
1229
1125
|
3. For each document in schema collections, extract `id`, `type`, and document-reference-ranged slot values.
|
|
1230
1126
|
4. Add a special `_type_and_ancestors` field that contains the class hierarchy for the document's type.
|
|
1231
|
-
5. Add special `
|
|
1232
|
-
6. Add indexes for `id`, relationship fields, and `{
|
|
1127
|
+
5. Add special `_upstream` and `_downstream` fields with subdocuments containing ID and type of related entities.
|
|
1128
|
+
6. Add indexes for `id`, relationship fields, and `{_upstream,_downstream}{.id,(.type, .id)}` (compound) indexes.
|
|
1233
1129
|
7. Finally, atomically replace the existing `alldocs` collection with the temporary one.
|
|
1234
1130
|
|
|
1235
1131
|
The `alldocs` collection is scheduled to be updated daily via a scheduled job defined as
|
|
@@ -1240,7 +1136,7 @@ def materialize_alldocs(context) -> int:
|
|
|
1240
1136
|
`/workflow_executions/{workflow_execution_id}/related_resources` that need to perform graph traversal to find
|
|
1241
1137
|
related documents. It serves as a denormalized view of the database to make these complex queries more efficient.
|
|
1242
1138
|
|
|
1243
|
-
The {`
|
|
1139
|
+
The {`_upstream`,`_downstream`} fields enable efficient index-covered queries to find all entities of specific types
|
|
1244
1140
|
that are related to a given set of source entities, leveraging the `_type_and_ancestors` field for subtype
|
|
1245
1141
|
expansions.
|
|
1246
1142
|
"""
|
|
@@ -1271,6 +1167,9 @@ def materialize_alldocs(context) -> int:
|
|
|
1271
1167
|
)
|
|
1272
1168
|
)
|
|
1273
1169
|
|
|
1170
|
+
# FIXME rename to `document_reference_ranged_slots_by_type`
|
|
1171
|
+
# FIXME key on CURIE, e.g. `nmdc:Study`
|
|
1172
|
+
# (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
|
|
1274
1173
|
document_reference_ranged_slots = defaultdict(list)
|
|
1275
1174
|
for cls_name, slot_map in cls_slot_map.items():
|
|
1276
1175
|
for slot_name, slot in slot_map.items():
|
|
@@ -1310,12 +1209,12 @@ def materialize_alldocs(context) -> int:
|
|
|
1310
1209
|
new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
|
|
1311
1210
|
|
|
1312
1211
|
new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
|
|
1313
|
-
# InsertOne is a method on the py-mongo Client class.
|
|
1314
1212
|
# Get ancestors without the prefix, but add prefix to each one in the output
|
|
1315
1213
|
ancestors = schema_view.class_ancestors(doc_type)
|
|
1316
1214
|
new_doc["_type_and_ancestors"] = [
|
|
1317
1215
|
"nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
|
|
1318
1216
|
]
|
|
1217
|
+
# InsertOne is a pymongo representation of a mongo command.
|
|
1319
1218
|
write_operations.append(InsertOne(new_doc))
|
|
1320
1219
|
if len(write_operations) == BULK_WRITE_BATCH_SIZE:
|
|
1321
1220
|
_ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
|
|
@@ -1339,19 +1238,28 @@ def materialize_alldocs(context) -> int:
|
|
|
1339
1238
|
# so that `temp_alldocs_collection` will be "good to go" on renaming.
|
|
1340
1239
|
temp_alldocs_collection.create_index("id", unique=True)
|
|
1341
1240
|
# Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
|
|
1241
|
+
# TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
|
|
1342
1242
|
slots_to_index = ["has_input", "has_output", "was_informed_by"]
|
|
1343
1243
|
[temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
|
|
1344
1244
|
context.log.info(f"created indexes on id, {slots_to_index}.")
|
|
1345
1245
|
|
|
1346
1246
|
# Add related-ids fields to enable efficient relationship traversal
|
|
1347
1247
|
context.log.info("Adding fields for related ids to documents...")
|
|
1348
|
-
|
|
1248
|
+
_add_linked_instances_to_alldocs(
|
|
1349
1249
|
temp_alldocs_collection, context, document_reference_ranged_slots
|
|
1350
1250
|
)
|
|
1251
|
+
context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
|
|
1252
|
+
temp_alldocs_collection.create_index("_upstream.id")
|
|
1253
|
+
temp_alldocs_collection.create_index("_downstream.id")
|
|
1254
|
+
# Create compound indexes to ensure index-covered queries
|
|
1255
|
+
temp_alldocs_collection.create_index([("_upstream.type", 1), ("_upstream.id", 1)])
|
|
1256
|
+
temp_alldocs_collection.create_index(
|
|
1257
|
+
[("_downstream.type", 1), ("_downstream.id", 1)]
|
|
1258
|
+
)
|
|
1259
|
+
context.log.info("Successfully created {`_upstream`,`_downstream`} indexes")
|
|
1351
1260
|
|
|
1352
1261
|
context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
|
|
1353
1262
|
temp_alldocs_collection.rename("alldocs", dropTarget=True)
|
|
1354
|
-
|
|
1355
1263
|
n_alldocs_documents = mdb.alldocs.estimated_document_count()
|
|
1356
1264
|
context.log.info(
|
|
1357
1265
|
f"Rebuilt `alldocs` collection with {n_alldocs_documents} documents."
|