nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import secrets
|
|
5
|
+
import string
|
|
6
|
+
from datetime import datetime, timezone, timedelta
|
|
7
|
+
from importlib import import_module
|
|
8
|
+
|
|
9
|
+
from fastapi import HTTPException, status
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
from toolz import keyfilter
|
|
12
|
+
|
|
13
|
+
API_SITE_ID = os.getenv("API_SITE_ID")
|
|
14
|
+
API_SITE_CLIENT_ID = os.getenv("API_SITE_CLIENT_ID")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def omit(blacklist, d):
|
|
18
|
+
return keyfilter(lambda k: k not in blacklist, d)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def pick(whitelist, d):
|
|
22
|
+
return keyfilter(lambda k: k in whitelist, d)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def hash_from_str(s: str, algo="sha256") -> str:
|
|
26
|
+
if algo not in hashlib.algorithms_guaranteed:
|
|
27
|
+
raise ValueError(f"desired algorithm {algo} not supported")
|
|
28
|
+
return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def sha256hash_from_file(file_path: str, timestamp: str):
|
|
32
|
+
# https://stackoverflow.com/a/55542529
|
|
33
|
+
h = hashlib.sha256()
|
|
34
|
+
|
|
35
|
+
timestamp_bytes = timestamp.encode("utf-8")
|
|
36
|
+
h.update(timestamp_bytes)
|
|
37
|
+
|
|
38
|
+
with open(file_path, "rb") as file:
|
|
39
|
+
while True:
|
|
40
|
+
# Reading is buffered, so we can read smaller chunks.
|
|
41
|
+
chunk = file.read(h.block_size)
|
|
42
|
+
if not chunk:
|
|
43
|
+
break
|
|
44
|
+
h.update(chunk)
|
|
45
|
+
|
|
46
|
+
return h.hexdigest()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def raise404_if_none(doc, detail="Not found"):
|
|
50
|
+
if doc is None:
|
|
51
|
+
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
|
|
52
|
+
return doc
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def now() -> datetime:
|
|
56
|
+
"""Get a `datetime` representing the current time in UTC."""
|
|
57
|
+
return datetime.now(timezone.utc)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def now_str() -> str:
|
|
61
|
+
"""Get an ISO string representing the current time in UTC."""
|
|
62
|
+
return now().isoformat()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def expiry_dt_from_now(days=0, hours=0, minutes=0, seconds=0):
|
|
66
|
+
return now() + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def has_passed(dt):
|
|
70
|
+
return now() > dt
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def import_via_dotted_path(dotted_path: str):
|
|
74
|
+
module_name, _, member_name = dotted_path.rpartition(".")
|
|
75
|
+
return getattr(import_module(module_name), member_name)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def dotted_path_for(member):
|
|
79
|
+
return f"{member.__module__}.{member.__name__}"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def generate_secret(length=12):
|
|
83
|
+
"""Generate a secret.
|
|
84
|
+
|
|
85
|
+
With
|
|
86
|
+
- at least one lowercase character,
|
|
87
|
+
- at least one uppercase character, and
|
|
88
|
+
- at least three digits
|
|
89
|
+
|
|
90
|
+
"""
|
|
91
|
+
if length < 8:
|
|
92
|
+
raise ValueError(f"{length=} must be >=8.")
|
|
93
|
+
alphabet = string.ascii_letters + string.digits + "!@#$%^*-_+="
|
|
94
|
+
# based on https://docs.python.org/3.8/library/secrets.html#recipes-and-best-practices
|
|
95
|
+
while True:
|
|
96
|
+
_secret = "".join(secrets.choice(alphabet) for i in range(length))
|
|
97
|
+
if (
|
|
98
|
+
any(c.islower() for c in _secret)
|
|
99
|
+
and any(c.isupper() for c in _secret)
|
|
100
|
+
and sum(c.isdigit() for c in _secret) >= 3
|
|
101
|
+
):
|
|
102
|
+
break
|
|
103
|
+
return _secret
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def json_clean(data, model, exclude_unset=False) -> dict:
|
|
107
|
+
"""Run data through a JSON serializer for a pydantic model."""
|
|
108
|
+
if not isinstance(data, (dict, BaseModel)):
|
|
109
|
+
raise TypeError("`data` must be a pydantic model or its .model_dump()")
|
|
110
|
+
m = model(**data) if isinstance(data, dict) else data
|
|
111
|
+
|
|
112
|
+
# Note: Between Pydantic v1 and v2, the `json` method was renamed to `model_dump_json`.
|
|
113
|
+
# Reference: https://docs.pydantic.dev/2.11/migration/#changes-to-pydanticbasemodel
|
|
114
|
+
return json.loads(m.model_dump_json(exclude_unset=exclude_unset))
|
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import os
|
|
3
|
+
from contextlib import AbstractContextManager
|
|
4
|
+
from copy import deepcopy
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from typing import Set
|
|
7
|
+
from uuid import uuid4
|
|
8
|
+
|
|
9
|
+
import bson
|
|
10
|
+
from nmdc_schema.nmdc import Database as NMDCDatabase
|
|
11
|
+
from pymongo.errors import AutoReconnect, OperationFailure
|
|
12
|
+
from refscan.lib.Finder import Finder
|
|
13
|
+
from refscan.scanner import scan_outgoing_references
|
|
14
|
+
from tenacity import wait_random_exponential, retry, retry_if_exception_type
|
|
15
|
+
from toolz import merge, unique
|
|
16
|
+
from refscan.lib.helpers import get_collection_names_from_schema
|
|
17
|
+
|
|
18
|
+
from nmdc_runtime.api.models.query import UpdateStatement, DeleteStatement
|
|
19
|
+
from nmdc_runtime.mongo_util import SessionBoundDatabase
|
|
20
|
+
from nmdc_runtime.util import (
|
|
21
|
+
nmdc_schema_view,
|
|
22
|
+
collection_name_to_class_names,
|
|
23
|
+
ensure_unique_id_indexes,
|
|
24
|
+
nmdc_database_collection_names,
|
|
25
|
+
get_allowed_references,
|
|
26
|
+
get_nmdc_schema_validator,
|
|
27
|
+
)
|
|
28
|
+
from pymongo import MongoClient
|
|
29
|
+
from pymongo.database import Database as MongoDatabase
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@retry(
|
|
33
|
+
retry=retry_if_exception_type(AutoReconnect),
|
|
34
|
+
wait=wait_random_exponential(multiplier=0.5, max=60),
|
|
35
|
+
)
|
|
36
|
+
def check_mongo_ok_autoreconnect(mdb: MongoDatabase):
|
|
37
|
+
r"""
|
|
38
|
+
Check whether the application can write to the database.
|
|
39
|
+
"""
|
|
40
|
+
collection = mdb.get_collection("_runtime.healthcheck")
|
|
41
|
+
collection.insert_one({"status": "ok"})
|
|
42
|
+
collection.delete_many({"status": "ok"})
|
|
43
|
+
return True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@lru_cache
|
|
47
|
+
def get_mongo_client() -> MongoClient:
|
|
48
|
+
r"""
|
|
49
|
+
Returns a `MongoClient` instance you can use to access the MongoDB server specified via environment variables.
|
|
50
|
+
Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
|
|
51
|
+
"""
|
|
52
|
+
return MongoClient(
|
|
53
|
+
host=os.getenv("MONGO_HOST"),
|
|
54
|
+
username=os.getenv("MONGO_USERNAME"),
|
|
55
|
+
password=os.getenv("MONGO_PASSWORD"),
|
|
56
|
+
directConnection=True,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@lru_cache
|
|
61
|
+
def get_mongo_db() -> MongoDatabase:
|
|
62
|
+
r"""
|
|
63
|
+
Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
|
|
64
|
+
Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
|
|
65
|
+
"""
|
|
66
|
+
_client = get_mongo_client()
|
|
67
|
+
mdb = _client[os.getenv("MONGO_DBNAME")]
|
|
68
|
+
check_mongo_ok_autoreconnect(mdb)
|
|
69
|
+
return mdb
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@lru_cache
|
|
73
|
+
def get_session_bound_mongo_db(session=None) -> MongoDatabase:
|
|
74
|
+
r"""
|
|
75
|
+
Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
|
|
76
|
+
Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
|
|
77
|
+
"""
|
|
78
|
+
_client = get_mongo_client()
|
|
79
|
+
mdb = _client[os.getenv("MONGO_DBNAME")]
|
|
80
|
+
check_mongo_ok_autoreconnect(mdb)
|
|
81
|
+
return SessionBoundDatabase(mdb, session) if session is not None else mdb
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
|
|
85
|
+
"""
|
|
86
|
+
Returns the names of the collections that (a) exist in the database,
|
|
87
|
+
(b) are described by the schema, and (c) contain at least one document.
|
|
88
|
+
|
|
89
|
+
Note: The ampersand (`&`) is the "set intersection" operator.
|
|
90
|
+
"""
|
|
91
|
+
collection_names_from_database = mdb.list_collection_names()
|
|
92
|
+
schema_view = nmdc_schema_view()
|
|
93
|
+
collection_names_from_schema = get_collection_names_from_schema(schema_view)
|
|
94
|
+
names = set(collection_names_from_database) & set(collection_names_from_schema)
|
|
95
|
+
return {name for name in names if mdb[name].estimated_document_count() > 0}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@lru_cache
|
|
99
|
+
def activity_collection_names(mdb: MongoDatabase) -> Set[str]:
|
|
100
|
+
r"""
|
|
101
|
+
TODO: Document this function.
|
|
102
|
+
"""
|
|
103
|
+
return get_nonempty_nmdc_schema_collection_names(mdb) - {
|
|
104
|
+
"biosample_set",
|
|
105
|
+
"study_set",
|
|
106
|
+
"data_object_set",
|
|
107
|
+
"functional_annotation_set",
|
|
108
|
+
"genome_feature_set",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@lru_cache
|
|
113
|
+
def get_planned_process_collection_names() -> Set[str]:
|
|
114
|
+
r"""
|
|
115
|
+
Returns the names of all collections that the schema says can contain documents
|
|
116
|
+
that represent instances of the `PlannedProcess` class or any of its subclasses.
|
|
117
|
+
"""
|
|
118
|
+
schema_view = nmdc_schema_view()
|
|
119
|
+
collection_names = set()
|
|
120
|
+
planned_process_descendants = set(schema_view.class_descendants("PlannedProcess"))
|
|
121
|
+
|
|
122
|
+
for collection_name, class_names in collection_name_to_class_names.items():
|
|
123
|
+
for class_name in class_names:
|
|
124
|
+
# If the name of this class is the name of the `PlannedProcess` class
|
|
125
|
+
# or any of its subclasses, add it to the result set.
|
|
126
|
+
if class_name in planned_process_descendants:
|
|
127
|
+
collection_names.add(collection_name)
|
|
128
|
+
|
|
129
|
+
return collection_names
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def mongodump_excluded_collections() -> str:
|
|
133
|
+
"""
|
|
134
|
+
TODO: Document this function.
|
|
135
|
+
"""
|
|
136
|
+
_mdb = get_mongo_db()
|
|
137
|
+
schema_view = nmdc_schema_view()
|
|
138
|
+
collection_names_from_database = _mdb.list_collection_names()
|
|
139
|
+
collection_names_from_schema = get_collection_names_from_schema(schema_view)
|
|
140
|
+
excluded_collections = " ".join(
|
|
141
|
+
f"--excludeCollection={c}"
|
|
142
|
+
for c in sorted(
|
|
143
|
+
set(collection_names_from_database) - set(collection_names_from_schema)
|
|
144
|
+
)
|
|
145
|
+
)
|
|
146
|
+
return excluded_collections
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def mongorestore_collection(mdb, collection_name, bson_file_path):
|
|
150
|
+
"""
|
|
151
|
+
Replaces the specified collection with one that reflects the contents of the
|
|
152
|
+
specified BSON file.
|
|
153
|
+
"""
|
|
154
|
+
with gzip.open(bson_file_path, "rb") as bson_file:
|
|
155
|
+
data = bson.decode_all(bson_file.read())
|
|
156
|
+
if data:
|
|
157
|
+
mdb.drop_collection(collection_name)
|
|
158
|
+
mdb[collection_name].insert_many(data)
|
|
159
|
+
print(
|
|
160
|
+
f"mongorestore_collection: inserted {len(data)} documents into {collection_name} after drop"
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
print(f"mongorestore_collection: no {collection_name} documents found")
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def mongorestore_from_dir(mdb, dump_directory, skip_collections=None):
|
|
167
|
+
"""
|
|
168
|
+
Effectively runs a `mongorestore` command in pure Python.
|
|
169
|
+
Helpful in a container context that does not have the `mongorestore` command available.
|
|
170
|
+
"""
|
|
171
|
+
skip_collections = skip_collections or []
|
|
172
|
+
for root, dirs, files in os.walk(dump_directory):
|
|
173
|
+
for file in files:
|
|
174
|
+
if file.endswith(".bson.gz"):
|
|
175
|
+
collection_name = file.replace(".bson.gz", "")
|
|
176
|
+
if collection_name in skip_collections:
|
|
177
|
+
continue
|
|
178
|
+
bson_file_path = os.path.join(root, file)
|
|
179
|
+
mongorestore_collection(mdb, collection_name, bson_file_path)
|
|
180
|
+
|
|
181
|
+
print("mongorestore_from_dir completed successfully.")
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class OverlayDBError(Exception):
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class OverlayDB(AbstractContextManager):
|
|
189
|
+
"""Provides a context whereby a base Database is overlaid with a temporary one.
|
|
190
|
+
|
|
191
|
+
If you need to run basic simulations of updates to a base database,
|
|
192
|
+
you don't want to actually commit transactions to the base database.
|
|
193
|
+
|
|
194
|
+
For example, to insert or replace (matching on "id") many documents into a collection in order
|
|
195
|
+
to then validate the resulting total set of collection documents, an OverlayDB writes to
|
|
196
|
+
an overlay collection that "shadows" the base collection during a "find" query
|
|
197
|
+
(the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
|
|
198
|
+
overlay collection, that id is marked as "seen" and will not also be returned when
|
|
199
|
+
subsequently scanning the (unmodified) base-database collection.
|
|
200
|
+
|
|
201
|
+
Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
|
|
202
|
+
database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
|
|
203
|
+
`overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
|
|
204
|
+
the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
|
|
205
|
+
"merging" just-in-time to process the method invocation. You can see an example of this in the implementation
|
|
206
|
+
of the `merge_find` method, which internally accesses both the real database and the overlaying database.
|
|
207
|
+
|
|
208
|
+
Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
|
|
209
|
+
documents from a base collection to the overlay, and then applying the updates to the overlay,
|
|
210
|
+
so that again, base collections are unmodified, and a "merge_find" call will produce a result
|
|
211
|
+
*as if* the base collection(s) were modified.
|
|
212
|
+
|
|
213
|
+
Mongo deletions (as the "delete" method) also copy affected documents from the base collection
|
|
214
|
+
to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
|
|
215
|
+
call will match a relevant document given a suitable filter, and will mark the document's id
|
|
216
|
+
as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
|
|
217
|
+
|
|
218
|
+
Usage:
|
|
219
|
+
````
|
|
220
|
+
with OverlayDB(mdb) as odb:
|
|
221
|
+
# do stuff, e.g. `odb.replace_or_insert_many(...)`
|
|
222
|
+
```
|
|
223
|
+
"""
|
|
224
|
+
|
|
225
|
+
def __init__(self, mdb: MongoDatabase):
|
|
226
|
+
self._bottom_db = mdb
|
|
227
|
+
self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
|
|
228
|
+
ensure_unique_id_indexes(self._top_db)
|
|
229
|
+
|
|
230
|
+
def __enter__(self):
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
234
|
+
self._bottom_db.client.drop_database(self._top_db.name)
|
|
235
|
+
|
|
236
|
+
def replace_or_insert_many(self, coll_name, documents: list):
|
|
237
|
+
try:
|
|
238
|
+
self._top_db[coll_name].insert_many(documents)
|
|
239
|
+
except OperationFailure as e:
|
|
240
|
+
raise OverlayDBError(str(e.details))
|
|
241
|
+
|
|
242
|
+
def apply_updates(self, coll_name, updates: list):
|
|
243
|
+
"""prepare overlay db and apply updates to it."""
|
|
244
|
+
assert all(UpdateStatement(**us) for us in updates)
|
|
245
|
+
for update_spec in updates:
|
|
246
|
+
for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
|
|
247
|
+
self._top_db[coll_name].insert_one(bottom_doc)
|
|
248
|
+
try:
|
|
249
|
+
self._top_db.command({"update": coll_name, "updates": updates})
|
|
250
|
+
except OperationFailure as e:
|
|
251
|
+
raise OverlayDBError(str(e.details))
|
|
252
|
+
|
|
253
|
+
def delete(self, coll_name, deletes: list):
|
|
254
|
+
""" "apply" delete command by flagging docs in overlay database"""
|
|
255
|
+
assert all(DeleteStatement(**us) for us in deletes)
|
|
256
|
+
for delete_spec in deletes:
|
|
257
|
+
for bottom_doc in self._bottom_db[coll_name].find(
|
|
258
|
+
delete_spec["q"], limit=delete_spec["limit"]
|
|
259
|
+
):
|
|
260
|
+
bottom_doc["_deleted"] = True
|
|
261
|
+
self._top_db[coll_name].insert_one(bottom_doc)
|
|
262
|
+
|
|
263
|
+
def merge_find(self, coll_name, find_spec: dict):
|
|
264
|
+
"""Yield docs first from overlay and then from base db, minding deletion flags."""
|
|
265
|
+
# ensure projection of "id" and "_deleted"
|
|
266
|
+
if "projection" in find_spec:
|
|
267
|
+
proj = find_spec["projection"]
|
|
268
|
+
if isinstance(proj, dict):
|
|
269
|
+
proj = merge(proj, {"id": 1, "_deleted": 1})
|
|
270
|
+
elif isinstance(proj, list):
|
|
271
|
+
proj = list(unique(proj + ["id", "_deleted"]))
|
|
272
|
+
|
|
273
|
+
top_docs = self._top_db[coll_name].find(**find_spec)
|
|
274
|
+
bottom_docs = self._bottom_db[coll_name].find(**find_spec)
|
|
275
|
+
top_seen_ids = set()
|
|
276
|
+
for doc in top_docs:
|
|
277
|
+
if not doc.get("_deleted"):
|
|
278
|
+
yield doc
|
|
279
|
+
top_seen_ids.add(doc["id"])
|
|
280
|
+
|
|
281
|
+
for doc in bottom_docs:
|
|
282
|
+
if doc["id"] not in top_seen_ids:
|
|
283
|
+
yield doc
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def validate_json(
|
|
287
|
+
in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
|
|
288
|
+
):
|
|
289
|
+
r"""
|
|
290
|
+
Checks whether the specified dictionary represents a valid instance of the `Database` class
|
|
291
|
+
defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
|
|
292
|
+
|
|
293
|
+
Example dictionary:
|
|
294
|
+
{
|
|
295
|
+
"biosample_set": [
|
|
296
|
+
{"id": "nmdc:bsm-00-000001", ...},
|
|
297
|
+
{"id": "nmdc:bsm-00-000002", ...}
|
|
298
|
+
],
|
|
299
|
+
"study_set": [
|
|
300
|
+
{"id": "nmdc:sty-00-000001", ...},
|
|
301
|
+
{"id": "nmdc:sty-00-000002", ...}
|
|
302
|
+
]
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
:param in_docs: The dictionary you want to validate
|
|
306
|
+
:param mdb: A reference to a MongoDB database
|
|
307
|
+
:param check_inter_document_references: Whether you want this function to check whether every document that
|
|
308
|
+
is referenced by any of the documents passed in would, indeed, exist
|
|
309
|
+
in the database, if the documents passed in were to be inserted into
|
|
310
|
+
the database. In other words, set this to `True` if you want this
|
|
311
|
+
function to perform referential integrity checks.
|
|
312
|
+
"""
|
|
313
|
+
validator = get_nmdc_schema_validator()
|
|
314
|
+
docs = deepcopy(in_docs)
|
|
315
|
+
validation_errors = {}
|
|
316
|
+
|
|
317
|
+
known_coll_names = set(nmdc_database_collection_names())
|
|
318
|
+
for coll_name, coll_docs in docs.items():
|
|
319
|
+
if coll_name not in known_coll_names:
|
|
320
|
+
# We expect each key in `in_docs` to be a known schema collection name. However, `@type` is a special key
|
|
321
|
+
# for JSON-LD, used for JSON serialization of e.g. LinkML objects. That is, the value of `@type` lets a
|
|
322
|
+
# client know that the JSON object (a dict in Python) should be interpreted as a
|
|
323
|
+
# <https://w3id.org/nmdc/Database>. If `@type` is present as a key, and its value indicates that
|
|
324
|
+
# `in_docs` is indeed a nmdc:Database, that's fine, and we don't want to raise an exception.
|
|
325
|
+
#
|
|
326
|
+
# prompted by: https://github.com/microbiomedata/nmdc-runtime/discussions/858
|
|
327
|
+
if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
|
|
328
|
+
continue
|
|
329
|
+
else:
|
|
330
|
+
validation_errors[coll_name] = [
|
|
331
|
+
f"'{coll_name}' is not a known schema collection name"
|
|
332
|
+
]
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
errors = list(
|
|
336
|
+
validator.iter_results({coll_name: coll_docs}, target_class="Database")
|
|
337
|
+
)
|
|
338
|
+
validation_errors[coll_name] = [e.message for e in errors]
|
|
339
|
+
if coll_docs:
|
|
340
|
+
if not isinstance(coll_docs, list):
|
|
341
|
+
validation_errors[coll_name].append("value must be a list")
|
|
342
|
+
elif not all(isinstance(d, dict) for d in coll_docs):
|
|
343
|
+
validation_errors[coll_name].append(
|
|
344
|
+
"all elements of list must be dicts"
|
|
345
|
+
)
|
|
346
|
+
if not validation_errors[coll_name]:
|
|
347
|
+
try:
|
|
348
|
+
with OverlayDB(mdb) as odb:
|
|
349
|
+
odb.replace_or_insert_many(coll_name, coll_docs)
|
|
350
|
+
except OverlayDBError as e:
|
|
351
|
+
validation_errors[coll_name].append(str(e))
|
|
352
|
+
|
|
353
|
+
if all(len(v) == 0 for v in validation_errors.values()):
|
|
354
|
+
# Second pass. Try instantiating linkml-sourced dataclass
|
|
355
|
+
in_docs.pop("@type", None)
|
|
356
|
+
try:
|
|
357
|
+
NMDCDatabase(**in_docs)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
return {"result": "errors", "detail": str(e)}
|
|
360
|
+
|
|
361
|
+
# Third pass (if enabled): Check inter-document references.
|
|
362
|
+
if check_inter_document_references is True:
|
|
363
|
+
# Prepare to use `refscan`.
|
|
364
|
+
#
|
|
365
|
+
# Note: We check the inter-document references in two stages, which are:
|
|
366
|
+
# 1. For each document in the JSON payload, check whether each document it references already exists
|
|
367
|
+
# (in the collections the schema says it can exist in) in the database. We use the
|
|
368
|
+
# `refscan` package to do this, which returns violation details we'll use in the second stage.
|
|
369
|
+
# 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
|
|
370
|
+
# check whether that document exists (in the collections the schema says it can exist in) in the
|
|
371
|
+
# JSON payload. If it does, then we "waive" (i.e. discard) that violation.
|
|
372
|
+
# The violations that remain after those two stages are the ones we return to the caller.
|
|
373
|
+
#
|
|
374
|
+
# Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
|
|
375
|
+
# does not provide a means to perform arbitrary queries against its virtual "merged" database. It
|
|
376
|
+
# is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
|
|
377
|
+
# `refscan`'s `Finder` class accepts.
|
|
378
|
+
#
|
|
379
|
+
finder = Finder(database=mdb)
|
|
380
|
+
references = get_allowed_references()
|
|
381
|
+
|
|
382
|
+
# Iterate over the collections in the JSON payload.
|
|
383
|
+
for source_collection_name, documents in in_docs.items():
|
|
384
|
+
for document in documents:
|
|
385
|
+
# Add an `_id` field to the document, since `refscan` requires the document to have one.
|
|
386
|
+
source_document = dict(document, _id=None)
|
|
387
|
+
violations = scan_outgoing_references(
|
|
388
|
+
document=source_document,
|
|
389
|
+
schema_view=nmdc_schema_view(),
|
|
390
|
+
references=references,
|
|
391
|
+
finder=finder,
|
|
392
|
+
source_collection_name=source_collection_name,
|
|
393
|
+
user_wants_to_locate_misplaced_documents=False,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# For each violation, check whether the misplaced document is in the JSON payload, itself.
|
|
397
|
+
for violation in violations:
|
|
398
|
+
can_waive_violation = False
|
|
399
|
+
# Determine which collections can contain the referenced document, based upon
|
|
400
|
+
# the schema class of which this source document is an instance.
|
|
401
|
+
target_collection_names = (
|
|
402
|
+
references.get_target_collection_names(
|
|
403
|
+
source_class_name=violation.source_class_name,
|
|
404
|
+
source_field_name=violation.source_field_name,
|
|
405
|
+
)
|
|
406
|
+
)
|
|
407
|
+
# Check whether the referenced document exists in any of those collections in the JSON payload.
|
|
408
|
+
for json_coll_name, json_coll_docs in in_docs.items():
|
|
409
|
+
if json_coll_name in target_collection_names:
|
|
410
|
+
for json_coll_doc in json_coll_docs:
|
|
411
|
+
if json_coll_doc["id"] == violation.target_id:
|
|
412
|
+
can_waive_violation = True
|
|
413
|
+
break # stop checking
|
|
414
|
+
if can_waive_violation:
|
|
415
|
+
break # stop checking
|
|
416
|
+
if not can_waive_violation:
|
|
417
|
+
violation_as_str = (
|
|
418
|
+
f"Document '{violation.source_document_id}' "
|
|
419
|
+
f"in collection '{violation.source_collection_name}' "
|
|
420
|
+
f"has a field '{violation.source_field_name}' that "
|
|
421
|
+
f"references a document having id "
|
|
422
|
+
f"'{violation.target_id}', but the latter document "
|
|
423
|
+
f"does not exist in any of the collections the "
|
|
424
|
+
f"NMDC Schema says it can exist in."
|
|
425
|
+
)
|
|
426
|
+
validation_errors[source_collection_name].append(
|
|
427
|
+
violation_as_str
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
# If any collection's error list is not empty, return an error response.
|
|
431
|
+
if any(len(v) > 0 for v in validation_errors.values()):
|
|
432
|
+
return {"result": "errors", "detail": validation_errors}
|
|
433
|
+
|
|
434
|
+
return {"result": "All Okay!"}
|
|
435
|
+
else:
|
|
436
|
+
return {"result": "errors", "detail": validation_errors}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
import boto3
|
|
5
|
+
|
|
6
|
+
API_SITE_BUCKET = os.getenv("API_SITE_ID")
|
|
7
|
+
S3_ID_NS = "do" # Namespace for Drs Objects in Site S3-bucket store.
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@lru_cache
|
|
11
|
+
def get_s3_client():
|
|
12
|
+
_session = boto3.session.Session()
|
|
13
|
+
return _session.client(
|
|
14
|
+
"s3",
|
|
15
|
+
region_name=os.getenv("DO_REGION_NAME"),
|
|
16
|
+
endpoint_url=os.getenv("DO_ENDPOINT_URL"),
|
|
17
|
+
aws_access_key_id=os.getenv("DO_SPACES_KEY"),
|
|
18
|
+
aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def presigned_url_to_put(
|
|
23
|
+
key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
|
|
24
|
+
):
|
|
25
|
+
return client.generate_presigned_url(
|
|
26
|
+
ClientMethod="put_object",
|
|
27
|
+
Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
|
|
28
|
+
ExpiresIn=expires_in,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def presigned_url_to_get(key, client=None, bucket=API_SITE_BUCKET, expires_in=300):
|
|
33
|
+
return client.generate_presigned_url(
|
|
34
|
+
ClientMethod="get_object",
|
|
35
|
+
Params={"Bucket": bucket, "Key": key},
|
|
36
|
+
ExpiresIn=expires_in,
|
|
37
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
import pymongo
|
|
4
|
+
from fastapi import APIRouter, Depends
|
|
5
|
+
|
|
6
|
+
from nmdc_runtime.api.core.util import raise404_if_none
|
|
7
|
+
from nmdc_runtime.api.db.mongo import get_mongo_db
|
|
8
|
+
from nmdc_runtime.api.models.capability import Capability
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@router.get("/capabilities", response_model=List[Capability])
|
|
14
|
+
def list_capabilities(
|
|
15
|
+
mdb: pymongo.database.Database = Depends(get_mongo_db),
|
|
16
|
+
):
|
|
17
|
+
return list(mdb.capabilities.find())
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@router.get("/capabilities/{capability_id}", response_model=Capability)
|
|
21
|
+
def get_capability(
|
|
22
|
+
capability_id: str,
|
|
23
|
+
mdb: pymongo.database.Database = Depends(get_mongo_db),
|
|
24
|
+
):
|
|
25
|
+
return raise404_if_none(mdb.capabilities.find_one({"id": capability_id}))
|