nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<svg width="32" height="32" fill="none" xmlns="http://www.w3.org/2000/svg">
|
|
2
|
+
<path fill-rule="evenodd" clip-rule="evenodd" d="M32 16c0 8.837-7.163 16-16 16-8.838 0-16-7.163-16-16C0 7.162 7.162 0 16 0c8.837 0 16 7.162 16 16Z" fill="#A6CE39"/>
|
|
3
|
+
<path fill-rule="evenodd" clip-rule="evenodd" d="M18.813 9.637h-5.45v13.9h5.474c4.555 0 7.35-3.378 7.35-6.95 0-1.635-.562-3.372-1.77-4.704-1.215-1.336-3.065-2.246-5.605-2.246ZM18.6 21.3h-2.813v-9.425H18.5c1.823 0 3.12.552 3.96 1.4.842.849 1.252 2.021 1.252 3.312 0 .784-.239 1.967-.993 2.948-.745.969-2.01 1.765-4.119 1.765Zm5.311-4.026c-.251 1.74-1.494 4.276-5.311 4.276h-3.063H18.6c3.817 0 5.06-2.536 5.311-4.276Zm1.812-2.405c-.657-2.601-2.85-4.982-6.91-4.982h-5.2 5.2c4.06 0 6.253 2.38 6.91 4.982Zm.215 1.718ZM8.363 9.675v13.887h2.425V9.675H8.363Zm2.175 13.637H8.612h1.925ZM9.575 8.65c.84 0 1.513-.689 1.513-1.513 0-.823-.673-1.512-1.513-1.512-.838 0-1.512.674-1.512 1.513 0 .823.672 1.512 1.512 1.512Z" fill="#fff"/>
|
|
4
|
+
</svg>
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Static
|
|
2
|
+
|
|
3
|
+
This document contains information about the origins of the other files in this directory.
|
|
4
|
+
|
|
5
|
+
- `ORCID-iD_icon_vector.svg`: On September 27, 2025, we downloaded this SVG file from ORCID's [Brand Library](https://orcid.filecamp.com/s/o/3CCuLloCl73Knntn/VU19wHSMUnX9TD4R), which we found a link to on the [Brand Guidelines](https://info.orcid.org/brand-guidelines/) page of ORCID's website.
|
|
Binary file
|
nmdc_runtime/util.py
CHANGED
|
@@ -1,31 +1,34 @@
|
|
|
1
|
+
import importlib.resources
|
|
1
2
|
import json
|
|
2
3
|
import mimetypes
|
|
3
4
|
import os
|
|
4
|
-
import
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
from collections.abc import Iterable
|
|
6
|
-
from contextlib import AbstractContextManager
|
|
7
|
-
from copy import deepcopy
|
|
8
7
|
from datetime import datetime, timezone
|
|
9
8
|
from functools import lru_cache
|
|
10
|
-
from
|
|
9
|
+
from itertools import chain
|
|
11
10
|
from pathlib import Path
|
|
12
|
-
from
|
|
13
|
-
from typing import List, Optional, Set, Dict
|
|
11
|
+
from typing import Callable, List, Optional, Set, Dict
|
|
14
12
|
|
|
15
|
-
import fastjsonschema
|
|
16
13
|
import requests
|
|
14
|
+
from bson.son import SON
|
|
17
15
|
from frozendict import frozendict
|
|
18
|
-
from
|
|
19
|
-
from
|
|
16
|
+
from linkml.validator import Validator
|
|
17
|
+
from linkml.validator.plugins import JsonschemaValidationPlugin
|
|
18
|
+
from linkml_runtime import SchemaView
|
|
19
|
+
from nmdc_schema import NmdcSchemaValidationPlugin
|
|
20
20
|
from nmdc_schema.get_nmdc_view import ViewGetter
|
|
21
|
-
from pydantic import Field, BaseModel
|
|
22
21
|
from pymongo.database import Database as MongoDatabase
|
|
23
22
|
from pymongo.errors import OperationFailure
|
|
24
|
-
from
|
|
23
|
+
from refscan.lib.helpers import (
|
|
24
|
+
identify_references,
|
|
25
|
+
get_collection_name_to_class_names_map,
|
|
26
|
+
)
|
|
27
|
+
from refscan.lib.ReferenceList import ReferenceList
|
|
28
|
+
from toolz import merge
|
|
25
29
|
|
|
26
30
|
from nmdc_runtime.api.core.util import sha256hash_from_file
|
|
27
31
|
from nmdc_runtime.api.models.object import DrsObjectIn
|
|
28
|
-
from typing_extensions import Annotated
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
def get_class_names_from_collection_spec(
|
|
@@ -75,6 +78,23 @@ def get_class_names_from_collection_spec(
|
|
|
75
78
|
return class_names
|
|
76
79
|
|
|
77
80
|
|
|
81
|
+
@lru_cache
|
|
82
|
+
def get_allowed_references() -> ReferenceList:
|
|
83
|
+
r"""
|
|
84
|
+
Returns a `ReferenceList` of all the inter-document references that
|
|
85
|
+
the NMDC Schema allows a schema-compliant MongoDB database to contain.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
# Identify the inter-document references that the schema allows a database to contain.
|
|
89
|
+
print("Identifying schema-allowed references.")
|
|
90
|
+
references = identify_references(
|
|
91
|
+
schema_view=nmdc_schema_view(),
|
|
92
|
+
collection_name_to_class_names=collection_name_to_class_names,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return references
|
|
96
|
+
|
|
97
|
+
|
|
78
98
|
@lru_cache
|
|
79
99
|
def get_type_collections() -> dict:
|
|
80
100
|
"""Returns a dictionary mapping class names to Mongo collection names."""
|
|
@@ -91,41 +111,23 @@ def get_type_collections() -> dict:
|
|
|
91
111
|
return mappings
|
|
92
112
|
|
|
93
113
|
|
|
94
|
-
def without_id_patterns(nmdc_jsonschema):
|
|
95
|
-
rv = deepcopy(nmdc_jsonschema)
|
|
96
|
-
for cls_, spec in rv["$defs"].items():
|
|
97
|
-
if "properties" in spec:
|
|
98
|
-
if "id" in spec["properties"]:
|
|
99
|
-
spec["properties"]["id"].pop("pattern", None)
|
|
100
|
-
return rv
|
|
101
|
-
|
|
102
|
-
|
|
103
114
|
@lru_cache
|
|
104
|
-
def
|
|
105
|
-
"""Get NMDC JSON Schema
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
.getvalue()
|
|
111
|
-
.decode("utf-8")
|
|
112
|
-
)
|
|
113
|
-
return d if enforce_id_patterns else without_id_patterns(d)
|
|
115
|
+
def get_nmdc_jsonschema_path() -> Path:
|
|
116
|
+
"""Get path to NMDC JSON Schema file."""
|
|
117
|
+
with importlib.resources.path(
|
|
118
|
+
"nmdc_schema", "nmdc_materialized_patterns.schema.json"
|
|
119
|
+
) as p:
|
|
120
|
+
return p
|
|
114
121
|
|
|
115
122
|
|
|
116
|
-
@lru_cache
|
|
117
|
-
def
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
@lru_cache()
|
|
124
|
+
def get_nmdc_jsonschema_dict() -> dict:
|
|
125
|
+
"""Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
|
|
126
|
+
with open(get_nmdc_jsonschema_path(), "r") as f:
|
|
127
|
+
return json.load(f)
|
|
121
128
|
|
|
122
129
|
|
|
123
130
|
nmdc_jsonschema = get_nmdc_jsonschema_dict()
|
|
124
|
-
nmdc_jsonschema_validator = get_nmdc_jsonschema_validator()
|
|
125
|
-
nmdc_jsonschema_noidpatterns = get_nmdc_jsonschema_dict(enforce_id_patterns=False)
|
|
126
|
-
nmdc_jsonschema_validator_noidpatterns = get_nmdc_jsonschema_validator(
|
|
127
|
-
enforce_id_patterns=False
|
|
128
|
-
)
|
|
129
131
|
|
|
130
132
|
REPO_ROOT_DIR = Path(__file__).parent.parent
|
|
131
133
|
|
|
@@ -266,9 +268,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
|
|
|
266
268
|
"""Find the first entity with key-value pair k_v, if any?
|
|
267
269
|
|
|
268
270
|
>>> find_one({"id": "foo"}, [{"id": "foo"}])
|
|
271
|
+
{'id': 'foo'}
|
|
272
|
+
>>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
|
|
269
273
|
True
|
|
270
|
-
>>> find_one({"id": "foo"}, [{"id": "bar"}])
|
|
271
|
-
False
|
|
272
274
|
"""
|
|
273
275
|
if len(k_v) > 1:
|
|
274
276
|
raise Exception("Supports only one key-value pair")
|
|
@@ -294,6 +296,49 @@ def nmdc_schema_view():
|
|
|
294
296
|
return ViewGetter().get_view()
|
|
295
297
|
|
|
296
298
|
|
|
299
|
+
@lru_cache()
|
|
300
|
+
def get_nmdc_schema_validator() -> Validator:
|
|
301
|
+
schema_view = nmdc_schema_view()
|
|
302
|
+
return Validator(
|
|
303
|
+
schema_view.schema,
|
|
304
|
+
validation_plugins=[
|
|
305
|
+
JsonschemaValidationPlugin(
|
|
306
|
+
closed=True,
|
|
307
|
+
# Since the `nmdc-schema` package exports a pre-built JSON Schema file, use that
|
|
308
|
+
# instead of relying on the plugin to generate one on the fly.
|
|
309
|
+
json_schema_path=get_nmdc_jsonschema_path(),
|
|
310
|
+
),
|
|
311
|
+
NmdcSchemaValidationPlugin(),
|
|
312
|
+
],
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@lru_cache
|
|
317
|
+
def get_class_name_to_collection_names_map(
|
|
318
|
+
schema_view: SchemaView,
|
|
319
|
+
) -> Dict[str, List[str]]:
|
|
320
|
+
"""
|
|
321
|
+
Returns a mapping of class names to the names of the collections that can store instances of those classes/types,
|
|
322
|
+
according to the specified `SchemaView`.
|
|
323
|
+
|
|
324
|
+
Example output:
|
|
325
|
+
```
|
|
326
|
+
{
|
|
327
|
+
"Study": ["study_set"],
|
|
328
|
+
"Biosample": ["biosample_set"],
|
|
329
|
+
...
|
|
330
|
+
}
|
|
331
|
+
```
|
|
332
|
+
"""
|
|
333
|
+
class_name_to_collection_names = defaultdict(list)
|
|
334
|
+
for collection_name, class_names in get_collection_name_to_class_names_map(
|
|
335
|
+
schema_view
|
|
336
|
+
).items():
|
|
337
|
+
for class_name in class_names:
|
|
338
|
+
class_name_to_collection_names[class_name].append(collection_name)
|
|
339
|
+
return class_name_to_collection_names
|
|
340
|
+
|
|
341
|
+
|
|
297
342
|
@lru_cache
|
|
298
343
|
def nmdc_database_collection_instance_class_names():
|
|
299
344
|
names = []
|
|
@@ -308,6 +353,14 @@ def nmdc_database_collection_instance_class_names():
|
|
|
308
353
|
|
|
309
354
|
@lru_cache
|
|
310
355
|
def nmdc_database_collection_names():
|
|
356
|
+
r"""
|
|
357
|
+
TODO: Document this function.
|
|
358
|
+
|
|
359
|
+
TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
|
|
360
|
+
collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
|
|
361
|
+
instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
|
|
362
|
+
maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
|
|
363
|
+
"""
|
|
311
364
|
names = []
|
|
312
365
|
view = nmdc_schema_view()
|
|
313
366
|
all_classes = set(view.all_classes())
|
|
@@ -340,6 +393,12 @@ def all_docs_have_unique_id(coll) -> bool:
|
|
|
340
393
|
|
|
341
394
|
|
|
342
395
|
def specialize_activity_set_docs(docs):
|
|
396
|
+
"""
|
|
397
|
+
TODO: Document this function.
|
|
398
|
+
|
|
399
|
+
TODO: Check whether this function is still necessary, given that the `Database` class
|
|
400
|
+
in `nmdc-schema` does not have a slot named `activity_set`.
|
|
401
|
+
"""
|
|
343
402
|
validation_errors = {}
|
|
344
403
|
type_collections = get_type_collections()
|
|
345
404
|
if "activity_set" in docs:
|
|
@@ -369,13 +428,38 @@ def specialize_activity_set_docs(docs):
|
|
|
369
428
|
|
|
370
429
|
# Define a mapping from collection name to a list of class names allowable for that collection's documents.
|
|
371
430
|
collection_name_to_class_names: Dict[str, List[str]] = {
|
|
372
|
-
collection_name:
|
|
431
|
+
collection_name: list(
|
|
432
|
+
set(
|
|
433
|
+
chain.from_iterable(
|
|
434
|
+
nmdc_schema_view().class_descendants(cls_name)
|
|
435
|
+
for cls_name in get_class_names_from_collection_spec(spec)
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
)
|
|
373
439
|
for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
|
|
374
440
|
"properties"
|
|
375
441
|
].items()
|
|
376
442
|
}
|
|
377
443
|
|
|
378
444
|
|
|
445
|
+
def class_hierarchy_as_list(obj) -> list[str]:
|
|
446
|
+
"""
|
|
447
|
+
get list of inherited classes for each concrete class
|
|
448
|
+
"""
|
|
449
|
+
rv = []
|
|
450
|
+
current_class = obj.__class__
|
|
451
|
+
|
|
452
|
+
def recurse_through_bases(cls):
|
|
453
|
+
if cls.__name__ == "YAMLRoot":
|
|
454
|
+
return rv
|
|
455
|
+
rv.append(cls.__name__)
|
|
456
|
+
for base in cls.__bases__:
|
|
457
|
+
recurse_through_bases(base)
|
|
458
|
+
return rv
|
|
459
|
+
|
|
460
|
+
return recurse_through_bases(current_class)
|
|
461
|
+
|
|
462
|
+
|
|
379
463
|
@lru_cache
|
|
380
464
|
def schema_collection_names_with_id_field() -> Set[str]:
|
|
381
465
|
"""
|
|
@@ -393,169 +477,129 @@ def schema_collection_names_with_id_field() -> Set[str]:
|
|
|
393
477
|
return target_collection_names
|
|
394
478
|
|
|
395
479
|
|
|
396
|
-
def
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
|
|
400
|
-
)
|
|
401
|
-
for collection_name in candidate_names:
|
|
402
|
-
if collection_name.startswith("system."): # reserved by mongodb
|
|
403
|
-
continue
|
|
480
|
+
def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
|
|
481
|
+
collection_names = sorted(schema_collection_names_with_id_field())
|
|
482
|
+
return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
|
|
404
483
|
|
|
405
|
-
if (
|
|
406
|
-
collection_name in schema_collection_names_with_id_field()
|
|
407
|
-
or all_docs_have_unique_id(mdb[collection_name])
|
|
408
|
-
):
|
|
409
|
-
mdb[collection_name].create_index("id", unique=True)
|
|
410
484
|
|
|
485
|
+
def does_collection_have_unique_index_on_id_field(
|
|
486
|
+
collection_name: str, db: MongoDatabase
|
|
487
|
+
) -> bool:
|
|
488
|
+
"""Check whether the specified MongoDB collection has a unique index on its `id` field (not `_id`).
|
|
411
489
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
u: dict
|
|
415
|
-
upsert: bool = False
|
|
416
|
-
multi: bool = False
|
|
490
|
+
Note: If the specified MongoDB collection either does not exist or is a _view_ instead of a collection,
|
|
491
|
+
this function will return `False`.
|
|
417
492
|
|
|
493
|
+
References:
|
|
494
|
+
- https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes
|
|
495
|
+
- https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.index_information
|
|
496
|
+
"""
|
|
497
|
+
# Check whether the specified collection actually exists in the database; and, if it does,
|
|
498
|
+
# whether it is really a _collection_ (as opposed to being a _view_). If it doesn't exist,
|
|
499
|
+
# or it is a view, return `False` right away.
|
|
500
|
+
collection_infos_cursor = db.list_collections(filter={"name": collection_name})
|
|
501
|
+
collection_infos = list(collection_infos_cursor)
|
|
502
|
+
if len(collection_infos) == 0:
|
|
503
|
+
return False
|
|
504
|
+
collection_info = collection_infos[0]
|
|
505
|
+
if collection_info["type"] != "collection":
|
|
506
|
+
return False
|
|
418
507
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
508
|
+
# Now that we know we're dealing with a collection, get information about each of its indexes.
|
|
509
|
+
collection = db.get_collection(collection_name)
|
|
510
|
+
for index_information in collection.list_indexes():
|
|
511
|
+
# Get the "field_name-direction" pairs that make up this index.
|
|
512
|
+
field_name_and_direction_pairs: SON = index_information["key"]
|
|
422
513
|
|
|
514
|
+
# If this index involves a number of fields other than one, skip it.
|
|
515
|
+
# We're only interested in indexes that involve the `id` field by itself.
|
|
516
|
+
if len(field_name_and_direction_pairs.keys()) != 1:
|
|
517
|
+
continue
|
|
423
518
|
|
|
424
|
-
|
|
425
|
-
|
|
519
|
+
# Check whether the field this index involves is the `id` field,
|
|
520
|
+
# and whether this index is `unique`.
|
|
521
|
+
field_name = list(field_name_and_direction_pairs.keys())[0]
|
|
522
|
+
if field_name == "id" and index_information.get("unique", False):
|
|
523
|
+
return True
|
|
426
524
|
|
|
525
|
+
return False
|
|
427
526
|
|
|
428
|
-
class OverlayDB(AbstractContextManager):
|
|
429
|
-
"""Provides a context whereby a base Database is overlaid with a temporary one.
|
|
430
527
|
|
|
431
|
-
|
|
432
|
-
|
|
528
|
+
def ensure_unique_id_indexes(mdb: MongoDatabase):
|
|
529
|
+
"""Ensure that any collections with an "id" field have an index on "id"."""
|
|
433
530
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
531
|
+
# Note: The pipe (i.e. `|`) operator performs a union of the two sets. In this case,
|
|
532
|
+
# it creates a set (i.e. `candidate_names`) consisting of the names of both
|
|
533
|
+
# (a) all collections in the real database, and (b) all collections that
|
|
534
|
+
# the NMDC schema says can contain instances of classes that have an "id" slot.
|
|
535
|
+
candidate_names = (
|
|
536
|
+
set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
|
|
537
|
+
)
|
|
538
|
+
for collection_name in candidate_names:
|
|
539
|
+
if collection_name.startswith("system."): # reserved by mongodb
|
|
540
|
+
continue
|
|
440
541
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
542
|
+
# If the collection already has a unique index on `id`, there's no need
|
|
543
|
+
# to check anything else about the collection.
|
|
544
|
+
if does_collection_have_unique_index_on_id_field(collection_name, mdb):
|
|
545
|
+
continue
|
|
445
546
|
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
547
|
+
if (
|
|
548
|
+
collection_name in schema_collection_names_with_id_field()
|
|
549
|
+
or all_docs_have_unique_id(mdb[collection_name])
|
|
550
|
+
):
|
|
551
|
+
# Check if index already exists, and if so, drop it if not unique
|
|
552
|
+
try:
|
|
553
|
+
existing_indexes = list(mdb[collection_name].list_indexes())
|
|
554
|
+
id_index = next(
|
|
555
|
+
(idx for idx in existing_indexes if idx["name"] == "id_1"), None
|
|
556
|
+
)
|
|
450
557
|
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
558
|
+
if id_index:
|
|
559
|
+
# If index exists but isn't unique, drop it so we can recreate
|
|
560
|
+
if not id_index.get("unique", False):
|
|
561
|
+
mdb[collection_name].drop_index("id_1")
|
|
562
|
+
|
|
563
|
+
# Create index with unique constraint
|
|
564
|
+
mdb[collection_name].create_index("id", unique=True)
|
|
565
|
+
except OperationFailure as e:
|
|
566
|
+
# If error is about index with same name, just continue
|
|
567
|
+
if "An existing index has the same name" in str(e):
|
|
568
|
+
continue
|
|
569
|
+
else:
|
|
570
|
+
# Re-raise other errors
|
|
571
|
+
raise
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
def decorate_if(condition: bool = False) -> Callable:
|
|
575
|
+
r"""
|
|
576
|
+
Decorator that applies another decorator only when `condition` is `True`.
|
|
577
|
+
|
|
578
|
+
Note: We implemented this so we could conditionally register
|
|
579
|
+
endpoints with FastAPI's `@router`.
|
|
580
|
+
|
|
581
|
+
Example usages:
|
|
582
|
+
A. Apply the `@router.get` decorator:
|
|
583
|
+
```python
|
|
584
|
+
@decorate_if(True)(router.get("/me"))
|
|
585
|
+
def get_me(...):
|
|
586
|
+
...
|
|
587
|
+
```
|
|
588
|
+
B. Bypass the `@router.get` decorator:
|
|
589
|
+
```python
|
|
590
|
+
@decorate_if(False)(router.get("/me"))
|
|
591
|
+
def get_me(...):
|
|
592
|
+
...
|
|
593
|
+
```
|
|
456
594
|
"""
|
|
457
595
|
|
|
458
|
-
def
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
def __enter__(self):
|
|
464
|
-
return self
|
|
465
|
-
|
|
466
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
|
467
|
-
self._bottom_db.client.drop_database(self._top_db.name)
|
|
468
|
-
|
|
469
|
-
def replace_or_insert_many(self, coll_name, documents: list):
|
|
470
|
-
try:
|
|
471
|
-
self._top_db[coll_name].insert_many(documents)
|
|
472
|
-
except OperationFailure as e:
|
|
473
|
-
raise OverlayDBError(str(e.details))
|
|
474
|
-
|
|
475
|
-
def apply_updates(self, coll_name, updates: list):
|
|
476
|
-
"""prepare overlay db and apply updates to it."""
|
|
477
|
-
assert all(UpdateStatement(**us) for us in updates)
|
|
478
|
-
for update_spec in updates:
|
|
479
|
-
for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
|
|
480
|
-
self._top_db[coll_name].insert_one(bottom_doc)
|
|
481
|
-
try:
|
|
482
|
-
self._top_db.command({"update": coll_name, "updates": updates})
|
|
483
|
-
except OperationFailure as e:
|
|
484
|
-
raise OverlayDBError(str(e.details))
|
|
485
|
-
|
|
486
|
-
def delete(self, coll_name, deletes: list):
|
|
487
|
-
""" "apply" delete command by flagging docs in overlay database"""
|
|
488
|
-
assert all(DeleteStatement(**us) for us in deletes)
|
|
489
|
-
for delete_spec in deletes:
|
|
490
|
-
for bottom_doc in self._bottom_db[coll_name].find(
|
|
491
|
-
delete_spec["q"], limit=delete_spec.get("limit", 1)
|
|
492
|
-
):
|
|
493
|
-
bottom_doc["_deleted"] = True
|
|
494
|
-
self._top_db[coll_name].insert_one(bottom_doc)
|
|
495
|
-
|
|
496
|
-
def merge_find(self, coll_name, find_spec: dict):
|
|
497
|
-
"""Yield docs first from overlay and then from base db, minding deletion flags."""
|
|
498
|
-
# ensure projection of "id" and "_deleted"
|
|
499
|
-
if "projection" in find_spec:
|
|
500
|
-
proj = find_spec["projection"]
|
|
501
|
-
if isinstance(proj, dict):
|
|
502
|
-
proj = merge(proj, {"id": 1, "_deleted": 1})
|
|
503
|
-
elif isinstance(proj, list):
|
|
504
|
-
proj = list(unique(proj + ["id", "_deleted"]))
|
|
505
|
-
|
|
506
|
-
top_docs = self._top_db[coll_name].find(**find_spec)
|
|
507
|
-
bottom_docs = self._bottom_db[coll_name].find(**find_spec)
|
|
508
|
-
top_seen_ids = set()
|
|
509
|
-
for doc in top_docs:
|
|
510
|
-
if not doc.get("_deleted"):
|
|
511
|
-
yield doc
|
|
512
|
-
top_seen_ids.add(doc["id"])
|
|
513
|
-
|
|
514
|
-
for doc in bottom_docs:
|
|
515
|
-
if doc["id"] not in top_seen_ids:
|
|
516
|
-
yield doc
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
def validate_json(in_docs: dict, mdb: MongoDatabase):
|
|
520
|
-
validator = Draft7Validator(get_nmdc_jsonschema_dict())
|
|
521
|
-
docs = deepcopy(in_docs)
|
|
522
|
-
validation_errors = {}
|
|
523
|
-
|
|
524
|
-
known_coll_names = set(nmdc_database_collection_names())
|
|
525
|
-
for coll_name, coll_docs in docs.items():
|
|
526
|
-
if coll_name not in known_coll_names:
|
|
527
|
-
if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
|
|
528
|
-
continue
|
|
596
|
+
def apply_original_decorator(original_decorator: Callable) -> Callable:
|
|
597
|
+
def check_condition(original_function: Callable) -> Callable:
|
|
598
|
+
if condition:
|
|
599
|
+
return original_decorator(original_function)
|
|
529
600
|
else:
|
|
530
|
-
|
|
531
|
-
f"'{coll_name}' is not a known schema collection name"
|
|
532
|
-
]
|
|
533
|
-
continue
|
|
601
|
+
return original_function
|
|
534
602
|
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
if not isinstance(coll_docs, list):
|
|
539
|
-
validation_errors[coll_name].append("value must be a list")
|
|
540
|
-
elif not all(isinstance(d, dict) for d in coll_docs):
|
|
541
|
-
validation_errors[coll_name].append(
|
|
542
|
-
"all elements of list must be dicts"
|
|
543
|
-
)
|
|
544
|
-
if not validation_errors[coll_name]:
|
|
545
|
-
try:
|
|
546
|
-
with OverlayDB(mdb) as odb:
|
|
547
|
-
odb.replace_or_insert_many(coll_name, coll_docs)
|
|
548
|
-
except OverlayDBError as e:
|
|
549
|
-
validation_errors[coll_name].append(str(e))
|
|
550
|
-
|
|
551
|
-
if all(len(v) == 0 for v in validation_errors.values()):
|
|
552
|
-
# Second pass. Try instantiating linkml-sourced dataclass
|
|
553
|
-
in_docs.pop("@type", None)
|
|
554
|
-
try:
|
|
555
|
-
NMDCDatabase(**in_docs)
|
|
556
|
-
except Exception as e:
|
|
557
|
-
return {"result": "errors", "detail": str(e)}
|
|
558
|
-
|
|
559
|
-
return {"result": "All Okay!"}
|
|
560
|
-
else:
|
|
561
|
-
return {"result": "errors", "detail": validation_errors}
|
|
603
|
+
return check_condition
|
|
604
|
+
|
|
605
|
+
return apply_original_decorator
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nmdc-runtime
|
|
3
|
+
Version: 2.12.0
|
|
4
|
+
Summary: A runtime system for NMDC data management and orchestration
|
|
5
|
+
Project-URL: Changelog, https://github.com/microbiomedata/nmdc-runtime/releases
|
|
6
|
+
Project-URL: Documentation, https://docs.microbiomedata.org/runtime
|
|
7
|
+
Project-URL: Issues, https://github.com/microbiomedata/nmdc-runtime/issues
|
|
8
|
+
Project-URL: Repository, https://github.com/microbiomedata/nmdc-runtime
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: base32-lib
|
|
12
|
+
Requires-Dist: boto3
|
|
13
|
+
Requires-Dist: click
|
|
14
|
+
Requires-Dist: dagit
|
|
15
|
+
Requires-Dist: dagster
|
|
16
|
+
Requires-Dist: dagster-graphql
|
|
17
|
+
Requires-Dist: dagster-postgres
|
|
18
|
+
Requires-Dist: fastapi>=0.115.0
|
|
19
|
+
Requires-Dist: frozendict
|
|
20
|
+
Requires-Dist: git-root
|
|
21
|
+
Requires-Dist: jq
|
|
22
|
+
Requires-Dist: jsonasobj2
|
|
23
|
+
Requires-Dist: linkml
|
|
24
|
+
Requires-Dist: linkml-runtime
|
|
25
|
+
Requires-Dist: lxml
|
|
26
|
+
Requires-Dist: nmdc-schema==11.13.0
|
|
27
|
+
Requires-Dist: ontology-loader==0.2.2
|
|
28
|
+
Requires-Dist: pandas
|
|
29
|
+
Requires-Dist: passlib[bcrypt]
|
|
30
|
+
Requires-Dist: pydantic[email]>=1.10.0
|
|
31
|
+
Requires-Dist: pyinstrument
|
|
32
|
+
Requires-Dist: pymongo
|
|
33
|
+
Requires-Dist: python-dotenv
|
|
34
|
+
Requires-Dist: python-jose[cryptography]
|
|
35
|
+
Requires-Dist: python-multipart>=0.0.18
|
|
36
|
+
Requires-Dist: pyyaml
|
|
37
|
+
Requires-Dist: refscan==0.3.2
|
|
38
|
+
Requires-Dist: requests
|
|
39
|
+
Requires-Dist: requests-cache
|
|
40
|
+
Requires-Dist: scalar-fastapi<2.0.0,>=1.4.1
|
|
41
|
+
Requires-Dist: tenacity
|
|
42
|
+
Requires-Dist: toolz
|
|
43
|
+
Requires-Dist: tqdm
|
|
44
|
+
Requires-Dist: unidecode
|
|
45
|
+
Requires-Dist: uvicorn[standard]
|