nmdc-runtime 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (98) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +7 -8
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +1 -22
  76. nmdc_runtime/site/ops.py +60 -152
  77. nmdc_runtime/site/repository.py +0 -112
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +2 -54
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/util.py +3 -47
  87. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  88. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  89. nmdc_runtime/site/translation/emsl.py +0 -43
  90. nmdc_runtime/site/translation/gold.py +0 -53
  91. nmdc_runtime/site/translation/jgi.py +0 -32
  92. nmdc_runtime/site/translation/util.py +0 -132
  93. nmdc_runtime/site/validation/jgi.py +0 -43
  94. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  95. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  96. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  97. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  98. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,109 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import secrets
5
+ import string
6
+ from datetime import datetime, timezone, timedelta
7
+ from importlib import import_module
8
+
9
+ from fastapi import HTTPException, status
10
+ from pydantic import BaseModel
11
+ from toolz import keyfilter
12
+
13
+ API_SITE_ID = os.getenv("API_SITE_ID")
14
+ API_SITE_CLIENT_ID = os.getenv("API_SITE_CLIENT_ID")
15
+
16
+
17
+ def omit(blacklist, d):
18
+ return keyfilter(lambda k: k not in blacklist, d)
19
+
20
+
21
+ def pick(whitelist, d):
22
+ return keyfilter(lambda k: k in whitelist, d)
23
+
24
+
25
+ def hash_from_str(s: str, algo="sha256") -> str:
26
+ if algo not in hashlib.algorithms_guaranteed:
27
+ raise ValueError(f"desired algorithm {algo} not supported")
28
+ return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest()
29
+
30
+
31
+ def sha256hash_from_file(file_path: str, timestamp: str):
32
+ # https://stackoverflow.com/a/55542529
33
+ h = hashlib.sha256()
34
+
35
+ timestamp_bytes = timestamp.encode("utf-8")
36
+ h.update(timestamp_bytes)
37
+
38
+ with open(file_path, "rb") as file:
39
+ while True:
40
+ # Reading is buffered, so we can read smaller chunks.
41
+ chunk = file.read(h.block_size)
42
+ if not chunk:
43
+ break
44
+ h.update(chunk)
45
+
46
+ return h.hexdigest()
47
+
48
+
49
+ def raise404_if_none(doc, detail="Not found"):
50
+ if doc is None:
51
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
52
+ return doc
53
+
54
+
55
+ def now(as_str=False):
56
+ dt = datetime.now(timezone.utc)
57
+ return dt.isoformat() if as_str else dt
58
+
59
+
60
+ def expiry_dt_from_now(days=0, hours=0, minutes=0, seconds=0):
61
+ return now() + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
62
+
63
+
64
+ def has_passed(dt):
65
+ return now() > dt
66
+
67
+
68
+ def import_via_dotted_path(dotted_path: str):
69
+ module_name, _, member_name = dotted_path.rpartition(".")
70
+ return getattr(import_module(module_name), member_name)
71
+
72
+
73
+ def dotted_path_for(member):
74
+ return f"{member.__module__}.{member.__name__}"
75
+
76
+
77
+ def generate_secret(length=12):
78
+ """Generate a secret.
79
+
80
+ With
81
+ - at least one lowercase character,
82
+ - at least one uppercase character, and
83
+ - at least three digits
84
+
85
+ """
86
+ if length < 8:
87
+ raise ValueError(f"{length=} must be >=8.")
88
+ alphabet = string.ascii_letters + string.digits + "!@#$%^*-_+="
89
+ # based on https://docs.python.org/3.8/library/secrets.html#recipes-and-best-practices
90
+ while True:
91
+ _secret = "".join(secrets.choice(alphabet) for i in range(length))
92
+ if (
93
+ any(c.islower() for c in _secret)
94
+ and any(c.isupper() for c in _secret)
95
+ and sum(c.isdigit() for c in _secret) >= 3
96
+ ):
97
+ break
98
+ return _secret
99
+
100
+
101
+ def json_clean(data, model, exclude_unset=False) -> dict:
102
+ """Run data through a JSON serializer for a pydantic model."""
103
+ if not isinstance(data, (dict, BaseModel)):
104
+ raise TypeError("`data` must be a pydantic model or its .model_dump()")
105
+ m = model(**data) if isinstance(data, dict) else data
106
+
107
+ # Note: Between Pydantic v1 and v2, the `json` method was renamed to `model_dump_json`.
108
+ # Reference: https://docs.pydantic.dev/2.11/migration/#changes-to-pydanticbasemodel
109
+ return json.loads(m.model_dump_json(exclude_unset=exclude_unset))
File without changes
@@ -0,0 +1,447 @@
1
+ import gzip
2
+ import os
3
+ from contextlib import AbstractContextManager
4
+ from copy import deepcopy
5
+ from functools import lru_cache
6
+ from typing import Set
7
+ from uuid import uuid4
8
+
9
+ import bson
10
+ from jsonschema import Draft7Validator
11
+ from nmdc_schema.nmdc import Database as NMDCDatabase
12
+ from pymongo.errors import AutoReconnect, OperationFailure
13
+ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
14
+ from refscan.lib.Finder import Finder
15
+ from refscan.scanner import scan_outgoing_references
16
+ from tenacity import wait_random_exponential, retry, retry_if_exception_type
17
+ from toolz import merge, unique
18
+ from refscan.lib.helpers import get_collection_names_from_schema
19
+
20
+ from nmdc_runtime.api.models.query import UpdateStatement, DeleteStatement
21
+ from nmdc_runtime.mongo_util import SessionBoundDatabase
22
+ from nmdc_runtime.util import (
23
+ nmdc_schema_view,
24
+ collection_name_to_class_names,
25
+ ensure_unique_id_indexes,
26
+ get_nmdc_jsonschema_dict,
27
+ nmdc_database_collection_names,
28
+ get_allowed_references,
29
+ )
30
+ from pymongo import MongoClient
31
+ from pymongo.database import Database as MongoDatabase
32
+
33
+
34
+ @retry(
35
+ retry=retry_if_exception_type(AutoReconnect),
36
+ wait=wait_random_exponential(multiplier=0.5, max=60),
37
+ )
38
+ def check_mongo_ok_autoreconnect(mdb: MongoDatabase):
39
+ r"""
40
+ Check whether the application can write to the database.
41
+ """
42
+ collection = mdb.get_collection("_runtime.healthcheck")
43
+ collection.insert_one({"status": "ok"})
44
+ collection.delete_many({"status": "ok"})
45
+ return True
46
+
47
+
48
+ @lru_cache
49
+ def get_mongo_client() -> MongoClient:
50
+ r"""
51
+ Returns a `MongoClient` instance you can use to access the MongoDB server specified via environment variables.
52
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
53
+ """
54
+ return MongoClient(
55
+ host=os.getenv("MONGO_HOST"),
56
+ username=os.getenv("MONGO_USERNAME"),
57
+ password=os.getenv("MONGO_PASSWORD"),
58
+ directConnection=True,
59
+ )
60
+
61
+
62
+ @lru_cache
63
+ def get_mongo_db() -> MongoDatabase:
64
+ r"""
65
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
66
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
67
+ """
68
+ _client = get_mongo_client()
69
+ mdb = _client[os.getenv("MONGO_DBNAME")]
70
+ check_mongo_ok_autoreconnect(mdb)
71
+ return mdb
72
+
73
+
74
+ @lru_cache
75
+ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
76
+ r"""
77
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
78
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
79
+ """
80
+ _client = get_mongo_client()
81
+ mdb = _client[os.getenv("MONGO_DBNAME")]
82
+ check_mongo_ok_autoreconnect(mdb)
83
+ return SessionBoundDatabase(mdb, session) if session is not None else mdb
84
+
85
+
86
+ @lru_cache
87
+ def get_async_mongo_db() -> AsyncIOMotorDatabase:
88
+ _client = AsyncIOMotorClient(
89
+ host=os.getenv("MONGO_HOST"),
90
+ username=os.getenv("MONGO_USERNAME"),
91
+ password=os.getenv("MONGO_PASSWORD"),
92
+ directConnection=True,
93
+ )
94
+ return _client[os.getenv("MONGO_DBNAME")]
95
+
96
+
97
+ def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
98
+ """
99
+ Returns the names of the collections that (a) exist in the database,
100
+ (b) are described by the schema, and (c) contain at least one document.
101
+
102
+ Note: The ampersand (`&`) is the "set intersection" operator.
103
+ """
104
+ collection_names_from_database = mdb.list_collection_names()
105
+ schema_view = nmdc_schema_view()
106
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
107
+ names = set(collection_names_from_database) & set(collection_names_from_schema)
108
+ return {name for name in names if mdb[name].estimated_document_count() > 0}
109
+
110
+
111
+ @lru_cache
112
+ def activity_collection_names(mdb: MongoDatabase) -> Set[str]:
113
+ r"""
114
+ TODO: Document this function.
115
+ """
116
+ return get_nonempty_nmdc_schema_collection_names(mdb) - {
117
+ "biosample_set",
118
+ "study_set",
119
+ "data_object_set",
120
+ "functional_annotation_set",
121
+ "genome_feature_set",
122
+ }
123
+
124
+
125
+ @lru_cache
126
+ def get_planned_process_collection_names() -> Set[str]:
127
+ r"""
128
+ Returns the names of all collections that the schema says can contain documents
129
+ that represent instances of the `PlannedProcess` class or any of its subclasses.
130
+ """
131
+ schema_view = nmdc_schema_view()
132
+ collection_names = set()
133
+ planned_process_descendants = set(schema_view.class_descendants("PlannedProcess"))
134
+
135
+ for collection_name, class_names in collection_name_to_class_names.items():
136
+ for class_name in class_names:
137
+ # If the name of this class is the name of the `PlannedProcess` class
138
+ # or any of its subclasses, add it to the result set.
139
+ if class_name in planned_process_descendants:
140
+ collection_names.add(collection_name)
141
+
142
+ return collection_names
143
+
144
+
145
+ def mongodump_excluded_collections() -> str:
146
+ """
147
+ TODO: Document this function.
148
+ """
149
+ _mdb = get_mongo_db()
150
+ schema_view = nmdc_schema_view()
151
+ collection_names_from_database = _mdb.list_collection_names()
152
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
153
+ excluded_collections = " ".join(
154
+ f"--excludeCollection={c}"
155
+ for c in sorted(
156
+ set(collection_names_from_database) - set(collection_names_from_schema)
157
+ )
158
+ )
159
+ return excluded_collections
160
+
161
+
162
+ def mongorestore_collection(mdb, collection_name, bson_file_path):
163
+ """
164
+ Replaces the specified collection with one that reflects the contents of the
165
+ specified BSON file.
166
+ """
167
+ with gzip.open(bson_file_path, "rb") as bson_file:
168
+ data = bson.decode_all(bson_file.read())
169
+ if data:
170
+ mdb.drop_collection(collection_name)
171
+ mdb[collection_name].insert_many(data)
172
+ print(
173
+ f"mongorestore_collection: inserted {len(data)} documents into {collection_name} after drop"
174
+ )
175
+ else:
176
+ print(f"mongorestore_collection: no {collection_name} documents found")
177
+
178
+
179
+ def mongorestore_from_dir(mdb, dump_directory, skip_collections=None):
180
+ """
181
+ Effectively runs a `mongorestore` command in pure Python.
182
+ Helpful in a container context that does not have the `mongorestore` command available.
183
+ """
184
+ skip_collections = skip_collections or []
185
+ for root, dirs, files in os.walk(dump_directory):
186
+ for file in files:
187
+ if file.endswith(".bson.gz"):
188
+ collection_name = file.replace(".bson.gz", "")
189
+ if collection_name in skip_collections:
190
+ continue
191
+ bson_file_path = os.path.join(root, file)
192
+ mongorestore_collection(mdb, collection_name, bson_file_path)
193
+
194
+ print("mongorestore_from_dir completed successfully.")
195
+
196
+
197
+ class OverlayDBError(Exception):
198
+ pass
199
+
200
+
201
+ class OverlayDB(AbstractContextManager):
202
+ """Provides a context whereby a base Database is overlaid with a temporary one.
203
+
204
+ If you need to run basic simulations of updates to a base database,
205
+ you don't want to actually commit transactions to the base database.
206
+
207
+ For example, to insert or replace (matching on "id") many documents into a collection in order
208
+ to then validate the resulting total set of collection documents, an OverlayDB writes to
209
+ an overlay collection that "shadows" the base collection during a "find" query
210
+ (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
211
+ overlay collection, that id is marked as "seen" and will not also be returned when
212
+ subsequently scanning the (unmodified) base-database collection.
213
+
214
+ Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
215
+ database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
216
+ `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
217
+ the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
218
+ "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
219
+ of the `merge_find` method, which internally accesses both the real database and the overlaying database.
220
+
221
+ Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
222
+ documents from a base collection to the overlay, and then applying the updates to the overlay,
223
+ so that again, base collections are unmodified, and a "merge_find" call will produce a result
224
+ *as if* the base collection(s) were modified.
225
+
226
+ Mongo deletions (as the "delete" method) also copy affected documents from the base collection
227
+ to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
228
+ call will match a relevant document given a suitable filter, and will mark the document's id
229
+ as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
230
+
231
+ Usage:
232
+ ````
233
+ with OverlayDB(mdb) as odb:
234
+ # do stuff, e.g. `odb.replace_or_insert_many(...)`
235
+ ```
236
+ """
237
+
238
+ def __init__(self, mdb: MongoDatabase):
239
+ self._bottom_db = mdb
240
+ self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
241
+ ensure_unique_id_indexes(self._top_db)
242
+
243
+ def __enter__(self):
244
+ return self
245
+
246
+ def __exit__(self, exc_type, exc_value, traceback):
247
+ self._bottom_db.client.drop_database(self._top_db.name)
248
+
249
+ def replace_or_insert_many(self, coll_name, documents: list):
250
+ try:
251
+ self._top_db[coll_name].insert_many(documents)
252
+ except OperationFailure as e:
253
+ raise OverlayDBError(str(e.details))
254
+
255
+ def apply_updates(self, coll_name, updates: list):
256
+ """prepare overlay db and apply updates to it."""
257
+ assert all(UpdateStatement(**us) for us in updates)
258
+ for update_spec in updates:
259
+ for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
260
+ self._top_db[coll_name].insert_one(bottom_doc)
261
+ try:
262
+ self._top_db.command({"update": coll_name, "updates": updates})
263
+ except OperationFailure as e:
264
+ raise OverlayDBError(str(e.details))
265
+
266
+ def delete(self, coll_name, deletes: list):
267
+ """ "apply" delete command by flagging docs in overlay database"""
268
+ assert all(DeleteStatement(**us) for us in deletes)
269
+ for delete_spec in deletes:
270
+ for bottom_doc in self._bottom_db[coll_name].find(
271
+ delete_spec["q"], limit=delete_spec["limit"]
272
+ ):
273
+ bottom_doc["_deleted"] = True
274
+ self._top_db[coll_name].insert_one(bottom_doc)
275
+
276
+ def merge_find(self, coll_name, find_spec: dict):
277
+ """Yield docs first from overlay and then from base db, minding deletion flags."""
278
+ # ensure projection of "id" and "_deleted"
279
+ if "projection" in find_spec:
280
+ proj = find_spec["projection"]
281
+ if isinstance(proj, dict):
282
+ proj = merge(proj, {"id": 1, "_deleted": 1})
283
+ elif isinstance(proj, list):
284
+ proj = list(unique(proj + ["id", "_deleted"]))
285
+
286
+ top_docs = self._top_db[coll_name].find(**find_spec)
287
+ bottom_docs = self._bottom_db[coll_name].find(**find_spec)
288
+ top_seen_ids = set()
289
+ for doc in top_docs:
290
+ if not doc.get("_deleted"):
291
+ yield doc
292
+ top_seen_ids.add(doc["id"])
293
+
294
+ for doc in bottom_docs:
295
+ if doc["id"] not in top_seen_ids:
296
+ yield doc
297
+
298
+
299
+ def validate_json(
300
+ in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
301
+ ):
302
+ r"""
303
+ Checks whether the specified dictionary represents a valid instance of the `Database` class
304
+ defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
305
+
306
+ Example dictionary:
307
+ {
308
+ "biosample_set": [
309
+ {"id": "nmdc:bsm-00-000001", ...},
310
+ {"id": "nmdc:bsm-00-000002", ...}
311
+ ],
312
+ "study_set": [
313
+ {"id": "nmdc:sty-00-000001", ...},
314
+ {"id": "nmdc:sty-00-000002", ...}
315
+ ]
316
+ }
317
+
318
+ :param in_docs: The dictionary you want to validate
319
+ :param mdb: A reference to a MongoDB database
320
+ :param check_inter_document_references: Whether you want this function to check whether every document that
321
+ is referenced by any of the documents passed in would, indeed, exist
322
+ in the database, if the documents passed in were to be inserted into
323
+ the database. In other words, set this to `True` if you want this
324
+ function to perform referential integrity checks.
325
+ """
326
+ validator = Draft7Validator(get_nmdc_jsonschema_dict())
327
+ docs = deepcopy(in_docs)
328
+ validation_errors = {}
329
+
330
+ known_coll_names = set(nmdc_database_collection_names())
331
+ for coll_name, coll_docs in docs.items():
332
+ if coll_name not in known_coll_names:
333
+ # We expect each key in `in_docs` to be a known schema collection name. However, `@type` is a special key
334
+ # for JSON-LD, used for JSON serialization of e.g. LinkML objects. That is, the value of `@type` lets a
335
+ # client know that the JSON object (a dict in Python) should be interpreted as a
336
+ # <https://w3id.org/nmdc/Database>. If `@type` is present as a key, and its value indicates that
337
+ # `in_docs` is indeed a nmdc:Database, that's fine, and we don't want to raise an exception.
338
+ #
339
+ # prompted by: https://github.com/microbiomedata/nmdc-runtime/discussions/858
340
+ if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
341
+ continue
342
+ else:
343
+ validation_errors[coll_name] = [
344
+ f"'{coll_name}' is not a known schema collection name"
345
+ ]
346
+ continue
347
+
348
+ errors = list(validator.iter_errors({coll_name: coll_docs}))
349
+ validation_errors[coll_name] = [e.message for e in errors]
350
+ if coll_docs:
351
+ if not isinstance(coll_docs, list):
352
+ validation_errors[coll_name].append("value must be a list")
353
+ elif not all(isinstance(d, dict) for d in coll_docs):
354
+ validation_errors[coll_name].append(
355
+ "all elements of list must be dicts"
356
+ )
357
+ if not validation_errors[coll_name]:
358
+ try:
359
+ with OverlayDB(mdb) as odb:
360
+ odb.replace_or_insert_many(coll_name, coll_docs)
361
+ except OverlayDBError as e:
362
+ validation_errors[coll_name].append(str(e))
363
+
364
+ if all(len(v) == 0 for v in validation_errors.values()):
365
+ # Second pass. Try instantiating linkml-sourced dataclass
366
+ in_docs.pop("@type", None)
367
+ try:
368
+ NMDCDatabase(**in_docs)
369
+ except Exception as e:
370
+ return {"result": "errors", "detail": str(e)}
371
+
372
+ # Third pass (if enabled): Check inter-document references.
373
+ if check_inter_document_references is True:
374
+ # Prepare to use `refscan`.
375
+ #
376
+ # Note: We check the inter-document references in two stages, which are:
377
+ # 1. For each document in the JSON payload, check whether each document it references already exists
378
+ # (in the collections the schema says it can exist in) in the database. We use the
379
+ # `refscan` package to do this, which returns violation details we'll use in the second stage.
380
+ # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
381
+ # check whether that document exists (in the collections the schema says it can exist in) in the
382
+ # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
383
+ # The violations that remain after those two stages are the ones we return to the caller.
384
+ #
385
+ # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
386
+ # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
387
+ # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
388
+ # `refscan`'s `Finder` class accepts.
389
+ #
390
+ finder = Finder(database=mdb)
391
+ references = get_allowed_references()
392
+
393
+ # Iterate over the collections in the JSON payload.
394
+ for source_collection_name, documents in in_docs.items():
395
+ for document in documents:
396
+ # Add an `_id` field to the document, since `refscan` requires the document to have one.
397
+ source_document = dict(document, _id=None)
398
+ violations = scan_outgoing_references(
399
+ document=source_document,
400
+ schema_view=nmdc_schema_view(),
401
+ references=references,
402
+ finder=finder,
403
+ source_collection_name=source_collection_name,
404
+ user_wants_to_locate_misplaced_documents=False,
405
+ )
406
+
407
+ # For each violation, check whether the misplaced document is in the JSON payload, itself.
408
+ for violation in violations:
409
+ can_waive_violation = False
410
+ # Determine which collections can contain the referenced document, based upon
411
+ # the schema class of which this source document is an instance.
412
+ target_collection_names = (
413
+ references.get_target_collection_names(
414
+ source_class_name=violation.source_class_name,
415
+ source_field_name=violation.source_field_name,
416
+ )
417
+ )
418
+ # Check whether the referenced document exists in any of those collections in the JSON payload.
419
+ for json_coll_name, json_coll_docs in in_docs.items():
420
+ if json_coll_name in target_collection_names:
421
+ for json_coll_doc in json_coll_docs:
422
+ if json_coll_doc["id"] == violation.target_id:
423
+ can_waive_violation = True
424
+ break # stop checking
425
+ if can_waive_violation:
426
+ break # stop checking
427
+ if not can_waive_violation:
428
+ violation_as_str = (
429
+ f"Document '{violation.source_document_id}' "
430
+ f"in collection '{violation.source_collection_name}' "
431
+ f"has a field '{violation.source_field_name}' that "
432
+ f"references a document having id "
433
+ f"'{violation.target_id}', but the latter document "
434
+ f"does not exist in any of the collections the "
435
+ f"NMDC Schema says it can exist in."
436
+ )
437
+ validation_errors[source_collection_name].append(
438
+ violation_as_str
439
+ )
440
+
441
+ # If any collection's error list is not empty, return an error response.
442
+ if any(len(v) > 0 for v in validation_errors.values()):
443
+ return {"result": "errors", "detail": validation_errors}
444
+
445
+ return {"result": "All Okay!"}
446
+ else:
447
+ return {"result": "errors", "detail": validation_errors}
@@ -0,0 +1,37 @@
1
+ from functools import lru_cache
2
+ import os
3
+
4
+ import boto3
5
+
6
+ API_SITE_BUCKET = os.getenv("API_SITE_ID")
7
+ S3_ID_NS = "do" # Namespace for Drs Objects in Site S3-bucket store.
8
+
9
+
10
+ @lru_cache
11
+ def get_s3_client():
12
+ _session = boto3.session.Session()
13
+ return _session.client(
14
+ "s3",
15
+ region_name=os.getenv("DO_REGION_NAME"),
16
+ endpoint_url=os.getenv("DO_ENDPOINT_URL"),
17
+ aws_access_key_id=os.getenv("DO_SPACES_KEY"),
18
+ aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
19
+ )
20
+
21
+
22
+ def presigned_url_to_put(
23
+ key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
24
+ ):
25
+ return client.generate_presigned_url(
26
+ ClientMethod="put_object",
27
+ Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
28
+ ExpiresIn=expires_in,
29
+ )
30
+
31
+
32
+ def presigned_url_to_get(key, client=None, bucket=API_SITE_BUCKET, expires_in=300):
33
+ return client.generate_presigned_url(
34
+ ClientMethod="get_object",
35
+ Params={"Bucket": bucket, "Key": key},
36
+ ExpiresIn=expires_in,
37
+ )
File without changes
@@ -0,0 +1,25 @@
1
+ from typing import List
2
+
3
+ import pymongo
4
+ from fastapi import APIRouter, Depends
5
+
6
+ from nmdc_runtime.api.core.util import raise404_if_none
7
+ from nmdc_runtime.api.db.mongo import get_mongo_db
8
+ from nmdc_runtime.api.models.capability import Capability
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.get("/capabilities", response_model=List[Capability])
14
+ def list_capabilities(
15
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
16
+ ):
17
+ return list(mdb.capabilities.find())
18
+
19
+
20
+ @router.get("/capabilities/{capability_id}", response_model=Capability)
21
+ def get_capability(
22
+ capability_id: str,
23
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
24
+ ):
25
+ return raise404_if_none(mdb.capabilities.find_one({"id": capability_id}))