nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,109 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import secrets
5
+ import string
6
+ from datetime import datetime, timezone, timedelta
7
+ from importlib import import_module
8
+
9
+ from fastapi import HTTPException, status
10
+ from pydantic import BaseModel
11
+ from toolz import keyfilter
12
+
13
+ API_SITE_ID = os.getenv("API_SITE_ID")
14
+ API_SITE_CLIENT_ID = os.getenv("API_SITE_CLIENT_ID")
15
+
16
+
17
+ def omit(blacklist, d):
18
+ return keyfilter(lambda k: k not in blacklist, d)
19
+
20
+
21
+ def pick(whitelist, d):
22
+ return keyfilter(lambda k: k in whitelist, d)
23
+
24
+
25
+ def hash_from_str(s: str, algo="sha256") -> str:
26
+ if algo not in hashlib.algorithms_guaranteed:
27
+ raise ValueError(f"desired algorithm {algo} not supported")
28
+ return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest()
29
+
30
+
31
+ def sha256hash_from_file(file_path: str, timestamp: str):
32
+ # https://stackoverflow.com/a/55542529
33
+ h = hashlib.sha256()
34
+
35
+ timestamp_bytes = timestamp.encode("utf-8")
36
+ h.update(timestamp_bytes)
37
+
38
+ with open(file_path, "rb") as file:
39
+ while True:
40
+ # Reading is buffered, so we can read smaller chunks.
41
+ chunk = file.read(h.block_size)
42
+ if not chunk:
43
+ break
44
+ h.update(chunk)
45
+
46
+ return h.hexdigest()
47
+
48
+
49
+ def raise404_if_none(doc, detail="Not found"):
50
+ if doc is None:
51
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
52
+ return doc
53
+
54
+
55
+ def now(as_str=False):
56
+ dt = datetime.now(timezone.utc)
57
+ return dt.isoformat() if as_str else dt
58
+
59
+
60
+ def expiry_dt_from_now(days=0, hours=0, minutes=0, seconds=0):
61
+ return now() + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
62
+
63
+
64
+ def has_passed(dt):
65
+ return now() > dt
66
+
67
+
68
+ def import_via_dotted_path(dotted_path: str):
69
+ module_name, _, member_name = dotted_path.rpartition(".")
70
+ return getattr(import_module(module_name), member_name)
71
+
72
+
73
+ def dotted_path_for(member):
74
+ return f"{member.__module__}.{member.__name__}"
75
+
76
+
77
+ def generate_secret(length=12):
78
+ """Generate a secret.
79
+
80
+ With
81
+ - at least one lowercase character,
82
+ - at least one uppercase character, and
83
+ - at least three digits
84
+
85
+ """
86
+ if length < 8:
87
+ raise ValueError(f"{length=} must be >=8.")
88
+ alphabet = string.ascii_letters + string.digits + "!@#$%^*-_+="
89
+ # based on https://docs.python.org/3.8/library/secrets.html#recipes-and-best-practices
90
+ while True:
91
+ _secret = "".join(secrets.choice(alphabet) for i in range(length))
92
+ if (
93
+ any(c.islower() for c in _secret)
94
+ and any(c.isupper() for c in _secret)
95
+ and sum(c.isdigit() for c in _secret) >= 3
96
+ ):
97
+ break
98
+ return _secret
99
+
100
+
101
+ def json_clean(data, model, exclude_unset=False) -> dict:
102
+ """Run data through a JSON serializer for a pydantic model."""
103
+ if not isinstance(data, (dict, BaseModel)):
104
+ raise TypeError("`data` must be a pydantic model or its .model_dump()")
105
+ m = model(**data) if isinstance(data, dict) else data
106
+
107
+ # Note: Between Pydantic v1 and v2, the `json` method was renamed to `model_dump_json`.
108
+ # Reference: https://docs.pydantic.dev/2.11/migration/#changes-to-pydanticbasemodel
109
+ return json.loads(m.model_dump_json(exclude_unset=exclude_unset))
@@ -0,0 +1,435 @@
1
+ import gzip
2
+ import os
3
+ from contextlib import AbstractContextManager
4
+ from copy import deepcopy
5
+ from functools import lru_cache
6
+ from typing import Set
7
+ from uuid import uuid4
8
+
9
+ import bson
10
+ from jsonschema import Draft7Validator
11
+ from nmdc_schema.nmdc import Database as NMDCDatabase
12
+ from pymongo.errors import AutoReconnect, OperationFailure
13
+ from refscan.lib.Finder import Finder
14
+ from refscan.scanner import scan_outgoing_references
15
+ from tenacity import wait_random_exponential, retry, retry_if_exception_type
16
+ from toolz import merge, unique
17
+ from refscan.lib.helpers import get_collection_names_from_schema
18
+
19
+ from nmdc_runtime.api.models.query import UpdateStatement, DeleteStatement
20
+ from nmdc_runtime.mongo_util import SessionBoundDatabase
21
+ from nmdc_runtime.util import (
22
+ nmdc_schema_view,
23
+ collection_name_to_class_names,
24
+ ensure_unique_id_indexes,
25
+ get_nmdc_jsonschema_dict,
26
+ nmdc_database_collection_names,
27
+ get_allowed_references,
28
+ )
29
+ from pymongo import MongoClient
30
+ from pymongo.database import Database as MongoDatabase
31
+
32
+
33
+ @retry(
34
+ retry=retry_if_exception_type(AutoReconnect),
35
+ wait=wait_random_exponential(multiplier=0.5, max=60),
36
+ )
37
+ def check_mongo_ok_autoreconnect(mdb: MongoDatabase):
38
+ r"""
39
+ Check whether the application can write to the database.
40
+ """
41
+ collection = mdb.get_collection("_runtime.healthcheck")
42
+ collection.insert_one({"status": "ok"})
43
+ collection.delete_many({"status": "ok"})
44
+ return True
45
+
46
+
47
+ @lru_cache
48
+ def get_mongo_client() -> MongoClient:
49
+ r"""
50
+ Returns a `MongoClient` instance you can use to access the MongoDB server specified via environment variables.
51
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
52
+ """
53
+ return MongoClient(
54
+ host=os.getenv("MONGO_HOST"),
55
+ username=os.getenv("MONGO_USERNAME"),
56
+ password=os.getenv("MONGO_PASSWORD"),
57
+ directConnection=True,
58
+ )
59
+
60
+
61
+ @lru_cache
62
+ def get_mongo_db() -> MongoDatabase:
63
+ r"""
64
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
65
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
66
+ """
67
+ _client = get_mongo_client()
68
+ mdb = _client[os.getenv("MONGO_DBNAME")]
69
+ check_mongo_ok_autoreconnect(mdb)
70
+ return mdb
71
+
72
+
73
+ @lru_cache
74
+ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
75
+ r"""
76
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
77
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
78
+ """
79
+ _client = get_mongo_client()
80
+ mdb = _client[os.getenv("MONGO_DBNAME")]
81
+ check_mongo_ok_autoreconnect(mdb)
82
+ return SessionBoundDatabase(mdb, session) if session is not None else mdb
83
+
84
+
85
+ def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
86
+ """
87
+ Returns the names of the collections that (a) exist in the database,
88
+ (b) are described by the schema, and (c) contain at least one document.
89
+
90
+ Note: The ampersand (`&`) is the "set intersection" operator.
91
+ """
92
+ collection_names_from_database = mdb.list_collection_names()
93
+ schema_view = nmdc_schema_view()
94
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
95
+ names = set(collection_names_from_database) & set(collection_names_from_schema)
96
+ return {name for name in names if mdb[name].estimated_document_count() > 0}
97
+
98
+
99
+ @lru_cache
100
+ def activity_collection_names(mdb: MongoDatabase) -> Set[str]:
101
+ r"""
102
+ TODO: Document this function.
103
+ """
104
+ return get_nonempty_nmdc_schema_collection_names(mdb) - {
105
+ "biosample_set",
106
+ "study_set",
107
+ "data_object_set",
108
+ "functional_annotation_set",
109
+ "genome_feature_set",
110
+ }
111
+
112
+
113
+ @lru_cache
114
+ def get_planned_process_collection_names() -> Set[str]:
115
+ r"""
116
+ Returns the names of all collections that the schema says can contain documents
117
+ that represent instances of the `PlannedProcess` class or any of its subclasses.
118
+ """
119
+ schema_view = nmdc_schema_view()
120
+ collection_names = set()
121
+ planned_process_descendants = set(schema_view.class_descendants("PlannedProcess"))
122
+
123
+ for collection_name, class_names in collection_name_to_class_names.items():
124
+ for class_name in class_names:
125
+ # If the name of this class is the name of the `PlannedProcess` class
126
+ # or any of its subclasses, add it to the result set.
127
+ if class_name in planned_process_descendants:
128
+ collection_names.add(collection_name)
129
+
130
+ return collection_names
131
+
132
+
133
+ def mongodump_excluded_collections() -> str:
134
+ """
135
+ TODO: Document this function.
136
+ """
137
+ _mdb = get_mongo_db()
138
+ schema_view = nmdc_schema_view()
139
+ collection_names_from_database = _mdb.list_collection_names()
140
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
141
+ excluded_collections = " ".join(
142
+ f"--excludeCollection={c}"
143
+ for c in sorted(
144
+ set(collection_names_from_database) - set(collection_names_from_schema)
145
+ )
146
+ )
147
+ return excluded_collections
148
+
149
+
150
+ def mongorestore_collection(mdb, collection_name, bson_file_path):
151
+ """
152
+ Replaces the specified collection with one that reflects the contents of the
153
+ specified BSON file.
154
+ """
155
+ with gzip.open(bson_file_path, "rb") as bson_file:
156
+ data = bson.decode_all(bson_file.read())
157
+ if data:
158
+ mdb.drop_collection(collection_name)
159
+ mdb[collection_name].insert_many(data)
160
+ print(
161
+ f"mongorestore_collection: inserted {len(data)} documents into {collection_name} after drop"
162
+ )
163
+ else:
164
+ print(f"mongorestore_collection: no {collection_name} documents found")
165
+
166
+
167
+ def mongorestore_from_dir(mdb, dump_directory, skip_collections=None):
168
+ """
169
+ Effectively runs a `mongorestore` command in pure Python.
170
+ Helpful in a container context that does not have the `mongorestore` command available.
171
+ """
172
+ skip_collections = skip_collections or []
173
+ for root, dirs, files in os.walk(dump_directory):
174
+ for file in files:
175
+ if file.endswith(".bson.gz"):
176
+ collection_name = file.replace(".bson.gz", "")
177
+ if collection_name in skip_collections:
178
+ continue
179
+ bson_file_path = os.path.join(root, file)
180
+ mongorestore_collection(mdb, collection_name, bson_file_path)
181
+
182
+ print("mongorestore_from_dir completed successfully.")
183
+
184
+
185
+ class OverlayDBError(Exception):
186
+ pass
187
+
188
+
189
+ class OverlayDB(AbstractContextManager):
190
+ """Provides a context whereby a base Database is overlaid with a temporary one.
191
+
192
+ If you need to run basic simulations of updates to a base database,
193
+ you don't want to actually commit transactions to the base database.
194
+
195
+ For example, to insert or replace (matching on "id") many documents into a collection in order
196
+ to then validate the resulting total set of collection documents, an OverlayDB writes to
197
+ an overlay collection that "shadows" the base collection during a "find" query
198
+ (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
199
+ overlay collection, that id is marked as "seen" and will not also be returned when
200
+ subsequently scanning the (unmodified) base-database collection.
201
+
202
+ Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
203
+ database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
204
+ `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
205
+ the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
206
+ "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
207
+ of the `merge_find` method, which internally accesses both the real database and the overlaying database.
208
+
209
+ Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
210
+ documents from a base collection to the overlay, and then applying the updates to the overlay,
211
+ so that again, base collections are unmodified, and a "merge_find" call will produce a result
212
+ *as if* the base collection(s) were modified.
213
+
214
+ Mongo deletions (as the "delete" method) also copy affected documents from the base collection
215
+ to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
216
+ call will match a relevant document given a suitable filter, and will mark the document's id
217
+ as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
218
+
219
+ Usage:
220
+ ````
221
+ with OverlayDB(mdb) as odb:
222
+ # do stuff, e.g. `odb.replace_or_insert_many(...)`
223
+ ```
224
+ """
225
+
226
+ def __init__(self, mdb: MongoDatabase):
227
+ self._bottom_db = mdb
228
+ self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
229
+ ensure_unique_id_indexes(self._top_db)
230
+
231
+ def __enter__(self):
232
+ return self
233
+
234
+ def __exit__(self, exc_type, exc_value, traceback):
235
+ self._bottom_db.client.drop_database(self._top_db.name)
236
+
237
+ def replace_or_insert_many(self, coll_name, documents: list):
238
+ try:
239
+ self._top_db[coll_name].insert_many(documents)
240
+ except OperationFailure as e:
241
+ raise OverlayDBError(str(e.details))
242
+
243
+ def apply_updates(self, coll_name, updates: list):
244
+ """prepare overlay db and apply updates to it."""
245
+ assert all(UpdateStatement(**us) for us in updates)
246
+ for update_spec in updates:
247
+ for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
248
+ self._top_db[coll_name].insert_one(bottom_doc)
249
+ try:
250
+ self._top_db.command({"update": coll_name, "updates": updates})
251
+ except OperationFailure as e:
252
+ raise OverlayDBError(str(e.details))
253
+
254
+ def delete(self, coll_name, deletes: list):
255
+ """ "apply" delete command by flagging docs in overlay database"""
256
+ assert all(DeleteStatement(**us) for us in deletes)
257
+ for delete_spec in deletes:
258
+ for bottom_doc in self._bottom_db[coll_name].find(
259
+ delete_spec["q"], limit=delete_spec["limit"]
260
+ ):
261
+ bottom_doc["_deleted"] = True
262
+ self._top_db[coll_name].insert_one(bottom_doc)
263
+
264
+ def merge_find(self, coll_name, find_spec: dict):
265
+ """Yield docs first from overlay and then from base db, minding deletion flags."""
266
+ # ensure projection of "id" and "_deleted"
267
+ if "projection" in find_spec:
268
+ proj = find_spec["projection"]
269
+ if isinstance(proj, dict):
270
+ proj = merge(proj, {"id": 1, "_deleted": 1})
271
+ elif isinstance(proj, list):
272
+ proj = list(unique(proj + ["id", "_deleted"]))
273
+
274
+ top_docs = self._top_db[coll_name].find(**find_spec)
275
+ bottom_docs = self._bottom_db[coll_name].find(**find_spec)
276
+ top_seen_ids = set()
277
+ for doc in top_docs:
278
+ if not doc.get("_deleted"):
279
+ yield doc
280
+ top_seen_ids.add(doc["id"])
281
+
282
+ for doc in bottom_docs:
283
+ if doc["id"] not in top_seen_ids:
284
+ yield doc
285
+
286
+
287
+ def validate_json(
288
+ in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
289
+ ):
290
+ r"""
291
+ Checks whether the specified dictionary represents a valid instance of the `Database` class
292
+ defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
293
+
294
+ Example dictionary:
295
+ {
296
+ "biosample_set": [
297
+ {"id": "nmdc:bsm-00-000001", ...},
298
+ {"id": "nmdc:bsm-00-000002", ...}
299
+ ],
300
+ "study_set": [
301
+ {"id": "nmdc:sty-00-000001", ...},
302
+ {"id": "nmdc:sty-00-000002", ...}
303
+ ]
304
+ }
305
+
306
+ :param in_docs: The dictionary you want to validate
307
+ :param mdb: A reference to a MongoDB database
308
+ :param check_inter_document_references: Whether you want this function to check whether every document that
309
+ is referenced by any of the documents passed in would, indeed, exist
310
+ in the database, if the documents passed in were to be inserted into
311
+ the database. In other words, set this to `True` if you want this
312
+ function to perform referential integrity checks.
313
+ """
314
+ validator = Draft7Validator(get_nmdc_jsonschema_dict())
315
+ docs = deepcopy(in_docs)
316
+ validation_errors = {}
317
+
318
+ known_coll_names = set(nmdc_database_collection_names())
319
+ for coll_name, coll_docs in docs.items():
320
+ if coll_name not in known_coll_names:
321
+ # We expect each key in `in_docs` to be a known schema collection name. However, `@type` is a special key
322
+ # for JSON-LD, used for JSON serialization of e.g. LinkML objects. That is, the value of `@type` lets a
323
+ # client know that the JSON object (a dict in Python) should be interpreted as a
324
+ # <https://w3id.org/nmdc/Database>. If `@type` is present as a key, and its value indicates that
325
+ # `in_docs` is indeed a nmdc:Database, that's fine, and we don't want to raise an exception.
326
+ #
327
+ # prompted by: https://github.com/microbiomedata/nmdc-runtime/discussions/858
328
+ if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
329
+ continue
330
+ else:
331
+ validation_errors[coll_name] = [
332
+ f"'{coll_name}' is not a known schema collection name"
333
+ ]
334
+ continue
335
+
336
+ errors = list(validator.iter_errors({coll_name: coll_docs}))
337
+ validation_errors[coll_name] = [e.message for e in errors]
338
+ if coll_docs:
339
+ if not isinstance(coll_docs, list):
340
+ validation_errors[coll_name].append("value must be a list")
341
+ elif not all(isinstance(d, dict) for d in coll_docs):
342
+ validation_errors[coll_name].append(
343
+ "all elements of list must be dicts"
344
+ )
345
+ if not validation_errors[coll_name]:
346
+ try:
347
+ with OverlayDB(mdb) as odb:
348
+ odb.replace_or_insert_many(coll_name, coll_docs)
349
+ except OverlayDBError as e:
350
+ validation_errors[coll_name].append(str(e))
351
+
352
+ if all(len(v) == 0 for v in validation_errors.values()):
353
+ # Second pass. Try instantiating linkml-sourced dataclass
354
+ in_docs.pop("@type", None)
355
+ try:
356
+ NMDCDatabase(**in_docs)
357
+ except Exception as e:
358
+ return {"result": "errors", "detail": str(e)}
359
+
360
+ # Third pass (if enabled): Check inter-document references.
361
+ if check_inter_document_references is True:
362
+ # Prepare to use `refscan`.
363
+ #
364
+ # Note: We check the inter-document references in two stages, which are:
365
+ # 1. For each document in the JSON payload, check whether each document it references already exists
366
+ # (in the collections the schema says it can exist in) in the database. We use the
367
+ # `refscan` package to do this, which returns violation details we'll use in the second stage.
368
+ # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
369
+ # check whether that document exists (in the collections the schema says it can exist in) in the
370
+ # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
371
+ # The violations that remain after those two stages are the ones we return to the caller.
372
+ #
373
+ # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
374
+ # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
375
+ # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
376
+ # `refscan`'s `Finder` class accepts.
377
+ #
378
+ finder = Finder(database=mdb)
379
+ references = get_allowed_references()
380
+
381
+ # Iterate over the collections in the JSON payload.
382
+ for source_collection_name, documents in in_docs.items():
383
+ for document in documents:
384
+ # Add an `_id` field to the document, since `refscan` requires the document to have one.
385
+ source_document = dict(document, _id=None)
386
+ violations = scan_outgoing_references(
387
+ document=source_document,
388
+ schema_view=nmdc_schema_view(),
389
+ references=references,
390
+ finder=finder,
391
+ source_collection_name=source_collection_name,
392
+ user_wants_to_locate_misplaced_documents=False,
393
+ )
394
+
395
+ # For each violation, check whether the misplaced document is in the JSON payload, itself.
396
+ for violation in violations:
397
+ can_waive_violation = False
398
+ # Determine which collections can contain the referenced document, based upon
399
+ # the schema class of which this source document is an instance.
400
+ target_collection_names = (
401
+ references.get_target_collection_names(
402
+ source_class_name=violation.source_class_name,
403
+ source_field_name=violation.source_field_name,
404
+ )
405
+ )
406
+ # Check whether the referenced document exists in any of those collections in the JSON payload.
407
+ for json_coll_name, json_coll_docs in in_docs.items():
408
+ if json_coll_name in target_collection_names:
409
+ for json_coll_doc in json_coll_docs:
410
+ if json_coll_doc["id"] == violation.target_id:
411
+ can_waive_violation = True
412
+ break # stop checking
413
+ if can_waive_violation:
414
+ break # stop checking
415
+ if not can_waive_violation:
416
+ violation_as_str = (
417
+ f"Document '{violation.source_document_id}' "
418
+ f"in collection '{violation.source_collection_name}' "
419
+ f"has a field '{violation.source_field_name}' that "
420
+ f"references a document having id "
421
+ f"'{violation.target_id}', but the latter document "
422
+ f"does not exist in any of the collections the "
423
+ f"NMDC Schema says it can exist in."
424
+ )
425
+ validation_errors[source_collection_name].append(
426
+ violation_as_str
427
+ )
428
+
429
+ # If any collection's error list is not empty, return an error response.
430
+ if any(len(v) > 0 for v in validation_errors.values()):
431
+ return {"result": "errors", "detail": validation_errors}
432
+
433
+ return {"result": "All Okay!"}
434
+ else:
435
+ return {"result": "errors", "detail": validation_errors}
@@ -0,0 +1,37 @@
1
+ from functools import lru_cache
2
+ import os
3
+
4
+ import boto3
5
+
6
+ API_SITE_BUCKET = os.getenv("API_SITE_ID")
7
+ S3_ID_NS = "do" # Namespace for Drs Objects in Site S3-bucket store.
8
+
9
+
10
+ @lru_cache
11
+ def get_s3_client():
12
+ _session = boto3.session.Session()
13
+ return _session.client(
14
+ "s3",
15
+ region_name=os.getenv("DO_REGION_NAME"),
16
+ endpoint_url=os.getenv("DO_ENDPOINT_URL"),
17
+ aws_access_key_id=os.getenv("DO_SPACES_KEY"),
18
+ aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
19
+ )
20
+
21
+
22
+ def presigned_url_to_put(
23
+ key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
24
+ ):
25
+ return client.generate_presigned_url(
26
+ ClientMethod="put_object",
27
+ Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
28
+ ExpiresIn=expires_in,
29
+ )
30
+
31
+
32
+ def presigned_url_to_get(key, client=None, bucket=API_SITE_BUCKET, expires_in=300):
33
+ return client.generate_presigned_url(
34
+ ClientMethod="get_object",
35
+ Params={"Bucket": bucket, "Key": key},
36
+ ExpiresIn=expires_in,
37
+ )
@@ -0,0 +1,25 @@
1
+ from typing import List
2
+
3
+ import pymongo
4
+ from fastapi import APIRouter, Depends
5
+
6
+ from nmdc_runtime.api.core.util import raise404_if_none
7
+ from nmdc_runtime.api.db.mongo import get_mongo_db
8
+ from nmdc_runtime.api.models.capability import Capability
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.get("/capabilities", response_model=List[Capability])
14
+ def list_capabilities(
15
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
16
+ ):
17
+ return list(mdb.capabilities.find())
18
+
19
+
20
+ @router.get("/capabilities/{capability_id}", response_model=Capability)
21
+ def get_capability(
22
+ capability_id: str,
23
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
24
+ ):
25
+ return raise404_if_none(mdb.capabilities.find_one({"id": capability_id}))