nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +212 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +777 -0
  10. nmdc_runtime/api/core/util.py +114 -0
  11. nmdc_runtime/api/db/mongo.py +436 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +206 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +277 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +817 -0
  31. nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
  32. nmdc_runtime/api/endpoints/workflows.py +353 -0
  33. nmdc_runtime/api/entrypoint.sh +7 -0
  34. nmdc_runtime/api/main.py +495 -0
  35. nmdc_runtime/api/middleware.py +43 -0
  36. nmdc_runtime/api/models/capability.py +14 -0
  37. nmdc_runtime/api/models/id.py +92 -0
  38. nmdc_runtime/api/models/job.py +57 -0
  39. nmdc_runtime/api/models/lib/helpers.py +78 -0
  40. nmdc_runtime/api/models/metadata.py +11 -0
  41. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  42. nmdc_runtime/api/models/object.py +180 -0
  43. nmdc_runtime/api/models/object_type.py +20 -0
  44. nmdc_runtime/api/models/operation.py +66 -0
  45. nmdc_runtime/api/models/query.py +246 -0
  46. nmdc_runtime/api/models/query_continuation.py +111 -0
  47. nmdc_runtime/api/models/run.py +161 -0
  48. nmdc_runtime/api/models/site.py +87 -0
  49. nmdc_runtime/api/models/trigger.py +13 -0
  50. nmdc_runtime/api/models/user.py +207 -0
  51. nmdc_runtime/api/models/util.py +260 -0
  52. nmdc_runtime/api/models/wfe_file_stages.py +122 -0
  53. nmdc_runtime/api/models/workflow.py +15 -0
  54. nmdc_runtime/api/openapi.py +178 -0
  55. nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
  56. nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
  57. nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
  58. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  59. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  60. nmdc_runtime/config.py +56 -0
  61. nmdc_runtime/minter/adapters/repository.py +22 -2
  62. nmdc_runtime/minter/config.py +30 -4
  63. nmdc_runtime/minter/domain/model.py +55 -1
  64. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  65. nmdc_runtime/mongo_util.py +89 -0
  66. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  67. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  68. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  69. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  70. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  71. nmdc_runtime/site/dagster.yaml +53 -0
  72. nmdc_runtime/site/entrypoint-daemon.sh +29 -0
  73. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  74. nmdc_runtime/site/entrypoint-dagit.sh +29 -0
  75. nmdc_runtime/site/export/ncbi_xml.py +1331 -0
  76. nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
  77. nmdc_runtime/site/export/study_metadata.py +27 -4
  78. nmdc_runtime/site/graphs.py +294 -45
  79. nmdc_runtime/site/ops.py +1008 -230
  80. nmdc_runtime/site/repair/database_updater.py +451 -0
  81. nmdc_runtime/site/repository.py +368 -133
  82. nmdc_runtime/site/resources.py +154 -80
  83. nmdc_runtime/site/translation/gold_translator.py +235 -83
  84. nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
  85. nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
  86. nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
  87. nmdc_runtime/site/translation/neon_utils.py +24 -7
  88. nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
  89. nmdc_runtime/site/translation/translator.py +73 -3
  90. nmdc_runtime/site/util.py +26 -7
  91. nmdc_runtime/site/validation/emsl.py +1 -0
  92. nmdc_runtime/site/validation/gold.py +1 -0
  93. nmdc_runtime/site/validation/util.py +16 -12
  94. nmdc_runtime/site/workspace.yaml +13 -0
  95. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  96. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  97. nmdc_runtime/static/README.md +5 -0
  98. nmdc_runtime/static/favicon.ico +0 -0
  99. nmdc_runtime/util.py +236 -192
  100. nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
  101. nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
  102. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
  103. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
  104. nmdc_runtime/containers.py +0 -14
  105. nmdc_runtime/core/db/Database.py +0 -15
  106. nmdc_runtime/core/exceptions/__init__.py +0 -23
  107. nmdc_runtime/core/exceptions/base.py +0 -47
  108. nmdc_runtime/core/exceptions/token.py +0 -13
  109. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  110. nmdc_runtime/domain/users/userSchema.py +0 -37
  111. nmdc_runtime/domain/users/userService.py +0 -14
  112. nmdc_runtime/infrastructure/database/db.py +0 -3
  113. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  114. nmdc_runtime/lib/__init__.py +0 -1
  115. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  116. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  117. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  118. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  119. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  120. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  121. nmdc_runtime/site/drsobjects/registration.py +0 -131
  122. nmdc_runtime/site/terminusdb/generate.py +0 -198
  123. nmdc_runtime/site/terminusdb/ingest.py +0 -44
  124. nmdc_runtime/site/terminusdb/schema.py +0 -1671
  125. nmdc_runtime/site/translation/emsl.py +0 -42
  126. nmdc_runtime/site/translation/gold.py +0 -53
  127. nmdc_runtime/site/translation/jgi.py +0 -31
  128. nmdc_runtime/site/translation/util.py +0 -132
  129. nmdc_runtime/site/validation/jgi.py +0 -42
  130. nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
  131. nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
  132. nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
  133. /nmdc_runtime/{client → api}/__init__.py +0 -0
  134. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  135. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  136. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  137. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  138. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  139. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  140. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  141. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  142. /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
  143. {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,114 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import secrets
5
+ import string
6
+ from datetime import datetime, timezone, timedelta
7
+ from importlib import import_module
8
+
9
+ from fastapi import HTTPException, status
10
+ from pydantic import BaseModel
11
+ from toolz import keyfilter
12
+
13
+ API_SITE_ID = os.getenv("API_SITE_ID")
14
+ API_SITE_CLIENT_ID = os.getenv("API_SITE_CLIENT_ID")
15
+
16
+
17
+ def omit(blacklist, d):
18
+ return keyfilter(lambda k: k not in blacklist, d)
19
+
20
+
21
+ def pick(whitelist, d):
22
+ return keyfilter(lambda k: k in whitelist, d)
23
+
24
+
25
+ def hash_from_str(s: str, algo="sha256") -> str:
26
+ if algo not in hashlib.algorithms_guaranteed:
27
+ raise ValueError(f"desired algorithm {algo} not supported")
28
+ return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest()
29
+
30
+
31
+ def sha256hash_from_file(file_path: str, timestamp: str):
32
+ # https://stackoverflow.com/a/55542529
33
+ h = hashlib.sha256()
34
+
35
+ timestamp_bytes = timestamp.encode("utf-8")
36
+ h.update(timestamp_bytes)
37
+
38
+ with open(file_path, "rb") as file:
39
+ while True:
40
+ # Reading is buffered, so we can read smaller chunks.
41
+ chunk = file.read(h.block_size)
42
+ if not chunk:
43
+ break
44
+ h.update(chunk)
45
+
46
+ return h.hexdigest()
47
+
48
+
49
+ def raise404_if_none(doc, detail="Not found"):
50
+ if doc is None:
51
+ raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
52
+ return doc
53
+
54
+
55
+ def now() -> datetime:
56
+ """Get a `datetime` representing the current time in UTC."""
57
+ return datetime.now(timezone.utc)
58
+
59
+
60
+ def now_str() -> str:
61
+ """Get an ISO string representing the current time in UTC."""
62
+ return now().isoformat()
63
+
64
+
65
+ def expiry_dt_from_now(days=0, hours=0, minutes=0, seconds=0):
66
+ return now() + timedelta(days=days, hours=hours, minutes=minutes, seconds=seconds)
67
+
68
+
69
+ def has_passed(dt):
70
+ return now() > dt
71
+
72
+
73
+ def import_via_dotted_path(dotted_path: str):
74
+ module_name, _, member_name = dotted_path.rpartition(".")
75
+ return getattr(import_module(module_name), member_name)
76
+
77
+
78
+ def dotted_path_for(member):
79
+ return f"{member.__module__}.{member.__name__}"
80
+
81
+
82
+ def generate_secret(length=12):
83
+ """Generate a secret.
84
+
85
+ With
86
+ - at least one lowercase character,
87
+ - at least one uppercase character, and
88
+ - at least three digits
89
+
90
+ """
91
+ if length < 8:
92
+ raise ValueError(f"{length=} must be >=8.")
93
+ alphabet = string.ascii_letters + string.digits + "!@#$%^*-_+="
94
+ # based on https://docs.python.org/3.8/library/secrets.html#recipes-and-best-practices
95
+ while True:
96
+ _secret = "".join(secrets.choice(alphabet) for i in range(length))
97
+ if (
98
+ any(c.islower() for c in _secret)
99
+ and any(c.isupper() for c in _secret)
100
+ and sum(c.isdigit() for c in _secret) >= 3
101
+ ):
102
+ break
103
+ return _secret
104
+
105
+
106
+ def json_clean(data, model, exclude_unset=False) -> dict:
107
+ """Run data through a JSON serializer for a pydantic model."""
108
+ if not isinstance(data, (dict, BaseModel)):
109
+ raise TypeError("`data` must be a pydantic model or its .model_dump()")
110
+ m = model(**data) if isinstance(data, dict) else data
111
+
112
+ # Note: Between Pydantic v1 and v2, the `json` method was renamed to `model_dump_json`.
113
+ # Reference: https://docs.pydantic.dev/2.11/migration/#changes-to-pydanticbasemodel
114
+ return json.loads(m.model_dump_json(exclude_unset=exclude_unset))
@@ -0,0 +1,436 @@
1
+ import gzip
2
+ import os
3
+ from contextlib import AbstractContextManager
4
+ from copy import deepcopy
5
+ from functools import lru_cache
6
+ from typing import Set
7
+ from uuid import uuid4
8
+
9
+ import bson
10
+ from nmdc_schema.nmdc import Database as NMDCDatabase
11
+ from pymongo.errors import AutoReconnect, OperationFailure
12
+ from refscan.lib.Finder import Finder
13
+ from refscan.scanner import scan_outgoing_references
14
+ from tenacity import wait_random_exponential, retry, retry_if_exception_type
15
+ from toolz import merge, unique
16
+ from refscan.lib.helpers import get_collection_names_from_schema
17
+
18
+ from nmdc_runtime.api.models.query import UpdateStatement, DeleteStatement
19
+ from nmdc_runtime.mongo_util import SessionBoundDatabase
20
+ from nmdc_runtime.util import (
21
+ nmdc_schema_view,
22
+ collection_name_to_class_names,
23
+ ensure_unique_id_indexes,
24
+ nmdc_database_collection_names,
25
+ get_allowed_references,
26
+ get_nmdc_schema_validator,
27
+ )
28
+ from pymongo import MongoClient
29
+ from pymongo.database import Database as MongoDatabase
30
+
31
+
32
+ @retry(
33
+ retry=retry_if_exception_type(AutoReconnect),
34
+ wait=wait_random_exponential(multiplier=0.5, max=60),
35
+ )
36
+ def check_mongo_ok_autoreconnect(mdb: MongoDatabase):
37
+ r"""
38
+ Check whether the application can write to the database.
39
+ """
40
+ collection = mdb.get_collection("_runtime.healthcheck")
41
+ collection.insert_one({"status": "ok"})
42
+ collection.delete_many({"status": "ok"})
43
+ return True
44
+
45
+
46
+ @lru_cache
47
+ def get_mongo_client() -> MongoClient:
48
+ r"""
49
+ Returns a `MongoClient` instance you can use to access the MongoDB server specified via environment variables.
50
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/mongo_client.html#pymongo.mongo_client.MongoClient
51
+ """
52
+ return MongoClient(
53
+ host=os.getenv("MONGO_HOST"),
54
+ username=os.getenv("MONGO_USERNAME"),
55
+ password=os.getenv("MONGO_PASSWORD"),
56
+ directConnection=True,
57
+ )
58
+
59
+
60
+ @lru_cache
61
+ def get_mongo_db() -> MongoDatabase:
62
+ r"""
63
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
64
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
65
+ """
66
+ _client = get_mongo_client()
67
+ mdb = _client[os.getenv("MONGO_DBNAME")]
68
+ check_mongo_ok_autoreconnect(mdb)
69
+ return mdb
70
+
71
+
72
+ @lru_cache
73
+ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
74
+ r"""
75
+ Returns a `Database` instance you can use to access the MongoDB database specified via an environment variable.
76
+ Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database
77
+ """
78
+ _client = get_mongo_client()
79
+ mdb = _client[os.getenv("MONGO_DBNAME")]
80
+ check_mongo_ok_autoreconnect(mdb)
81
+ return SessionBoundDatabase(mdb, session) if session is not None else mdb
82
+
83
+
84
+ def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
85
+ """
86
+ Returns the names of the collections that (a) exist in the database,
87
+ (b) are described by the schema, and (c) contain at least one document.
88
+
89
+ Note: The ampersand (`&`) is the "set intersection" operator.
90
+ """
91
+ collection_names_from_database = mdb.list_collection_names()
92
+ schema_view = nmdc_schema_view()
93
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
94
+ names = set(collection_names_from_database) & set(collection_names_from_schema)
95
+ return {name for name in names if mdb[name].estimated_document_count() > 0}
96
+
97
+
98
+ @lru_cache
99
+ def activity_collection_names(mdb: MongoDatabase) -> Set[str]:
100
+ r"""
101
+ TODO: Document this function.
102
+ """
103
+ return get_nonempty_nmdc_schema_collection_names(mdb) - {
104
+ "biosample_set",
105
+ "study_set",
106
+ "data_object_set",
107
+ "functional_annotation_set",
108
+ "genome_feature_set",
109
+ }
110
+
111
+
112
+ @lru_cache
113
+ def get_planned_process_collection_names() -> Set[str]:
114
+ r"""
115
+ Returns the names of all collections that the schema says can contain documents
116
+ that represent instances of the `PlannedProcess` class or any of its subclasses.
117
+ """
118
+ schema_view = nmdc_schema_view()
119
+ collection_names = set()
120
+ planned_process_descendants = set(schema_view.class_descendants("PlannedProcess"))
121
+
122
+ for collection_name, class_names in collection_name_to_class_names.items():
123
+ for class_name in class_names:
124
+ # If the name of this class is the name of the `PlannedProcess` class
125
+ # or any of its subclasses, add it to the result set.
126
+ if class_name in planned_process_descendants:
127
+ collection_names.add(collection_name)
128
+
129
+ return collection_names
130
+
131
+
132
+ def mongodump_excluded_collections() -> str:
133
+ """
134
+ TODO: Document this function.
135
+ """
136
+ _mdb = get_mongo_db()
137
+ schema_view = nmdc_schema_view()
138
+ collection_names_from_database = _mdb.list_collection_names()
139
+ collection_names_from_schema = get_collection_names_from_schema(schema_view)
140
+ excluded_collections = " ".join(
141
+ f"--excludeCollection={c}"
142
+ for c in sorted(
143
+ set(collection_names_from_database) - set(collection_names_from_schema)
144
+ )
145
+ )
146
+ return excluded_collections
147
+
148
+
149
+ def mongorestore_collection(mdb, collection_name, bson_file_path):
150
+ """
151
+ Replaces the specified collection with one that reflects the contents of the
152
+ specified BSON file.
153
+ """
154
+ with gzip.open(bson_file_path, "rb") as bson_file:
155
+ data = bson.decode_all(bson_file.read())
156
+ if data:
157
+ mdb.drop_collection(collection_name)
158
+ mdb[collection_name].insert_many(data)
159
+ print(
160
+ f"mongorestore_collection: inserted {len(data)} documents into {collection_name} after drop"
161
+ )
162
+ else:
163
+ print(f"mongorestore_collection: no {collection_name} documents found")
164
+
165
+
166
+ def mongorestore_from_dir(mdb, dump_directory, skip_collections=None):
167
+ """
168
+ Effectively runs a `mongorestore` command in pure Python.
169
+ Helpful in a container context that does not have the `mongorestore` command available.
170
+ """
171
+ skip_collections = skip_collections or []
172
+ for root, dirs, files in os.walk(dump_directory):
173
+ for file in files:
174
+ if file.endswith(".bson.gz"):
175
+ collection_name = file.replace(".bson.gz", "")
176
+ if collection_name in skip_collections:
177
+ continue
178
+ bson_file_path = os.path.join(root, file)
179
+ mongorestore_collection(mdb, collection_name, bson_file_path)
180
+
181
+ print("mongorestore_from_dir completed successfully.")
182
+
183
+
184
+ class OverlayDBError(Exception):
185
+ pass
186
+
187
+
188
+ class OverlayDB(AbstractContextManager):
189
+ """Provides a context whereby a base Database is overlaid with a temporary one.
190
+
191
+ If you need to run basic simulations of updates to a base database,
192
+ you don't want to actually commit transactions to the base database.
193
+
194
+ For example, to insert or replace (matching on "id") many documents into a collection in order
195
+ to then validate the resulting total set of collection documents, an OverlayDB writes to
196
+ an overlay collection that "shadows" the base collection during a "find" query
197
+ (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
198
+ overlay collection, that id is marked as "seen" and will not also be returned when
199
+ subsequently scanning the (unmodified) base-database collection.
200
+
201
+ Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
202
+ database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
203
+ `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
204
+ the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
205
+ "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
206
+ of the `merge_find` method, which internally accesses both the real database and the overlaying database.
207
+
208
+ Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
209
+ documents from a base collection to the overlay, and then applying the updates to the overlay,
210
+ so that again, base collections are unmodified, and a "merge_find" call will produce a result
211
+ *as if* the base collection(s) were modified.
212
+
213
+ Mongo deletions (as the "delete" method) also copy affected documents from the base collection
214
+ to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
215
+ call will match a relevant document given a suitable filter, and will mark the document's id
216
+ as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
217
+
218
+ Usage:
219
+ ````
220
+ with OverlayDB(mdb) as odb:
221
+ # do stuff, e.g. `odb.replace_or_insert_many(...)`
222
+ ```
223
+ """
224
+
225
+ def __init__(self, mdb: MongoDatabase):
226
+ self._bottom_db = mdb
227
+ self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
228
+ ensure_unique_id_indexes(self._top_db)
229
+
230
+ def __enter__(self):
231
+ return self
232
+
233
+ def __exit__(self, exc_type, exc_value, traceback):
234
+ self._bottom_db.client.drop_database(self._top_db.name)
235
+
236
+ def replace_or_insert_many(self, coll_name, documents: list):
237
+ try:
238
+ self._top_db[coll_name].insert_many(documents)
239
+ except OperationFailure as e:
240
+ raise OverlayDBError(str(e.details))
241
+
242
+ def apply_updates(self, coll_name, updates: list):
243
+ """prepare overlay db and apply updates to it."""
244
+ assert all(UpdateStatement(**us) for us in updates)
245
+ for update_spec in updates:
246
+ for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
247
+ self._top_db[coll_name].insert_one(bottom_doc)
248
+ try:
249
+ self._top_db.command({"update": coll_name, "updates": updates})
250
+ except OperationFailure as e:
251
+ raise OverlayDBError(str(e.details))
252
+
253
+ def delete(self, coll_name, deletes: list):
254
+ """ "apply" delete command by flagging docs in overlay database"""
255
+ assert all(DeleteStatement(**us) for us in deletes)
256
+ for delete_spec in deletes:
257
+ for bottom_doc in self._bottom_db[coll_name].find(
258
+ delete_spec["q"], limit=delete_spec["limit"]
259
+ ):
260
+ bottom_doc["_deleted"] = True
261
+ self._top_db[coll_name].insert_one(bottom_doc)
262
+
263
+ def merge_find(self, coll_name, find_spec: dict):
264
+ """Yield docs first from overlay and then from base db, minding deletion flags."""
265
+ # ensure projection of "id" and "_deleted"
266
+ if "projection" in find_spec:
267
+ proj = find_spec["projection"]
268
+ if isinstance(proj, dict):
269
+ proj = merge(proj, {"id": 1, "_deleted": 1})
270
+ elif isinstance(proj, list):
271
+ proj = list(unique(proj + ["id", "_deleted"]))
272
+
273
+ top_docs = self._top_db[coll_name].find(**find_spec)
274
+ bottom_docs = self._bottom_db[coll_name].find(**find_spec)
275
+ top_seen_ids = set()
276
+ for doc in top_docs:
277
+ if not doc.get("_deleted"):
278
+ yield doc
279
+ top_seen_ids.add(doc["id"])
280
+
281
+ for doc in bottom_docs:
282
+ if doc["id"] not in top_seen_ids:
283
+ yield doc
284
+
285
+
286
+ def validate_json(
287
+ in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
288
+ ):
289
+ r"""
290
+ Checks whether the specified dictionary represents a valid instance of the `Database` class
291
+ defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
292
+
293
+ Example dictionary:
294
+ {
295
+ "biosample_set": [
296
+ {"id": "nmdc:bsm-00-000001", ...},
297
+ {"id": "nmdc:bsm-00-000002", ...}
298
+ ],
299
+ "study_set": [
300
+ {"id": "nmdc:sty-00-000001", ...},
301
+ {"id": "nmdc:sty-00-000002", ...}
302
+ ]
303
+ }
304
+
305
+ :param in_docs: The dictionary you want to validate
306
+ :param mdb: A reference to a MongoDB database
307
+ :param check_inter_document_references: Whether you want this function to check whether every document that
308
+ is referenced by any of the documents passed in would, indeed, exist
309
+ in the database, if the documents passed in were to be inserted into
310
+ the database. In other words, set this to `True` if you want this
311
+ function to perform referential integrity checks.
312
+ """
313
+ validator = get_nmdc_schema_validator()
314
+ docs = deepcopy(in_docs)
315
+ validation_errors = {}
316
+
317
+ known_coll_names = set(nmdc_database_collection_names())
318
+ for coll_name, coll_docs in docs.items():
319
+ if coll_name not in known_coll_names:
320
+ # We expect each key in `in_docs` to be a known schema collection name. However, `@type` is a special key
321
+ # for JSON-LD, used for JSON serialization of e.g. LinkML objects. That is, the value of `@type` lets a
322
+ # client know that the JSON object (a dict in Python) should be interpreted as a
323
+ # <https://w3id.org/nmdc/Database>. If `@type` is present as a key, and its value indicates that
324
+ # `in_docs` is indeed a nmdc:Database, that's fine, and we don't want to raise an exception.
325
+ #
326
+ # prompted by: https://github.com/microbiomedata/nmdc-runtime/discussions/858
327
+ if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
328
+ continue
329
+ else:
330
+ validation_errors[coll_name] = [
331
+ f"'{coll_name}' is not a known schema collection name"
332
+ ]
333
+ continue
334
+
335
+ errors = list(
336
+ validator.iter_results({coll_name: coll_docs}, target_class="Database")
337
+ )
338
+ validation_errors[coll_name] = [e.message for e in errors]
339
+ if coll_docs:
340
+ if not isinstance(coll_docs, list):
341
+ validation_errors[coll_name].append("value must be a list")
342
+ elif not all(isinstance(d, dict) for d in coll_docs):
343
+ validation_errors[coll_name].append(
344
+ "all elements of list must be dicts"
345
+ )
346
+ if not validation_errors[coll_name]:
347
+ try:
348
+ with OverlayDB(mdb) as odb:
349
+ odb.replace_or_insert_many(coll_name, coll_docs)
350
+ except OverlayDBError as e:
351
+ validation_errors[coll_name].append(str(e))
352
+
353
+ if all(len(v) == 0 for v in validation_errors.values()):
354
+ # Second pass. Try instantiating linkml-sourced dataclass
355
+ in_docs.pop("@type", None)
356
+ try:
357
+ NMDCDatabase(**in_docs)
358
+ except Exception as e:
359
+ return {"result": "errors", "detail": str(e)}
360
+
361
+ # Third pass (if enabled): Check inter-document references.
362
+ if check_inter_document_references is True:
363
+ # Prepare to use `refscan`.
364
+ #
365
+ # Note: We check the inter-document references in two stages, which are:
366
+ # 1. For each document in the JSON payload, check whether each document it references already exists
367
+ # (in the collections the schema says it can exist in) in the database. We use the
368
+ # `refscan` package to do this, which returns violation details we'll use in the second stage.
369
+ # 2. For each violation found in the first stage (i.e. each reference to a not-found document), we
370
+ # check whether that document exists (in the collections the schema says it can exist in) in the
371
+ # JSON payload. If it does, then we "waive" (i.e. discard) that violation.
372
+ # The violations that remain after those two stages are the ones we return to the caller.
373
+ #
374
+ # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
375
+ # does not provide a means to perform arbitrary queries against its virtual "merged" database. It
376
+ # is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
377
+ # `refscan`'s `Finder` class accepts.
378
+ #
379
+ finder = Finder(database=mdb)
380
+ references = get_allowed_references()
381
+
382
+ # Iterate over the collections in the JSON payload.
383
+ for source_collection_name, documents in in_docs.items():
384
+ for document in documents:
385
+ # Add an `_id` field to the document, since `refscan` requires the document to have one.
386
+ source_document = dict(document, _id=None)
387
+ violations = scan_outgoing_references(
388
+ document=source_document,
389
+ schema_view=nmdc_schema_view(),
390
+ references=references,
391
+ finder=finder,
392
+ source_collection_name=source_collection_name,
393
+ user_wants_to_locate_misplaced_documents=False,
394
+ )
395
+
396
+ # For each violation, check whether the misplaced document is in the JSON payload, itself.
397
+ for violation in violations:
398
+ can_waive_violation = False
399
+ # Determine which collections can contain the referenced document, based upon
400
+ # the schema class of which this source document is an instance.
401
+ target_collection_names = (
402
+ references.get_target_collection_names(
403
+ source_class_name=violation.source_class_name,
404
+ source_field_name=violation.source_field_name,
405
+ )
406
+ )
407
+ # Check whether the referenced document exists in any of those collections in the JSON payload.
408
+ for json_coll_name, json_coll_docs in in_docs.items():
409
+ if json_coll_name in target_collection_names:
410
+ for json_coll_doc in json_coll_docs:
411
+ if json_coll_doc["id"] == violation.target_id:
412
+ can_waive_violation = True
413
+ break # stop checking
414
+ if can_waive_violation:
415
+ break # stop checking
416
+ if not can_waive_violation:
417
+ violation_as_str = (
418
+ f"Document '{violation.source_document_id}' "
419
+ f"in collection '{violation.source_collection_name}' "
420
+ f"has a field '{violation.source_field_name}' that "
421
+ f"references a document having id "
422
+ f"'{violation.target_id}', but the latter document "
423
+ f"does not exist in any of the collections the "
424
+ f"NMDC Schema says it can exist in."
425
+ )
426
+ validation_errors[source_collection_name].append(
427
+ violation_as_str
428
+ )
429
+
430
+ # If any collection's error list is not empty, return an error response.
431
+ if any(len(v) > 0 for v in validation_errors.values()):
432
+ return {"result": "errors", "detail": validation_errors}
433
+
434
+ return {"result": "All Okay!"}
435
+ else:
436
+ return {"result": "errors", "detail": validation_errors}
@@ -0,0 +1,37 @@
1
+ from functools import lru_cache
2
+ import os
3
+
4
+ import boto3
5
+
6
+ API_SITE_BUCKET = os.getenv("API_SITE_ID")
7
+ S3_ID_NS = "do" # Namespace for Drs Objects in Site S3-bucket store.
8
+
9
+
10
+ @lru_cache
11
+ def get_s3_client():
12
+ _session = boto3.session.Session()
13
+ return _session.client(
14
+ "s3",
15
+ region_name=os.getenv("DO_REGION_NAME"),
16
+ endpoint_url=os.getenv("DO_ENDPOINT_URL"),
17
+ aws_access_key_id=os.getenv("DO_SPACES_KEY"),
18
+ aws_secret_access_key=os.getenv("DO_SPACES_SECRET"),
19
+ )
20
+
21
+
22
+ def presigned_url_to_put(
23
+ key, client=None, mime_type=None, bucket=API_SITE_BUCKET, expires_in=300
24
+ ):
25
+ return client.generate_presigned_url(
26
+ ClientMethod="put_object",
27
+ Params={"Bucket": bucket, "Key": key, "ContentType": mime_type},
28
+ ExpiresIn=expires_in,
29
+ )
30
+
31
+
32
+ def presigned_url_to_get(key, client=None, bucket=API_SITE_BUCKET, expires_in=300):
33
+ return client.generate_presigned_url(
34
+ ClientMethod="get_object",
35
+ Params={"Bucket": bucket, "Key": key},
36
+ ExpiresIn=expires_in,
37
+ )
@@ -0,0 +1,25 @@
1
+ from typing import List
2
+
3
+ import pymongo
4
+ from fastapi import APIRouter, Depends
5
+
6
+ from nmdc_runtime.api.core.util import raise404_if_none
7
+ from nmdc_runtime.api.db.mongo import get_mongo_db
8
+ from nmdc_runtime.api.models.capability import Capability
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ @router.get("/capabilities", response_model=List[Capability])
14
+ def list_capabilities(
15
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
16
+ ):
17
+ return list(mdb.capabilities.find())
18
+
19
+
20
+ @router.get("/capabilities/{capability_id}", response_model=Capability)
21
+ def get_capability(
22
+ capability_id: str,
23
+ mdb: pymongo.database.Database = Depends(get_mongo_db),
24
+ ):
25
+ return raise404_if_none(mdb.capabilities.find_one({"id": capability_id}))