nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show
  1. nmdc_runtime/Dockerfile +167 -0
  2. nmdc_runtime/api/analytics.py +90 -0
  3. nmdc_runtime/api/boot/capabilities.py +9 -0
  4. nmdc_runtime/api/boot/object_types.py +126 -0
  5. nmdc_runtime/api/boot/triggers.py +84 -0
  6. nmdc_runtime/api/boot/workflows.py +116 -0
  7. nmdc_runtime/api/core/auth.py +208 -0
  8. nmdc_runtime/api/core/idgen.py +200 -0
  9. nmdc_runtime/api/core/metadata.py +788 -0
  10. nmdc_runtime/api/core/util.py +109 -0
  11. nmdc_runtime/api/db/mongo.py +435 -0
  12. nmdc_runtime/api/db/s3.py +37 -0
  13. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  14. nmdc_runtime/api/endpoints/find.py +634 -0
  15. nmdc_runtime/api/endpoints/jobs.py +143 -0
  16. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  17. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  18. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  19. nmdc_runtime/api/endpoints/metadata.py +260 -0
  20. nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
  21. nmdc_runtime/api/endpoints/object_types.py +38 -0
  22. nmdc_runtime/api/endpoints/objects.py +270 -0
  23. nmdc_runtime/api/endpoints/operations.py +78 -0
  24. nmdc_runtime/api/endpoints/queries.py +701 -0
  25. nmdc_runtime/api/endpoints/runs.py +98 -0
  26. nmdc_runtime/api/endpoints/search.py +38 -0
  27. nmdc_runtime/api/endpoints/sites.py +205 -0
  28. nmdc_runtime/api/endpoints/triggers.py +25 -0
  29. nmdc_runtime/api/endpoints/users.py +214 -0
  30. nmdc_runtime/api/endpoints/util.py +796 -0
  31. nmdc_runtime/api/endpoints/workflows.py +353 -0
  32. nmdc_runtime/api/entrypoint.sh +7 -0
  33. nmdc_runtime/api/main.py +425 -0
  34. nmdc_runtime/api/middleware.py +43 -0
  35. nmdc_runtime/api/models/capability.py +14 -0
  36. nmdc_runtime/api/models/id.py +92 -0
  37. nmdc_runtime/api/models/job.py +37 -0
  38. nmdc_runtime/api/models/lib/helpers.py +78 -0
  39. nmdc_runtime/api/models/metadata.py +11 -0
  40. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  41. nmdc_runtime/api/models/object.py +180 -0
  42. nmdc_runtime/api/models/object_type.py +20 -0
  43. nmdc_runtime/api/models/operation.py +66 -0
  44. nmdc_runtime/api/models/query.py +246 -0
  45. nmdc_runtime/api/models/query_continuation.py +111 -0
  46. nmdc_runtime/api/models/run.py +161 -0
  47. nmdc_runtime/api/models/site.py +87 -0
  48. nmdc_runtime/api/models/trigger.py +13 -0
  49. nmdc_runtime/api/models/user.py +140 -0
  50. nmdc_runtime/api/models/util.py +260 -0
  51. nmdc_runtime/api/models/workflow.py +15 -0
  52. nmdc_runtime/api/openapi.py +178 -0
  53. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  54. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  55. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  56. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  57. nmdc_runtime/config.py +7 -8
  58. nmdc_runtime/minter/adapters/repository.py +22 -2
  59. nmdc_runtime/minter/config.py +2 -0
  60. nmdc_runtime/minter/domain/model.py +55 -1
  61. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  62. nmdc_runtime/mongo_util.py +1 -2
  63. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  64. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  65. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  66. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  67. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  68. nmdc_runtime/site/dagster.yaml +53 -0
  69. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  70. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  71. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  72. nmdc_runtime/site/export/ncbi_xml.py +633 -13
  73. nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
  74. nmdc_runtime/site/graphs.py +8 -22
  75. nmdc_runtime/site/ops.py +147 -181
  76. nmdc_runtime/site/repository.py +2 -112
  77. nmdc_runtime/site/resources.py +16 -3
  78. nmdc_runtime/site/translation/gold_translator.py +4 -12
  79. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  80. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  81. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  82. nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
  83. nmdc_runtime/site/translation/translator.py +63 -1
  84. nmdc_runtime/site/util.py +8 -3
  85. nmdc_runtime/site/validation/util.py +10 -5
  86. nmdc_runtime/site/workspace.yaml +13 -0
  87. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  88. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  89. nmdc_runtime/static/README.md +5 -0
  90. nmdc_runtime/static/favicon.ico +0 -0
  91. nmdc_runtime/util.py +90 -48
  92. nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
  93. nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
  94. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
  95. nmdc_runtime/containers.py +0 -14
  96. nmdc_runtime/core/db/Database.py +0 -15
  97. nmdc_runtime/core/exceptions/__init__.py +0 -23
  98. nmdc_runtime/core/exceptions/base.py +0 -47
  99. nmdc_runtime/core/exceptions/token.py +0 -13
  100. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  101. nmdc_runtime/domain/users/userSchema.py +0 -37
  102. nmdc_runtime/domain/users/userService.py +0 -14
  103. nmdc_runtime/infrastructure/database/db.py +0 -3
  104. nmdc_runtime/infrastructure/database/models/user.py +0 -10
  105. nmdc_runtime/lib/__init__.py +0 -1
  106. nmdc_runtime/lib/extract_nmdc_data.py +0 -41
  107. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  108. nmdc_runtime/lib/nmdc_dataframes.py +0 -829
  109. nmdc_runtime/lib/nmdc_etl_class.py +0 -402
  110. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  111. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  112. nmdc_runtime/site/drsobjects/registration.py +0 -131
  113. nmdc_runtime/site/translation/emsl.py +0 -43
  114. nmdc_runtime/site/translation/gold.py +0 -53
  115. nmdc_runtime/site/translation/jgi.py +0 -32
  116. nmdc_runtime/site/translation/util.py +0 -132
  117. nmdc_runtime/site/validation/jgi.py +0 -43
  118. nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
  119. nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
  120. nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
  121. /nmdc_runtime/{client → api}/__init__.py +0 -0
  122. /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
  123. /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
  124. /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
  125. /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
  126. /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
  127. /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
  128. /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
  129. /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
  130. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
  131. {nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,701 @@
1
+ import json
2
+ import logging
3
+ from typing import Annotated
4
+
5
+ import bson.json_util
6
+ from fastapi import APIRouter, Depends, Query, status, HTTPException
7
+ from pymongo.database import Database as MongoDatabase
8
+ from toolz import assoc_in
9
+ from refscan.lib.Finder import Finder
10
+ from refscan.scanner import identify_referring_documents
11
+
12
+ from nmdc_runtime.api.core.util import now
13
+ from nmdc_runtime.api.db.mongo import (
14
+ get_mongo_db,
15
+ get_nonempty_nmdc_schema_collection_names,
16
+ OverlayDB,
17
+ validate_json,
18
+ )
19
+ from nmdc_runtime.api.endpoints.lib.helpers import simulate_updates_and_check_references
20
+ from nmdc_runtime.api.endpoints.util import (
21
+ check_action_permitted,
22
+ strip_oid,
23
+ )
24
+ import nmdc_runtime.api.models.query_continuation as qc
25
+ from nmdc_runtime.api.models.query import (
26
+ DeleteCommand,
27
+ DeleteCommandResponse,
28
+ CommandResponse,
29
+ command_response_for,
30
+ QueryCmd,
31
+ UpdateCommand,
32
+ UpdateCommandResponse,
33
+ AggregateCommand,
34
+ FindCommand,
35
+ GetMoreCommand,
36
+ CommandResponseOptions,
37
+ Cmd,
38
+ CursorYieldingCommandResponse,
39
+ CursorYieldingCommand,
40
+ DeleteSpecs,
41
+ UpdateSpecs,
42
+ )
43
+ from nmdc_runtime.api.models.lib.helpers import derive_delete_specs, derive_update_specs
44
+ from nmdc_runtime.api.models.user import get_current_active_user, User
45
+ from nmdc_runtime.util import (
46
+ get_allowed_references,
47
+ nmdc_schema_view,
48
+ )
49
+
50
+ router = APIRouter()
51
+
52
+
53
+ def check_can_update_and_delete(user: User):
54
+ # update and delete queries require same level of permissions
55
+ if not check_action_permitted(
56
+ user.username, "/queries:run(query_cmd:DeleteCommand)"
57
+ ):
58
+ raise HTTPException(
59
+ status_code=status.HTTP_403_FORBIDDEN,
60
+ detail="Only specific users are allowed to issue update and delete commands.",
61
+ )
62
+
63
+
64
+ def check_can_aggregate(user: User):
65
+ if not check_action_permitted(
66
+ user.username, "/queries:run(query_cmd:AggregateCommand)"
67
+ ):
68
+ raise HTTPException(
69
+ status_code=status.HTTP_403_FORBIDDEN,
70
+ detail="Only specific users are allowed to issue aggregate commands.",
71
+ )
72
+
73
+
74
+ # Note: We set `response_model_exclude_unset=True` so that all the properties of the `CommandResponseOptions` object
75
+ # that we don't explicitly assign values to while handling the HTTP request, are omitted from the HTTP response.
76
+ # Reference: https://fastapi.tiangolo.com/tutorial/response-model/#use-the-response_model_exclude_unset-parameter
77
+ @router.post(
78
+ "/queries:run",
79
+ response_model=CommandResponseOptions,
80
+ response_model_exclude_unset=True,
81
+ )
82
+ def run_query(
83
+ cmd: Cmd,
84
+ mdb: MongoDatabase = Depends(get_mongo_db),
85
+ user: User = Depends(get_current_active_user),
86
+ allow_broken_refs: Annotated[
87
+ bool,
88
+ Query(
89
+ description="When `true`, the server will allow operations that leave behind broken references."
90
+ ),
91
+ ] = False,
92
+ ):
93
+ r"""
94
+ Performs `find`, `aggregate`, `update`, `delete`, and `getMore` commands for users that have adequate permissions.
95
+
96
+ For `find` and `aggregate` commands, the requested items will be in `cursor.batch`.
97
+ When the response includes a non-null `cursor.id`, there _may_ be more items available.
98
+ To retrieve the next batch of items, submit a request with `getMore` set to that non-null `cursor.id`.
99
+ When the response includes a null `cursor.id`, there are no more items available.
100
+
101
+ **Example request bodies:**
102
+
103
+ Get all\* biosamples.
104
+ ```
105
+ {
106
+ "find": "biosample_set",
107
+ "filter": {}
108
+ }
109
+ ```
110
+
111
+ Get all\* biosamples associated with a given study.
112
+ ```
113
+ {
114
+ "find": "biosample_set",
115
+ "filter": {"associated_studies": "nmdc:sty-11-34xj1150"}
116
+ }
117
+ ```
118
+
119
+ \*<small>Up to 101, which is the default "batchSize" for the "find" command.</small>
120
+
121
+ Get the first 200 biosamples associated with a given study.
122
+ ```
123
+ {
124
+ "find": "biosample_set",
125
+ "filter": {"associated_studies": "nmdc:sty-11-34xj1150"},
126
+ "batchSize": 200
127
+ }
128
+ ```
129
+
130
+ Delete the first biosample having a given `id`.
131
+ ```
132
+ {
133
+ "delete": "biosample_set",
134
+ "deletes": [{"q": {"id": "A_BIOSAMPLE_ID"}, "limit": 1}]
135
+ }
136
+ ```
137
+
138
+ Rename the first\* embargoed biosample.
139
+ ```
140
+ {
141
+ "update": "biosample_set",
142
+ "updates": [{"q": {"embargoed": true}, "u": {"$set": {"name": "A_NEW_NAME"}}}]
143
+ }
144
+ ```
145
+
146
+ \*<small>Updates at most 1 matching document, since `"multi": true` is not present.</small>
147
+
148
+ Rename all\* embargoed biosamples.
149
+ ```
150
+ {
151
+ "update": "biosample_set",
152
+ "updates": [{"q": {"embargoed": true}, "u": {"$set": {"name": "A_NEW_NAME"}}, "multi": true}]
153
+ }
154
+ ```
155
+
156
+ \*<small>Updates all matching documents, since `"multi": true` is present.</small>
157
+
158
+ Get all\* biosamples, sorted by the number of studies associated with them (greatest to least).
159
+ ```
160
+ {
161
+ "aggregate": "biosample_set",
162
+ "pipeline": [{"$sortByCount": "$associated_studies"}]
163
+ }
164
+ ```
165
+
166
+ \*<small>Up to 25, which is the default "batchSize" for the "aggregate" command.</small>
167
+
168
+ Get the first 10 biosamples having the largest numbers of studies associated with them,
169
+ sorted by that number of studies (greatest to least).
170
+ ```
171
+ {
172
+ "aggregate": "biosample_set",
173
+ "pipeline": [{"$sortByCount": "$associated_studies"}],
174
+ "cursor": {"batchSize": 10}
175
+ }
176
+ ```
177
+
178
+ Get a specific study and all the biosamples associated with that study.
179
+ ```
180
+ {
181
+ "aggregate": "study_set",
182
+ "pipeline": [
183
+ {
184
+ "$match": {
185
+ "id": "nmdc:sty-11-8fb6t785"
186
+ }
187
+ },
188
+ {
189
+ "$lookup": {
190
+ "from": "biosample_set",
191
+ "localField": "id",
192
+ "foreignField": "associated_studies",
193
+ "as": "biosamples_of_study"
194
+ }
195
+ }
196
+ ]
197
+ }
198
+ ```
199
+
200
+ Use the `cursor.id` from a previous response to get the next batch of results,
201
+ whether that batch is empty or non-empty.
202
+ ```
203
+ {
204
+ "getMore": "somecursorid"
205
+ }
206
+ ```
207
+
208
+ **Limitations:**
209
+
210
+ 1. The maximum size of the response payload is 16 MB. You can use the "batchSize" property
211
+ (for "find" commands) or the "cursor.batchSize" property (for "aggregate" commands)—along
212
+ with some trial and error—to ensure the response payload size remains under that limit.
213
+ 2. When using an "aggregate" command, if any of the objects output by the pipeline lacks an
214
+ "_id" field, the endpoint will return only the first batch of objects and will not offer
215
+ pagination (i.e. "cursor.id" will be null).
216
+ 3. Manipulating the values of "_id" fields within an aggregation pipeline (e.g. via "$set")
217
+ can result in pagination not working. If this impacts your use case, please contact us.
218
+ """
219
+ r"""
220
+ Additional notes for developers:*
221
+ --------------------------------
222
+ * Note: Because this section isn't in the main docstring,
223
+ it isn't visible on Swagger UI.
224
+
225
+ 1. The sorting part of the pagination algorithm is based upon the assumption
226
+ that the `_id` values of the documents are "deterministically sortable."
227
+ However, an aggregation pipeline can be used to populate the `_id` field
228
+ with values that are _not_ "deterministically sortable." For example,
229
+ the final stage of the pipeline could be: `{ "$set": { "_id": "potato" } }`
230
+ References:
231
+ - https://www.unicode.org/notes/tn9/
232
+ - https://www.mongodb.com/docs/manual/reference/operator/aggregation/set/
233
+ """
234
+
235
+ # If the command is one that requires the user to have specific permissions, check for those permissions now.
236
+ # Note: The permission-checking function will raise an exception if the user lacks those permissions.
237
+ if isinstance(cmd, (DeleteCommand, UpdateCommand)):
238
+ check_can_update_and_delete(user)
239
+
240
+ # check if the user has permission to run aggregate commands
241
+ if isinstance(cmd, AggregateCommand):
242
+ check_can_aggregate(user)
243
+
244
+ cmd_response = _run_mdb_cmd(cmd, allow_broken_refs=allow_broken_refs)
245
+ return cmd_response
246
+
247
+
248
+ _mdb = get_mongo_db()
249
+
250
+
251
+ def _run_mdb_cmd(
252
+ cmd: Cmd, mdb: MongoDatabase = _mdb, allow_broken_refs: bool = False
253
+ ) -> CommandResponse:
254
+ r"""
255
+ TODO: Document this function.
256
+ TODO: Consider splitting this function into multiple, smaller functions (if practical). It is currently ~370 lines.
257
+ TODO: How does this function behave when the "batchSize" is invalid (e.g. 0, negative, non-numeric)?
258
+
259
+ :param cmd: Undocumented. TODO: Document this parameter.
260
+ :param mdb: Undocumented. TODO: Document this parameter.
261
+ :param allow_broken_refs: Under normal circumstances, if this function determines that performing
262
+ the specified command would leave behind broken references, this function
263
+ will reject the command (i.e. raise an HTTP 422). In contrast, when the
264
+ `allow_broken_refs` parameter is set to `true`, this function will not
265
+ reject the command for that reason (however, it may reject the command
266
+ for other reasons).
267
+ """
268
+ ran_at = now()
269
+ cursor_id = cmd.getMore if isinstance(cmd, GetMoreCommand) else None
270
+ logging.info(f"Command type: {type(cmd).__name__}")
271
+ logging.info(f"Cursor ID: {cursor_id}")
272
+
273
+ if isinstance(cmd, DeleteCommand):
274
+ collection_name = cmd.delete
275
+ if collection_name not in get_nonempty_nmdc_schema_collection_names(mdb):
276
+ raise HTTPException(
277
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
278
+ detail=(
279
+ "Can only delete documents from collections that are "
280
+ "not empty and are described by the NMDC schema."
281
+ ),
282
+ )
283
+ delete_specs: DeleteSpecs = derive_delete_specs(delete_command=cmd)
284
+
285
+ # Check whether any of the documents the user wants to delete are referenced
286
+ # by any documents that are _not_ among those documents. If any of them are,
287
+ # it means that performing the deletion would leave behind a broken reference(s).
288
+ #
289
+ # TODO: Consider accounting for the "limit" property of the delete specs.
290
+ # Currently, we ignore it and—instead—perform the validation as though
291
+ # the user wants to delete _all_ matching documents.
292
+ #
293
+ # TODO: Account for the fact that this validation step and the actual deletion
294
+ # step do not occur within a transaction; so, the database may change
295
+ # between the two events (i.e. there's a race condition).
296
+ #
297
+ target_document_descriptors = list(
298
+ mdb[collection_name].find(
299
+ filter={"$or": [spec["filter"] for spec in delete_specs]},
300
+ projection={"_id": 1, "id": 1, "type": 1},
301
+ )
302
+ )
303
+
304
+ # Make a set of the `_id` values of the target documents so that (later) we can
305
+ # check whether a given _referring_ document is also one of the _target_ documents
306
+ # (i.e. is among the documents the user wants to delete).
307
+ target_document_object_ids = set(
308
+ tdd["_id"] for tdd in target_document_descriptors
309
+ )
310
+
311
+ # For each document the user wants to delete, check whether it is referenced
312
+ # by any documents that are _not_ among those that the user wants to delete
313
+ # (i.e. check whether there are any references that would be broken).
314
+ finder = Finder(database=mdb)
315
+ for target_document_descriptor in target_document_descriptors:
316
+ # If the document descriptor lacks the "id" field, we already know that no
317
+ # documents reference it (since they would have to _use_ that "id" value to
318
+ # do so). So, we don't bother trying to identify documents that reference it.
319
+ if "id" not in target_document_descriptor:
320
+ continue
321
+
322
+ referring_document_descriptors = identify_referring_documents(
323
+ document=target_document_descriptor, # expects at least "id" and "type"
324
+ schema_view=nmdc_schema_view(),
325
+ references=get_allowed_references(),
326
+ finder=finder,
327
+ )
328
+ # If _any_ referring document is _not_ among the documents the user wants
329
+ # to delete, then we know that performing the deletion would leave behind a
330
+ # broken reference(s).
331
+ #
332
+ # In that case, we either (a) log a warning to the server console (if broken
333
+ # references are being allowed) or (b) abort with an HTTP 422 error response
334
+ # (if broken references are not being allowed).
335
+ #
336
+ for referring_document_descriptor in referring_document_descriptors:
337
+ if (
338
+ referring_document_descriptor["source_document_object_id"]
339
+ not in target_document_object_ids
340
+ ):
341
+ source_document_id = referring_document_descriptor[
342
+ "source_document_id"
343
+ ]
344
+ source_collection_name = referring_document_descriptor[
345
+ "source_collection_name"
346
+ ]
347
+ target_document_id = target_document_descriptor["id"]
348
+ if allow_broken_refs:
349
+ logging.warning(
350
+ f"The document having 'id'='{target_document_id}' in "
351
+ f"the collection '{collection_name}' is referenced by "
352
+ f"the document having 'id'='{source_document_id}' in "
353
+ f"the collection '{source_collection_name}'. "
354
+ f"Deleting the former will leave behind a broken reference."
355
+ )
356
+ else:
357
+ # TODO: Consider reporting _all_ would-be-broken references instead of
358
+ # only the _first_ one we encounter. That would make the response
359
+ # more informative to the user in cases where there are multiple
360
+ # such references; but it would also take longer to compute and
361
+ # would increase the response size (consider the case where the
362
+ # user-specified filter matches many, many documents).
363
+ raise HTTPException(
364
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
365
+ detail=(
366
+ f"The operation was not performed, because performing it would "
367
+ f"have left behind one or more broken references. For example: "
368
+ f"The document having 'id'='{target_document_id}' in "
369
+ f"the collection '{collection_name}' is referenced by "
370
+ f"the document having 'id'='{source_document_id}' in "
371
+ f"the collection '{source_collection_name}'. "
372
+ f"Deleting the former would leave behind a broken reference. "
373
+ f"Update or delete referring document(s) and try again."
374
+ ),
375
+ )
376
+
377
+ for spec in delete_specs:
378
+ docs = list(mdb[collection_name].find(**spec))
379
+ if not docs:
380
+ continue
381
+ insert_many_result = mdb.client["nmdc_deleted"][
382
+ collection_name
383
+ ].insert_many({"doc": d, "deleted_at": ran_at} for d in docs)
384
+ if len(insert_many_result.inserted_ids) != len(docs):
385
+ raise HTTPException(
386
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
387
+ detail="Failed to back up to-be-deleted documents. operation aborted.",
388
+ )
389
+ elif isinstance(cmd, UpdateCommand):
390
+ collection_name = cmd.update
391
+ if collection_name not in get_nonempty_nmdc_schema_collection_names(mdb):
392
+ raise HTTPException(
393
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
394
+ detail=(
395
+ "Can only update documents in collections that are "
396
+ "not empty and are described by the NMDC schema."
397
+ ),
398
+ )
399
+ update_specs: UpdateSpecs = derive_update_specs(update_command=cmd)
400
+ # Execute this "update" command on a temporary "overlay" database so we can
401
+ # validate its outcome before executing it on the real database. If its outcome
402
+ # is invalid, we will abort and raise an "HTTP 422" exception.
403
+ #
404
+ # TODO: Consider wrapping this entire "preview-then-apply" sequence within a
405
+ # MongoDB transaction so as to avoid race conditions where the overlay
406
+ # database at "preview" time does not reflect the state of the database
407
+ # at "apply" time. This will be necessary once the "preview" step
408
+ # accounts for referential integrity.
409
+ #
410
+ with OverlayDB(mdb) as odb:
411
+ odb.apply_updates(
412
+ collection_name,
413
+ [u.model_dump(mode="json", exclude="hint") for u in cmd.updates],
414
+ )
415
+ _ids_to_check = set()
416
+ for spec in update_specs:
417
+ for doc in mdb[collection_name].find(
418
+ filter=spec["filter"],
419
+ limit=spec["limit"],
420
+ projection={
421
+ "_id": 1
422
+ }, # unique `id` not guaranteed (see e.g. `functional_annotation_agg`)
423
+ ):
424
+ _ids_to_check.add(doc["_id"])
425
+ docs_to_check = odb._top_db[collection_name].find(
426
+ {"_id": {"$in": list(_ids_to_check)}}
427
+ )
428
+ rv = validate_json(
429
+ {collection_name: [strip_oid(d) for d in docs_to_check]}, mdb
430
+ )
431
+ if rv["result"] == "errors":
432
+ raise HTTPException(
433
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
434
+ detail=f"Schema document(s) would be invalid after proposed update: {rv['detail']}",
435
+ )
436
+
437
+ # Perform referential integrity checking.
438
+ #
439
+ # TODO: As usual with this endpoint, the operation is susceptible to a race
440
+ # condition, wherein the database gets modified between this validation
441
+ # step and the eventual "apply" step.
442
+ #
443
+ violation_messages = simulate_updates_and_check_references(
444
+ db=mdb, update_cmd=cmd
445
+ )
446
+ if len(violation_messages) > 0:
447
+ detail = (
448
+ "The operation was not performed, because performing it would "
449
+ "have left behind one or more broken references. Details: "
450
+ f"{', '.join(violation_messages)}"
451
+ )
452
+ if allow_broken_refs:
453
+ logging.warning(detail)
454
+ else:
455
+ raise HTTPException(
456
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
457
+ detail=detail,
458
+ )
459
+
460
+ for spec in update_specs:
461
+ docs = list(mdb[collection_name].find(**spec))
462
+ if not docs:
463
+ continue
464
+ insert_many_result = mdb.client["nmdc_updated"][
465
+ collection_name
466
+ ].insert_many({"doc": d, "updated_at": ran_at} for d in docs)
467
+ if len(insert_many_result.inserted_ids) != len(docs):
468
+ raise HTTPException(
469
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
470
+ detail="Failed to back up to-be-updated documents. operation aborted.",
471
+ )
472
+ elif isinstance(cmd, AggregateCommand):
473
+ # Append $sort stage to pipeline and allow disk use.
474
+ cmd.pipeline.append({"$sort": {"_id": 1}})
475
+ cmd.allowDiskUse = True
476
+ elif isinstance(cmd, FindCommand):
477
+ cmd.sort = (cmd.sort or {}) | {"_id": 1}
478
+ elif isinstance(cmd, GetMoreCommand):
479
+ # Fetch query continuation for query, construct "getMore" equivalent, and assign `query` to that equivalent.
480
+ query_continuation = qc.get_qc_by__id(cursor_id)
481
+ # construct "getMore" equivalent of originating "find" or "aggregate" query.
482
+ initial_cmd_doc: dict = qc.get_initial_query_for_qc(
483
+ query_continuation
484
+ ).model_dump(exclude_unset=True)
485
+ if "find" in initial_cmd_doc:
486
+ modified_cmd_doc = assoc_in(
487
+ initial_cmd_doc,
488
+ ["filter", "_id", "$gt"],
489
+ qc.get_last_doc__id_for_qc(query_continuation),
490
+ )
491
+ cmd = FindCommand(**modified_cmd_doc)
492
+ elif "aggregate" in initial_cmd_doc:
493
+ initial_cmd_doc["pipeline"].append(
494
+ {
495
+ "$match": {
496
+ "_id": {"$gt": qc.get_last_doc__id_for_qc(query_continuation)}
497
+ }
498
+ }
499
+ )
500
+ modified_cmd_doc = assoc_in(
501
+ initial_cmd_doc,
502
+ ["pipeline"],
503
+ initial_cmd_doc["pipeline"]
504
+ + [
505
+ {
506
+ "$match": {
507
+ "_id": {
508
+ "$gt": qc.get_last_doc__id_for_qc(query_continuation)
509
+ }
510
+ }
511
+ }
512
+ ],
513
+ )
514
+
515
+ cmd = AggregateCommand(**modified_cmd_doc)
516
+ else:
517
+ raise HTTPException(
518
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
519
+ detail="The specified 'getMore' value resolved to an invalid command.",
520
+ )
521
+
522
+ # Issue `cmd` (possibly modified) as a mongo command, and ensure a well-formed response.
523
+ # transform e.g. `{"$oid": "..."}` instances in model_dump to `ObjectId("...")` instances.
524
+ logging.info(
525
+ f"Command JSON: {bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))}"
526
+ )
527
+
528
+ # Send a command to the database and get the raw response. If the command was a
529
+ # cursor-yielding command, make a new response object in which the raw response's
530
+ # `cursor.firstBatch`/`cursor.nextBatch` value is in a field named `cursor.batch`.
531
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database.command
532
+ cmd_response_raw: dict = mdb.command(
533
+ bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))
534
+ )
535
+ if isinstance(cmd, CursorYieldingCommand):
536
+ batch_key = "firstBatch" if isinstance(cmd, QueryCmd) else "nextBatch"
537
+ cmd_response_adapted = assoc_in(
538
+ cmd_response_raw,
539
+ ["cursor", "batch"],
540
+ cmd_response_raw["cursor"][batch_key],
541
+ )
542
+ del cmd_response_adapted["cursor"][batch_key]
543
+ else:
544
+ cmd_response_adapted = cmd_response_raw
545
+
546
+ cmd_response: CommandResponse = command_response_for(type(cmd))(
547
+ **cmd_response_adapted
548
+ )
549
+
550
+ # Not okay? Early return.
551
+ if not cmd_response.ok:
552
+ return cmd_response
553
+
554
+ # If the command response is of a kind that has a `writeErrors` attribute, and the value of that
555
+ # attribute is a list, and that list is non-empty, we know that some errors occurred.
556
+ # In that case, we respond with an HTTP 422 status code and the list of those errors.
557
+ if isinstance(cmd_response, DeleteCommandResponse) or isinstance(
558
+ cmd_response, UpdateCommandResponse
559
+ ):
560
+ if (
561
+ isinstance(cmd_response.writeErrors, list)
562
+ and len(cmd_response.writeErrors) > 0
563
+ ):
564
+ raise HTTPException(
565
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
566
+ detail=cmd_response.writeErrors,
567
+ )
568
+
569
+ if isinstance(cmd, (DeleteCommand, UpdateCommand)):
570
+ # TODO `_request_dagster_run` of `ensure_alldocs`?
571
+ if cmd_response.n == 0:
572
+ raise HTTPException(
573
+ status_code=status.HTTP_418_IM_A_TEAPOT,
574
+ detail=(
575
+ f"{'update' if isinstance(cmd, UpdateCommand) else 'delete'} command modified zero documents."
576
+ " I'm guessing that's not what you expected. Check the syntax of your request."
577
+ " But what do I know? I'm just a teapot.",
578
+ ),
579
+ )
580
+
581
+ # Cursor-command response? Prep runtime-managed cursor id and replace mongo session cursor id in response.
582
+ query_continuation = None
583
+
584
+ # TODO: Handle empty cursor response or situations where batch < batchSize.
585
+ #
586
+ # Note: This "TODO" comment has not been removed, but — based upon the
587
+ # results of the automated tests, which do submit "find" and
588
+ # "aggregation" commands that produce empty result sets and result
589
+ # sets smaller than one batch — I think this has been resolved.
590
+ #
591
+ if isinstance(cmd, CursorYieldingCommand) and cmd_response.cursor.id == "0":
592
+ # No cursor id returned. No need to create a continuation.
593
+ cmd_response.cursor.id = None
594
+ return cmd_response
595
+
596
+ # Cursor id returned. Create a continuation.
597
+ if isinstance(cmd, AggregateCommand):
598
+ slimmed_command_response = CursorYieldingCommandResponse.slimmed(cmd_response)
599
+
600
+ # First, we check whether the "slimmed" command response is `None`. That can only happen
601
+ # when some of the documents in the batch lack an `_id` field. We do not support pagination
602
+ # in that scenario (since our pagination algorithm relies on the `_id` values).
603
+ if slimmed_command_response is None:
604
+ logging.warning(
605
+ "Failed to obtain list of `_id` values. Will return batch and no pagination token."
606
+ )
607
+ cmd_response.cursor.id = None # explicitly set the pagination token to null
608
+ return cmd_response
609
+
610
+ query_continuation = qc.create_qc(cmd, slimmed_command_response)
611
+ cmd_response.cursor.id = (
612
+ None if cmd_response.cursor.id == "0" else query_continuation.id
613
+ )
614
+ elif isinstance(cmd, FindCommand):
615
+ query_continuation = qc.create_qc(
616
+ cmd, CursorYieldingCommandResponse.slimmed(cmd_response)
617
+ )
618
+ cmd_response.cursor.id = (
619
+ None if cmd_response.cursor.id == "0" else query_continuation.id
620
+ )
621
+ elif isinstance(cmd, GetMoreCommand):
622
+ # Append query run to current continuation
623
+ query_continuation = qc.get_qc_by__id(cursor_id)
624
+ query_continuation.cmd_responses.append(
625
+ CursorYieldingCommandResponse.cursor_batch__ids_only(cmd_response)
626
+ )
627
+ cmd_response.cursor.id = (
628
+ None if cmd_response.cursor.id == "0" else query_continuation.id
629
+ )
630
+
631
+ return cmd_response
632
+
633
+
634
+ def _run_delete_nonschema(
635
+ cmd: DeleteCommand, mdb: MongoDatabase = _mdb
636
+ ) -> DeleteCommandResponse:
637
+ """
638
+ Performs deletion operations similarly to `_run_mdb_cmd`, but skips
639
+ performing referential integrity checking.
640
+
641
+ This function is intended for deleting documents from non-schema collections
642
+ where referential integrity checking is not required or desired.
643
+
644
+ :param cmd: DeleteCommand to execute
645
+ :param mdb: MongoDB database instance
646
+ :return: DeleteCommandResponse with the result of the deletion operation
647
+ """
648
+ ran_at = now()
649
+ collection_name = cmd.delete
650
+
651
+ # Derive delete specifications from the command
652
+ delete_specs: DeleteSpecs = derive_delete_specs(delete_command=cmd)
653
+
654
+ # Skip the target_document descriptor code and referential integrity checking
655
+ # that exists in _run_mdb_cmd (lines 276-357)
656
+
657
+ # Perform the actual deletion operations
658
+ for spec in delete_specs:
659
+ docs = list(mdb[collection_name].find(**spec))
660
+ if not docs:
661
+ continue
662
+ insert_many_result = mdb.client["nmdc_deleted"][collection_name].insert_many(
663
+ {"doc": d, "deleted_at": ran_at} for d in docs
664
+ )
665
+ if len(insert_many_result.inserted_ids) != len(docs):
666
+ raise HTTPException(
667
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
668
+ detail="Failed to back up to-be-deleted documents. Operation aborted.",
669
+ )
670
+
671
+ # Issue the delete command to the database
672
+ cmd_response_raw: dict = mdb.command(
673
+ bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))
674
+ )
675
+
676
+ # Create the command response object (assume DeleteCommandResponse type)
677
+ cmd_response = DeleteCommandResponse(**cmd_response_raw)
678
+
679
+ # Check if the command was successful
680
+ if not cmd_response.ok:
681
+ return cmd_response
682
+
683
+ # Handle write errors if any occurred
684
+ if isinstance(cmd_response.writeErrors, list) and len(cmd_response.writeErrors) > 0:
685
+ raise HTTPException(
686
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
687
+ detail=cmd_response.writeErrors,
688
+ )
689
+
690
+ # Check if any documents were actually deleted
691
+ if cmd_response.n == 0:
692
+ raise HTTPException(
693
+ status_code=status.HTTP_418_IM_A_TEAPOT,
694
+ detail=(
695
+ "delete command modified zero documents."
696
+ " I'm guessing that's not what you expected. Check the syntax of your request."
697
+ " But what do I know? I'm just a teapot.",
698
+ ),
699
+ )
700
+
701
+ return cmd_response