nmdc-runtime 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (100) hide show
  1. nmdc_runtime/api/__init__.py +0 -0
  2. nmdc_runtime/api/analytics.py +70 -0
  3. nmdc_runtime/api/boot/__init__.py +0 -0
  4. nmdc_runtime/api/boot/capabilities.py +9 -0
  5. nmdc_runtime/api/boot/object_types.py +126 -0
  6. nmdc_runtime/api/boot/triggers.py +84 -0
  7. nmdc_runtime/api/boot/workflows.py +116 -0
  8. nmdc_runtime/api/core/__init__.py +0 -0
  9. nmdc_runtime/api/core/auth.py +208 -0
  10. nmdc_runtime/api/core/idgen.py +170 -0
  11. nmdc_runtime/api/core/metadata.py +788 -0
  12. nmdc_runtime/api/core/util.py +109 -0
  13. nmdc_runtime/api/db/__init__.py +0 -0
  14. nmdc_runtime/api/db/mongo.py +447 -0
  15. nmdc_runtime/api/db/s3.py +37 -0
  16. nmdc_runtime/api/endpoints/__init__.py +0 -0
  17. nmdc_runtime/api/endpoints/capabilities.py +25 -0
  18. nmdc_runtime/api/endpoints/find.py +794 -0
  19. nmdc_runtime/api/endpoints/ids.py +192 -0
  20. nmdc_runtime/api/endpoints/jobs.py +143 -0
  21. nmdc_runtime/api/endpoints/lib/__init__.py +0 -0
  22. nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
  23. nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
  24. nmdc_runtime/api/endpoints/metadata.py +260 -0
  25. nmdc_runtime/api/endpoints/nmdcschema.py +581 -0
  26. nmdc_runtime/api/endpoints/object_types.py +38 -0
  27. nmdc_runtime/api/endpoints/objects.py +277 -0
  28. nmdc_runtime/api/endpoints/operations.py +105 -0
  29. nmdc_runtime/api/endpoints/queries.py +679 -0
  30. nmdc_runtime/api/endpoints/runs.py +98 -0
  31. nmdc_runtime/api/endpoints/search.py +38 -0
  32. nmdc_runtime/api/endpoints/sites.py +229 -0
  33. nmdc_runtime/api/endpoints/triggers.py +25 -0
  34. nmdc_runtime/api/endpoints/users.py +214 -0
  35. nmdc_runtime/api/endpoints/util.py +774 -0
  36. nmdc_runtime/api/endpoints/workflows.py +353 -0
  37. nmdc_runtime/api/main.py +401 -0
  38. nmdc_runtime/api/middleware.py +43 -0
  39. nmdc_runtime/api/models/__init__.py +0 -0
  40. nmdc_runtime/api/models/capability.py +14 -0
  41. nmdc_runtime/api/models/id.py +92 -0
  42. nmdc_runtime/api/models/job.py +37 -0
  43. nmdc_runtime/api/models/lib/__init__.py +0 -0
  44. nmdc_runtime/api/models/lib/helpers.py +78 -0
  45. nmdc_runtime/api/models/metadata.py +11 -0
  46. nmdc_runtime/api/models/minter.py +0 -0
  47. nmdc_runtime/api/models/nmdc_schema.py +146 -0
  48. nmdc_runtime/api/models/object.py +180 -0
  49. nmdc_runtime/api/models/object_type.py +20 -0
  50. nmdc_runtime/api/models/operation.py +66 -0
  51. nmdc_runtime/api/models/query.py +246 -0
  52. nmdc_runtime/api/models/query_continuation.py +111 -0
  53. nmdc_runtime/api/models/run.py +161 -0
  54. nmdc_runtime/api/models/site.py +87 -0
  55. nmdc_runtime/api/models/trigger.py +13 -0
  56. nmdc_runtime/api/models/user.py +140 -0
  57. nmdc_runtime/api/models/util.py +253 -0
  58. nmdc_runtime/api/models/workflow.py +15 -0
  59. nmdc_runtime/api/openapi.py +242 -0
  60. nmdc_runtime/config.py +55 -4
  61. nmdc_runtime/core/db/Database.py +1 -3
  62. nmdc_runtime/infrastructure/database/models/user.py +0 -9
  63. nmdc_runtime/lib/extract_nmdc_data.py +0 -8
  64. nmdc_runtime/lib/nmdc_dataframes.py +3 -7
  65. nmdc_runtime/lib/nmdc_etl_class.py +1 -7
  66. nmdc_runtime/minter/adapters/repository.py +1 -2
  67. nmdc_runtime/minter/config.py +2 -0
  68. nmdc_runtime/minter/domain/model.py +35 -1
  69. nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
  70. nmdc_runtime/mongo_util.py +1 -2
  71. nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
  72. nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
  73. nmdc_runtime/site/export/ncbi_xml.py +1 -2
  74. nmdc_runtime/site/export/ncbi_xml_utils.py +1 -1
  75. nmdc_runtime/site/graphs.py +33 -28
  76. nmdc_runtime/site/ops.py +97 -237
  77. nmdc_runtime/site/repair/database_updater.py +8 -0
  78. nmdc_runtime/site/repository.py +7 -117
  79. nmdc_runtime/site/resources.py +4 -4
  80. nmdc_runtime/site/translation/gold_translator.py +22 -21
  81. nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
  82. nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
  83. nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
  84. nmdc_runtime/site/translation/submission_portal_translator.py +64 -54
  85. nmdc_runtime/site/translation/translator.py +63 -1
  86. nmdc_runtime/site/util.py +8 -3
  87. nmdc_runtime/site/validation/util.py +10 -5
  88. nmdc_runtime/util.py +9 -321
  89. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/METADATA +57 -6
  90. nmdc_runtime-2.10.0.dist-info/RECORD +138 -0
  91. nmdc_runtime/site/translation/emsl.py +0 -43
  92. nmdc_runtime/site/translation/gold.py +0 -53
  93. nmdc_runtime/site/translation/jgi.py +0 -32
  94. nmdc_runtime/site/translation/util.py +0 -132
  95. nmdc_runtime/site/validation/jgi.py +0 -43
  96. nmdc_runtime-2.8.0.dist-info/RECORD +0 -84
  97. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/WHEEL +0 -0
  98. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/entry_points.txt +0 -0
  99. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/licenses/LICENSE +0 -0
  100. {nmdc_runtime-2.8.0.dist-info → nmdc_runtime-2.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,679 @@
1
+ import json
2
+ import logging
3
+ from typing import Annotated
4
+
5
+ import bson.json_util
6
+ from fastapi import APIRouter, Depends, Query, status, HTTPException
7
+ from pymongo.database import Database as MongoDatabase
8
+ from toolz import assoc_in
9
+ from refscan.lib.Finder import Finder
10
+ from refscan.scanner import identify_referring_documents
11
+
12
+ from nmdc_runtime.api.core.util import now
13
+ from nmdc_runtime.api.db.mongo import (
14
+ get_mongo_db,
15
+ get_nonempty_nmdc_schema_collection_names,
16
+ OverlayDB,
17
+ validate_json,
18
+ )
19
+ from nmdc_runtime.api.endpoints.lib.helpers import simulate_updates_and_check_references
20
+ from nmdc_runtime.api.endpoints.util import (
21
+ check_action_permitted,
22
+ strip_oid,
23
+ )
24
+ import nmdc_runtime.api.models.query_continuation as qc
25
+ from nmdc_runtime.api.models.query import (
26
+ DeleteCommand,
27
+ DeleteCommandResponse,
28
+ CommandResponse,
29
+ command_response_for,
30
+ QueryCmd,
31
+ UpdateCommand,
32
+ UpdateCommandResponse,
33
+ AggregateCommand,
34
+ FindCommand,
35
+ GetMoreCommand,
36
+ CommandResponseOptions,
37
+ Cmd,
38
+ CursorYieldingCommandResponse,
39
+ CursorYieldingCommand,
40
+ DeleteSpecs,
41
+ UpdateSpecs,
42
+ )
43
+ from nmdc_runtime.api.models.lib.helpers import derive_delete_specs, derive_update_specs
44
+ from nmdc_runtime.api.models.user import get_current_active_user, User
45
+ from nmdc_runtime.util import (
46
+ get_allowed_references,
47
+ nmdc_schema_view,
48
+ )
49
+
50
+ router = APIRouter()
51
+
52
+
53
+ def check_can_update_and_delete(user: User):
54
+ # update and delete queries require same level of permissions
55
+ if not check_action_permitted(
56
+ user.username, "/queries:run(query_cmd:DeleteCommand)"
57
+ ):
58
+ raise HTTPException(
59
+ status_code=status.HTTP_403_FORBIDDEN,
60
+ detail="Only specific users are allowed to issue update and delete commands.",
61
+ )
62
+
63
+
64
+ def check_can_aggregate(user: User):
65
+ if not check_action_permitted(
66
+ user.username, "/queries:run(query_cmd:AggregateCommand)"
67
+ ):
68
+ raise HTTPException(
69
+ status_code=status.HTTP_403_FORBIDDEN,
70
+ detail="Only specific users are allowed to issue aggregate commands.",
71
+ )
72
+
73
+
74
+ # Note: We set `response_model_exclude_unset=True` so that all the properties of the `CommandResponseOptions` object
75
+ # that we don't explicitly assign values to while handling the HTTP request, are omitted from the HTTP response.
76
+ # Reference: https://fastapi.tiangolo.com/tutorial/response-model/#use-the-response_model_exclude_unset-parameter
77
+ @router.post(
78
+ "/queries:run",
79
+ response_model=CommandResponseOptions,
80
+ response_model_exclude_unset=True,
81
+ )
82
+ def run_query(
83
+ cmd: Cmd,
84
+ mdb: MongoDatabase = Depends(get_mongo_db),
85
+ user: User = Depends(get_current_active_user),
86
+ allow_broken_refs: Annotated[
87
+ bool,
88
+ Query(
89
+ description="When `true`, the server will allow operations that leave behind broken references."
90
+ ),
91
+ ] = False,
92
+ ):
93
+ r"""
94
+ Performs `find`, `aggregate`, `update`, `delete`, and `getMore` commands for users that have adequate permissions.
95
+
96
+ For `find` and `aggregate` commands, the requested items will be in `cursor.batch`.
97
+ When the response includes a non-null `cursor.id`, there _may_ be more items available.
98
+ To retrieve the next batch of items, submit a request with `getMore` set to that non-null `cursor.id`.
99
+ When the response includes a null `cursor.id`, there are no more items available.
100
+
101
+ **Example request bodies:**
102
+
103
+ Get all\* biosamples.
104
+ ```
105
+ {
106
+ "find": "biosample_set",
107
+ "filter": {}
108
+ }
109
+ ```
110
+
111
+ Get all\* biosamples associated with a given study.
112
+ ```
113
+ {
114
+ "find": "biosample_set",
115
+ "filter": {"associated_studies": "nmdc:sty-11-34xj1150"}
116
+ }
117
+ ```
118
+
119
+ \*<small>Up to 101, which is the default "batchSize" for the "find" command.</small>
120
+
121
+ Get the first 200 biosamples associated with a given study.
122
+ ```
123
+ {
124
+ "find": "biosample_set",
125
+ "filter": {"associated_studies": "nmdc:sty-11-34xj1150"},
126
+ "batchSize": 200
127
+ }
128
+ ```
129
+
130
+ Delete the first biosample having a given `id`.
131
+ ```
132
+ {
133
+ "delete": "biosample_set",
134
+ "deletes": [{"q": {"id": "A_BIOSAMPLE_ID"}, "limit": 1}]
135
+ }
136
+ ```
137
+
138
+ Rename the first\* embargoed biosample.
139
+ ```
140
+ {
141
+ "update": "biosample_set",
142
+ "updates": [{"q": {"embargoed": true}, "u": {"$set": {"name": "A_NEW_NAME"}}}]
143
+ }
144
+ ```
145
+
146
+ \*<small>Updates at most 1 matching document, since `"multi": true` is not present.</small>
147
+
148
+ Rename all\* embargoed biosamples.
149
+ ```
150
+ {
151
+ "update": "biosample_set",
152
+ "updates": [{"q": {"embargoed": true}, "u": {"$set": {"name": "A_NEW_NAME"}}, "multi": true}]
153
+ }
154
+ ```
155
+
156
+ \*<small>Updates all matching documents, since `"multi": true` is present.</small>
157
+
158
+ Get all\* biosamples, sorted by the number of studies associated with them (greatest to least).
159
+ ```
160
+ {
161
+ "aggregate": "biosample_set",
162
+ "pipeline": [{"$sortByCount": "$associated_studies"}]
163
+ }
164
+ ```
165
+
166
+ \*<small>Up to 25, which is the default "batchSize" for the "aggregate" command.</small>
167
+
168
+ Get the first 10 biosamples having the largest numbers of studies associated with them,
169
+ sorted by that number of studies (greatest to least).
170
+ ```
171
+ {
172
+ "aggregate": "biosample_set",
173
+ "pipeline": [{"$sortByCount": "$associated_studies"}],
174
+ "cursor": {"batchSize": 10}
175
+ }
176
+ ```
177
+
178
+ Use the `cursor.id` from a previous response to get the next batch of results,
179
+ whether that batch is empty or non-empty.
180
+ ```
181
+ {
182
+ "getMore": "somecursorid"
183
+ }
184
+ ```
185
+
186
+ **Limitations:**
187
+
188
+ 1. The maximum size of the response payload is 16 MB. You can use the "batchSize" property
189
+ (for "find" commands) or the "cursor.batchSize" property (for "aggregate" commands)—along
190
+ with some trial and error—to ensure the response payload size remains under that limit.
191
+ 2. When using an "aggregate" command, if any of the objects output by the pipeline lacks an
192
+ "_id" field, the endpoint will return only the first batch of objects and will not offer
193
+ pagination (i.e. "cursor.id" will be null).
194
+ 3. Manipulating the values of "_id" fields within an aggregation pipeline (e.g. via "$set")
195
+ can result in pagination not working. If this impacts your use case, please contact us.
196
+ """
197
+ r"""
198
+ Additional notes for developers:*
199
+ --------------------------------
200
+ * Note: Because this section isn't in the main docstring,
201
+ it isn't visible on Swagger UI.
202
+
203
+ 1. The sorting part of the pagination algorithm is based upon the assumption
204
+ that the `_id` values of the documents are "deterministically sortable."
205
+ However, an aggregation pipeline can be used to populate the `_id` field
206
+ with values that are _not_ "deterministically sortable." For example,
207
+ the final stage of the pipeline could be: `{ "$set": { "_id": "potato" } }`
208
+ References:
209
+ - https://www.unicode.org/notes/tn9/
210
+ - https://www.mongodb.com/docs/manual/reference/operator/aggregation/set/
211
+ """
212
+
213
+ # If the command is one that requires the user to have specific permissions, check for those permissions now.
214
+ # Note: The permission-checking function will raise an exception if the user lacks those permissions.
215
+ if isinstance(cmd, (DeleteCommand, UpdateCommand)):
216
+ check_can_update_and_delete(user)
217
+
218
+ # check if the user has permission to run aggregate commands
219
+ if isinstance(cmd, AggregateCommand):
220
+ check_can_aggregate(user)
221
+
222
+ cmd_response = _run_mdb_cmd(cmd, allow_broken_refs=allow_broken_refs)
223
+ return cmd_response
224
+
225
+
226
+ _mdb = get_mongo_db()
227
+
228
+
229
+ def _run_mdb_cmd(
230
+ cmd: Cmd, mdb: MongoDatabase = _mdb, allow_broken_refs: bool = False
231
+ ) -> CommandResponse:
232
+ r"""
233
+ TODO: Document this function.
234
+ TODO: Consider splitting this function into multiple, smaller functions (if practical). It is currently ~370 lines.
235
+ TODO: How does this function behave when the "batchSize" is invalid (e.g. 0, negative, non-numeric)?
236
+
237
+ :param cmd: Undocumented. TODO: Document this parameter.
238
+ :param mdb: Undocumented. TODO: Document this parameter.
239
+ :param allow_broken_refs: Under normal circumstances, if this function determines that performing
240
+ the specified command would leave behind broken references, this function
241
+ will reject the command (i.e. raise an HTTP 422). In contrast, when the
242
+ `allow_broken_refs` parameter is set to `true`, this function will not
243
+ reject the command for that reason (however, it may reject the command
244
+ for other reasons).
245
+ """
246
+ ran_at = now()
247
+ cursor_id = cmd.getMore if isinstance(cmd, GetMoreCommand) else None
248
+ logging.info(f"Command type: {type(cmd).__name__}")
249
+ logging.info(f"Cursor ID: {cursor_id}")
250
+
251
+ if isinstance(cmd, DeleteCommand):
252
+ collection_name = cmd.delete
253
+ if collection_name not in get_nonempty_nmdc_schema_collection_names(mdb):
254
+ raise HTTPException(
255
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
256
+ detail=(
257
+ "Can only delete documents from collections that are "
258
+ "not empty and are described by the NMDC schema."
259
+ ),
260
+ )
261
+ delete_specs: DeleteSpecs = derive_delete_specs(delete_command=cmd)
262
+
263
+ # Check whether any of the documents the user wants to delete are referenced
264
+ # by any documents that are _not_ among those documents. If any of them are,
265
+ # it means that performing the deletion would leave behind a broken reference(s).
266
+ #
267
+ # TODO: Consider accounting for the "limit" property of the delete specs.
268
+ # Currently, we ignore it and—instead—perform the validation as though
269
+ # the user wants to delete _all_ matching documents.
270
+ #
271
+ # TODO: Account for the fact that this validation step and the actual deletion
272
+ # step do not occur within a transaction; so, the database may change
273
+ # between the two events (i.e. there's a race condition).
274
+ #
275
+ target_document_descriptors = list(
276
+ mdb[collection_name].find(
277
+ filter={"$or": [spec["filter"] for spec in delete_specs]},
278
+ projection={"_id": 1, "id": 1, "type": 1},
279
+ )
280
+ )
281
+
282
+ # Make a set of the `_id` values of the target documents so that (later) we can
283
+ # check whether a given _referring_ document is also one of the _target_ documents
284
+ # (i.e. is among the documents the user wants to delete).
285
+ target_document_object_ids = set(
286
+ tdd["_id"] for tdd in target_document_descriptors
287
+ )
288
+
289
+ # For each document the user wants to delete, check whether it is referenced
290
+ # by any documents that are _not_ among those that the user wants to delete
291
+ # (i.e. check whether there are any references that would be broken).
292
+ finder = Finder(database=mdb)
293
+ for target_document_descriptor in target_document_descriptors:
294
+ # If the document descriptor lacks the "id" field, we already know that no
295
+ # documents reference it (since they would have to _use_ that "id" value to
296
+ # do so). So, we don't bother trying to identify documents that reference it.
297
+ if "id" not in target_document_descriptor:
298
+ continue
299
+
300
+ referring_document_descriptors = identify_referring_documents(
301
+ document=target_document_descriptor, # expects at least "id" and "type"
302
+ schema_view=nmdc_schema_view(),
303
+ references=get_allowed_references(),
304
+ finder=finder,
305
+ )
306
+ # If _any_ referring document is _not_ among the documents the user wants
307
+ # to delete, then we know that performing the deletion would leave behind a
308
+ # broken reference(s).
309
+ #
310
+ # In that case, we either (a) log a warning to the server console (if broken
311
+ # references are being allowed) or (b) abort with an HTTP 422 error response
312
+ # (if broken references are not being allowed).
313
+ #
314
+ for referring_document_descriptor in referring_document_descriptors:
315
+ if (
316
+ referring_document_descriptor["source_document_object_id"]
317
+ not in target_document_object_ids
318
+ ):
319
+ source_document_id = referring_document_descriptor[
320
+ "source_document_id"
321
+ ]
322
+ source_collection_name = referring_document_descriptor[
323
+ "source_collection_name"
324
+ ]
325
+ target_document_id = target_document_descriptor["id"]
326
+ if allow_broken_refs:
327
+ logging.warning(
328
+ f"The document having 'id'='{target_document_id}' in "
329
+ f"the collection '{collection_name}' is referenced by "
330
+ f"the document having 'id'='{source_document_id}' in "
331
+ f"the collection '{source_collection_name}'. "
332
+ f"Deleting the former will leave behind a broken reference."
333
+ )
334
+ else:
335
+ # TODO: Consider reporting _all_ would-be-broken references instead of
336
+ # only the _first_ one we encounter. That would make the response
337
+ # more informative to the user in cases where there are multiple
338
+ # such references; but it would also take longer to compute and
339
+ # would increase the response size (consider the case where the
340
+ # user-specified filter matches many, many documents).
341
+ raise HTTPException(
342
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
343
+ detail=(
344
+ f"The operation was not performed, because performing it would "
345
+ f"have left behind one or more broken references. For example: "
346
+ f"The document having 'id'='{target_document_id}' in "
347
+ f"the collection '{collection_name}' is referenced by "
348
+ f"the document having 'id'='{source_document_id}' in "
349
+ f"the collection '{source_collection_name}'. "
350
+ f"Deleting the former would leave behind a broken reference. "
351
+ f"Update or delete referring document(s) and try again."
352
+ ),
353
+ )
354
+
355
+ for spec in delete_specs:
356
+ docs = list(mdb[collection_name].find(**spec))
357
+ if not docs:
358
+ continue
359
+ insert_many_result = mdb.client["nmdc_deleted"][
360
+ collection_name
361
+ ].insert_many({"doc": d, "deleted_at": ran_at} for d in docs)
362
+ if len(insert_many_result.inserted_ids) != len(docs):
363
+ raise HTTPException(
364
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
365
+ detail="Failed to back up to-be-deleted documents. operation aborted.",
366
+ )
367
+ elif isinstance(cmd, UpdateCommand):
368
+ collection_name = cmd.update
369
+ if collection_name not in get_nonempty_nmdc_schema_collection_names(mdb):
370
+ raise HTTPException(
371
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
372
+ detail=(
373
+ "Can only update documents in collections that are "
374
+ "not empty and are described by the NMDC schema."
375
+ ),
376
+ )
377
+ update_specs: UpdateSpecs = derive_update_specs(update_command=cmd)
378
+ # Execute this "update" command on a temporary "overlay" database so we can
379
+ # validate its outcome before executing it on the real database. If its outcome
380
+ # is invalid, we will abort and raise an "HTTP 422" exception.
381
+ #
382
+ # TODO: Consider wrapping this entire "preview-then-apply" sequence within a
383
+ # MongoDB transaction so as to avoid race conditions where the overlay
384
+ # database at "preview" time does not reflect the state of the database
385
+ # at "apply" time. This will be necessary once the "preview" step
386
+ # accounts for referential integrity.
387
+ #
388
+ with OverlayDB(mdb) as odb:
389
+ odb.apply_updates(
390
+ collection_name,
391
+ [u.model_dump(mode="json", exclude="hint") for u in cmd.updates],
392
+ )
393
+ _ids_to_check = set()
394
+ for spec in update_specs:
395
+ for doc in mdb[collection_name].find(
396
+ filter=spec["filter"],
397
+ limit=spec["limit"],
398
+ projection={
399
+ "_id": 1
400
+ }, # unique `id` not guaranteed (see e.g. `functional_annotation_agg`)
401
+ ):
402
+ _ids_to_check.add(doc["_id"])
403
+ docs_to_check = odb._top_db[collection_name].find(
404
+ {"_id": {"$in": list(_ids_to_check)}}
405
+ )
406
+ rv = validate_json(
407
+ {collection_name: [strip_oid(d) for d in docs_to_check]}, mdb
408
+ )
409
+ if rv["result"] == "errors":
410
+ raise HTTPException(
411
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
412
+ detail=f"Schema document(s) would be invalid after proposed update: {rv['detail']}",
413
+ )
414
+
415
+ # Perform referential integrity checking.
416
+ #
417
+ # TODO: As usual with this endpoint, the operation is susceptible to a race
418
+ # condition, wherein the database gets modified between this validation
419
+ # step and the eventual "apply" step.
420
+ #
421
+ violation_messages = simulate_updates_and_check_references(
422
+ db=mdb, update_cmd=cmd
423
+ )
424
+ if len(violation_messages) > 0:
425
+ detail = (
426
+ "The operation was not performed, because performing it would "
427
+ "have left behind one or more broken references. Details: "
428
+ f"{', '.join(violation_messages)}"
429
+ )
430
+ if allow_broken_refs:
431
+ logging.warning(detail)
432
+ else:
433
+ raise HTTPException(
434
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
435
+ detail=detail,
436
+ )
437
+
438
+ for spec in update_specs:
439
+ docs = list(mdb[collection_name].find(**spec))
440
+ if not docs:
441
+ continue
442
+ insert_many_result = mdb.client["nmdc_updated"][
443
+ collection_name
444
+ ].insert_many({"doc": d, "updated_at": ran_at} for d in docs)
445
+ if len(insert_many_result.inserted_ids) != len(docs):
446
+ raise HTTPException(
447
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
448
+ detail="Failed to back up to-be-updated documents. operation aborted.",
449
+ )
450
+ elif isinstance(cmd, AggregateCommand):
451
+ # Append $sort stage to pipeline and allow disk use.
452
+ cmd.pipeline.append({"$sort": {"_id": 1}})
453
+ cmd.allowDiskUse = True
454
+ elif isinstance(cmd, FindCommand):
455
+ cmd.sort = (cmd.sort or {}) | {"_id": 1}
456
+ elif isinstance(cmd, GetMoreCommand):
457
+ # Fetch query continuation for query, construct "getMore" equivalent, and assign `query` to that equivalent.
458
+ query_continuation = qc.get_qc_by__id(cursor_id)
459
+ # construct "getMore" equivalent of originating "find" or "aggregate" query.
460
+ initial_cmd_doc: dict = qc.get_initial_query_for_qc(
461
+ query_continuation
462
+ ).model_dump(exclude_unset=True)
463
+ if "find" in initial_cmd_doc:
464
+ modified_cmd_doc = assoc_in(
465
+ initial_cmd_doc,
466
+ ["filter", "_id", "$gt"],
467
+ qc.get_last_doc__id_for_qc(query_continuation),
468
+ )
469
+ cmd = FindCommand(**modified_cmd_doc)
470
+ elif "aggregate" in initial_cmd_doc:
471
+ initial_cmd_doc["pipeline"].append(
472
+ {
473
+ "$match": {
474
+ "_id": {"$gt": qc.get_last_doc__id_for_qc(query_continuation)}
475
+ }
476
+ }
477
+ )
478
+ modified_cmd_doc = assoc_in(
479
+ initial_cmd_doc,
480
+ ["pipeline"],
481
+ initial_cmd_doc["pipeline"]
482
+ + [
483
+ {
484
+ "$match": {
485
+ "_id": {
486
+ "$gt": qc.get_last_doc__id_for_qc(query_continuation)
487
+ }
488
+ }
489
+ }
490
+ ],
491
+ )
492
+
493
+ cmd = AggregateCommand(**modified_cmd_doc)
494
+ else:
495
+ raise HTTPException(
496
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
497
+ detail="The specified 'getMore' value resolved to an invalid command.",
498
+ )
499
+
500
+ # Issue `cmd` (possibly modified) as a mongo command, and ensure a well-formed response.
501
+ # transform e.g. `{"$oid": "..."}` instances in model_dump to `ObjectId("...")` instances.
502
+ logging.info(
503
+ f"Command JSON: {bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))}"
504
+ )
505
+
506
+ # Send a command to the database and get the raw response. If the command was a
507
+ # cursor-yielding command, make a new response object in which the raw response's
508
+ # `cursor.firstBatch`/`cursor.nextBatch` value is in a field named `cursor.batch`.
509
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/database.html#pymongo.database.Database.command
510
+ cmd_response_raw: dict = mdb.command(
511
+ bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))
512
+ )
513
+ if isinstance(cmd, CursorYieldingCommand):
514
+ batch_key = "firstBatch" if isinstance(cmd, QueryCmd) else "nextBatch"
515
+ cmd_response_adapted = assoc_in(
516
+ cmd_response_raw,
517
+ ["cursor", "batch"],
518
+ cmd_response_raw["cursor"][batch_key],
519
+ )
520
+ del cmd_response_adapted["cursor"][batch_key]
521
+ else:
522
+ cmd_response_adapted = cmd_response_raw
523
+
524
+ cmd_response: CommandResponse = command_response_for(type(cmd))(
525
+ **cmd_response_adapted
526
+ )
527
+
528
+ # Not okay? Early return.
529
+ if not cmd_response.ok:
530
+ return cmd_response
531
+
532
+ # If the command response is of a kind that has a `writeErrors` attribute, and the value of that
533
+ # attribute is a list, and that list is non-empty, we know that some errors occurred.
534
+ # In that case, we respond with an HTTP 422 status code and the list of those errors.
535
+ if isinstance(cmd_response, DeleteCommandResponse) or isinstance(
536
+ cmd_response, UpdateCommandResponse
537
+ ):
538
+ if (
539
+ isinstance(cmd_response.writeErrors, list)
540
+ and len(cmd_response.writeErrors) > 0
541
+ ):
542
+ raise HTTPException(
543
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
544
+ detail=cmd_response.writeErrors,
545
+ )
546
+
547
+ if isinstance(cmd, (DeleteCommand, UpdateCommand)):
548
+ # TODO `_request_dagster_run` of `ensure_alldocs`?
549
+ if cmd_response.n == 0:
550
+ raise HTTPException(
551
+ status_code=status.HTTP_418_IM_A_TEAPOT,
552
+ detail=(
553
+ f"{'update' if isinstance(cmd, UpdateCommand) else 'delete'} command modified zero documents."
554
+ " I'm guessing that's not what you expected. Check the syntax of your request."
555
+ " But what do I know? I'm just a teapot.",
556
+ ),
557
+ )
558
+
559
+ # Cursor-command response? Prep runtime-managed cursor id and replace mongo session cursor id in response.
560
+ query_continuation = None
561
+
562
+ # TODO: Handle empty cursor response or situations where batch < batchSize.
563
+ #
564
+ # Note: This "TODO" comment has not been removed, but — based upon the
565
+ # results of the automated tests, which do submit "find" and
566
+ # "aggregation" commands that produce empty result sets and result
567
+ # sets smaller than one batch — I think this has been resolved.
568
+ #
569
+ if isinstance(cmd, CursorYieldingCommand) and cmd_response.cursor.id == "0":
570
+ # No cursor id returned. No need to create a continuation.
571
+ cmd_response.cursor.id = None
572
+ return cmd_response
573
+
574
+ # Cursor id returned. Create a continuation.
575
+ if isinstance(cmd, AggregateCommand):
576
+ slimmed_command_response = CursorYieldingCommandResponse.slimmed(cmd_response)
577
+
578
+ # First, we check whether the "slimmed" command response is `None`. That can only happen
579
+ # when some of the documents in the batch lack an `_id` field. We do not support pagination
580
+ # in that scenario (since our pagination algorithm relies on the `_id` values).
581
+ if slimmed_command_response is None:
582
+ logging.warning(
583
+ "Failed to obtain list of `_id` values. Will return batch and no pagination token."
584
+ )
585
+ cmd_response.cursor.id = None # explicitly set the pagination token to null
586
+ return cmd_response
587
+
588
+ query_continuation = qc.create_qc(cmd, slimmed_command_response)
589
+ cmd_response.cursor.id = (
590
+ None if cmd_response.cursor.id == "0" else query_continuation.id
591
+ )
592
+ elif isinstance(cmd, FindCommand):
593
+ query_continuation = qc.create_qc(
594
+ cmd, CursorYieldingCommandResponse.slimmed(cmd_response)
595
+ )
596
+ cmd_response.cursor.id = (
597
+ None if cmd_response.cursor.id == "0" else query_continuation.id
598
+ )
599
+ elif isinstance(cmd, GetMoreCommand):
600
+ # Append query run to current continuation
601
+ query_continuation = qc.get_qc_by__id(cursor_id)
602
+ query_continuation.cmd_responses.append(
603
+ CursorYieldingCommandResponse.cursor_batch__ids_only(cmd_response)
604
+ )
605
+ cmd_response.cursor.id = (
606
+ None if cmd_response.cursor.id == "0" else query_continuation.id
607
+ )
608
+
609
+ return cmd_response
610
+
611
+
612
+ def _run_delete_nonschema(
613
+ cmd: DeleteCommand, mdb: MongoDatabase = _mdb
614
+ ) -> DeleteCommandResponse:
615
+ """
616
+ Performs deletion operations similarly to `_run_mdb_cmd`, but skips
617
+ performing referential integrity checking.
618
+
619
+ This function is intended for deleting documents from non-schema collections
620
+ where referential integrity checking is not required or desired.
621
+
622
+ :param cmd: DeleteCommand to execute
623
+ :param mdb: MongoDB database instance
624
+ :return: DeleteCommandResponse with the result of the deletion operation
625
+ """
626
+ ran_at = now()
627
+ collection_name = cmd.delete
628
+
629
+ # Derive delete specifications from the command
630
+ delete_specs: DeleteSpecs = derive_delete_specs(delete_command=cmd)
631
+
632
+ # Skip the target_document descriptor code and referential integrity checking
633
+ # that exists in _run_mdb_cmd (lines 276-357)
634
+
635
+ # Perform the actual deletion operations
636
+ for spec in delete_specs:
637
+ docs = list(mdb[collection_name].find(**spec))
638
+ if not docs:
639
+ continue
640
+ insert_many_result = mdb.client["nmdc_deleted"][collection_name].insert_many(
641
+ {"doc": d, "deleted_at": ran_at} for d in docs
642
+ )
643
+ if len(insert_many_result.inserted_ids) != len(docs):
644
+ raise HTTPException(
645
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
646
+ detail="Failed to back up to-be-deleted documents. Operation aborted.",
647
+ )
648
+
649
+ # Issue the delete command to the database
650
+ cmd_response_raw: dict = mdb.command(
651
+ bson.json_util.loads(json.dumps(cmd.model_dump(exclude_unset=True)))
652
+ )
653
+
654
+ # Create the command response object (assume DeleteCommandResponse type)
655
+ cmd_response = DeleteCommandResponse(**cmd_response_raw)
656
+
657
+ # Check if the command was successful
658
+ if not cmd_response.ok:
659
+ return cmd_response
660
+
661
+ # Handle write errors if any occurred
662
+ if isinstance(cmd_response.writeErrors, list) and len(cmd_response.writeErrors) > 0:
663
+ raise HTTPException(
664
+ status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
665
+ detail=cmd_response.writeErrors,
666
+ )
667
+
668
+ # Check if any documents were actually deleted
669
+ if cmd_response.n == 0:
670
+ raise HTTPException(
671
+ status_code=status.HTTP_418_IM_A_TEAPOT,
672
+ detail=(
673
+ "delete command modified zero documents."
674
+ " I'm guessing that's not what you expected. Check the syntax of your request."
675
+ " But what do I know? I'm just a teapot.",
676
+ ),
677
+ )
678
+
679
+ return cmd_response