nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,10 @@ from refscan.lib.helpers import (
10
10
  get_names_of_classes_eligible_for_collection,
11
11
  )
12
12
 
13
+ from nmdc_runtime.api.endpoints.lib.linked_instances import (
14
+ gather_linked_instances,
15
+ hydrated,
16
+ )
13
17
  from nmdc_runtime.config import IS_LINKED_INSTANCES_ENDPOINT_ENABLED
14
18
  from nmdc_runtime.minter.config import typecodes
15
19
  from nmdc_runtime.minter.domain.model import check_valid_ids
@@ -118,7 +122,7 @@ def get_nmdc_database_collection_stats(
118
122
  @decorate_if(condition=IS_LINKED_INSTANCES_ENDPOINT_ENABLED)(
119
123
  router.get(
120
124
  "/nmdcschema/linked_instances",
121
- response_model=ListResponse,
125
+ response_model=ListResponse[Doc],
122
126
  response_model_exclude_unset=True,
123
127
  )
124
128
  )
@@ -147,23 +151,54 @@ def get_linked_instances(
147
151
  examples=["nmdc:bsm-11-abc123"],
148
152
  ),
149
153
  ] = None,
154
+ hydrate: Annotated[
155
+ bool,
156
+ Query(
157
+ title="Hydrate",
158
+ description="Whether to include full documents in the response. The default is to include slim documents.",
159
+ ),
160
+ ] = False,
161
+ page_token: Annotated[
162
+ str | None,
163
+ Query(
164
+ title="Next page token",
165
+ description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the
166
+ `next_page_token` field in a previous response from this endpoint.\n\n_Example_:
167
+ `nmdc:sys0zr0fbt71`""",
168
+ examples=[
169
+ "nmdc:sys0zr0fbt71",
170
+ ],
171
+ ),
172
+ ] = None,
173
+ max_page_size: Annotated[
174
+ int,
175
+ Query(
176
+ title="Resources per page",
177
+ description="How many resources you want _each page_ to contain, formatted as a positive integer.",
178
+ examples=[20],
179
+ ),
180
+ ] = 20,
150
181
  mdb: MongoDatabase = Depends(get_mongo_db),
151
182
  ):
152
183
  """
153
184
  Retrieves database instances that are both (a) linked to any of `ids`, and (b) of a type in `types`.
154
185
 
155
- An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to
156
- a class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition))
157
- in our database ([nmdc:Database](https://w3id.org/nmdc/Database)).
158
- While a [nmdc:Database](https://w3id.org/nmdc/Database) is organized into collections,
159
- every item in every database collection -- that is, every instance -- knows its `type`, so we can
160
- (and here do)<sup>&dagger;</sup>
161
- return a simple list of instances
162
- ([a LinkML CollectionInstance](https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)),
163
- which a client may use to construct a corresponding [nmdc:Database](https://w3id.org/nmdc/Database).
164
-
165
- From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively) in
166
- order to collect the set of all instances linked to these `ids`.
186
+ An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to a
187
+ class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition)) in our database ([
188
+ nmdc:Database](https://w3id.org/nmdc/Database)). While a [nmdc:Database](https://w3id.org/nmdc/Database) is
189
+ organized into collections, every item in every database collection -- that is, every instance -- knows its
190
+ `type`, so we can (and here do) return a simple list of instances ([a LinkML CollectionInstance](
191
+ https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)). If hydrate is `False` (the
192
+ default), then the returned list contains "slim" documents that include only the `id` and `type` of each
193
+ instance. If hydrate is `True`, then the returned list contains "full" (aka <a
194
+ href="https://en.wikipedia.org/wiki/Hydration_(web_development)">"hydrated"</a>) documents of each instance,
195
+ suitable e.g. for a client to subsequently use to construct a corresponding
196
+ [nmdc:Database](https://w3id.org/nmdc/Database) instance with schema-compliant documents.
197
+ Both "slim" and "full" documents include (optional) `_upstream_of` and `_downstream_of` fields,
198
+ to indicate the returned document's relationship to `ids`.
199
+
200
+ From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively)
201
+ to collect the set of all instances linked to these `ids`.
167
202
 
168
203
  * A link "upstream" is represented by a slot ([linkml:SlotDefinition](https://w3id.org/linkml/SlotDefinition))
169
204
  for which the
@@ -186,16 +221,15 @@ def get_linked_instances(
186
221
  [nmdc:InformationObject](https://w3id.org/nmdc/InformationObject),
187
222
  [nmdc:Sample](https://w3id.org/nmdc/Sample), etc. -- may be given.
188
223
  If no value for `types` is given, then all [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing)s are returned.
189
-
190
- <sup>&dagger;</sup>: actually, we do not (yet).
191
- For now (see [microbiomedata/nmdc-runtime#1118](https://github.com/microbiomedata/nmdc-runtime/issues/1118)),
192
- we return a short list of "fat" documents, each of which represents one of the `ids` and presents
193
- representations of that id's downstream and upstream instances (currently just each instance's `id` and `type`)
194
- as separate subdocument array fields.
195
224
  """
196
- # TODO move logic from endpoint to unit-testable handler
197
- # TODO ListResponse[SimplifiedNMDCDatabase]
198
- # TODO ensure pagination for responses
225
+ if page_token is not None:
226
+ rv = list_resources(
227
+ req=ListRequest(page_token=page_token, max_page_size=max_page_size), mdb=mdb
228
+ )
229
+ rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
230
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
231
+ return rv
232
+
199
233
  ids_found = [d["id"] for d in mdb.alldocs.find({"id": {"$in": ids}}, {"id": 1})]
200
234
  ids_not_found = list(set(ids) - set(ids_found))
201
235
  if ids_not_found:
@@ -217,131 +251,18 @@ def get_linked_instances(
217
251
  ),
218
252
  )
219
253
 
220
- # This aggregation pipeline traverses the graph of documents in the alldocs collection, following upstream
221
- # relationships (_upstream.id) to discover upstream documents for entities that originated, or helped produce,
222
- # the entities with documents identified by `ids`. It unwinds the collected (via `$graphLookup`) upstream docs,
223
- # filters them by given `types` of interest, projects only essential fields to reduce response latency and size,
224
- # and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed upstream docs into an array for each
225
- # given ID.
226
- upstream_docs = list(
227
- mdb.alldocs.aggregate(
228
- [
229
- {"$match": {"id": {"$in": ids}}},
230
- {
231
- "$graphLookup": {
232
- "from": "alldocs",
233
- "startWith": "$_upstream.id",
234
- "connectFromField": "_upstream.id",
235
- "connectToField": "id",
236
- "as": "upstream_docs",
237
- }
238
- },
239
- {"$unwind": {"path": "$upstream_docs"}},
240
- {"$match": {"upstream_docs._type_and_ancestors": {"$in": types}}},
241
- {"$project": {"id": 1, "upstream_docs": "$upstream_docs"}},
242
- {
243
- "$group": {
244
- "_id": "$id",
245
- "upstream_docs": {
246
- "$addToSet": {
247
- "id": "$upstream_docs.id",
248
- "type": "$upstream_docs.type",
249
- }
250
- },
251
- }
252
- },
253
- {
254
- "$lookup": {
255
- "from": "alldocs",
256
- "localField": "_id",
257
- "foreignField": "id",
258
- "as": "selves",
259
- }
260
- },
261
- {
262
- "$project": {
263
- "_id": 0,
264
- "id": "$_id",
265
- "upstream_docs": 1,
266
- "type": {"$arrayElemAt": ["$selves.type", 0]},
267
- }
268
- },
269
- ],
270
- allowDiskUse=True,
271
- )
254
+ merge_into_collection_name = gather_linked_instances(
255
+ alldocs_collection=mdb.alldocs, ids=ids, types=types
272
256
  )
273
257
 
274
- # This aggregation pipeline traverses the graph of documents in the alldocs collection, following downstream
275
- # relationships (_downstream.id) to discover downstream documents for entities that originated from,
276
- # or are considered part of, the entities with documents identified by `ids`. It unwinds the collected (via
277
- # `$graphLookup`) downstream docs, filters them by given `types` of interest, projects only essential fields to
278
- # reduce response latency and size, and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed
279
- # downstream docs into an array for each given ID.
280
- downstream_docs = list(
281
- mdb.alldocs.aggregate(
282
- [
283
- {"$match": {"id": {"$in": ids}}},
284
- {
285
- "$graphLookup": {
286
- "from": "alldocs",
287
- "startWith": "$_downstream.id",
288
- "connectFromField": "_downstream.id",
289
- "connectToField": "id",
290
- "as": "downstream_docs",
291
- }
292
- },
293
- {"$unwind": {"path": "$downstream_docs"}},
294
- {"$match": {"downstream_docs._type_and_ancestors": {"$in": types}}},
295
- {
296
- "$group": {
297
- "_id": "$id",
298
- "downstream_docs": {
299
- "$addToSet": {
300
- "id": "$downstream_docs.id",
301
- "type": "$downstream_docs.type",
302
- }
303
- },
304
- }
305
- },
306
- {
307
- "$lookup": {
308
- "from": "alldocs",
309
- "localField": "_id",
310
- "foreignField": "id",
311
- "as": "selves",
312
- }
313
- },
314
- {
315
- "$project": {
316
- "_id": 0,
317
- "id": "$_id",
318
- "downstream_docs": 1,
319
- "type": {"$arrayElemAt": ["$selves.type", 0]},
320
- }
321
- },
322
- ],
323
- allowDiskUse=True,
324
- )
258
+ rv = list_resources(
259
+ ListRequest(page_token=page_token, max_page_size=max_page_size),
260
+ mdb,
261
+ merge_into_collection_name,
325
262
  )
326
-
327
- relations_by_id = {
328
- id_: {
329
- "id": id_,
330
- "upstream_docs": [],
331
- "downstream_docs": [],
332
- }
333
- for id_ in ids
334
- }
335
-
336
- # For each subject document that was upstream of or downstream of any documents, create a dictionary
337
- # containing that subject document's `id`, its `type`, and the list of `id`s of the
338
- # documents that it for upstream or or downstream of.
339
- for d in upstream_docs + downstream_docs:
340
- relations_by_id[d["id"]]["type"] = d["type"]
341
- relations_by_id[d["id"]]["upstream_docs"] += d.get("upstream_docs", [])
342
- relations_by_id[d["id"]]["downstream_docs"] += d.get("downstream_docs", [])
343
-
344
- return {"resources": list(relations_by_id.values())}
263
+ rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
264
+ rv["resources"] = [strip_oid(d) for d in rv["resources"]]
265
+ return rv
345
266
 
346
267
 
347
268
  @router.get(
@@ -124,9 +124,8 @@ def get_object_info(
124
124
  )
125
125
  if object_id.startswith("sty-"):
126
126
  url_to_try = f"https://data.microbiomedata.org/api/study/nmdc:{object_id}"
127
- rv = requests.get(
128
- url_to_try, allow_redirects=True
129
- ) # TODO use HEAD when enabled upstream
127
+ # TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
128
+ rv = requests.get(url_to_try, allow_redirects=True)
130
129
  if rv.status_code != 404:
131
130
  return RedirectResponse(
132
131
  f"https://data.microbiomedata.org/details/study/nmdc:{object_id}",
@@ -134,9 +133,8 @@ def get_object_info(
134
133
  )
135
134
  elif object_id.startswith("bsm-"):
136
135
  url_to_try = f"https://data.microbiomedata.org/api/biosample/nmdc:{object_id}"
137
- rv = requests.get(
138
- url_to_try, allow_redirects=True
139
- ) # TODO use HEAD when enabled upstream
136
+ # TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
137
+ rv = requests.get(url_to_try, allow_redirects=True)
140
138
  if rv.status_code != 404:
141
139
  return RedirectResponse(
142
140
  f"https://data.microbiomedata.org/details/sample/nmdc:{object_id}",
@@ -270,8 +268,3 @@ def update_object(
270
268
  doc_object_patched = merge(doc, object_patch.model_dump(exclude_unset=True))
271
269
  mdb.operations.replace_one({"id": object_id}, doc_object_patched)
272
270
  return doc_object_patched
273
-
274
-
275
- @router.put("/objects/{object_id}", response_model=DrsObject)
276
- def replace_object():
277
- pass
@@ -76,30 +76,3 @@ def update_operation(
76
76
  )
77
77
  mdb.operations.replace_one({"id": op_id}, doc_op_patched)
78
78
  return doc_op_patched
79
-
80
-
81
- @router.post(
82
- "/operations/{op_id}:wait",
83
- description=(
84
- "Wait until the operation is resolved or rejected before returning the result."
85
- " This is a 'blocking' alternative to client-side polling, and may not be available"
86
- " for operation types know to be particularly long-running."
87
- ),
88
- )
89
- def wait_operation():
90
- pass
91
-
92
-
93
- @router.post("/operations/{op_id}:cancel")
94
- def cancel_operation():
95
- pass
96
-
97
-
98
- @router.post("/operations/{op_id}:pause")
99
- def pause_operation():
100
- pass
101
-
102
-
103
- @router.post("/operations/{op_id}:resume")
104
- def resume_operation():
105
- pass
@@ -175,6 +175,28 @@ def run_query(
175
175
  }
176
176
  ```
177
177
 
178
+ Get a specific study and all the biosamples associated with that study.
179
+ ```
180
+ {
181
+ "aggregate": "study_set",
182
+ "pipeline": [
183
+ {
184
+ "$match": {
185
+ "id": "nmdc:sty-11-8fb6t785"
186
+ }
187
+ },
188
+ {
189
+ "$lookup": {
190
+ "from": "biosample_set",
191
+ "localField": "id",
192
+ "foreignField": "associated_studies",
193
+ "as": "biosamples_of_study"
194
+ }
195
+ }
196
+ ]
197
+ }
198
+ ```
199
+
178
200
  Use the `cursor.id` from a previous response to get the next batch of results,
179
201
  whether that batch is empty or non-empty.
180
202
  ```
@@ -87,30 +87,6 @@ def get_site(
87
87
  return raise404_if_none(mdb.sites.find_one({"id": site_id}))
88
88
 
89
89
 
90
- @router.patch("/sites/{site_id}", include_in_schema=False)
91
- def update_site():
92
- """Not yet implemented"""
93
- pass
94
-
95
-
96
- @router.put("/sites/{site_id}", include_in_schema=False)
97
- def replace_site():
98
- """Not yet implemented"""
99
- pass
100
-
101
-
102
- @router.get("/sites/{site_id}/capabilities", include_in_schema=False)
103
- def list_site_capabilities(site_id: str):
104
- """Not yet implemented"""
105
- pass
106
-
107
-
108
- @router.put("/sites/{site_id}/capabilities", include_in_schema=False)
109
- def replace_site_capabilities(site_id: str, capability_ids: List[str]):
110
- """Not yet implemented"""
111
- pass
112
-
113
-
114
90
  def verify_client_site_pair(
115
91
  site_id: str,
116
92
  mdb: pymongo.database.Database = Depends(get_mongo_db),
@@ -6,7 +6,7 @@ from functools import lru_cache
6
6
  from json import JSONDecodeError
7
7
  from pathlib import Path
8
8
  from time import time_ns
9
- from typing import Dict, List, Optional, Set, Tuple
9
+ from typing import List, Optional, Set, Tuple
10
10
  from zoneinfo import ZoneInfo
11
11
 
12
12
  from bson import json_util
@@ -55,18 +55,23 @@ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
55
55
  HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
56
56
 
57
57
 
58
- def does_num_matching_docs_exceed_threshold(
59
- collection: MongoCollection, filter_: dict, threshold: int
58
+ def is_num_matching_docs_within_limit(
59
+ collection: MongoCollection, filter_: dict, limit: int
60
60
  ) -> bool:
61
- """Check whether a MongoDB collection contains more than `threshold` documents matching the filter."""
62
- if threshold < 0:
63
- raise ValueError("Threshold must be at least 0.")
61
+ """
62
+ Check whether the number of documents in a MongoDB collection that match
63
+ the filter is within (i.e. is no greater than) the specified limit.
64
+ """
65
+ if limit < 0:
66
+ raise ValueError("Limit must be at least 0.")
64
67
 
68
+ # Count the number of documents matching the filter, but only count up to limit + 1,
69
+ # since that's enough to determine whether the number exceeds the limit.
65
70
  limited_num_matching_docs = collection.count_documents(
66
71
  filter=filter_,
67
- limit=threshold + 1,
72
+ limit=limit + 1,
68
73
  )
69
- return limited_num_matching_docs > threshold
74
+ return limited_num_matching_docs <= limit
70
75
 
71
76
 
72
77
  def check_filter(filter_: str):
@@ -87,22 +92,44 @@ def check_filter(filter_: str):
87
92
  return filter_
88
93
 
89
94
 
90
- def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
91
- r"""
95
+ def list_resources(
96
+ req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
97
+ ) -> dict:
98
+ """
92
99
  Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
93
100
 
94
- Note: If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter
95
- criteria than can fit on a page of that size, this function will paginate the resources.
101
+ `mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
102
+ unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
103
+ the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
104
+ as an argument to this function's `collection_name` parameter.
105
+
106
+ If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
107
+ can fit on a page of that size, this function will paginate the resources.
96
108
  """
109
+ if collection_name == "" and req.page_token is None:
110
+ raise HTTPException(
111
+ status_code=status.HTTP_400_BAD_REQUEST,
112
+ detail="Must specify a collection name if no page token is supplied.",
113
+ )
114
+ if req.page_token:
115
+ doc = mdb.page_tokens.find_one({"_id": req.page_token})
116
+ if doc is None:
117
+ raise HTTPException(
118
+ status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
119
+ )
120
+ collection_name = doc["ns"]
121
+ last_id = doc["last_id"]
122
+ mdb.page_tokens.delete_one({"_id": req.page_token})
123
+ else:
124
+ last_id = None
97
125
 
98
126
  id_field = "id"
99
127
  if "id_1" not in mdb[collection_name].index_information():
100
128
  logging.warning(
101
129
  f"list_resources: no index set on 'id' for collection {collection_name}"
102
130
  )
103
- id_field = (
104
- "_id" # currently expected for `functional_annotation_agg` collection
105
- )
131
+ id_field = "_id" # expected for `functional_annotation_agg` collection
132
+
106
133
  max_page_size = req.max_page_size
107
134
  filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
108
135
  projection = (
@@ -110,16 +137,6 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
110
137
  if req.projection
111
138
  else None
112
139
  )
113
- if req.page_token:
114
- doc = mdb.page_tokens.find_one({"_id": req.page_token, "ns": collection_name})
115
- if doc is None:
116
- raise HTTPException(
117
- status_code=status.HTTP_400_BAD_REQUEST, detail="Bad page_token"
118
- )
119
- last_id = doc["last_id"]
120
- mdb.page_tokens.delete_one({"_id": req.page_token})
121
- else:
122
- last_id = None
123
140
  if last_id is not None:
124
141
  if id_field in filter_:
125
142
  filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
@@ -128,17 +145,12 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
128
145
 
129
146
  # Determine whether we will paginate the results.
130
147
  #
131
- # Note: We will paginate them unless either:
132
- # - the `max_page_size` is not a positive integer
133
- # - the number of documents matching the filter does not exceed `max_page_size`
148
+ # Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
149
+ # or (b) the number of documents matching the filter can fit on a single page.
134
150
  #
135
151
  will_paginate = True
136
- if not isinstance(max_page_size, int):
137
- will_paginate = False
138
- elif max_page_size < 1:
139
- will_paginate = False
140
- elif not does_num_matching_docs_exceed_threshold(
141
- collection=mdb[collection_name], filter_=filter_, threshold=max_page_size
152
+ if max_page_size < 1 or is_num_matching_docs_within_limit(
153
+ collection=mdb[collection_name], filter_=filter_, limit=max_page_size
142
154
  ):
143
155
  will_paginate = False
144
156
 
@@ -304,9 +316,19 @@ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
304
316
  if req.page:
305
317
  skip = (req.page - 1) * req.per_page
306
318
  if skip > 10_000:
319
+ # Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
320
+ # parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
321
+ # longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
322
+ # collections.
307
323
  raise HTTPException(
308
324
  status_code=status.HTTP_400_BAD_REQUEST,
309
- detail="Use cursor-based pagination for paging beyond 10,000 items",
325
+ detail=(
326
+ "Use cursor-based pagination for paging beyond 10,000 items. "
327
+ "That is, instead of specifying the `page` query parameter for this endpoint, "
328
+ "specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
329
+ "and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
330
+ "you set `cursor` in the next request."
331
+ ),
310
332
  )
311
333
  limit = req.per_page
312
334
  results, db_response_time_ms = timeit(
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+
3
+ set -euo pipefail
4
+
5
+ exec gunicorn --worker-tmp-dir /dev/shm --workers=2 \
6
+ --threads=4 --worker-class gthread \
7
+ --log-file=- --bind 0.0.0.0:8000 nmdc_runtime.api.main:app