nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nmdc-runtime might be problematic. Click here for more details.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +22 -2
- nmdc_runtime/api/core/idgen.py +36 -6
- nmdc_runtime/api/db/mongo.py +0 -12
- nmdc_runtime/api/endpoints/find.py +65 -225
- nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
- nmdc_runtime/api/endpoints/objects.py +4 -11
- nmdc_runtime/api/endpoints/operations.py +0 -27
- nmdc_runtime/api/endpoints/queries.py +22 -0
- nmdc_runtime/api/endpoints/sites.py +0 -24
- nmdc_runtime/api/endpoints/util.py +57 -35
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +84 -60
- nmdc_runtime/api/models/util.py +12 -5
- nmdc_runtime/api/openapi.py +116 -180
- nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/minter/adapters/repository.py +21 -0
- nmdc_runtime/minter/domain/model.py +20 -0
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +26 -0
- nmdc_runtime/site/export/ncbi_xml.py +632 -11
- nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
- nmdc_runtime/site/graphs.py +7 -0
- nmdc_runtime/site/ops.py +92 -34
- nmdc_runtime/site/repository.py +2 -0
- nmdc_runtime/site/resources.py +16 -3
- nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +87 -1
- nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
- nmdc_runtime/api/endpoints/ids.py +0 -192
- nmdc_runtime/client/__init__.py +0 -0
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/__init__.py +0 -0
- nmdc_runtime/core/db/Database.py +0 -13
- nmdc_runtime/core/db/__init__.py +0 -0
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/__init__.py +0 -0
- nmdc_runtime/domain/users/__init__.py +0 -0
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
- nmdc_runtime/infrastructure/database/models/user.py +0 -1
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -33
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -825
- nmdc_runtime/lib/nmdc_etl_class.py +0 -396
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/__init__.py +0 -0
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
- nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -10,6 +10,10 @@ from refscan.lib.helpers import (
|
|
|
10
10
|
get_names_of_classes_eligible_for_collection,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
|
+
from nmdc_runtime.api.endpoints.lib.linked_instances import (
|
|
14
|
+
gather_linked_instances,
|
|
15
|
+
hydrated,
|
|
16
|
+
)
|
|
13
17
|
from nmdc_runtime.config import IS_LINKED_INSTANCES_ENDPOINT_ENABLED
|
|
14
18
|
from nmdc_runtime.minter.config import typecodes
|
|
15
19
|
from nmdc_runtime.minter.domain.model import check_valid_ids
|
|
@@ -118,7 +122,7 @@ def get_nmdc_database_collection_stats(
|
|
|
118
122
|
@decorate_if(condition=IS_LINKED_INSTANCES_ENDPOINT_ENABLED)(
|
|
119
123
|
router.get(
|
|
120
124
|
"/nmdcschema/linked_instances",
|
|
121
|
-
response_model=ListResponse,
|
|
125
|
+
response_model=ListResponse[Doc],
|
|
122
126
|
response_model_exclude_unset=True,
|
|
123
127
|
)
|
|
124
128
|
)
|
|
@@ -147,23 +151,54 @@ def get_linked_instances(
|
|
|
147
151
|
examples=["nmdc:bsm-11-abc123"],
|
|
148
152
|
),
|
|
149
153
|
] = None,
|
|
154
|
+
hydrate: Annotated[
|
|
155
|
+
bool,
|
|
156
|
+
Query(
|
|
157
|
+
title="Hydrate",
|
|
158
|
+
description="Whether to include full documents in the response. The default is to include slim documents.",
|
|
159
|
+
),
|
|
160
|
+
] = False,
|
|
161
|
+
page_token: Annotated[
|
|
162
|
+
str | None,
|
|
163
|
+
Query(
|
|
164
|
+
title="Next page token",
|
|
165
|
+
description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the
|
|
166
|
+
`next_page_token` field in a previous response from this endpoint.\n\n_Example_:
|
|
167
|
+
`nmdc:sys0zr0fbt71`""",
|
|
168
|
+
examples=[
|
|
169
|
+
"nmdc:sys0zr0fbt71",
|
|
170
|
+
],
|
|
171
|
+
),
|
|
172
|
+
] = None,
|
|
173
|
+
max_page_size: Annotated[
|
|
174
|
+
int,
|
|
175
|
+
Query(
|
|
176
|
+
title="Resources per page",
|
|
177
|
+
description="How many resources you want _each page_ to contain, formatted as a positive integer.",
|
|
178
|
+
examples=[20],
|
|
179
|
+
),
|
|
180
|
+
] = 20,
|
|
150
181
|
mdb: MongoDatabase = Depends(get_mongo_db),
|
|
151
182
|
):
|
|
152
183
|
"""
|
|
153
184
|
Retrieves database instances that are both (a) linked to any of `ids`, and (b) of a type in `types`.
|
|
154
185
|
|
|
155
|
-
An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
186
|
+
An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to a
|
|
187
|
+
class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition)) in our database ([
|
|
188
|
+
nmdc:Database](https://w3id.org/nmdc/Database)). While a [nmdc:Database](https://w3id.org/nmdc/Database) is
|
|
189
|
+
organized into collections, every item in every database collection -- that is, every instance -- knows its
|
|
190
|
+
`type`, so we can (and here do) return a simple list of instances ([a LinkML CollectionInstance](
|
|
191
|
+
https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)). If hydrate is `False` (the
|
|
192
|
+
default), then the returned list contains "slim" documents that include only the `id` and `type` of each
|
|
193
|
+
instance. If hydrate is `True`, then the returned list contains "full" (aka <a
|
|
194
|
+
href="https://en.wikipedia.org/wiki/Hydration_(web_development)">"hydrated"</a>) documents of each instance,
|
|
195
|
+
suitable e.g. for a client to subsequently use to construct a corresponding
|
|
196
|
+
[nmdc:Database](https://w3id.org/nmdc/Database) instance with schema-compliant documents.
|
|
197
|
+
Both "slim" and "full" documents include (optional) `_upstream_of` and `_downstream_of` fields,
|
|
198
|
+
to indicate the returned document's relationship to `ids`.
|
|
199
|
+
|
|
200
|
+
From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively)
|
|
201
|
+
to collect the set of all instances linked to these `ids`.
|
|
167
202
|
|
|
168
203
|
* A link "upstream" is represented by a slot ([linkml:SlotDefinition](https://w3id.org/linkml/SlotDefinition))
|
|
169
204
|
for which the
|
|
@@ -186,16 +221,15 @@ def get_linked_instances(
|
|
|
186
221
|
[nmdc:InformationObject](https://w3id.org/nmdc/InformationObject),
|
|
187
222
|
[nmdc:Sample](https://w3id.org/nmdc/Sample), etc. -- may be given.
|
|
188
223
|
If no value for `types` is given, then all [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing)s are returned.
|
|
189
|
-
|
|
190
|
-
<sup>†</sup>: actually, we do not (yet).
|
|
191
|
-
For now (see [microbiomedata/nmdc-runtime#1118](https://github.com/microbiomedata/nmdc-runtime/issues/1118)),
|
|
192
|
-
we return a short list of "fat" documents, each of which represents one of the `ids` and presents
|
|
193
|
-
representations of that id's downstream and upstream instances (currently just each instance's `id` and `type`)
|
|
194
|
-
as separate subdocument array fields.
|
|
195
224
|
"""
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
225
|
+
if page_token is not None:
|
|
226
|
+
rv = list_resources(
|
|
227
|
+
req=ListRequest(page_token=page_token, max_page_size=max_page_size), mdb=mdb
|
|
228
|
+
)
|
|
229
|
+
rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
|
|
230
|
+
rv["resources"] = [strip_oid(d) for d in rv["resources"]]
|
|
231
|
+
return rv
|
|
232
|
+
|
|
199
233
|
ids_found = [d["id"] for d in mdb.alldocs.find({"id": {"$in": ids}}, {"id": 1})]
|
|
200
234
|
ids_not_found = list(set(ids) - set(ids_found))
|
|
201
235
|
if ids_not_found:
|
|
@@ -217,131 +251,18 @@ def get_linked_instances(
|
|
|
217
251
|
),
|
|
218
252
|
)
|
|
219
253
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
# the entities with documents identified by `ids`. It unwinds the collected (via `$graphLookup`) upstream docs,
|
|
223
|
-
# filters them by given `types` of interest, projects only essential fields to reduce response latency and size,
|
|
224
|
-
# and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed upstream docs into an array for each
|
|
225
|
-
# given ID.
|
|
226
|
-
upstream_docs = list(
|
|
227
|
-
mdb.alldocs.aggregate(
|
|
228
|
-
[
|
|
229
|
-
{"$match": {"id": {"$in": ids}}},
|
|
230
|
-
{
|
|
231
|
-
"$graphLookup": {
|
|
232
|
-
"from": "alldocs",
|
|
233
|
-
"startWith": "$_upstream.id",
|
|
234
|
-
"connectFromField": "_upstream.id",
|
|
235
|
-
"connectToField": "id",
|
|
236
|
-
"as": "upstream_docs",
|
|
237
|
-
}
|
|
238
|
-
},
|
|
239
|
-
{"$unwind": {"path": "$upstream_docs"}},
|
|
240
|
-
{"$match": {"upstream_docs._type_and_ancestors": {"$in": types}}},
|
|
241
|
-
{"$project": {"id": 1, "upstream_docs": "$upstream_docs"}},
|
|
242
|
-
{
|
|
243
|
-
"$group": {
|
|
244
|
-
"_id": "$id",
|
|
245
|
-
"upstream_docs": {
|
|
246
|
-
"$addToSet": {
|
|
247
|
-
"id": "$upstream_docs.id",
|
|
248
|
-
"type": "$upstream_docs.type",
|
|
249
|
-
}
|
|
250
|
-
},
|
|
251
|
-
}
|
|
252
|
-
},
|
|
253
|
-
{
|
|
254
|
-
"$lookup": {
|
|
255
|
-
"from": "alldocs",
|
|
256
|
-
"localField": "_id",
|
|
257
|
-
"foreignField": "id",
|
|
258
|
-
"as": "selves",
|
|
259
|
-
}
|
|
260
|
-
},
|
|
261
|
-
{
|
|
262
|
-
"$project": {
|
|
263
|
-
"_id": 0,
|
|
264
|
-
"id": "$_id",
|
|
265
|
-
"upstream_docs": 1,
|
|
266
|
-
"type": {"$arrayElemAt": ["$selves.type", 0]},
|
|
267
|
-
}
|
|
268
|
-
},
|
|
269
|
-
],
|
|
270
|
-
allowDiskUse=True,
|
|
271
|
-
)
|
|
254
|
+
merge_into_collection_name = gather_linked_instances(
|
|
255
|
+
alldocs_collection=mdb.alldocs, ids=ids, types=types
|
|
272
256
|
)
|
|
273
257
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
# reduce response latency and size, and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed
|
|
279
|
-
# downstream docs into an array for each given ID.
|
|
280
|
-
downstream_docs = list(
|
|
281
|
-
mdb.alldocs.aggregate(
|
|
282
|
-
[
|
|
283
|
-
{"$match": {"id": {"$in": ids}}},
|
|
284
|
-
{
|
|
285
|
-
"$graphLookup": {
|
|
286
|
-
"from": "alldocs",
|
|
287
|
-
"startWith": "$_downstream.id",
|
|
288
|
-
"connectFromField": "_downstream.id",
|
|
289
|
-
"connectToField": "id",
|
|
290
|
-
"as": "downstream_docs",
|
|
291
|
-
}
|
|
292
|
-
},
|
|
293
|
-
{"$unwind": {"path": "$downstream_docs"}},
|
|
294
|
-
{"$match": {"downstream_docs._type_and_ancestors": {"$in": types}}},
|
|
295
|
-
{
|
|
296
|
-
"$group": {
|
|
297
|
-
"_id": "$id",
|
|
298
|
-
"downstream_docs": {
|
|
299
|
-
"$addToSet": {
|
|
300
|
-
"id": "$downstream_docs.id",
|
|
301
|
-
"type": "$downstream_docs.type",
|
|
302
|
-
}
|
|
303
|
-
},
|
|
304
|
-
}
|
|
305
|
-
},
|
|
306
|
-
{
|
|
307
|
-
"$lookup": {
|
|
308
|
-
"from": "alldocs",
|
|
309
|
-
"localField": "_id",
|
|
310
|
-
"foreignField": "id",
|
|
311
|
-
"as": "selves",
|
|
312
|
-
}
|
|
313
|
-
},
|
|
314
|
-
{
|
|
315
|
-
"$project": {
|
|
316
|
-
"_id": 0,
|
|
317
|
-
"id": "$_id",
|
|
318
|
-
"downstream_docs": 1,
|
|
319
|
-
"type": {"$arrayElemAt": ["$selves.type", 0]},
|
|
320
|
-
}
|
|
321
|
-
},
|
|
322
|
-
],
|
|
323
|
-
allowDiskUse=True,
|
|
324
|
-
)
|
|
258
|
+
rv = list_resources(
|
|
259
|
+
ListRequest(page_token=page_token, max_page_size=max_page_size),
|
|
260
|
+
mdb,
|
|
261
|
+
merge_into_collection_name,
|
|
325
262
|
)
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
"id": id_,
|
|
330
|
-
"upstream_docs": [],
|
|
331
|
-
"downstream_docs": [],
|
|
332
|
-
}
|
|
333
|
-
for id_ in ids
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
# For each subject document that was upstream of or downstream of any documents, create a dictionary
|
|
337
|
-
# containing that subject document's `id`, its `type`, and the list of `id`s of the
|
|
338
|
-
# documents that it for upstream or or downstream of.
|
|
339
|
-
for d in upstream_docs + downstream_docs:
|
|
340
|
-
relations_by_id[d["id"]]["type"] = d["type"]
|
|
341
|
-
relations_by_id[d["id"]]["upstream_docs"] += d.get("upstream_docs", [])
|
|
342
|
-
relations_by_id[d["id"]]["downstream_docs"] += d.get("downstream_docs", [])
|
|
343
|
-
|
|
344
|
-
return {"resources": list(relations_by_id.values())}
|
|
263
|
+
rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
|
|
264
|
+
rv["resources"] = [strip_oid(d) for d in rv["resources"]]
|
|
265
|
+
return rv
|
|
345
266
|
|
|
346
267
|
|
|
347
268
|
@router.get(
|
|
@@ -124,9 +124,8 @@ def get_object_info(
|
|
|
124
124
|
)
|
|
125
125
|
if object_id.startswith("sty-"):
|
|
126
126
|
url_to_try = f"https://data.microbiomedata.org/api/study/nmdc:{object_id}"
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
) # TODO use HEAD when enabled upstream
|
|
127
|
+
# TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
|
|
128
|
+
rv = requests.get(url_to_try, allow_redirects=True)
|
|
130
129
|
if rv.status_code != 404:
|
|
131
130
|
return RedirectResponse(
|
|
132
131
|
f"https://data.microbiomedata.org/details/study/nmdc:{object_id}",
|
|
@@ -134,9 +133,8 @@ def get_object_info(
|
|
|
134
133
|
)
|
|
135
134
|
elif object_id.startswith("bsm-"):
|
|
136
135
|
url_to_try = f"https://data.microbiomedata.org/api/biosample/nmdc:{object_id}"
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
) # TODO use HEAD when enabled upstream
|
|
136
|
+
# TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
|
|
137
|
+
rv = requests.get(url_to_try, allow_redirects=True)
|
|
140
138
|
if rv.status_code != 404:
|
|
141
139
|
return RedirectResponse(
|
|
142
140
|
f"https://data.microbiomedata.org/details/sample/nmdc:{object_id}",
|
|
@@ -270,8 +268,3 @@ def update_object(
|
|
|
270
268
|
doc_object_patched = merge(doc, object_patch.model_dump(exclude_unset=True))
|
|
271
269
|
mdb.operations.replace_one({"id": object_id}, doc_object_patched)
|
|
272
270
|
return doc_object_patched
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
@router.put("/objects/{object_id}", response_model=DrsObject)
|
|
276
|
-
def replace_object():
|
|
277
|
-
pass
|
|
@@ -76,30 +76,3 @@ def update_operation(
|
|
|
76
76
|
)
|
|
77
77
|
mdb.operations.replace_one({"id": op_id}, doc_op_patched)
|
|
78
78
|
return doc_op_patched
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@router.post(
|
|
82
|
-
"/operations/{op_id}:wait",
|
|
83
|
-
description=(
|
|
84
|
-
"Wait until the operation is resolved or rejected before returning the result."
|
|
85
|
-
" This is a 'blocking' alternative to client-side polling, and may not be available"
|
|
86
|
-
" for operation types know to be particularly long-running."
|
|
87
|
-
),
|
|
88
|
-
)
|
|
89
|
-
def wait_operation():
|
|
90
|
-
pass
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
@router.post("/operations/{op_id}:cancel")
|
|
94
|
-
def cancel_operation():
|
|
95
|
-
pass
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
@router.post("/operations/{op_id}:pause")
|
|
99
|
-
def pause_operation():
|
|
100
|
-
pass
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
@router.post("/operations/{op_id}:resume")
|
|
104
|
-
def resume_operation():
|
|
105
|
-
pass
|
|
@@ -175,6 +175,28 @@ def run_query(
|
|
|
175
175
|
}
|
|
176
176
|
```
|
|
177
177
|
|
|
178
|
+
Get a specific study and all the biosamples associated with that study.
|
|
179
|
+
```
|
|
180
|
+
{
|
|
181
|
+
"aggregate": "study_set",
|
|
182
|
+
"pipeline": [
|
|
183
|
+
{
|
|
184
|
+
"$match": {
|
|
185
|
+
"id": "nmdc:sty-11-8fb6t785"
|
|
186
|
+
}
|
|
187
|
+
},
|
|
188
|
+
{
|
|
189
|
+
"$lookup": {
|
|
190
|
+
"from": "biosample_set",
|
|
191
|
+
"localField": "id",
|
|
192
|
+
"foreignField": "associated_studies",
|
|
193
|
+
"as": "biosamples_of_study"
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
]
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
178
200
|
Use the `cursor.id` from a previous response to get the next batch of results,
|
|
179
201
|
whether that batch is empty or non-empty.
|
|
180
202
|
```
|
|
@@ -87,30 +87,6 @@ def get_site(
|
|
|
87
87
|
return raise404_if_none(mdb.sites.find_one({"id": site_id}))
|
|
88
88
|
|
|
89
89
|
|
|
90
|
-
@router.patch("/sites/{site_id}", include_in_schema=False)
|
|
91
|
-
def update_site():
|
|
92
|
-
"""Not yet implemented"""
|
|
93
|
-
pass
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
@router.put("/sites/{site_id}", include_in_schema=False)
|
|
97
|
-
def replace_site():
|
|
98
|
-
"""Not yet implemented"""
|
|
99
|
-
pass
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
@router.get("/sites/{site_id}/capabilities", include_in_schema=False)
|
|
103
|
-
def list_site_capabilities(site_id: str):
|
|
104
|
-
"""Not yet implemented"""
|
|
105
|
-
pass
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
@router.put("/sites/{site_id}/capabilities", include_in_schema=False)
|
|
109
|
-
def replace_site_capabilities(site_id: str, capability_ids: List[str]):
|
|
110
|
-
"""Not yet implemented"""
|
|
111
|
-
pass
|
|
112
|
-
|
|
113
|
-
|
|
114
90
|
def verify_client_site_pair(
|
|
115
91
|
site_id: str,
|
|
116
92
|
mdb: pymongo.database.Database = Depends(get_mongo_db),
|
|
@@ -6,7 +6,7 @@ from functools import lru_cache
|
|
|
6
6
|
from json import JSONDecodeError
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from time import time_ns
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import List, Optional, Set, Tuple
|
|
10
10
|
from zoneinfo import ZoneInfo
|
|
11
11
|
|
|
12
12
|
from bson import json_util
|
|
@@ -55,18 +55,23 @@ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
|
|
|
55
55
|
HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
|
|
56
56
|
|
|
57
57
|
|
|
58
|
-
def
|
|
59
|
-
collection: MongoCollection, filter_: dict,
|
|
58
|
+
def is_num_matching_docs_within_limit(
|
|
59
|
+
collection: MongoCollection, filter_: dict, limit: int
|
|
60
60
|
) -> bool:
|
|
61
|
-
"""
|
|
62
|
-
|
|
63
|
-
|
|
61
|
+
"""
|
|
62
|
+
Check whether the number of documents in a MongoDB collection that match
|
|
63
|
+
the filter is within (i.e. is no greater than) the specified limit.
|
|
64
|
+
"""
|
|
65
|
+
if limit < 0:
|
|
66
|
+
raise ValueError("Limit must be at least 0.")
|
|
64
67
|
|
|
68
|
+
# Count the number of documents matching the filter, but only count up to limit + 1,
|
|
69
|
+
# since that's enough to determine whether the number exceeds the limit.
|
|
65
70
|
limited_num_matching_docs = collection.count_documents(
|
|
66
71
|
filter=filter_,
|
|
67
|
-
limit=
|
|
72
|
+
limit=limit + 1,
|
|
68
73
|
)
|
|
69
|
-
return limited_num_matching_docs
|
|
74
|
+
return limited_num_matching_docs <= limit
|
|
70
75
|
|
|
71
76
|
|
|
72
77
|
def check_filter(filter_: str):
|
|
@@ -87,22 +92,44 @@ def check_filter(filter_: str):
|
|
|
87
92
|
return filter_
|
|
88
93
|
|
|
89
94
|
|
|
90
|
-
def list_resources(
|
|
91
|
-
|
|
95
|
+
def list_resources(
|
|
96
|
+
req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
|
|
97
|
+
) -> dict:
|
|
98
|
+
"""
|
|
92
99
|
Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
|
|
93
100
|
|
|
94
|
-
|
|
95
|
-
|
|
101
|
+
`mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
|
|
102
|
+
unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
|
|
103
|
+
the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
|
|
104
|
+
as an argument to this function's `collection_name` parameter.
|
|
105
|
+
|
|
106
|
+
If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
|
|
107
|
+
can fit on a page of that size, this function will paginate the resources.
|
|
96
108
|
"""
|
|
109
|
+
if collection_name == "" and req.page_token is None:
|
|
110
|
+
raise HTTPException(
|
|
111
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
112
|
+
detail="Must specify a collection name if no page token is supplied.",
|
|
113
|
+
)
|
|
114
|
+
if req.page_token:
|
|
115
|
+
doc = mdb.page_tokens.find_one({"_id": req.page_token})
|
|
116
|
+
if doc is None:
|
|
117
|
+
raise HTTPException(
|
|
118
|
+
status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
|
|
119
|
+
)
|
|
120
|
+
collection_name = doc["ns"]
|
|
121
|
+
last_id = doc["last_id"]
|
|
122
|
+
mdb.page_tokens.delete_one({"_id": req.page_token})
|
|
123
|
+
else:
|
|
124
|
+
last_id = None
|
|
97
125
|
|
|
98
126
|
id_field = "id"
|
|
99
127
|
if "id_1" not in mdb[collection_name].index_information():
|
|
100
128
|
logging.warning(
|
|
101
129
|
f"list_resources: no index set on 'id' for collection {collection_name}"
|
|
102
130
|
)
|
|
103
|
-
id_field =
|
|
104
|
-
|
|
105
|
-
)
|
|
131
|
+
id_field = "_id" # expected for `functional_annotation_agg` collection
|
|
132
|
+
|
|
106
133
|
max_page_size = req.max_page_size
|
|
107
134
|
filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
|
|
108
135
|
projection = (
|
|
@@ -110,16 +137,6 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
|
|
|
110
137
|
if req.projection
|
|
111
138
|
else None
|
|
112
139
|
)
|
|
113
|
-
if req.page_token:
|
|
114
|
-
doc = mdb.page_tokens.find_one({"_id": req.page_token, "ns": collection_name})
|
|
115
|
-
if doc is None:
|
|
116
|
-
raise HTTPException(
|
|
117
|
-
status_code=status.HTTP_400_BAD_REQUEST, detail="Bad page_token"
|
|
118
|
-
)
|
|
119
|
-
last_id = doc["last_id"]
|
|
120
|
-
mdb.page_tokens.delete_one({"_id": req.page_token})
|
|
121
|
-
else:
|
|
122
|
-
last_id = None
|
|
123
140
|
if last_id is not None:
|
|
124
141
|
if id_field in filter_:
|
|
125
142
|
filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
|
|
@@ -128,17 +145,12 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
|
|
|
128
145
|
|
|
129
146
|
# Determine whether we will paginate the results.
|
|
130
147
|
#
|
|
131
|
-
# Note: We will paginate them unless either
|
|
132
|
-
#
|
|
133
|
-
# - the number of documents matching the filter does not exceed `max_page_size`
|
|
148
|
+
# Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
|
|
149
|
+
# or (b) the number of documents matching the filter can fit on a single page.
|
|
134
150
|
#
|
|
135
151
|
will_paginate = True
|
|
136
|
-
if
|
|
137
|
-
|
|
138
|
-
elif max_page_size < 1:
|
|
139
|
-
will_paginate = False
|
|
140
|
-
elif not does_num_matching_docs_exceed_threshold(
|
|
141
|
-
collection=mdb[collection_name], filter_=filter_, threshold=max_page_size
|
|
152
|
+
if max_page_size < 1 or is_num_matching_docs_within_limit(
|
|
153
|
+
collection=mdb[collection_name], filter_=filter_, limit=max_page_size
|
|
142
154
|
):
|
|
143
155
|
will_paginate = False
|
|
144
156
|
|
|
@@ -304,9 +316,19 @@ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
|
|
|
304
316
|
if req.page:
|
|
305
317
|
skip = (req.page - 1) * req.per_page
|
|
306
318
|
if skip > 10_000:
|
|
319
|
+
# Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
|
|
320
|
+
# parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
|
|
321
|
+
# longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
|
|
322
|
+
# collections.
|
|
307
323
|
raise HTTPException(
|
|
308
324
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
309
|
-
detail=
|
|
325
|
+
detail=(
|
|
326
|
+
"Use cursor-based pagination for paging beyond 10,000 items. "
|
|
327
|
+
"That is, instead of specifying the `page` query parameter for this endpoint, "
|
|
328
|
+
"specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
|
|
329
|
+
"and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
|
|
330
|
+
"you set `cursor` in the next request."
|
|
331
|
+
),
|
|
310
332
|
)
|
|
311
333
|
limit = req.per_page
|
|
312
334
|
results, db_response_time_ms = timeit(
|