PyPI - nmdc-runtime - Versions diffs - 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl - Mend

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +22 -2
nmdc_runtime/api/core/idgen.py +36 -6
nmdc_runtime/api/db/mongo.py +0 -12
nmdc_runtime/api/endpoints/find.py +65 -225
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
nmdc_runtime/api/endpoints/objects.py +4 -11
nmdc_runtime/api/endpoints/operations.py +0 -27
nmdc_runtime/api/endpoints/queries.py +22 -0
nmdc_runtime/api/endpoints/sites.py +0 -24
nmdc_runtime/api/endpoints/util.py +57 -35
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +84 -60
nmdc_runtime/api/models/util.py +12 -5
nmdc_runtime/api/openapi.py +116 -180
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/minter/adapters/repository.py +21 -0
nmdc_runtime/minter/domain/model.py +20 -0
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +632 -11
nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
nmdc_runtime/site/graphs.py +7 -0
nmdc_runtime/site/ops.py +92 -34
nmdc_runtime/site/repository.py +2 -0
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +87 -1
nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
nmdc_runtime/api/endpoints/ids.py +0 -192
nmdc_runtime/client/__init__.py +0 -0
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/__init__.py +0 -0
nmdc_runtime/core/db/Database.py +0 -13
nmdc_runtime/core/db/__init__.py +0 -0
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/__init__.py +0 -0
nmdc_runtime/domain/users/__init__.py +0 -0
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/__init__.py +0 -0
nmdc_runtime/infrastructure/database/__init__.py +0 -0
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
nmdc_runtime/infrastructure/database/models/user.py +0 -1
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -33
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -825
nmdc_runtime/lib/nmdc_etl_class.py +0 -396
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/__init__.py +0 -0
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/api/endpoints/nmdcschema.py CHANGED Viewed

@@ -10,6 +10,10 @@ from refscan.lib.helpers import (
     get_names_of_classes_eligible_for_collection,
 )
+from nmdc_runtime.api.endpoints.lib.linked_instances import (
+    gather_linked_instances,
+    hydrated,
+)
 from nmdc_runtime.config import IS_LINKED_INSTANCES_ENDPOINT_ENABLED
 from nmdc_runtime.minter.config import typecodes
 from nmdc_runtime.minter.domain.model import check_valid_ids
@@ -118,7 +122,7 @@ def get_nmdc_database_collection_stats(
 @decorate_if(condition=IS_LINKED_INSTANCES_ENDPOINT_ENABLED)(
     router.get(
         "/nmdcschema/linked_instances",
-        response_model=ListResponse,
+        response_model=ListResponse[Doc],
         response_model_exclude_unset=True,
     )
 )
@@ -147,23 +151,54 @@ def get_linked_instances(
             examples=["nmdc:bsm-11-abc123"],
         ),
     ] = None,
+    hydrate: Annotated[
+        bool,
+        Query(
+            title="Hydrate",
+            description="Whether to include full documents in the response. The default is to include slim documents.",
+        ),
+    ] = False,
+    page_token: Annotated[
+        str | None,
+        Query(
+            title="Next page token",
+            description="""A bookmark you can use to fetch the _next_ page of resources. You can get this from the
+                    `next_page_token` field in a previous response from this endpoint.\n\n_Example_:
+                    `nmdc:sys0zr0fbt71`""",
+            examples=[
+                "nmdc:sys0zr0fbt71",
+            ],
+        ),
+    ] = None,
+    max_page_size: Annotated[
+        int,
+        Query(
+            title="Resources per page",
+            description="How many resources you want _each page_ to contain, formatted as a positive integer.",
+            examples=[20],
+        ),
+    ] = 20,
     mdb: MongoDatabase = Depends(get_mongo_db),
 ):
     """
     Retrieves database instances that are both (a) linked to any of `ids`, and (b) of a type in `types`.
-    An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to
-    a class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition))
-    in our database ([nmdc:Database](https://w3id.org/nmdc/Database)).
-    While a [nmdc:Database](https://w3id.org/nmdc/Database) is organized into collections,
-    every item in every database collection -- that is, every instance -- knows its `type`, so we can
-    (and here do)<sup>&dagger;</sup>
-    return a simple list of instances
-    ([a LinkML CollectionInstance](https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)),
-    which a client may use to construct a corresponding [nmdc:Database](https://w3id.org/nmdc/Database).
-    From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively) in
-    order to collect the set of all instances linked to these `ids`.
+    An [instance](https://linkml.io/linkml-model/latest/docs/specification/02instances/) is an object conforming to a
+    class definition ([linkml:ClassDefinition](https://w3id.org/linkml/ClassDefinition)) in our database ([
+    nmdc:Database](https://w3id.org/nmdc/Database)). While a [nmdc:Database](https://w3id.org/nmdc/Database) is
+    organized into collections, every item in every database collection -- that is, every instance -- knows its
+    `type`, so we can (and here do) return a simple list of instances ([a LinkML CollectionInstance](
+    https://linkml.io/linkml-model/latest/docs/specification/02instances/#collections)). If hydrate is `False` (the
+    default), then the returned list contains "slim" documents that include only the `id` and `type` of each
+    instance. If hydrate is `True`, then the returned list contains "full" (aka <a
+    href="https://en.wikipedia.org/wiki/Hydration_(web_development)">"hydrated"</a>) documents of each instance,
+    suitable e.g. for a client to subsequently use to construct a corresponding
+    [nmdc:Database](https://w3id.org/nmdc/Database) instance with schema-compliant documents.
+    Both "slim" and "full" documents include (optional) `_upstream_of` and `_downstream_of` fields,
+    to indicate the returned document's relationship to `ids`.
+    From the nexus instance IDs given in `ids`, both "upstream" and "downstream" links are followed (transitively)
+    to collect the set of all instances linked to these `ids`.
     * A link "upstream" is represented by a slot ([linkml:SlotDefinition](https://w3id.org/linkml/SlotDefinition))
     for which the
@@ -186,16 +221,15 @@ def get_linked_instances(
     [nmdc:InformationObject](https://w3id.org/nmdc/InformationObject),
     [nmdc:Sample](https://w3id.org/nmdc/Sample), etc. -- may be given.
     If no value for `types` is given, then all [nmdc:NamedThing](https://w3id.org/nmdc/NamedThing)s are returned.
-    <sup>&dagger;</sup>: actually, we do not (yet).
-    For now (see [microbiomedata/nmdc-runtime#1118](https://github.com/microbiomedata/nmdc-runtime/issues/1118)),
-    we return a short list of "fat" documents,  each of which represents one of the `ids` and presents
-    representations of that id's downstream and upstream instances (currently just each instance's `id` and `type`)
-    as separate subdocument array fields.
     """
-    # TODO move logic from endpoint to unit-testable handler
-    # TODO ListResponse[SimplifiedNMDCDatabase]
-    # TODO ensure pagination for responses
+    if page_token is not None:
+        rv = list_resources(
+            req=ListRequest(page_token=page_token, max_page_size=max_page_size), mdb=mdb
+        )
+        rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
+        rv["resources"] = [strip_oid(d) for d in rv["resources"]]
+        return rv
     ids_found = [d["id"] for d in mdb.alldocs.find({"id": {"$in": ids}}, {"id": 1})]
     ids_not_found = list(set(ids) - set(ids_found))
     if ids_not_found:
@@ -217,131 +251,18 @@ def get_linked_instances(
             ),
         )
-    # This aggregation pipeline traverses the graph of documents in the alldocs collection, following upstream
-    # relationships (_upstream.id) to discover upstream documents for entities that originated, or helped produce,
-    # the entities with documents identified by `ids`. It unwinds the collected (via `$graphLookup`) upstream docs,
-    # filters them by given `types` of interest, projects only essential fields to reduce response latency and size,
-    # and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed upstream docs into an array for each
-    # given ID.
-    upstream_docs = list(
-        mdb.alldocs.aggregate(
-            [
-                {"$match": {"id": {"$in": ids}}},
-                {
-                    "$graphLookup": {
-                        "from": "alldocs",
-                        "startWith": "$_upstream.id",
-                        "connectFromField": "_upstream.id",
-                        "connectToField": "id",
-                        "as": "upstream_docs",
-                    }
-                },
-                {"$unwind": {"path": "$upstream_docs"}},
-                {"$match": {"upstream_docs._type_and_ancestors": {"$in": types}}},
-                {"$project": {"id": 1, "upstream_docs": "$upstream_docs"}},
-                {
-                    "$group": {
-                        "_id": "$id",
-                        "upstream_docs": {
-                            "$addToSet": {
-                                "id": "$upstream_docs.id",
-                                "type": "$upstream_docs.type",
-                            }
-                        },
-                    }
-                },
-                {
-                    "$lookup": {
-                        "from": "alldocs",
-                        "localField": "_id",
-                        "foreignField": "id",
-                        "as": "selves",
-                    }
-                },
-                {
-                    "$project": {
-                        "_id": 0,
-                        "id": "$_id",
-                        "upstream_docs": 1,
-                        "type": {"$arrayElemAt": ["$selves.type", 0]},
-                    }
-                },
-            ],
-            allowDiskUse=True,
-        )
+    merge_into_collection_name = gather_linked_instances(
+        alldocs_collection=mdb.alldocs, ids=ids, types=types
     )
-    # This aggregation pipeline traverses the graph of documents in the alldocs collection, following downstream
-    # relationships (_downstream.id) to discover downstream documents for entities that originated from,
-    # or are considered part of, the entities with documents identified by `ids`. It unwinds the collected (via
-    # `$graphLookup`) downstream docs, filters them by given `types` of interest, projects only essential fields to
-    # reduce response latency and size, and groups them by each of the given `ids`, i.e. re-winding the `$unwind`-ed
-    # downstream docs into an array for each given ID.
-    downstream_docs = list(
-        mdb.alldocs.aggregate(
-            [
-                {"$match": {"id": {"$in": ids}}},
-                {
-                    "$graphLookup": {
-                        "from": "alldocs",
-                        "startWith": "$_downstream.id",
-                        "connectFromField": "_downstream.id",
-                        "connectToField": "id",
-                        "as": "downstream_docs",
-                    }
-                },
-                {"$unwind": {"path": "$downstream_docs"}},
-                {"$match": {"downstream_docs._type_and_ancestors": {"$in": types}}},
-                {
-                    "$group": {
-                        "_id": "$id",
-                        "downstream_docs": {
-                            "$addToSet": {
-                                "id": "$downstream_docs.id",
-                                "type": "$downstream_docs.type",
-                            }
-                        },
-                    }
-                },
-                {
-                    "$lookup": {
-                        "from": "alldocs",
-                        "localField": "_id",
-                        "foreignField": "id",
-                        "as": "selves",
-                    }
-                },
-                {
-                    "$project": {
-                        "_id": 0,
-                        "id": "$_id",
-                        "downstream_docs": 1,
-                        "type": {"$arrayElemAt": ["$selves.type", 0]},
-                    }
-                },
-            ],
-            allowDiskUse=True,
-        )
+    rv = list_resources(
+        ListRequest(page_token=page_token, max_page_size=max_page_size),
+        mdb,
+        merge_into_collection_name,
     )
-    relations_by_id = {
-        id_: {
-            "id": id_,
-            "upstream_docs": [],
-            "downstream_docs": [],
-        }
-        for id_ in ids
-    }
-    # For each subject document that was upstream of or downstream of any documents, create a dictionary
-    # containing that subject document's `id`, its `type`, and the list of `id`s of the
-    # documents that it for upstream or or downstream of.
-    for d in upstream_docs + downstream_docs:
-        relations_by_id[d["id"]]["type"] = d["type"]
-        relations_by_id[d["id"]]["upstream_docs"] += d.get("upstream_docs", [])
-        relations_by_id[d["id"]]["downstream_docs"] += d.get("downstream_docs", [])
-    return {"resources": list(relations_by_id.values())}
+    rv["resources"] = hydrated(rv["resources"], mdb) if hydrate else rv["resources"]
+    rv["resources"] = [strip_oid(d) for d in rv["resources"]]
+    return rv
 @router.get(

nmdc_runtime/api/endpoints/objects.py CHANGED Viewed

@@ -124,9 +124,8 @@ def get_object_info(
         )
     if object_id.startswith("sty-"):
         url_to_try = f"https://data.microbiomedata.org/api/study/nmdc:{object_id}"
-        rv = requests.get(
-            url_to_try, allow_redirects=True
-        )  # TODO use HEAD when enabled upstream
+        # TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
+        rv = requests.get(url_to_try, allow_redirects=True)
         if rv.status_code != 404:
             return RedirectResponse(
                 f"https://data.microbiomedata.org/details/study/nmdc:{object_id}",
@@ -134,9 +133,8 @@ def get_object_info(
             )
     elif object_id.startswith("bsm-"):
         url_to_try = f"https://data.microbiomedata.org/api/biosample/nmdc:{object_id}"
-        rv = requests.get(
-            url_to_try, allow_redirects=True
-        )  # TODO use HEAD when enabled upstream
+        # TODO: Update this HTTP request to use the HTTP "HEAD" method once the upstream endpoint supports that method.
+        rv = requests.get(url_to_try, allow_redirects=True)
         if rv.status_code != 404:
             return RedirectResponse(
                 f"https://data.microbiomedata.org/details/sample/nmdc:{object_id}",
@@ -270,8 +268,3 @@ def update_object(
     doc_object_patched = merge(doc, object_patch.model_dump(exclude_unset=True))
     mdb.operations.replace_one({"id": object_id}, doc_object_patched)
     return doc_object_patched
-@router.put("/objects/{object_id}", response_model=DrsObject)
-def replace_object():
-    pass

nmdc_runtime/api/endpoints/operations.py CHANGED Viewed

@@ -76,30 +76,3 @@ def update_operation(
     )
     mdb.operations.replace_one({"id": op_id}, doc_op_patched)
     return doc_op_patched
-@router.post(
-    "/operations/{op_id}:wait",
-    description=(
-        "Wait until the operation is resolved or rejected before returning the result."
-        " This is a 'blocking' alternative to client-side polling, and may not be available"
-        " for operation types know to be particularly long-running."
-    ),
-)
-def wait_operation():
-    pass
-@router.post("/operations/{op_id}:cancel")
-def cancel_operation():
-    pass
-@router.post("/operations/{op_id}:pause")
-def pause_operation():
-    pass
-@router.post("/operations/{op_id}:resume")
-def resume_operation():
-    pass

nmdc_runtime/api/endpoints/queries.py CHANGED Viewed

@@ -175,6 +175,28 @@ def run_query(
     }
     ```
+    Get a specific study and all the biosamples associated with that study.
+    ```
+    {
+      "aggregate": "study_set",
+      "pipeline": [
+        {
+          "$match": {
+            "id": "nmdc:sty-11-8fb6t785"
+          }
+        },
+        {
+          "$lookup": {
+            "from": "biosample_set",
+            "localField": "id",
+            "foreignField": "associated_studies",
+            "as": "biosamples_of_study"
+          }
+        }
+      ]
+    }
+    ```
     Use the `cursor.id` from a previous response to get the next batch of results,
     whether that batch is empty or non-empty.
     ```

nmdc_runtime/api/endpoints/sites.py CHANGED Viewed

@@ -87,30 +87,6 @@ def get_site(
     return raise404_if_none(mdb.sites.find_one({"id": site_id}))
-@router.patch("/sites/{site_id}", include_in_schema=False)
-def update_site():
-    """Not yet implemented"""
-    pass
-@router.put("/sites/{site_id}", include_in_schema=False)
-def replace_site():
-    """Not yet implemented"""
-    pass
-@router.get("/sites/{site_id}/capabilities", include_in_schema=False)
-def list_site_capabilities(site_id: str):
-    """Not yet implemented"""
-    pass
-@router.put("/sites/{site_id}/capabilities", include_in_schema=False)
-def replace_site_capabilities(site_id: str, capability_ids: List[str]):
-    """Not yet implemented"""
-    pass
 def verify_client_site_pair(
     site_id: str,
     mdb: pymongo.database.Database = Depends(get_mongo_db),

nmdc_runtime/api/endpoints/util.py CHANGED Viewed

@@ -6,7 +6,7 @@ from functools import lru_cache
 from json import JSONDecodeError
 from pathlib import Path
 from time import time_ns
-from typing import Dict, List, Optional, Set, Tuple
+from typing import List, Optional, Set, Tuple
 from zoneinfo import ZoneInfo
 from bson import json_util
@@ -55,18 +55,23 @@ BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
 HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
-def does_num_matching_docs_exceed_threshold(
-    collection: MongoCollection, filter_: dict, threshold: int
+def is_num_matching_docs_within_limit(
+    collection: MongoCollection, filter_: dict, limit: int
 ) -> bool:
-    """Check whether a MongoDB collection contains more than `threshold` documents matching the filter."""
-    if threshold < 0:
-        raise ValueError("Threshold must be at least 0.")
+    """
+    Check whether the number of documents in a MongoDB collection that match
+    the filter is within (i.e. is no greater than) the specified limit.
+    """
+    if limit < 0:
+        raise ValueError("Limit must be at least 0.")
+    # Count the number of documents matching the filter, but only count up to limit + 1,
+    # since that's enough to determine whether the number exceeds the limit.
     limited_num_matching_docs = collection.count_documents(
         filter=filter_,
-        limit=threshold + 1,
+        limit=limit + 1,
     )
-    return limited_num_matching_docs > threshold
+    return limited_num_matching_docs <= limit
 def check_filter(filter_: str):
@@ -87,22 +92,44 @@ def check_filter(filter_: str):
     return filter_
-def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
-    r"""
+def list_resources(
+    req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
+) -> dict:
+    """
     Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
-    Note: If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter
-          criteria than can fit on a page of that size, this function will paginate the resources.
+    `mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
+    unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
+    the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
+    as an argument to this function's `collection_name` parameter.
+    If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
+    can fit on a page of that size, this function will paginate the resources.
     """
+    if collection_name == "" and req.page_token is None:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Must specify a collection name if no page token is supplied.",
+        )
+    if req.page_token:
+        doc = mdb.page_tokens.find_one({"_id": req.page_token})
+        if doc is None:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
+            )
+        collection_name = doc["ns"]
+        last_id = doc["last_id"]
+        mdb.page_tokens.delete_one({"_id": req.page_token})
+    else:
+        last_id = None
     id_field = "id"
     if "id_1" not in mdb[collection_name].index_information():
         logging.warning(
             f"list_resources: no index set on 'id' for collection {collection_name}"
         )
-        id_field = (
-            "_id"  # currently expected for `functional_annotation_agg` collection
-        )
+        id_field = "_id"  # expected for `functional_annotation_agg` collection
     max_page_size = req.max_page_size
     filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
     projection = (
@@ -110,16 +137,6 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
         if req.projection
         else None
     )
-    if req.page_token:
-        doc = mdb.page_tokens.find_one({"_id": req.page_token, "ns": collection_name})
-        if doc is None:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST, detail="Bad page_token"
-            )
-        last_id = doc["last_id"]
-        mdb.page_tokens.delete_one({"_id": req.page_token})
-    else:
-        last_id = None
     if last_id is not None:
         if id_field in filter_:
             filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
@@ -128,17 +145,12 @@ def list_resources(req: ListRequest, mdb: MongoDatabase, collection_name: str):
     # Determine whether we will paginate the results.
     #
-    # Note: We will paginate them unless either:
-    #       - the `max_page_size` is not a positive integer
-    #       - the number of documents matching the filter does not exceed `max_page_size`
+    # Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
+    #       or (b) the number of documents matching the filter can fit on a single page.
     #
     will_paginate = True
-    if not isinstance(max_page_size, int):
-        will_paginate = False
-    elif max_page_size < 1:
-        will_paginate = False
-    elif not does_num_matching_docs_exceed_threshold(
-        collection=mdb[collection_name], filter_=filter_, threshold=max_page_size
+    if max_page_size < 1 or is_num_matching_docs_within_limit(
+        collection=mdb[collection_name], filter_=filter_, limit=max_page_size
     ):
         will_paginate = False
@@ -304,9 +316,19 @@ def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
     if req.page:
         skip = (req.page - 1) * req.per_page
         if skip > 10_000:
+            # Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
+            # parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
+            # longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
+            # collections.
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Use cursor-based pagination for paging beyond 10,000 items",
+                detail=(
+                    "Use cursor-based pagination for paging beyond 10,000 items. "
+                    "That is, instead of specifying the `page` query parameter for this endpoint, "
+                    "specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
+                    "and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
+                    "you set `cursor` in the next request."
+                ),
             )
         limit = req.per_page
         results, db_response_time_ms = timeit(

nmdc_runtime/api/entrypoint.sh ADDED Viewed

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+exec gunicorn --worker-tmp-dir /dev/shm --workers=2 \
+              --threads=4 --worker-class gthread \
+              --log-file=- --bind 0.0.0.0:8000 nmdc_runtime.api.main:app

nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.1py3-none-any.whl