PyPI - nmdc-runtime - Versions diffs - 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show

nmdc_runtime/Dockerfile +167 -0
nmdc_runtime/api/analytics.py +22 -2
nmdc_runtime/api/core/idgen.py +36 -6
nmdc_runtime/api/db/mongo.py +0 -12
nmdc_runtime/api/endpoints/find.py +65 -225
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
nmdc_runtime/api/endpoints/objects.py +4 -11
nmdc_runtime/api/endpoints/operations.py +0 -27
nmdc_runtime/api/endpoints/queries.py +22 -0
nmdc_runtime/api/endpoints/sites.py +0 -24
nmdc_runtime/api/endpoints/util.py +57 -35
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +84 -60
nmdc_runtime/api/models/util.py +12 -5
nmdc_runtime/api/openapi.py +116 -180
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/minter/adapters/repository.py +21 -0
nmdc_runtime/minter/domain/model.py +20 -0
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +632 -11
nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
nmdc_runtime/site/graphs.py +7 -0
nmdc_runtime/site/ops.py +92 -34
nmdc_runtime/site/repository.py +2 -0
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +87 -1
nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/RECORD +47 -57
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
nmdc_runtime/api/endpoints/ids.py +0 -192
nmdc_runtime/client/__init__.py +0 -0
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/__init__.py +0 -0
nmdc_runtime/core/db/Database.py +0 -13
nmdc_runtime/core/db/__init__.py +0 -0
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/__init__.py +0 -0
nmdc_runtime/domain/users/__init__.py +0 -0
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/__init__.py +0 -0
nmdc_runtime/infrastructure/database/__init__.py +0 -0
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
nmdc_runtime/infrastructure/database/models/user.py +0 -1
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -33
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -825
nmdc_runtime/lib/nmdc_etl_class.py +0 -396
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/__init__.py +0 -0
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/Dockerfile ADDED Viewed

@@ -0,0 +1,167 @@
+# Note: Most of the steps for the `base` image were copied verbatim from either `fastapi.Dockerfile`,
+#       `dagster.Dockerfile`, or `test.Dockerfile` (indeed, most of the steps were present in all three files).
+#       Reference: https://docs.docker.com/get-started/docker-concepts/building-images/multi-stage-builds/
+#
+# Base this image upon a variant of the official Python 3.10 image that is, in turn,
+# based upon a minimal (slim) variant of the Debian 11 (bullseye) image.
+# Reference: https://hub.docker.com/_/python
+# ────────────────────────────────────────────────────────────────────────────┐
+FROM python:3.10-slim-bullseye AS base
+# ────────────────────────────────────────────────────────────────────────────┘
+# Install and upgrade system-level software in a non-interactive way, then delete temporary files.
+# Note: Setting `DEBIAN_FRONTEND=noninteractive` and passing `-y` to `apt-get` makes things non-interactive.
+RUN export DEBIAN_FRONTEND=noninteractive && \
+  apt-get update && \
+  apt-get -y upgrade && \
+  apt-get install -y --no-install-recommends \
+    tini \
+    procps \
+    net-tools \
+    build-essential \
+    git \
+    make \
+    zip \
+    curl \
+    wget \
+    gnupg && \
+  apt-get -y clean && \
+  rm -rf /var/lib/apt/lists/*
+# Enable Python's "fault handler" feature, so, when low-level errors occur (e.g. segfaults), Python prints lots of info.
+# Reference: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONFAULTHANDLER
+ENV PYTHONFAULTHANDLER=1
+# Configure Git to consider the `/code` directory to be "safe", so that, when a Git repository
+# created outside of the container gets mounted at that path within the container, the
+# `uv-dynamic-versioning` tool running within the container does not fail with the error:
+# > "Detected Git repository, but failed because of dubious ownership"
+# Reference: https://git-scm.com/docs/git-config#Documentation/git-config.txt-safedirectory
+RUN git config --global --add safe.directory /code
+# Install `uv`.
+# Reference: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
+ADD https://astral.sh/uv/install.sh /uv-installer.sh
+RUN sh /uv-installer.sh && \
+    rm /uv-installer.sh
+ENV PATH="/root/.local/bin/:$PATH"
+# Install Python dependencies (production dependencies only).
+#
+# Note: We copy only the files that `uv` needs in order to install dependencies. That way,
+#       we minimize the number of files whose changes would invalidate cached image layers
+#
+# Note: We use the `VIRTUAL_ENV` environment variable to specify the path to the Python virtual
+#       environment that we want the `uv` program inside the container to create and use.
+#
+#       Q: Why don't we use `./.venv` in the repository file tree?
+#       A: If we were to do that, then, whenever a developer would mount (via our Docker Compose file)
+#          the repository file tree from their host machine (which may include a `.venv/` directory
+#          created by their host machine) into the container, it would overwrite the Python virtual
+#          environment that the `uv` program inside the container is using.
+#
+#       Q: What is special about the `VIRTUAL_ENV` environment variable?
+#       A: When using `uv`'s `--active` option (as we do in later stages of this Dockerfile),
+#          `uv` determines which virtual environment is active by looking at `VIRTUAL_ENV'. This
+#          is the case, even though the documentation of the `venv` module (in Python's standard
+#          library) specifically says: "`VIRTUAL_ENV` cannot be relied upon to determine whether
+#          a virtual environment is being used."
+#
+#       References:
+#       - https://docs.astral.sh/uv/pip/environments/#using-arbitrary-python-environments (RE: `VIRTUAL_ENV`)
+#       - https://docs.astral.sh/uv/reference/environment/#virtual_env (RE: `VIRTUAL_ENV`, from uv's perspective)
+#       - https://docs.python.org/3/library/venv.html#how-venvs-work (RE: `VIRTUAL_ENV`, from venv's perspective)
+#       - https://docs.astral.sh/uv/concepts/projects/sync/#partial-installations (RE: `--no-install-project`)
+#
+# Note: In the `RUN` command, we use a "cache mount" (a feature of Docker) to cache production dependencies
+#       across builds. This is a performance optimization technique shown in the `uv` docs.
+#       Reference:
+#       - https://docs.astral.sh/uv/guides/integration/docker/#caching (RE: the technique)
+#       - https://docs.docker.com/build/cache/optimize/#use-cache-mounts (RE: the feature)
+#       - https://docs.astral.sh/uv/reference/settings/#link-mode (RE: `UV_LINK_MODE`)
+#       - https://docs.astral.sh/uv/reference/cli/#uv-sync--no-install-project (RE: `--no-install-project`)
+#
+# Note: We use `--compile-bytecode` so that Python compiles `.py` files to `.pyc` files now,
+#       instead of when the container is running. By default, `uv` defers this compilation
+#       to "import time," whereas `pip` (by default) performs it at "install time" (like this).
+#
+ENV VIRTUAL_ENV="/venv"
+RUN mkdir -p "${VIRTUAL_ENV}"
+COPY ./pyproject.toml /code/pyproject.toml
+COPY ./uv.lock        /code/uv.lock
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /code && \
+    UV_LINK_MODE=copy uv sync --active --no-dev --no-install-project --compile-bytecode
+# ────────────────────────────────────────────────────────────────────────────┐
+FROM base AS fastapi
+# ────────────────────────────────────────────────────────────────────────────┘
+# Copy repository contents into image.
+COPY . /code
+# Install the project in editable mode.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /code && \
+    uv sync --active --no-dev
+# Use Uvicorn to serve the FastAPI app on port 8000.
+EXPOSE 8000
+WORKDIR /code
+CMD ["uv", "run", "--active", "uvicorn", "nmdc_runtime.api.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"]
+# ────────────────────────────────────────────────────────────────────────────┐
+FROM base AS dagster
+# ────────────────────────────────────────────────────────────────────────────┘
+# Copy repository contents into image.
+#
+# Note: This path (i.e. "/opt/dagster/lib/") is hard-coded in a few places in `nmdc_runtime/site/ops.py`. That's why
+#       this image does not store the repository contents in `/code`, unlike the other images in this Dockerfile.
+#
+COPY . /opt/dagster/lib
+# Install the project in editable mode.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /opt/dagster/lib && \
+    uv sync --active --no-dev
+# Move Dagster configuration files to the place Dagster expects.
+ENV DAGSTER_HOME="/opt/dagster/dagster_home/"
+RUN mkdir -p                                             "${DAGSTER_HOME}" && \
+    cp /opt/dagster/lib/nmdc_runtime/site/dagster.yaml   "${DAGSTER_HOME}" && \
+    cp /opt/dagster/lib/nmdc_runtime/site/workspace.yaml "${DAGSTER_HOME}"
+# Use Tini to run Dagit.
+#
+# Notes:
+# - The port number (i.e. "3000") is hard-coded in `nmdc_runtime/site/entrypoint-dagit.sh`.
+# - Dagster daemon (versus Dagit) can be launched by overriding the `ENTRYPOINT` defined here.
+#
+# Reference: https://github.com/krallin/tini
+#
+EXPOSE 3000
+WORKDIR /opt/dagster/dagster_home/
+ENTRYPOINT ["tini", "--", "../lib/nmdc_runtime/site/entrypoint-dagit.sh"]
+# ────────────────────────────────────────────────────────────────────────────┐
+FROM base AS test
+# ────────────────────────────────────────────────────────────────────────────┘
+# Copy all repository contents into image.
+COPY . /code
+# Install the project in editable mode, and install development dependencies.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd /code && \
+    uv sync --active
+# Make `wait-for-it.sh` executable.
+RUN chmod +x /code/.docker/wait-for-it.sh
+WORKDIR /code
+# Ensure started container does not exit, so that a subsequent `docker exec` command can run tests.
+# For an example `docker exec` command, see `Makefile`'s `run-test` target.
+# Such a command should use `wait-for-it.sh` to run `pytest` no earlier than when the FastAPI server is accessible.
+ENTRYPOINT ["tail", "-f", "/dev/null"]

nmdc_runtime/api/analytics.py CHANGED Viewed

@@ -16,25 +16,42 @@ from toolz import merge
 from nmdc_runtime.api.db.mongo import get_mongo_db
+# This is a queue of the "request descriptors" that we will eventually insert into the database.
 _requests = []
 _last_posted = datetime.now()
 def _post_requests(collection: str, requests_data: List[Dict], source: str):
+    """Inserts the specified request descriptors into the specified MongoDB collection."""
     mdb = get_mongo_db()
     mdb[collection].insert_many([merge(d, {"source": source}) for d in requests_data])
 def log_request(collection: str, request_data: Dict, source: str = "FastAPI"):
+    """Flushes the queue of request descriptors to the database if enough time has passed since the previous time."""
     global _requests, _last_posted
     _requests.append(request_data)
     now = datetime.now()
     # flush queue every minute at most
     if (now - _last_posted).total_seconds() > 60.0:
+        # Note: This use of threading is an attempt to avoid blocking the current thread
+        #       while performing the insertion(s).
+        #
+        # TODO: Is there is a race condition here? If multiple requests arrive at approximately
+        #       the same time, is it possible that each one causes a different thread to be
+        #       started, each with a different (and possibly overlapping) set of requests to
+        #       insert?
+        #
+        # TODO: If the insertion fails, will the requests be lost?
+        #
+        # Note: The author of this function said it may have been a "standard" solution copied
+        #       from some documentation. Indeed, the comment at the top of this module contains
+        #       a link to code on which it was based.
+        #
         threading.Thread(
             target=_post_requests, args=(collection, _requests, source)
         ).start()
-        _requests = []
+        _requests = []  # empties the queue
         _last_posted = now
@@ -49,6 +66,9 @@ class Analytics(BaseHTTPMiddleware):
         start = time()
         response = await call_next(request)
+        # Use a fallback IP address value (currently an empty string) if we can't derive one from the request.
+        ip_address: str = "" if request.client is None else request.client.host
         # Build a dictionary that describes the incoming request.
         #
         # Note: `request.headers` is an instance of `MultiDict`. References:
@@ -57,7 +77,7 @@ class Analytics(BaseHTTPMiddleware):
         #
         request_data = {
             "hostname": request.url.hostname,
-            "ip_address": request.client.host,
+            "ip_address": ip_address,
             "path": request.url.path,
             "user_agent": request.headers.get("user-agent"),
             "method": request.method,

nmdc_runtime/api/core/idgen.py CHANGED Viewed

@@ -89,7 +89,35 @@ def generate_ids(
     shoulder: str = "fk4",
 ) -> List[str]:
     r"""
-    TODO: Document this function.
+    Generate the specified number of identifiers, storing them in a MongoDB collection
+    whose name is derived from the specified Name-Assigning Authority (NAA) and Shoulder.
+    :param mdb: Handle to a MongoDB database
+    :param owner: String that will go in the "__ao" field of the identifier record.
+                  Callers will oftentimes set this to the name of a Runtime "site"
+                  (as in, a "site client" site, not a "Dagster" site).
+    :param populator: String that will go in the "who" field of the identifier record.
+                      Indicates "who generated this ID." Callers will oftentimes set
+                      this to the name of a Runtime "site" (as in, a "site client" site,
+                      not a "Dagster" site).
+    :param ns: Namespace (see Minter docs); e.g. "changesheets"
+    :param naa: Name-Assigning Authority (see Minter docs); e.g. "nmdc"
+    :param shoulder: String that will go in the "how" field (see Minter docs); e.g. "sys0"
+    This function was written the way it was in an attempt to mirror the ARK spec:
+    https://www.ietf.org/archive/id/draft-kunze-ark-41.html (found via: https://arks.org/specs/)
+    Deviations from the ARK spec include:
+    1. The inclusion of a typecode.
+       The inclusion of a typecode came out of discussions with team members,
+       who wanted identifiers to include some non-opaque substring that could be used
+       to determine what type of resource a given identifier refers to.
+    2. Making hyphens mandatory.
+       We decided to make the hyphens mandatory, whereas the spec says they are optional.
+       > "Hyphens are considered to be insignificant and are always ignored in ARKs."
+       > Reference: https://www.ietf.org/archive/id/draft-kunze-ark-41.html#name-character-repertoires
+       In our case, we require that users include an identifier's hyphens whenever
+       they are using that identifier.
     """
     collection = mdb.get_collection(collection_name(naa, shoulder))
     estimated_document_count = collection.estimated_document_count()
@@ -119,7 +147,9 @@ def generate_ids(
         if not_taken:
             # All attribute names beginning with "__a" are reserved...
             # https://github.com/jkunze/n2t-eggnog/blob/0f0f4c490e6dece507dba710d3557e29b8f6627e/egg#L1882
-            # XXX mongo is a pain with '.'s in field names, so not using e.g. "_.e" names.
+            # The author of this function opted to refrain from using property names beginning with "_.e",
+            # because he thought it would complicate MongoDB queries involving those properties, given that
+            # the "." is used as a field delimiter in MongoDB syntax (e.g. "foo.bar.baz").
             docs = [
                 {
                     "@context": "https://n2t.net/e/n2t_apidoc.html#identifier-metadata",
@@ -145,9 +175,9 @@ def generate_ids(
 def generate_one_id(
-    mdb: MongoDatabase = None,
+    mdb: MongoDatabase,
     ns: str = "",
-    shoulder: str = "sys0",
+    shoulder: str = "sys0",  # "sys0" represents the Runtime
 ) -> str:
     """Generate unique Crockford Base32-encoded ID for mdb repository.
@@ -156,8 +186,8 @@ def generate_one_id(
     """
     return generate_ids(
         mdb,
-        owner="_system",
-        populator="_system",
+        owner="_system",  # "_system" represents the Runtime
+        populator="_system",  # "_system" represents the Runtime
         number=1,
         ns=ns,
         naa="nmdc",

nmdc_runtime/api/db/mongo.py CHANGED Viewed

@@ -10,7 +10,6 @@ import bson
 from jsonschema import Draft7Validator
 from nmdc_schema.nmdc import Database as NMDCDatabase
 from pymongo.errors import AutoReconnect, OperationFailure
-from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
 from refscan.lib.Finder import Finder
 from refscan.scanner import scan_outgoing_references
 from tenacity import wait_random_exponential, retry, retry_if_exception_type
@@ -83,17 +82,6 @@ def get_session_bound_mongo_db(session=None) -> MongoDatabase:
     return SessionBoundDatabase(mdb, session) if session is not None else mdb
-@lru_cache
-def get_async_mongo_db() -> AsyncIOMotorDatabase:
-    _client = AsyncIOMotorClient(
-        host=os.getenv("MONGO_HOST"),
-        username=os.getenv("MONGO_USERNAME"),
-        password=os.getenv("MONGO_PASSWORD"),
-        directConnection=True,
-    )
-    return _client[os.getenv("MONGO_DBNAME")]
 def get_nonempty_nmdc_schema_collection_names(mdb: MongoDatabase) -> Set[str]:
     """
     Returns the names of the collections that (a) exist in the database,

nmdc_runtime/api/endpoints/find.py CHANGED Viewed

@@ -1,21 +1,17 @@
-from operator import itemgetter
-from typing import List, Annotated
+import logging
+from typing import Annotated
 from fastapi import APIRouter, Depends, Path, Query
-from jinja2 import Environment, PackageLoader, select_autoescape
-from nmdc_runtime.util import get_nmdc_jsonschema_dict
 from pymongo.database import Database as MongoDatabase
-from starlette.responses import HTMLResponse
-from toolz import merge, assoc_in
 from nmdc_schema.get_nmdc_view import ViewGetter
 from nmdc_runtime.api.core.util import raise404_if_none
 from nmdc_runtime.api.db.mongo import (
     get_mongo_db,
-    activity_collection_names,
     get_planned_process_collection_names,
     get_nonempty_nmdc_schema_collection_names,
 )
+from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
 from nmdc_runtime.api.endpoints.util import (
     find_resources,
     strip_oid,
@@ -25,9 +21,8 @@ from nmdc_runtime.api.models.metadata import Doc
 from nmdc_runtime.api.models.util import (
     FindResponse,
     FindRequest,
-    entity_attributes_to_index,
 )
-from nmdc_runtime.util import get_class_names_from_collection_spec
 router = APIRouter()
@@ -178,133 +173,71 @@ def find_data_objects_for_study(
         is a list of the `DataObject`s associated with that `Biosample`.
     """
     biosample_data_objects = []
-    study = raise404_if_none(
-        mdb.study_set.find_one({"id": study_id}, ["id"]), detail="Study not found"
-    )
-    # Note: With nmdc-schema v10 (legacy schema), we used the field named `part_of` here.
-    #       With nmdc-schema v11 (Berkeley schema), we use the field named `associated_studies` here.
-    biosamples = mdb.biosample_set.find({"associated_studies": study["id"]}, ["id"])
-    biosample_ids = [biosample["id"] for biosample in biosamples]
-    # SchemaView interface to NMDC Schema
-    nmdc_view = ViewGetter()
-    nmdc_sv = nmdc_view.get_view()
-    dg_descendants = [
-        (f"nmdc:{t}" if ":" not in t else t)
-        for t in nmdc_sv.class_descendants("DataGeneration")
-    ]
-    def collect_data_objects(doc_ids, collected_objects, unique_ids):
-        """Helper function to collect data objects from `has_input` and `has_output` references."""
-        for doc_id in doc_ids:
-            # Check if this is a DataObject by looking at the document's type directly
-            doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
-            if (
-                doc
-                and doc.get("type") == "nmdc:DataObject"
-                and doc_id not in unique_ids
-            ):
-                data_obj = mdb.data_object_set.find_one({"id": doc_id})
-                if data_obj:
-                    collected_objects.append(strip_oid(data_obj))
-                    unique_ids.add(doc_id)
-    # Another way in which DataObjects can be related to Biosamples is through the
-    # `was_informed_by` key/slot. We need to link records from the `workflow_execution_set`
-    # collection that are "informed" by the same DataGeneration records that created
-    # the outputs above. Then we need to get additional DataObject records that are
-    # created by this linkage.
-    def process_informed_by_docs(doc, collected_objects, unique_ids):
-        """Process documents linked by `was_informed_by` and collect relevant data objects."""
-        # Note: As of nmdc-schema 11.9.0, the `was_informed_by` field, if defined,
-        #       will contain a list of strings. In MongoDB, the `{k: v}` filter
-        #       can be used to check whether either (a) the value of field `f` is
-        #       an array containing `v` as one of its elements, or (b) the value
-        #       of field `f` is exactly equal to `v`. We rely on behavior (a) here.
-        informed_by_docs = mdb.workflow_execution_set.find(
-            {"was_informed_by": doc["id"]}
-        )
-        for informed_doc in informed_by_docs:
-            collect_data_objects(
-                informed_doc.get("has_input", []), collected_objects, unique_ids
-            )
-            collect_data_objects(
-                informed_doc.get("has_output", []), collected_objects, unique_ids
-            )
-    biosample_data_objects = []
+    # Respond with an error if the specified `Study` does not exist.
+    # Note: We project only the `_id` field, to minimize data transfer.
+    raise404_if_none(
+        mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
+        detail="Study not found",
+    )
-    for biosample_id in biosample_ids:
-        current_ids = [biosample_id]
-        collected_data_objects = []
-        unique_ids = set()
-        # Iterate over records in the `alldocs` collection. Look for
-        # records that have the given biosample_id as value on the
-        # `has_input` key/slot. The retrieved documents might also have a
-        # `has_output` key/slot associated with them. Get the value of the
-        # `has_output` key and check if it's type is `nmdc:DataObject`. If
-        # it's not, repeat the process till it is.
-        while current_ids:
-            new_current_ids = []
-            for current_id in current_ids:
-                # Query to find all documents with current_id as the value on
-                # `has_input` slot
-                for doc in mdb.alldocs.find({"has_input": current_id}):
-                    has_output = doc.get("has_output", [])
-                    # Process `DataGeneration` type documents linked by `was_informed_by`
-                    if not has_output and any(
-                        t in dg_descendants for t in doc.get("_type_and_ancestors", [])
-                    ):
-                        process_informed_by_docs(
-                            doc, collected_data_objects, unique_ids
-                        )
-                        continue
-                    collect_data_objects(has_output, collected_data_objects, unique_ids)
-                    # Add non-DataObject outputs to continue the chain
-                    for op in has_output:
-                        doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
-                        if doc_check and doc_check.get("type") != "nmdc:DataObject":
-                            new_current_ids.append(op)
-                    if any(
-                        t in dg_descendants for t in doc.get("_type_and_ancestors", [])
-                    ):
-                        process_informed_by_docs(
-                            doc, collected_data_objects, unique_ids
-                        )
-                # Also check if current_id is a DataObject that serves as input to other processes
-                current_doc_type = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
-                if (
-                    current_doc_type
-                    and current_doc_type.get("type") == "nmdc:DataObject"
-                ):
-                    # Find all documents in alldocs that have this DataObject as input
-                    for doc in mdb.alldocs.find({"has_input": current_id}):
-                        has_output = doc.get("has_output", [])
-                        # Process outputs from these documents
-                        collect_data_objects(
-                            has_output, collected_data_objects, unique_ids
-                        )
-                        # Add non-DataObject outputs to continue the chain
-                        for op in has_output:
-                            doc_check = mdb.alldocs.find_one({"id": op}, {"type": 1})
-                            if doc_check and doc_check.get("type") != "nmdc:DataObject":
-                                new_current_ids.append(op)
-            current_ids = new_current_ids
-        if collected_data_objects:
-            result = {
+    # Use the `get_linked_instances` function—which is the function that
+    # underlies the `/nmdcschema/linked_instances` API endpoint—to get all
+    # the `Biosample`s that are downstream of the specified `Study`.
+    #
+    # Note: The `get_linked_instances` function requires that a `max_page_size`
+    #       integer argument be passed in. In our case, we want to get _all_ of
+    #       the instances. Python has no "infinity" integer; and, even if it did,
+    #       if we were to specify too large of an integer, we'd get this error:
+    #       > "OverflowError: MongoDB can only handle up to 8-byte ints"
+    #       So, as a workaround, we pass in a number that is large enough that we
+    #       think it will account for all cases in practice (e.g., a study having
+    #       a trillion biosamples or a trillion data objects).
+    #
+    #       TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
+    #
+    large_max_page_size: int = 1_000_000_000_000
+    linked_biosamples_result: dict = get_linked_instances(
+        ids=[study_id],
+        types=["nmdc:Biosample"],
+        hydrate=False,  # we'll only use their `id` values
+        page_token=None,
+        max_page_size=large_max_page_size,
+        mdb=mdb,
+    )
+    biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
+    logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
+    # Get all the `DataObject`s that are downstream from any of those `Biosample`s.
+    data_objects_by_biosample_id = {}
+    linked_data_objects_result: dict = get_linked_instances(
+        ids=biosample_ids,
+        types=["nmdc:DataObject"],
+        hydrate=True,  # we want the full `DataObject` documents
+        page_token=None,
+        max_page_size=large_max_page_size,
+        mdb=mdb,
+    )
+    for data_object in linked_data_objects_result.get("resources", []):
+        upstream_biosample_id = data_object["_downstream_of"][0]
+        if upstream_biosample_id not in data_objects_by_biosample_id.keys():
+            data_objects_by_biosample_id[upstream_biosample_id] = []
+        # Strip away the metadata fields injected by `get_linked_instances()`.
+        data_object.pop("_upstream_of", None)
+        data_object.pop("_downstream_of", None)
+        data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
+    # Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
+    # i.e., into the format returned by the initial version of this API endpoint,
+    # which did not use the `get_linked_instances` function under the hood.
+    for biosample_id, data_objects in data_objects_by_biosample_id.items():
+        biosample_data_objects.append(
+            {
                 "biosample_id": biosample_id,
-                "data_objects": collected_data_objects,
+                "data_objects": data_objects,
             }
-            biosample_data_objects.append(result)
+        )
     return biosample_data_objects
@@ -699,96 +632,3 @@ def find_related_objects_for_workflow_execution(
     }
     return response
-jinja_env = Environment(
-    loader=PackageLoader("nmdc_runtime"), autoescape=select_autoescape()
-)
-def attr_index_sort_key(attr):
-    return "_" if attr == "id" else attr
-def documentation_links(jsonschema_dict, collection_names) -> dict:
-    """This function constructs a hierarchical catalog of (links to) schema classes and their slots.
-    The returned dictionary `doc_links` is used as input to the Jinja template `nmdc_runtime/templates/search.html`
-    in order to support user experience for `GET /search`.
-    """
-    # Note: All documentation URLs generated within this function will begin with this.
-    base_url = r"https://w3id.org/nmdc"
-    # Initialize dictionary in which to associate key/value pairs via the following for loop.
-    doc_links = {}
-    for collection_name in collection_names:
-        # Since a given collection can be associated with multiple classes, the `doc_links` dictionary
-        # will have a _list_ of values for each collection.
-        class_descriptors = []
-        # If the collection name is one that the `search.html` page has a dedicated section for,
-        # give it a top-level key; otherwise, nest it under `activity_set`.
-        key_hierarchy: List[str] = ["activity_set", collection_name]
-        if collection_name in ("biosample_set", "study_set", "data_object_set"):
-            key_hierarchy = [collection_name]
-        # Process the name of each class that the schema associates with this collection.
-        collection_spec = jsonschema_dict["$defs"]["Database"]["properties"][
-            collection_name
-        ]
-        class_names = get_class_names_from_collection_spec(collection_spec)
-        for idx, class_name in enumerate(class_names):
-            # Make a list of dictionaries, each of which describes one attribute of this class.
-            entity_attrs = list(jsonschema_dict["$defs"][class_name]["properties"])
-            entity_attr_descriptors = [
-                {"url": f"{base_url}/{attr_name}", "attr_name": attr_name}
-                for attr_name in entity_attrs
-            ]
-            # Make a dictionary describing this class.
-            class_descriptor = {
-                "collection_name": collection_name,
-                "entity_url": f"{base_url}/{class_name}",
-                "entity_name": class_name,
-                "entity_attrs": sorted(
-                    entity_attr_descriptors, key=itemgetter("attr_name")
-                ),
-            }
-            # Add that descriptor to this collection's list of class descriptors.
-            class_descriptors.append(class_descriptor)
-        # Add a key/value pair describing this collection to the `doc_links` dictionary.
-        # Reference: https://toolz.readthedocs.io/en/latest/api.html#toolz.dicttoolz.assoc_in
-        doc_links = assoc_in(doc_links, keys=key_hierarchy, value=class_descriptors)
-    return doc_links
-@router.get("/search", response_class=HTMLResponse, include_in_schema=False)
-def search_page(
-    mdb: MongoDatabase = Depends(get_mongo_db),
-):
-    template = jinja_env.get_template("search.html")
-    indexed_entity_attributes = merge(
-        {n: {"id"} for n in activity_collection_names(mdb)},
-        {
-            coll: sorted(attrs | {"id"}, key=attr_index_sort_key)
-            for coll, attrs in entity_attributes_to_index.items()
-        },
-    )
-    doc_links = documentation_links(
-        get_nmdc_jsonschema_dict(),
-        (
-            list(activity_collection_names(mdb))
-            + ["biosample_set", "study_set", "data_object_set"]
-        ),
-    )
-    html_content = template.render(
-        activity_collection_names=sorted(activity_collection_names(mdb)),
-        indexed_entity_attributes=indexed_entity_attributes,
-        doc_links=doc_links,
-    )
-    return HTMLResponse(content=html_content, status_code=200)

nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.10.0py3-none-any.whl → 2.11.0py3-none-any.whl