PyPI - nmdc-runtime - Versions diffs - 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show

nmdc_runtime/Dockerfile +167 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +208 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +788 -0
nmdc_runtime/api/core/util.py +109 -0
nmdc_runtime/api/db/mongo.py +435 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +143 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +270 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +796 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +425 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +37 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +140 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +7 -8
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +1 -2
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +633 -13
nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
nmdc_runtime/site/graphs.py +8 -22
nmdc_runtime/site/ops.py +147 -181
nmdc_runtime/site/repository.py +2 -112
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/gold_translator.py +4 -12
nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
nmdc_runtime/site/translation/translator.py +63 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +10 -5
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +90 -48
nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/api/endpoints/find.py ADDED Viewed

@@ -0,0 +1,634 @@
+import logging
+from typing import Annotated
+from fastapi import APIRouter, Depends, Path, Query
+from pymongo.database import Database as MongoDatabase
+from nmdc_schema.get_nmdc_view import ViewGetter
+from nmdc_runtime.api.core.util import raise404_if_none
+from nmdc_runtime.api.db.mongo import (
+    get_mongo_db,
+    get_planned_process_collection_names,
+    get_nonempty_nmdc_schema_collection_names,
+)
+from nmdc_runtime.api.endpoints.nmdcschema import get_linked_instances
+from nmdc_runtime.api.endpoints.util import (
+    find_resources,
+    strip_oid,
+    find_resources_spanning,
+)
+from nmdc_runtime.api.models.metadata import Doc
+from nmdc_runtime.api.models.util import (
+    FindResponse,
+    FindRequest,
+)
+router = APIRouter()
+@router.get(
+    "/studies",
+    response_model=FindResponse,
+    response_model_exclude_unset=True,
+)
+def find_studies(
+    req: Annotated[FindRequest, Query()],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    The `GET /studies` endpoint is a general purpose way to retrieve NMDC studies based on parameters provided by the user.
+    Studies can be filtered and sorted based on the applicable [Study attributes](https://microbiomedata.github.io/nmdc-schema/Study/).
+    """
+    return find_resources(req, mdb, "study_set")
+@router.get(
+    "/studies/{study_id}",
+    response_model=Doc,
+    response_model_exclude_unset=True,
+)
+def find_study_by_id(
+    study_id: Annotated[
+        str,
+        Path(
+            title="Study ID",
+            description="The `id` of the `Study` you want to find.\n\n_Example_: `nmdc:sty-11-abc123`",
+            examples=["nmdc:sty-11-abc123"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    If the study identifier is known, a study can be retrieved directly using the GET /studies/{study_id} endpoint.
+    \n Note that only one study can be retrieved at a time using this method.
+    """
+    return strip_oid(raise404_if_none(mdb["study_set"].find_one({"id": study_id})))
+@router.get(
+    "/biosamples",
+    response_model=FindResponse,
+    response_model_exclude_unset=True,
+)
+def find_biosamples(
+    req: Annotated[FindRequest, Query()],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    The GET /biosamples endpoint is a general purpose way to retrieve biosample metadata using user-provided filter and sort criteria.
+    Please see the applicable [Biosample attributes](https://microbiomedata.github.io/nmdc-schema/Biosample/).
+    """
+    return find_resources(req, mdb, "biosample_set")
+@router.get(
+    "/biosamples/{sample_id}",
+    response_model=Doc,
+    response_model_exclude_unset=True,
+)
+def find_biosample_by_id(
+    sample_id: Annotated[
+        str,
+        Path(
+            title="Biosample ID",
+            description="The `id` of the `Biosample` you want to find.\n\n_Example_: `nmdc:bsm-11-abc123`",
+            examples=["nmdc:bsm-11-abc123"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    If the biosample identifier is known, a biosample can be retrieved directly using the GET /biosamples/{sample_id}.
+    \n Note that only one biosample metadata record can be retrieved at a time using this method.
+    """
+    return strip_oid(raise404_if_none(mdb["biosample_set"].find_one({"id": sample_id})))
+@router.get(
+    "/data_objects",
+    response_model=FindResponse,
+    response_model_exclude_unset=True,
+)
+def find_data_objects(
+    req: Annotated[FindRequest, Query()],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    To retrieve metadata about NMDC data objects (such as files, records, or omics data) the GET /data_objects endpoint
+    may be used along with various parameters. Please see the applicable [Data Object](https://microbiomedata.github.io/nmdc-schema/DataObject/)
+    attributes.
+    """
+    return find_resources(req, mdb, "data_object_set")
+@router.get(
+    "/data_objects/study/{study_id}",
+    response_model_exclude_unset=True,
+    #
+    # Customize the name that Swagger UI displays for the API endpoint.
+    #
+    # Note: By default, FastAPI derives the name of the API endpoint from the name of the decorated function. Here, we
+    #       are using a custom name that matches the derived one, except that the custom one ends with `(delayed)`.
+    #
+    # Note: Each word in the name will appear capitalized on Swagger UI.
+    #
+    name="Find data objects for study (delayed)",
+    #
+    # Customize the description that Swagger UI displays for the API endpoint.
+    #
+    # Note: By default, FastAPI derives the description of the API endpoint from the docstring of the decorated
+    #       function. Here, we are using a custom description that was written for an audience of API consumers,
+    #       as opposed to the derived description that was written for an audience of `nmdc-runtime` developers.
+    #
+    description=(
+        "Gets all `DataObject`s related to all `Biosample`s related to the specified `Study`."
+        "<br /><br />"  # newlines
+        "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
+        "with respect to the NMDC database. That's because the cache that underlies this API "
+        "endpoint gets refreshed to match the NMDC database once every 24 hours."
+    ),
+)
+def find_data_objects_for_study(
+    study_id: Annotated[
+        str,
+        Path(
+            title="Study ID",
+            description="""The `id` of the `Study` having `Biosample`s with which you want to find
+                        associated `DataObject`s.\n\n_Example_: `nmdc:sty-11-abc123`""",
+            examples=["nmdc:sty-11-abc123"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """This API endpoint is used to retrieve data objects associated with
+    all the biosamples associated with a given study. This endpoint makes
+    use of the `alldocs` collection for its implementation.
+    :param study_id: NMDC study id for which data objects are to be retrieved
+    :param mdb: PyMongo connection, defaults to Depends(get_mongo_db)
+    :return: List of dictionaries, each of which has a `biosample_id` entry
+        and a `data_object_set` entry. The value of the `biosample_id` entry
+        is the `Biosample`'s `id`. The value of the `data_object_set` entry
+        is a list of the `DataObject`s associated with that `Biosample`.
+    """
+    biosample_data_objects = []
+    # Respond with an error if the specified `Study` does not exist.
+    # Note: We project only the `_id` field, to minimize data transfer.
+    raise404_if_none(
+        mdb["study_set"].find_one({"id": study_id}, projection={"_id": 1}),
+        detail="Study not found",
+    )
+    # Use the `get_linked_instances` function—which is the function that
+    # underlies the `/nmdcschema/linked_instances` API endpoint—to get all
+    # the `Biosample`s that are downstream of the specified `Study`.
+    #
+    # Note: The `get_linked_instances` function requires that a `max_page_size`
+    #       integer argument be passed in. In our case, we want to get _all_ of
+    #       the instances. Python has no "infinity" integer; and, even if it did,
+    #       if we were to specify too large of an integer, we'd get this error:
+    #       > "OverflowError: MongoDB can only handle up to 8-byte ints"
+    #       So, as a workaround, we pass in a number that is large enough that we
+    #       think it will account for all cases in practice (e.g., a study having
+    #       a trillion biosamples or a trillion data objects).
+    #
+    #       TODO: Update the `get_linked_instances` function to optionally impose _no_ limit.
+    #
+    large_max_page_size: int = 1_000_000_000_000
+    linked_biosamples_result: dict = get_linked_instances(
+        ids=[study_id],
+        types=["nmdc:Biosample"],
+        hydrate=False,  # we'll only use their `id` values
+        page_token=None,
+        max_page_size=large_max_page_size,
+        mdb=mdb,
+    )
+    biosample_ids = [d["id"] for d in linked_biosamples_result.get("resources", [])]
+    logging.debug(f"Found {len(biosample_ids)} Biosamples for Study {study_id}")
+    # Get all the `DataObject`s that are downstream from any of those `Biosample`s.
+    data_objects_by_biosample_id = {}
+    linked_data_objects_result: dict = get_linked_instances(
+        ids=biosample_ids,
+        types=["nmdc:DataObject"],
+        hydrate=True,  # we want the full `DataObject` documents
+        page_token=None,
+        max_page_size=large_max_page_size,
+        mdb=mdb,
+    )
+    for data_object in linked_data_objects_result.get("resources", []):
+        upstream_biosample_id = data_object["_downstream_of"][0]
+        if upstream_biosample_id not in data_objects_by_biosample_id.keys():
+            data_objects_by_biosample_id[upstream_biosample_id] = []
+        # Strip away the metadata fields injected by `get_linked_instances()`.
+        data_object.pop("_upstream_of", None)
+        data_object.pop("_downstream_of", None)
+        data_objects_by_biosample_id[upstream_biosample_id].append(data_object)
+    # Convert the `data_objects_by_biosample_id` dictionary into a list of dicts;
+    # i.e., into the format returned by the initial version of this API endpoint,
+    # which did not use the `get_linked_instances` function under the hood.
+    for biosample_id, data_objects in data_objects_by_biosample_id.items():
+        biosample_data_objects.append(
+            {
+                "biosample_id": biosample_id,
+                "data_objects": data_objects,
+            }
+        )
+    return biosample_data_objects
+@router.get(
+    "/data_objects/{data_object_id}",
+    response_model=Doc,
+    response_model_exclude_unset=True,
+)
+def find_data_object_by_id(
+    data_object_id: Annotated[
+        str,
+        Path(
+            title="DataObject ID",
+            description="The `id` of the `DataObject` you want to find.\n\n_Example_: `nmdc:dobj-11-abc123`",
+            examples=["nmdc:dobj-11-abc123"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    If the data object identifier is known, the metadata can be retrieved using the GET /data_objects/{data_object_id} endpoint.
+    \n Note that only one data object metadata record may be retrieved at a time using this method.
+    """
+    return strip_oid(
+        raise404_if_none(mdb["data_object_set"].find_one({"id": data_object_id}))
+    )
+@router.get(
+    "/planned_processes",
+    response_model=FindResponse,
+    response_model_exclude_unset=True,
+)
+def find_planned_processes(
+    req: Annotated[FindRequest, Query()],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """
+    The GET /planned_processes endpoint is a general way to fetch metadata about various planned processes (e.g.
+    workflow execution, material processing, etc.). Any "slot" (a.k.a. attribute) for
+    [`PlannedProcess`](https://w3id.org/nmdc/PlannedProcess) may be used in the filter
+    and sort parameters, including attributes of subclasses of *PlannedProcess*.
+    For example, attributes used in subclasses such as [`Extraction`](https://w3id.org/nmdc/Extraction)
+    (subclass of *PlannedProcess*), can be used as input criteria for the filter and sort parameters of this endpoint.
+    """
+    return find_resources_spanning(
+        req,
+        mdb,
+        get_planned_process_collection_names()
+        & get_nonempty_nmdc_schema_collection_names(mdb),
+    )
+@router.get(
+    "/planned_processes/{planned_process_id}",
+    response_model=Doc,
+    response_model_exclude_unset=True,
+)
+def find_planned_process_by_id(
+    planned_process_id: Annotated[
+        str,
+        Path(
+            title="PlannedProcess ID",
+            description="The `id` of the document that represents an instance of "
+            "the `PlannedProcess` class or any of its subclasses",
+            examples=[r"nmdc:wfmag-11-00jn7876.1"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    r"""
+    Returns the document that has the specified `id` and represents an instance of the `PlannedProcess` class
+    or any of its subclasses. If no such document exists, returns an HTTP 404 response.
+    """
+    doc = None
+    # Note: We exclude empty collections as a performance optimization
+    #       (we already know they don't contain the document).
+    collection_names = (
+        get_planned_process_collection_names()
+        & get_nonempty_nmdc_schema_collection_names(mdb)
+    )
+    # For each collection, search it for a document having the specified `id`.
+    for name in collection_names:
+        doc = mdb[name].find_one({"id": planned_process_id})
+        if doc is not None:
+            return strip_oid(doc)
+    # Note: If execution gets to this point, it means we didn't find the document.
+    return raise404_if_none(doc)
+@router.get(
+    "/workflow_executions/{workflow_execution_id}/related_resources",
+    response_model_exclude_unset=True,
+    name="Find resources related to the specified WorkflowExecution",
+    description=(
+        "Finds `DataObject`s, `Biosample`s, `Study`s, and other `WorkflowExecution`s "
+        "related to the specified `WorkflowExecution`."
+        "<br /><br />"  # newlines
+        "This endpoint returns a JSON object that contains "
+        "(a) the specified `WorkflowExecution`, "
+        "(b) all the `DataObject`s that are inputs to — or outputs from — the specified `WorkflowExecution`, "
+        "(c) all the `Biosample`s that were inputs to those `DataGeneration`s, "
+        "(d) all the `Study`s with which those `Biosample`s are associated, and "
+        "(e) all the other `WorkflowExecution`s that are part of the same processing pipeline "
+        "as the specified `WorkflowExecution`."
+        "<br /><br />"  # newlines
+        "**Note:** The data returned by this API endpoint can be up to 24 hours out of date "
+        "with respect to the NMDC database. That's because the cache that underlies this API "
+        "endpoint gets refreshed to match the NMDC database once every 24 hours."
+    ),
+)
+def find_related_objects_for_workflow_execution(
+    workflow_execution_id: Annotated[
+        str,
+        Path(
+            title="Workflow Execution ID",
+            description=(
+                "The `id` of the `WorkflowExecution` to which you want to find related resources."
+                "\n\n"
+                "_Example_: `nmdc:wfmgan-11-wdx72h27.1`"
+            ),
+            examples=["nmdc:wfmgan-11-wdx72h27.1"],
+        ),
+    ],
+    mdb: MongoDatabase = Depends(get_mongo_db),
+):
+    """This API endpoint retrieves resources related to the specified WorkflowExecution,
+    including DataObjects that are inputs to — or outputs from — it, other WorkflowExecution
+    instances that are part of the same pipeline, and related Biosamples and Studies.
+    :param workflow_execution_id: id of workflow_execution_set instance for which related objects are to be retrieved
+    :param mdb: A PyMongo `Database` instance that can be used to access the MongoDB database
+    :return: Dictionary with data_objects, related_workflow_executions, biosamples, and studies lists
+    """
+    # Get the specified `WorkflowExecution` document from the database.
+    workflow_execution = raise404_if_none(
+        mdb.workflow_execution_set.find_one({"id": workflow_execution_id}),
+        detail="Workflow execution not found",
+    )
+    # Create empty lists that will contain the related documents we find.
+    data_objects = []
+    related_workflow_executions = []
+    biosamples = []
+    studies = []
+    # Create empty sets that we'll use to avoid processing a given document multiple times.
+    unique_data_object_ids = set()
+    unique_workflow_execution_ids = set()
+    unique_biosample_ids = set()
+    unique_study_ids = set()
+    # Add the ID of the specified `WorkflowExecution` document, to the set of unique `WorkflowExecution` IDs.
+    unique_workflow_execution_ids.add(workflow_execution_id)
+    # Get a `SchemaView` that is bound to the NMDC schema.
+    nmdc_view = ViewGetter()
+    nmdc_sv = nmdc_view.get_view()
+    dg_descendants = [
+        (f"nmdc:{t}" if ":" not in t else t)
+        for t in nmdc_sv.class_descendants("DataGeneration")
+    ]
+    def add_data_object(doc_id: str) -> bool:
+        r"""
+        Helper function that adds the `DataObject` having the specified `id`
+        to our list of `DataObjects`, if it isn't already in there.
+        """
+        # Check if this is a DataObject by looking at the document's type directly
+        doc = mdb.alldocs.find_one({"id": doc_id}, {"type": 1})
+        if (
+            doc
+            and doc.get("type") == "nmdc:DataObject"
+            and doc_id not in unique_data_object_ids
+        ):
+            data_obj = mdb.data_object_set.find_one({"id": doc_id})
+            if data_obj:
+                data_objects.append(strip_oid(data_obj))
+                unique_data_object_ids.add(doc_id)
+                return True
+        return False
+    def add_workflow_execution(wfe: dict) -> None:
+        r"""
+        Helper function that adds the specified `WorkflowExecution`
+        to our list of `WorkflowExecution`s, if it isn't already in there;
+        and adds its related `DataObjects` to our list of `DataObject`s.
+        """
+        if wfe["id"] not in unique_workflow_execution_ids:
+            related_workflow_executions.append(strip_oid(wfe))
+            unique_workflow_execution_ids.add(wfe["id"])
+            # Add data objects related to this workflow execution.
+            ids_of_inputs = wfe.get("has_input", [])
+            ids_of_outputs = wfe.get("has_output", [])
+            for doc_id in ids_of_inputs + ids_of_outputs:
+                add_data_object(doc_id)
+    def add_biosample(biosample_id: str) -> bool:
+        r"""
+        Helper function that adds the specified `Biosample`
+        to our list of `Biosample`s, if it isn't already in there;
+        and adds its related `Study`s to our list of `Study`s.
+        """
+        if biosample_id not in unique_biosample_ids:
+            biosample = mdb.biosample_set.find_one({"id": biosample_id})
+            if biosample:
+                biosamples.append(strip_oid(biosample))
+                unique_biosample_ids.add(biosample_id)
+                # Add studies related to this biosample.
+                for study_id in biosample.get("associated_studies", []):
+                    add_study(study_id)
+                return True
+        return False
+    def add_study(study_id: str) -> bool:
+        r"""
+        Helper function that adds the specified `Study`
+        to our list of `Study`s, if it isn't already in there.
+        """
+        if study_id not in unique_study_ids:
+            study = mdb.study_set.find_one({"id": study_id})
+            if study:
+                studies.append(strip_oid(study))
+                unique_study_ids.add(study_id)
+                return True
+        return False
+    def find_biosamples_recursively(start_id: str) -> None:
+        r"""
+        Recursive helper function that traverses the database in search of relevant `Biosample`s.
+        This function searches for biosamples starting from the "input" to a DataGeneration record by
+        traversing the data provenance graph – which is the bipartite graph formed by the
+        `has_input` / `has_output` relationships in the schema. It uses the ids asserted on
+        `has_input` and `has_output` slots on documents in the `alldocs` collection to tie related documents
+        in the chain together.
+        Note: The function uses an internal nested recursive function (`process_id()`) to avoid cycles
+        in the graph and tracks processed IDs to prevent infinite recursion.
+        :param start_id: The ID of the document to start the search from. This will typically
+            be the input to a `DataGeneration` record, which may be a `Biosample` directly or a
+            `ProcessedSample`.
+        """
+        # Create an empty set we can use to track the `id`s of documents we've already processed,
+        # in order to avoid processing the same documents multiple times (i.e. cycling in the graph).
+        processed_ids = set()
+        def process_id(current_id):
+            r"""
+            Recursive helper function that processes a single document ID and follows
+            connections to discover related biosamples.
+            This function:
+            1. Checks if the current ID is already processed to prevent cycles
+            2. Directly adds the document if it's a `Biosample`
+            3. For non-Biosample documents (type of `PlannedProcess`), it:
+               - Processes input (`has_input`) IDs of the current document
+               - Finds documents that have the current ID as output (`has_output`) and processes their inputs
+            This recursive approach allows traversing the provenance graph in both directions.
+            :param current_id: The ID of the document to process in this recursive step
+            """
+            if current_id in processed_ids:
+                return
+            processed_ids.add(current_id)
+            # If it's a `Biosample`, i.e., "type" == "nmdc:Biosample"
+            doc = mdb.alldocs.find_one({"id": current_id}, {"type": 1})
+            if doc and doc.get("type") == "nmdc:Biosample":
+                add_biosample(current_id)
+                return
+            # Find the document with this ID to see what it is
+            current_doc = mdb.alldocs.find_one({"id": current_id})
+            if current_doc:
+                # Check if this document has inputs - if so, process them
+                for input_id in current_doc.get("has_input", []):
+                    if input_id not in processed_ids:
+                        process_id(input_id)
+            # Also find documents that have this ID as an output
+            # This is the key to walking backward through the chain
+            for doc in mdb.alldocs.find({"has_output": current_id}):
+                # Process all inputs of this document
+                for input_id in doc.get("has_input", []):
+                    if input_id not in processed_ids:
+                        process_id(input_id)
+        # Start the recursive search
+        process_id(start_id)
+    # Get the DataObject `id`s that are inputs (`has_input`) to and
+    # outputs (`has_output`) from the user-specified WorkflowExecution.
+    input_ids = workflow_execution.get("has_input", [])
+    output_ids = workflow_execution.get("has_output", [])
+    # Add those DataObjects to our list of DataObjects.
+    for doc_id in input_ids + output_ids:
+        add_data_object(doc_id)
+    # Find WorkflowExecutions whose inputs are outputs of this WorkflowExecution.
+    # Add those to our list of related WorkflowExecutions.
+    for output_id in output_ids:
+        related_wfes = mdb.workflow_execution_set.find({"has_input": output_id})
+        for wfe in related_wfes:
+            add_workflow_execution(wfe)
+    # Find WorkflowExecutions whose outputs are inputs of this WorkflowExecution.
+    # Add those, too, to our list of related WorkflowExecutions.
+    for input_id in input_ids:
+        related_wfes = mdb.workflow_execution_set.find({"has_output": input_id})
+        for wfe in related_wfes:
+            add_workflow_execution(wfe)
+    # Find WorkflowExecutions whose `was_informed_by` list contains that of the user-specified WorkflowExecution.
+    # Add those, too, to our list of related WorkflowExecutions.
+    if "was_informed_by" in workflow_execution:
+        was_informed_by = workflow_execution["was_informed_by"]
+        # Note: We added this assertion in an attempt to facilitate debugging
+        #       the system in the situation where a `WorkflowExecution` document
+        #       has a `was_informed_by` field whose value is not a list (which
+        #       would be a violation of NMDC schema 11.9.0).
+        assert isinstance(was_informed_by, list), (
+            "A WorkflowExecution's `was_informed_by` field contained "
+            f"a {type(was_informed_by)} instead of a list."
+        )
+        # Get all WorkflowExecutions that were informed by any of the
+        # things that informed the user-specified WorkflowExecution.
+        related_wfes = mdb.workflow_execution_set.find(
+            {"was_informed_by": {"$in": was_informed_by}}
+        )
+        for wfe in related_wfes:
+            if wfe["id"] != workflow_execution_id:
+                add_workflow_execution(wfe)
+        # Get all `DataGeneration`s that informed the user-specified `WorkflowExecution`, then
+        # get all `Biosample`s and `Study`s associated with each of those `DataGeneration`s.
+        dg_docs = mdb.alldocs.find({"id": {"$in": was_informed_by}})
+        for dg_doc in dg_docs:
+            if any(t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])):
+                # Get Biosamples from the DataGeneration's `has_input` field by recursively walking up the chain.
+                # While we recursively walk up the chain, we'll add those Biosamples to our list of Biosamples.
+                for input_id in dg_doc.get("has_input", []):
+                    find_biosamples_recursively(input_id)
+                # Get Studies associated with the DataGeneration,
+                # and add them to our list of Studies.
+                for study_id in dg_doc.get("associated_studies", []):
+                    add_study(study_id)
+                # If the DataGeneration has no associated Studies, but has related Biosamples,
+                # add the Studies associated with those Biosamples to our list of Studies.
+                if not dg_doc.get("associated_studies") and len(biosamples) > 0:
+                    for bs in biosamples:
+                        for study_id in bs.get("associated_studies", []):
+                            add_study(study_id)
+    # For all data objects we collected, check if they have a `was_generated_by` reference
+    # This is a supplementary path to find more relationships
+    for data_obj in data_objects:
+        if "was_generated_by" in data_obj:
+            gen_id = data_obj["was_generated_by"]
+            dg_doc = mdb.alldocs.find_one({"id": gen_id})
+            if dg_doc and any(
+                t in dg_descendants for t in dg_doc.get("_type_and_ancestors", [])
+            ):
+                # Get Studies directly associated with the DataGeneration
+                for study_id in dg_doc.get("associated_studies", []):
+                    add_study(study_id)
+    response = {
+        "workflow_execution_id": workflow_execution_id,  # `WorkflowExecution` `id` provided by user
+        "workflow_execution": strip_oid(
+            workflow_execution
+        ),  # the specified `WorkflowExecution`
+        "data_objects": data_objects,  # related `DataObject`s
+        "related_workflow_executions": related_workflow_executions,  # related `WorkflowExecution`s
+        "biosamples": biosamples,  # related `Biosample`s
+        "studies": studies,  # related `Study`s
+    }
+    return response

nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl