PyPI - nmdc-runtime - Versions diffs - 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -0
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +30 -4
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +1331 -0
nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
nmdc_runtime/site/export/study_metadata.py +27 -4
nmdc_runtime/site/graphs.py +294 -45
nmdc_runtime/site/ops.py +1008 -230
nmdc_runtime/site/repair/database_updater.py +451 -0
nmdc_runtime/site/repository.py +368 -133
nmdc_runtime/site/resources.py +154 -80
nmdc_runtime/site/translation/gold_translator.py +235 -83
nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
nmdc_runtime/site/translation/neon_utils.py +24 -7
nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
nmdc_runtime/site/translation/translator.py +73 -3
nmdc_runtime/site/util.py +26 -7
nmdc_runtime/site/validation/emsl.py +1 -0
nmdc_runtime/site/validation/gold.py +1 -0
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +236 -192
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/terminusdb/generate.py +0 -198
nmdc_runtime/site/terminusdb/ingest.py +0 -44
nmdc_runtime/site/terminusdb/schema.py +0 -1671
nmdc_runtime/site/translation/emsl.py +0 -42
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -31
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -42
nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
/nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0

nmdc_runtime/api/endpoints/jobs.py ADDED Viewed

@@ -0,0 +1,206 @@
+from datetime import datetime, timezone
+import json
+import logging
+from typing import Optional, Annotated
+from pymongo.database import Database
+from fastapi import APIRouter, Depends, Query, HTTPException, Path
+from pymongo.errors import ConnectionFailure, OperationFailure
+from starlette import status
+from nmdc_runtime.api.core.util import (
+    raise404_if_none,
+)
+from nmdc_runtime.api.db.mongo import get_mongo_db
+from nmdc_runtime.api.core.idgen import generate_one_id
+from nmdc_runtime.api.endpoints.util import list_resources, _claim_job, strip_oid
+from nmdc_runtime.api.models.job import Job, JobClaim, JobIn
+from nmdc_runtime.api.models.metadata import Doc
+from nmdc_runtime.api.models.operation import Operation, MetadataT
+from nmdc_runtime.api.models.site import (
+    Site,
+    maybe_get_current_client_site,
+    get_current_client_site,
+)
+from nmdc_runtime.api.models.util import ListRequest, ListResponse, ResultT
+router = APIRouter()
+# Note: We use the generic `Doc` class—instead of the `Job` class—to describe the response
+#       because this endpoint (via `ListRequest`) supports projection, which can be used to omit
+#       fields from the response, even fields the `Job` class says are required.
+@router.get(
+    "/jobs", response_model=ListResponse[Doc], response_model_exclude_unset=True
+)
+def list_jobs(
+    req: Annotated[ListRequest, Query()],
+    mdb: Database = Depends(get_mongo_db),
+    maybe_site: Optional[Site] = Depends(maybe_get_current_client_site),
+):
+    """List pre-configured workflow jobs.
+    If authenticated as a site client, `req.filter` defaults to fetch unclaimed jobs
+    that are claimable by the site client. This default can be overridden to view all jobs
+    by explicitly passing a `req.filter` of `{}`.
+    """
+    if isinstance(maybe_site, Site) and req.filter is None:
+        req.filter = json.dumps({"claims.site_id": {"$ne": maybe_site.id}})
+    rv = list_resources(req, mdb, "jobs")
+    rv["resources"] = [strip_oid(d) for d in rv["resources"]]
+    return rv
+@router.post(
+    "/jobs",
+    status_code=status.HTTP_201_CREATED,
+    response_model_exclude_unset=True,
+)
+def create_job(
+    job_in: JobIn,
+    mdb: Database = Depends(get_mongo_db),
+    site: Site = Depends(get_current_client_site),
+) -> Job:
+    """
+    Create a workflow job.
+    A workflow job is a resource that decouples the configuration of a workflow from the execution of that workflow.
+    **Permissions:** This endpoint is only accessible to site clients.
+    """
+    _ = site  # must be authenticated
+    # Generate a unique ID for the job.
+    job_id = generate_one_id(mdb, "jobs")
+    # Generate a timestamp for the job's `created_at` field.
+    created_at = datetime.now(timezone.utc)
+    # Validate the request payload, combined with the generated ID and timestamp.
+    job_in_dict: dict = job_in.model_dump(exclude_unset=True)
+    try:
+        validated_job = Job(**job_in_dict, id=job_id, created_at=created_at)
+    except Exception as e:
+        error_message = f"Invalid job. Details: {str(e)}"
+        logging.warning(error_message)
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail=error_message,
+        )
+    # Insert the validated job into the database.
+    validated_job_dict: dict = validated_job.model_dump(exclude_unset=True)
+    try:
+        result = mdb.jobs.insert_one(validated_job_dict)
+        if not result.inserted_id:
+            raise Exception("Failed to insert job into database.")
+    except Exception as e:
+        logging.exception(e)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to create job.",
+        )
+    # Return the job that was created (i.e. inserted into the database).
+    return validated_job
+@router.get("/jobs/{job_id}", response_model=Job, response_model_exclude_unset=True)
+def get_job_info(
+    job_id: str,
+    mdb: Database = Depends(get_mongo_db),
+):
+    return raise404_if_none(mdb.jobs.find_one({"id": job_id}))
+@router.post("/jobs/{job_id}:claim", response_model=Operation[ResultT, MetadataT])
+def claim_job(
+    job_id: str,
+    mdb: Database = Depends(get_mongo_db),
+    site: Site = Depends(get_current_client_site),
+):
+    return _claim_job(job_id, mdb, site)
+@router.post("/jobs/{job_id}:release")
+def release_job(
+    job_id: Annotated[
+        str,
+        Path(
+            title="Job ID",
+            description="The `id` of the job.\n\n_Example_: `nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6`",
+            examples=["nmdc:f81d4fae-7dec-11d0-a765-00a0c91e6bf6"],
+        ),
+    ],
+    mdb: Database = Depends(get_mongo_db),
+    site: Site = Depends(get_current_client_site),
+) -> Optional[Job]:
+    r"""
+    Release the specified job.
+    Releasing a job cancels all the unfinished operations (of that job)
+    claimed by the `site` associated with the logged-in site client.
+    Return the updated job, reflecting that the aforementioned operations have been cancelled.
+    """
+    job = Job(**raise404_if_none(mdb.jobs.find_one({"id": job_id})))
+    active_job_claims_by_this_site = list(
+        mdb.operations.find(
+            {
+                "metadata.job.id": job_id,
+                "metadata.site_id": site.id,
+                "done": False,
+            },
+            ["id"],
+        )
+    )
+    job_claims_by_this_site_post_release = [
+        JobClaim(op_id=claim["id"], site_id=site.id, done=True, cancelled=True)
+        for claim in active_job_claims_by_this_site
+    ]
+    job_claims_not_by_this_site = [
+        claim for claim in job.claims if (claim.site_id != site.id)
+    ]
+    # Execute MongoDB transaction to ensure atomic change of job document plus relevant set of operations documents.
+    def transactional_update(session):
+        mdb.operations.update_many(
+            {"id": {"$in": [claim["id"] for claim in active_job_claims_by_this_site]}},
+            {"$set": {"metadata.cancelled": True, "metadata.done": True}},
+            session=session,
+        )
+        job_claim_subdocuments_post_release = [
+            claim.model_dump(exclude_unset=True)
+            for claim in (
+                job_claims_not_by_this_site + job_claims_by_this_site_post_release
+            )
+        ]
+        mdb.jobs.update_one(
+            {"id": job_id},
+            {"$set": {"claims": job_claim_subdocuments_post_release}},
+            session=session,
+        )
+    try:
+        with mdb.client.start_session() as session:
+            with session.start_transaction():
+                transactional_update(session)
+    except (ConnectionFailure, OperationFailure) as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Transaction failed: {e}",
+        )
+    # Return the updated `jobs` document.
+    #
+    # TODO: Consider retrieving the document within the transaction
+    #       to ensure it still exists.
+    #
+    updated_job = mdb.jobs.find_one({"id": job_id})
+    if updated_job is None:
+        # Note: We return `None` in this case because that's what the
+        #       endpoint originally did in this case, and we don't want
+        #       to introduce a breaking change as part of this refactor.
+        return None
+    else:
+        return Job(**updated_job)

nmdc_runtime/api/endpoints/lib/helpers.py ADDED Viewed

@@ -0,0 +1,274 @@
+import json
+import bson.json_util
+from typing import List
+from pymongo.database import Database
+from refscan.lib.Finder import Finder
+from refscan.lib.helpers import derive_schema_class_name_from_document
+from refscan.scanner import identify_referring_documents, scan_outgoing_references
+from nmdc_runtime.api.models.lib.helpers import derive_update_specs
+from nmdc_runtime.api.models.query import UpdateCommand, UpdateSpecs
+from nmdc_runtime.util import get_allowed_references, nmdc_schema_view
+def make_violation_message(
+    collection_name: str,
+    source_document_id: str,
+    source_field_name: str,
+    target_document_id: str,
+) -> str:
+    r"""
+    Constructs a violation message that indicates that a document would contain a broken reference.
+    :param collection_name: The name of the collection containing the document containing the broken reference
+    :param source_document_id: The `id` of the document containing the broken reference
+    :param source_field_name: The name of the field containing the broken reference
+    :param target_document_id: The `id` of the document that is being referenced
+    :return: A formatted string describing the violation
+    """
+    return (
+        f"The document having id='{source_document_id}' in "
+        f"the collection '{collection_name}' contains a "
+        f"reference (in its '{source_field_name}' field, "
+        f"referring to the document having id='{target_document_id}') "
+        f"which would be broken."
+    )
+def simulate_updates_and_check_references(
+    db: Database, update_cmd: UpdateCommand
+) -> List[str]:
+    r"""
+    Checks whether, if the specified updates were performed on the specified database,
+    both of the following things would be true afterward:
+    1. (Regarding outgoing references): The updated documents do not contain any
+       broken references.
+    2. (Regarding incoming references): The documents that originally _referenced_
+       any of the updated documents do not contain any broken references.
+       This check is necessary because update operations can currently change `id`
+       and `type` values, which can affect what can legally reference those documents.
+    This function checks those things by performing the updates within a MongoDB
+    transaction, leaving the transaction in the _pending_ (i.e. not committed) state,
+    and then performing various checks on the database in that _pending_ state.
+    :param db: The database on which to simulate performing the updates
+    :param update_cmd: The command that specifies the updates
+    :return: List of violation messages. If the list is empty, it means that—if
+             the updates had been performed (instead of only simulated) here—they
+             would not have left behind any broken references.
+    """
+    # Initialize the list of violation messages that we will return.
+    violation_messages: List[str] = []
+    # Instantiate a `Finder` bound to the Mongo database. This will be
+    # used later, to identify and check inter-document references.
+    finder = Finder(database=db)
+    # Extract the collection name from the command.
+    collection_name = update_cmd.update
+    # Derive the update specifications from the command.
+    update_specs: UpdateSpecs = derive_update_specs(update_cmd)
+    # Get a reference to a `SchemaView` bound to the NMDC schema, so we can
+    # use it to, for example, map `type` field values to schema class names.
+    schema_view = nmdc_schema_view()
+    # Get some data structures that indicate which fields of which documents
+    # can legally contain references, according to the NMDC schema.
+    legal_references = get_allowed_references()
+    reference_field_names_by_source_class_name = (
+        legal_references.get_reference_field_names_by_source_class_name()
+    )
+    # Start a "throwaway" MongoDB transaction so we can simulate the updates.
+    with db.client.start_session() as session:
+        with session.start_transaction():
+            # Make a list of the `_id`, `id`, and `type` values of the documents that
+            # the user wants to update.
+            projection = {"_id": 1, "id": 1, "type": 1}
+            subject_document_summaries_pre_update = list(
+                db[collection_name].find(
+                    filter={"$or": [spec["filter"] for spec in update_specs]},
+                    projection=projection,
+                    session=session,
+                )
+            )
+            # Make a set of the `_id` values of the subject documents so that (later) we can
+            # check whether a given _referring_ document is also one of the _subject_
+            # documents (i.e. is among the documents the user wants to update).
+            subject_document_object_ids = set(
+                tdd["_id"] for tdd in subject_document_summaries_pre_update
+            )
+            # Identify _all_ documents that reference any of the subject documents.
+            all_referring_document_descriptors_pre_update = []
+            for subject_document_summary in subject_document_summaries_pre_update:
+                # If the document summary lacks the "id" field, we already know that no
+                # documents reference it (since they would have to _use_ that "id" value to
+                # do so); so, we abort this iteration and move on to the next subject document.
+                if "id" not in subject_document_summary:
+                    continue
+                referring_document_descriptors = identify_referring_documents(
+                    document=subject_document_summary,  # expects at least "id" and "type"
+                    schema_view=schema_view,
+                    references=legal_references,
+                    finder=finder,
+                    client_session=session,
+                )
+                all_referring_document_descriptors_pre_update.extend(
+                    referring_document_descriptors
+                )
+            # Simulate the updates (i.e. apply them within the context of the transaction).
+            db.command(
+                # Note: This expression was copied from the `_run_mdb_cmd` function in `queries.py`.
+                # TODO: Document this expression (i.e. the Pydantic->JSON->BSON chain).
+                bson.json_util.loads(
+                    json.dumps(update_cmd.model_dump(exclude_unset=True))
+                ),
+                session=session,
+            )
+            # For each referring document, check whether any of its outgoing references
+            # is broken (in the context of the transaction).
+            for descriptor in all_referring_document_descriptors_pre_update:
+                referring_document_oid = descriptor["source_document_object_id"]
+                referring_document_id = descriptor["source_document_id"]
+                referring_collection_name = descriptor["source_collection_name"]
+                # If the referring document is among the documents that the user wanted to
+                # update, we skip it for now. We will check its outgoing references later
+                # (i.e. when we check the outgoing references of _all_ updated documents).
+                if referring_document_oid in subject_document_object_ids:
+                    continue
+                # Get the referring document, so we can check its outgoing references.
+                # Note: We project only the fields that can legally contain references,
+                #       plus other fields involved in referential integrity checking.
+                referring_document_reference_field_names = (
+                    reference_field_names_by_source_class_name[
+                        descriptor["source_class_name"]
+                    ]
+                )
+                projection = {
+                    field_name: 1
+                    for field_name in referring_document_reference_field_names
+                } | {
+                    "_id": 1,
+                    "id": 1,
+                    "type": 1,
+                }  # note: `|` unions the dicts
+                referring_document = db[referring_collection_name].find_one(
+                    {"_id": referring_document_oid},
+                    projection=projection,
+                    session=session,
+                )
+                # Note: We assert that the referring document exists (to satisfy the type checker).
+                assert (
+                    referring_document is not None
+                ), "A referring document has vanished."
+                violations = scan_outgoing_references(
+                    document=referring_document,
+                    source_collection_name=referring_collection_name,
+                    schema_view=schema_view,
+                    references=legal_references,
+                    finder=finder,
+                    client_session=session,  # so it uses the pending transaction's session
+                )
+                # For each violation (i.e. broken reference) that exists, add a violation message
+                # to the list of violation messages.
+                #
+                # TODO: The violation might not involve a reference to one of the
+                #       subject documents. The `scan_outgoing_references` function
+                #       scans _all_ references emanating from the document.
+                #
+                for violation in violations:
+                    source_field_name = violation.source_field_name
+                    target_id = violation.target_id
+                    violation_messages.append(
+                        make_violation_message(
+                            collection_name=referring_collection_name,
+                            source_document_id=referring_document_id,
+                            source_field_name=source_field_name,
+                            target_document_id=target_id,
+                        )
+                    )
+            # For each updated document, check whether any of its outgoing references
+            # is broken (in the context of the transaction).
+            for subject_document_summary in subject_document_summaries_pre_update:
+                subject_document_oid = subject_document_summary["_id"]
+                subject_document_id = subject_document_summary["id"]
+                subject_document_class_name = derive_schema_class_name_from_document(
+                    document=subject_document_summary,
+                    schema_view=schema_view,
+                )
+                assert (
+                    subject_document_class_name is not None
+                ), "The updated document does not represent a valid schema class instance."
+                subject_collection_name = (
+                    collection_name  # makes a disambiguating alias
+                )
+                # Get the updated document, so we can check its outgoing references.
+                # Note: We project only the fields that can legally contain references,
+                #       plus other fields involved in referential integrity checking.
+                updated_document_reference_field_names = (
+                    reference_field_names_by_source_class_name[
+                        subject_document_class_name
+                    ]
+                )
+                projection = {
+                    field_name: 1
+                    for field_name in updated_document_reference_field_names
+                } | {
+                    "_id": 1,
+                    "id": 1,
+                    "type": 1,
+                }  # note: `|` unions the dicts
+                updated_document = db[subject_collection_name].find_one(
+                    {"_id": subject_document_oid},
+                    projection=projection,
+                    session=session,
+                )
+                # Note: We assert that the updated document exists (to satisfy the type checker).
+                assert updated_document is not None, "An updated document has vanished."
+                violations = scan_outgoing_references(
+                    document=updated_document,
+                    source_collection_name=subject_collection_name,
+                    schema_view=schema_view,
+                    references=legal_references,
+                    finder=finder,
+                    client_session=session,  # so it uses the pending transaction's session
+                )
+                # For each violation (i.e. broken reference) that exists, add a violation message
+                # to the list of violation messages.
+                for violation in violations:
+                    source_field_name = violation.source_field_name
+                    target_id = violation.target_id
+                    violation_messages.append(
+                        make_violation_message(
+                            collection_name=subject_collection_name,
+                            source_document_id=subject_document_id,
+                            source_field_name=source_field_name,
+                            target_document_id=target_id,
+                        )
+                    )
+            # Whatever happens (i.e. whether there are violations or not), abort the transaction.
+            #
+            # Note: If an exception was raised within this `with` block, the transaction
+            #       will already have been aborted automatically (and execution will not
+            #       have reached this statement). On the other hand, if no exception
+            #       was raised, we explicitly abort the transaction so that the updates
+            #       that we "simulated" in this block do not get applied to the real database.
+            #       Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/client_session.html
+            #
+            session.abort_transaction()
+    return violation_messages

nmdc_runtime/api/endpoints/lib/linked_instances.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""
+This module houses logic for the `GET /nmdcschema/linked_instances` endpoint, defined as
+`nmdc_runtime.api.endpoints.nmdcschema.linked_instances`, to avoid (further) bloating the
+`nmdc_runtime.api.endpoints.nmdcschema` module.
+"""
+from datetime import timedelta
+from typing import Literal, Any
+from bson import ObjectId
+from pymongo.collection import Collection as MongoCollection
+from pymongo.database import Database as MongoDatabase
+from toolz import merge
+from nmdc_runtime.api.core.util import hash_from_str, now
+from nmdc_runtime.api.db.mongo import get_mongo_db
+from nmdc_runtime.util import get_class_name_to_collection_names_map, nmdc_schema_view
+def hash_from_ids_and_types(ids: list[str], types: list[str]) -> str:
+    """A quick hash as a function of `ids` and `types`.
+    This will serve as part of a temporary mongo collection name.
+    Because it will only be "part of" the name, avoiding hash collisions isn't a priority.
+    Returns a hex digest truncated to 8 characters, so 16**8 ≈ 4M possible values.
+    """
+    return hash_from_str(
+        ",".join(sorted(ids)) + "." + ",".join(sorted(types)), algo="md5"
+    )[:8]
+def temp_linked_instances_collection_name(ids: list[str], types: list[str]) -> str:
+    """A name for a temporary mongo collection to store linked instances in service of an API request."""
+    return f"_runtime.tmp.linked_instances.{hash_from_ids_and_types(ids=ids,types=types)}.{ObjectId()}"
+def drop_stale_temp_linked_instances_collections() -> None:
+    """Drop any temporary linked-instances collections that were generated earlier than one day ago."""
+    mdb = get_mongo_db()
+    one_day_ago = now() - timedelta(days=1)
+    for collection_name in mdb.list_collection_names(
+        filter={"name": {"$regex": r"^_runtime.tmp.linked_instances\..*"}}
+    ):
+        if ObjectId(collection_name.split(".")[-1]).generation_time < one_day_ago:
+            mdb.drop_collection(collection_name)
+def gather_linked_instances(
+    alldocs_collection: MongoCollection,
+    ids: list[str],
+    types: list[str],
+) -> str:
+    """Collect linked instances and stores them in a new temporary collection.
+    Run an aggregation pipeline over `alldocs_collection` that collects ∈`types` instances linked to `ids`.
+    The pipeline is run twice, once for each of {"downstream", "upstream"} directions.
+    """
+    merge_into_collection_name = temp_linked_instances_collection_name(
+        ids=ids, types=types
+    )
+    for direction in ["downstream", "upstream"]:
+        _ = list(
+            alldocs_collection.aggregate(
+                pipeline_for_direction(
+                    ids=ids,
+                    types=types,
+                    direction=direction,
+                    merge_into_collection_name=merge_into_collection_name,
+                ),
+                allowDiskUse=True,
+            )
+        )
+    return merge_into_collection_name
+def pipeline_for_direction(
+    ids: list[str],
+    types: list[str],
+    direction: Literal["downstream", "upstream"],
+    merge_into_collection_name: str,
+    alldocs_collection_name: str = "alldocs",
+) -> list:
+    """A pure function that returns the aggregation pipeline for `direction`.
+    The pipeline
+    - collects ∈`types` instances linked to `ids` along `direction`,
+    - retains only those document fields essential to the caller, and
+    - ensures the collected instances are present, and properly updated if applicable, in a merge-target collection.
+    """
+    return pipeline_for_instances_linked_to_ids_by_direction(
+        ids=ids,
+        types=types,
+        direction=direction,
+        alldocs_collection_name=alldocs_collection_name,
+    ) + [
+        {"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}},
+        pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
+            merge_into_collection_name=merge_into_collection_name, direction=direction
+        ),
+    ]
+def pipeline_for_instances_linked_to_ids_by_direction(
+    ids: list[str],
+    types: list[str],
+    direction: Literal["downstream", "upstream"],
+    alldocs_collection_name: str = "alldocs",
+    slim: bool = True,
+) -> list[dict[str, Any]]:
+    """
+    Returns an aggregation pipeline that:
+    - traverses the graph of documents in the alldocs collection, following `direction`-specific relationships
+      to discover documents linked to the documents given by `ids`.
+    - `$unwind`s the collected (via `$graphLookup`) docs,
+    - filters them by given `types` of interest,
+     - adds bookkeeping information about `direction`ality, and
+     - (optionally) projects only essential fields to reduce response latency and size.
+    """
+    return [
+        {"$match": {"id": {"$in": ids}}},
+        {
+            "$graphLookup": {
+                "from": alldocs_collection_name,
+                "startWith": f"$_{direction}.id",
+                "connectFromField": f"_{direction}.id",
+                "connectToField": "id",
+                "as": f"{direction}_docs",
+            }
+        },
+        {"$unwind": {"path": f"${direction}_docs"}},
+        {"$match": {f"{direction}_docs._type_and_ancestors": {"$in": types}}},
+        {"$addFields": {f"{direction}_docs._{direction}_of": ["$id"]}},
+        {"$replaceRoot": {"newRoot": f"${direction}_docs"}},
+    ] + ([{"$project": {"id": 1, "type": 1, f"_{direction}_of": 1}}] if slim else [])
+def pipeline_stage_for_merging_instances_and_grouping_link_provenance_by_direction(
+    merge_into_collection_name: str,
+    direction: Literal["downstream", "upstream"],
+) -> dict[str, Any]:
+    """
+    Returns an aggregation-pipeline step that merges its input document stream to a collection dedicated to serving
+    the caller in a manner amenable to pagination across multiple HTTP requests.
+    """
+    return {
+        "$merge": {
+            "into": merge_into_collection_name,
+            "on": "_id",
+            "whenMatched": [
+                {
+                    "$set": {
+                        f"_{direction}_of": {
+                            "$setUnion": [
+                                f"$_{direction}_of",
+                                f"$$new._{direction}_of",
+                            ]
+                        }
+                    }
+                }
+            ],
+            "whenNotMatched": "insert",
+        }
+    }
+def hydrated(resources: list[dict], mdb: MongoDatabase) -> list[dict]:
+    """Replace each `dict` in `resources` with a hydrated version.
+    Instead of returning the retrieved "full" documents as is, we merge each one with (a copy of) the corresponding
+    original document in *resources*, which includes additional fields, e.g. `_upstream_of` and `_downstream_of`.
+    """
+    class_name_to_collection_names_map = get_class_name_to_collection_names_map(
+        nmdc_schema_view()
+    )
+    types_of_resources = {r["type"] for r in resources}
+    full_docs_by_id = {}
+    for type in types_of_resources:
+        resource_ids_of_type = [d["id"] for d in resources if d["type"] == type]
+        schema_collection = mdb.get_collection(
+            # Note: We are assuming that documents of a given type are only allowed (by the schema) to reside in one
+            # collection. Based on that assumption, we will query only the _first_ collection whose name we get from
+            # the map. This assumption is continuously verified prior to code deployment via
+            # `test_get_class_name_to_collection_names_map_has_one_and_only_one_collection_name_per_class_name`.
+            class_name_to_collection_names_map[type.removeprefix("nmdc:")][0]
+        )
+        for doc in schema_collection.find({"id": {"$in": resource_ids_of_type}}):
+            full_docs_by_id[doc["id"]] = doc
+    return [merge(r, full_docs_by_id[r["id"]]) for r in resources]

nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl