PyPI - nmdc-runtime - Versions diffs - 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -1
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +731 -40
nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
nmdc_runtime/site/graphs.py +80 -29
nmdc_runtime/site/ops.py +522 -183
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +108 -117
nmdc_runtime/site/resources.py +72 -36
nmdc_runtime/site/translation/gold_translator.py +22 -21
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
nmdc_runtime/site/translation/translator.py +64 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +175 -348
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/api/endpoints/util.py ADDED Viewed

@@ -0,0 +1,817 @@
+import logging
+import os
+import tempfile
+from datetime import datetime
+from functools import lru_cache
+from json import JSONDecodeError
+from pathlib import Path
+from time import time_ns
+from typing import List, Optional, Set, Tuple
+from zoneinfo import ZoneInfo
+from bson import json_util
+from dagster import DagsterRunStatus
+from dagster_graphql import DagsterGraphQLClientError
+from fastapi import HTTPException
+from gridfs import GridFS
+from nmdc_runtime.api.core.idgen import generate_one_id, local_part
+from nmdc_runtime.api.core.util import (
+    dotted_path_for,
+    expiry_dt_from_now,
+    raise404_if_none,
+)
+from nmdc_runtime.api.db.mongo import get_mongo_db
+from nmdc_runtime.api.models.job import Job, JobClaim, JobOperationMetadata
+from nmdc_runtime.api.models.object import (
+    DrsId,
+    DrsObject,
+    DrsObjectIn,
+    PortableFilename,
+)
+from nmdc_runtime.api.models.operation import Operation
+from nmdc_runtime.api.models.run import (
+    RunUserSpec,
+    _add_run_fail_event,
+    _add_run_requested_event,
+    _add_run_started_event,
+    get_dagster_graphql_client,
+)
+from nmdc_runtime.api.models.site import Site
+from nmdc_runtime.api.models.user import User
+from nmdc_runtime.api.models.util import (
+    FindRequest,
+    ListRequest,
+    ResultT,
+)
+from nmdc_runtime.util import drs_metadata_for
+from pymongo.collection import Collection as MongoCollection
+from pymongo.database import Database as MongoDatabase
+from pymongo.errors import DuplicateKeyError
+from starlette import status
+from toolz import assoc_in, concat, dissoc, get_in, merge
+BASE_URL_INTERNAL = os.getenv("API_HOST")
+BASE_URL_EXTERNAL = os.getenv("API_HOST_EXTERNAL")
+HOSTNAME_EXTERNAL = BASE_URL_EXTERNAL.split("://", 1)[-1]
+def is_num_matching_docs_within_limit(
+    collection: MongoCollection, filter_: dict, limit: int
+) -> bool:
+    """
+    Check whether the number of documents in a MongoDB collection that match
+    the filter is within (i.e. is no greater than) the specified limit.
+    """
+    if limit < 0:
+        raise ValueError("Limit must be at least 0.")
+    # Count the number of documents matching the filter, but only count up to limit + 1,
+    # since that's enough to determine whether the number exceeds the limit.
+    limited_num_matching_docs = collection.count_documents(
+        filter=filter_,
+        limit=limit + 1,
+    )
+    return limited_num_matching_docs <= limit
+def check_filter(filter_: str):
+    """A pass-through function that checks if `filter_` is parsable as a JSON object. Raises otherwise."""
+    filter_ = filter_.strip()
+    if not filter_.startswith("{") or not filter_.endswith("}"):
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail=f"The given `filter` is not a valid JSON object, which must start with '{{' and end with '}}'.",
+        )
+    try:
+        json_util.loads(filter_)
+    except JSONDecodeError as e:
+        raise HTTPException(
+            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail=f"Given `filter` is not valid JSON: {e}",
+        )
+    return filter_
+def list_resources(
+    req: ListRequest, mdb: MongoDatabase, collection_name: str = ""
+) -> dict:
+    """
+    Returns a dictionary containing the requested MongoDB documents, maybe alongside pagination information.
+    `mdb.page_tokens` docs are `{"_id": req.page_token, "ns": collection_name}`, Because `page_token` is globally
+    unique, and because the `mdb.page_tokens.find_one({"_id": req.page_token})` document stores `collection_name` in
+    the "ns" (namespace) field, the value for `collection_name` stored there takes precedence over any value supplied
+    as an argument to this function's `collection_name` parameter.
+    If the specified page size (`req.max_page_size`) is non-zero and more documents match the filter criteria than
+    can fit on a page of that size, this function will paginate the resources.
+    """
+    if collection_name == "" and req.page_token is None:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Must specify a collection name if no page token is supplied.",
+        )
+    if req.page_token:
+        doc = mdb.page_tokens.find_one({"_id": req.page_token})
+        if doc is None:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST, detail="`page_token` not found"
+            )
+        collection_name = doc["ns"]
+        last_id = doc["last_id"]
+        mdb.page_tokens.delete_one({"_id": req.page_token})
+    else:
+        last_id = None
+    id_field = "id"
+    if "id_1" not in mdb[collection_name].index_information():
+        # Note: This warning is displayed for the "functional_annotation_agg" and
+        #       "users" collections, for example.
+        logging.warning(
+            f"list_resources: no index set on 'id' for collection {collection_name}"
+        )
+        id_field = "_id"
+    max_page_size = req.max_page_size
+    filter_ = json_util.loads(check_filter(req.filter)) if req.filter else {}
+    projection = (
+        list(set(comma_separated_values(req.projection)) | {id_field})
+        if req.projection
+        else None
+    )
+    if last_id is not None:
+        if id_field in filter_:
+            filter_[id_field] = merge(filter_[id_field], {"$gt": last_id})
+        else:
+            filter_ = merge(filter_, {id_field: {"$gt": last_id}})
+    # Determine whether we will paginate the results.
+    #
+    # Note: We will paginate them unless either (a) the `max_page_size` is less than 1,
+    #       or (b) the number of documents matching the filter can fit on a single page.
+    #
+    will_paginate = True
+    if max_page_size < 1 or is_num_matching_docs_within_limit(
+        collection=mdb[collection_name], filter_=filter_, limit=max_page_size
+    ):
+        will_paginate = False
+    if not will_paginate:
+        rv = {
+            "resources": list(
+                mdb[collection_name].find(filter=filter_, projection=projection)
+            )
+        }
+        return rv
+    else:
+        resources = list(
+            mdb[collection_name].find(
+                filter=filter_,
+                projection=projection,
+                limit=max_page_size,
+                sort=[(id_field, 1)],
+                allow_disk_use=True,
+            )
+        )
+        last_id = resources[-1][id_field]
+        token = generate_one_id(mdb, "page_tokens")
+        # TODO unify with `/queries:run` query continuation model
+        #  => {_id: cursor/token, query: <full query>, last_id: <>, last_modified: <>}
+        mdb.page_tokens.insert_one(
+            {"_id": token, "ns": collection_name, "last_id": last_id}
+        )
+        return {"resources": resources, "next_page_token": token}
+def coerce_to_float_if_possible(val):
+    r"""
+    Converts the specified value into a floating-point number if possible;
+    raising a `ValueError` if not possible.
+    """
+    try:
+        return float(val)
+    except ValueError:
+        return val
+def comma_separated_values(s: str):
+    r"""
+    Returns a list of the comma-delimited substrings of the specified string. Discards any whitespace
+    surrounding each substring.
+    Reference: https://docs.python.org/3/library/re.html#re.split
+    >>> comma_separated_values("apple, banana, cherry")
+    ['apple', 'banana', 'cherry']
+    """
+    return [v.strip() for v in s.split(",")]
+def get_mongo_filter(filter_str):
+    r"""
+    Convert a str in the domain-specific language (DSL) solicited by `nmdc_runtime.api.models.util.FindRequest.filter`
+    -- i.e., a comma-separated list of `attribute:value` pairs, where the `value` can include a comparison operator
+    (e.g. `>=`) and where if the attribute is of type _string_ and has the suffix `.search` appended to its name
+    then the server should perform a full-text search
+    -- to a corresponding MongoDB filter representation for e.g. passing to a collection `find` call.
+    """
+    filter_ = {}
+    if not filter_str:
+        return filter_
+    pairs = comma_separated_values(filter_str)
+    if not all(len(split) == 2 for split in (p.split(":", maxsplit=1) for p in pairs)):
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Filter must be of form: attribute:spec[,attribute:spec]*",
+        )
+    for attr, spec in (p.split(":", maxsplit=1) for p in pairs):
+        if attr.endswith(".search"):
+            actual_attr = attr[: -len(".search")]
+            filter_[actual_attr] = {"$regex": spec}
+        else:
+            for op, key in {("<", "$lt"), ("<=", "$lte"), (">", "$gt"), (">=", "$gte")}:
+                if spec.startswith(op):
+                    filter_[attr] = {key: coerce_to_float_if_possible(spec[len(op) :])}
+                    break
+            else:
+                filter_[attr] = spec
+    return filter_
+def get_mongo_sort(sort_str) -> Optional[List[Tuple[str, int]]]:
+    """
+    Parse `sort_str` and a str of the form "attribute:spec[,attribute:spec]*",
+    where spec is `asc` (ascending -- the default if no spec) or `desc` (descending),
+    and return a value suitable to pass as a `sort` kwarg to a mongo collection `find` call.
+    """
+    sort_ = []
+    if not sort_str:
+        return None
+    pairs = comma_separated_values(sort_str)
+    for p in pairs:
+        components = p.split(":", maxsplit=1)
+        if len(components) == 1:
+            attr, spec = components[0], ""
+        else:
+            attr, spec = components
+        for op, key in {("", 1), ("asc", 1), ("desc", -1)}:
+            if spec == op:
+                sort_.append((attr, key))
+                break
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=(
+                    "Sort must be of form: attribute:spec[,attribute:spec]* "
+                    "where spec is `asc` (ascending -- the default if no spec) "
+                    "or `desc` (descending).",
+                ),
+            )
+    return sort_
+def strip_oid(doc: dict) -> dict:
+    r"""
+    Returns a copy of the specified dictionary, that has no `_id` key.
+    """
+    return dissoc(doc, "_id")
+def timeit(cursor):
+    """Collect from cursor and return time taken in milliseconds."""
+    tic = time_ns()
+    results = list(cursor)
+    toc = time_ns()
+    return results, int(round((toc - tic) / 1e6))
+def find_resources(req: FindRequest, mdb: MongoDatabase, collection_name: str):
+    """Find nmdc schema collection entities that match the FindRequest.
+    "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
+    TODO: Add type hint for function's return value (see `nmdc_runtime.api.models.util.FindResponse`).
+    """
+    if req.group_by:
+        raise HTTPException(
+            status_code=status.HTTP_418_IM_A_TEAPOT,
+            detail="I don't yet know how to ?group_by=",
+        )
+    if req.search:
+        raise HTTPException(
+            status_code=status.HTTP_418_IM_A_TEAPOT,
+            detail=(
+                "I don't yet know how to ?search=. "
+                "Use ?filter=<attribute>.search:<spec> instead."
+            ),
+        )
+    filter_ = get_mongo_filter(req.filter)
+    projection = (
+        list(set(comma_separated_values(req.fields)) | {"id"}) if req.fields else None
+    )
+    sort_ = get_mongo_sort(req.sort)
+    total_count = mdb[collection_name].count_documents(filter=filter_)
+    if req.page:
+        skip = (req.page - 1) * req.per_page
+        if skip > 10_000:
+            # Note: because _page number_-based pagination is currently implemented via MongoDB's `skip` and `limit`
+            # parameters, a full (slow) collection scan is performed to skip to the requested page. This scan takes
+            # longer and longer as `skip` increases, which is why cursor-based pagination is preferred for large
+            # collections.
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=(
+                    "Use cursor-based pagination for paging beyond 10,000 items. "
+                    "That is, instead of specifying the `page` query parameter for this endpoint, "
+                    "specify the `cursor` query parameter. In particular, set `cursor` to `*` to get the first page, "
+                    "and use the value of `meta.next_cursor` in the response, if not `null`, as the value to which "
+                    "you set `cursor` in the next request."
+                ),
+            )
+        limit = req.per_page
+        results, db_response_time_ms = timeit(
+            mdb[collection_name].find(
+                filter=filter_,
+                skip=skip,
+                limit=limit,
+                sort=sort_,
+                projection=projection,
+            )
+        )
+        rv = {
+            "meta": {
+                "mongo_filter_dict": filter_,
+                "mongo_sort_list": [[a, s] for a, s in sort_] if sort_ else None,
+                "count": total_count,
+                "db_response_time_ms": db_response_time_ms,
+                "page": req.page,
+                "per_page": req.per_page,
+            },
+            "results": [strip_oid(d) for d in results],
+            "group_by": [],
+        }
+        if req.fields:
+            rv["meta"]["fields"] = req.fields
+    else:  # req.cursor is not None
+        if req.cursor != "*":
+            doc = mdb.page_tokens.find_one({"_id": req.cursor, "ns": collection_name})
+            if doc is None:
+                raise HTTPException(
+                    status_code=status.HTTP_400_BAD_REQUEST, detail="Bad cursor value"
+                )
+            last_id = doc["last_id"]
+            mdb.page_tokens.delete_one({"_id": req.cursor})
+        else:
+            last_id = None
+        if last_id is not None:
+            if "id" in filter_:
+                filter_["id"] = merge(filter_["id"], {"$gt": last_id})
+            else:
+                filter_ = merge(filter_, {"id": {"$gt": last_id}})
+        if "id_1" not in mdb[collection_name].index_information():
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Cursor-based pagination is not enabled for this resource.",
+            )
+        limit = req.per_page
+        sort_for_cursor = (sort_ or []) + [("id", 1)]
+        results, db_response_time_ms = timeit(
+            mdb[collection_name].find(
+                filter=filter_, limit=limit, sort=sort_for_cursor, projection=projection
+            )
+        )
+        # Check whether there are any results. If there aren't any, we refrain from
+        # trying to access the `id` of the final one (since it doesn't exist).
+        if len(results) > 0:
+            last_id = results[-1]["id"]
+        else:
+            last_id = None
+        # If we have a `last_id` value other than `None`, we check whether it belongs
+        # to the final document overall (not just the final one on this page). On the
+        # other hand, if `last_id` is `None`, we set the token to `None` (since there
+        # is no "next page" of results to be retrieved).
+        if last_id is not None:
+            filter_eager = filter_
+            if "id" in filter_:
+                filter_eager["id"] = merge(filter_["id"], {"$gt": last_id})
+            else:
+                filter_eager = merge(filter_, {"id": {"$gt": last_id}})
+            more_results = (
+                mdb[collection_name].count_documents(filter=filter_eager, limit=limit)
+                > 0
+            )
+            # If the `last_id` does not belong to the final document overall, generate
+            # a new pagination token and persist it to the database. Otherwise (i.e. if
+            # the `last_id` _does_ belong to the final document overall), set the token
+            # to `None` (since there is no "next page" to be retrieved).
+            if more_results:
+                token = generate_one_id(mdb, "page_tokens")
+                mdb.page_tokens.insert_one(
+                    {"_id": token, "ns": collection_name, "last_id": last_id}
+                )
+            else:
+                token = None
+        else:
+            token = None
+        rv = {
+            "meta": {
+                "mongo_filter_dict": filter_,
+                "mongo_sort_list": sort_for_cursor,
+                "count": total_count,
+                "db_response_time_ms": db_response_time_ms,
+                "page": None,
+                "per_page": req.per_page,
+                "next_cursor": token,
+            },
+            "results": [strip_oid(d) for d in results],
+            "group_by": [],
+        }
+        if req.fields:
+            rv["meta"]["fields"] = req.fields
+    return rv
+def find_resources_spanning(
+    req: FindRequest, mdb: MongoDatabase, collection_names: Set[str]
+):
+    """Find nmdc schema collection entities -- here, across multiple collections -- that match the FindRequest.
+    This is useful for collections that house documents that are subclasses of a common ancestor class.
+    "resources" is used generically here, as in "Web resources", e.g. Uniform Resource Identifiers (URIs).
+    """
+    if req.cursor or not req.page:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="This resource only supports page-based pagination",
+        )
+    if len(collection_names) == 0:
+        return {
+            "meta": {
+                "mongo_filter_dict": get_mongo_filter(req.filter),
+                "count": 0,
+                "db_response_time_ms": 0,
+                "page": req.page,
+                "per_page": req.per_page,
+            },
+            "results": [],
+            "group_by": [],
+        }
+    responses = {name: find_resources(req, mdb, name) for name in collection_names}
+    rv = {
+        "meta": {
+            "mongo_filter_dict": next(
+                r["meta"]["mongo_filter_dict"] for r in responses.values()
+            ),
+            "count": sum(r["meta"]["count"] for r in responses.values()),
+            "db_response_time_ms": sum(
+                r["meta"]["db_response_time_ms"] for r in responses.values()
+            ),
+            "page": req.page,
+            "per_page": req.per_page,
+        },
+        "results": list(concat(r["results"] for r in responses.values())),
+        "group_by": [],
+    }
+    return rv
+def exists(collection: MongoCollection, filter_: dict):
+    r"""
+    Returns True if there are any documents in the collection that meet the filter requirements.
+    """
+    return collection.count_documents(filter_) > 0
+def persist_content_and_get_drs_object(
+    content: str,
+    description: str,
+    username="(anonymous)",
+    filename=None,
+    content_type="application/json",
+    id_ns="json-metadata-in",
+    exists_ok=False,
+):
+    """Persist a Data Repository Service (DRS) object.
+    An object may be a blob, analogous to a file, or a bundle, analogous to a folder. Sites register objects,
+    and sites must ensure that these objects are accessible to the NMDC data broker.
+    An object may be associated with one or more object types, useful for triggering workflows.
+    Reference: https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.1.0/docs/#_drs_datatypes
+    """
+    mdb = get_mongo_db()
+    drs_id = local_part(generate_one_id(mdb, ns=id_ns, shoulder="gfs0"))
+    filename = filename or drs_id
+    PortableFilename(filename)  # validates
+    DrsId(drs_id)  # validates
+    mdb_fs = GridFS(mdb)
+    mdb_fs.put(
+        content,
+        _id=drs_id,
+        filename=filename,
+        content_type=content_type,
+        encoding="utf-8",
+    )
+    with tempfile.TemporaryDirectory() as save_dir:
+        filepath = str(Path(save_dir).joinpath(filename))
+        with open(filepath, "w") as f:
+            f.write(content)
+        now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat(
+            timespec="minutes"
+        )
+        object_in = DrsObjectIn(
+            **drs_metadata_for(
+                filepath,
+                base={
+                    "description": (
+                        description
+                        + f" (created by/for {username}"
+                        + f" at {now_to_the_minute})"
+                    ),
+                    "access_methods": [{"access_id": drs_id}],
+                },
+                timestamp=now_to_the_minute,
+            )
+        )
+    self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}"
+    return _create_object(
+        mdb,
+        object_in,
+        mgr_site="nmdc-runtime",
+        drs_id=drs_id,
+        self_uri=self_uri,
+        exists_ok=exists_ok,
+    )
+def _create_object(
+    mdb: MongoDatabase,
+    object_in: DrsObjectIn,
+    mgr_site,
+    drs_id,
+    self_uri,
+    exists_ok=False,
+):
+    """Helper function for creating a Data Repository Service (DRS) object."""
+    drs_obj = DrsObject(
+        **object_in.model_dump(exclude_unset=True),
+        id=drs_id,
+        self_uri=self_uri,
+    )
+    doc = drs_obj.model_dump(exclude_unset=True)
+    doc["_mgr_site"] = mgr_site  # manager site
+    try:
+        mdb.objects.insert_one(doc)
+    except DuplicateKeyError as e:
+        if e.details["keyPattern"] == {"checksums.type": 1, "checksums.checksum": 1}:
+            if exists_ok:
+                return mdb.objects.find_one(
+                    {
+                        "checksums": {
+                            "$elemMatch": {
+                                "type": e.details["keyValue"]["checksums.type"],
+                                "checksum": e.details["keyValue"]["checksums.checksum"],
+                            }
+                        }
+                    }
+                )
+            else:
+                raise HTTPException(
+                    status_code=status.HTTP_409_CONFLICT,
+                    detail=f"provided checksum matches existing object: {e.details['keyValue']}",
+                )
+        else:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="duplicate key error",
+            )
+    return doc
+def _claim_job(job_id: str, mdb: MongoDatabase, site: Site):
+    r"""
+    TODO: Document this function.
+    """
+    job_doc = raise404_if_none(mdb.jobs.find_one({"id": job_id}))
+    job = Job(**job_doc)
+    # check that site satisfies the job's workflow's required capabilities.
+    capabilities_required = job.workflow.capability_ids or []
+    for cid in capabilities_required:
+        if cid not in site.capability_ids:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail=f"client site does not have capability {cid} required to claim job",
+            )
+    # For now, allow site to claim same job multiple times,
+    # to re-submit results given same job input config.
+    job_op_for_site = mdb.operations.find_one(
+        {"metadata.job.id": job.id, "metadata.site_id": site.id}
+    )
+    if job_op_for_site is not None:
+        # raise HTTPException(
+        #     status_code=status.HTTP_409_CONFLICT,
+        #     detail={
+        #         "msg": (
+        #             f"client site already claimed job -- "
+        #             f"see operation {job_op_for_site['id']}"
+        #         ),
+        #         "id": job_op_for_site["id"],
+        #     },
+        # )
+        pass
+    op_id = generate_one_id(mdb, "op")
+    job.claims = (job.claims or []) + [JobClaim(op_id=op_id, site_id=site.id)]
+    op = Operation[ResultT, JobOperationMetadata](
+        **{
+            "id": op_id,
+            "expire_time": expiry_dt_from_now(days=30),
+            "metadata": {
+                "job": Job(
+                    **{
+                        "id": job.id,
+                        "workflow": job.workflow,
+                        "config": job.config,
+                    }
+                ).model_dump(exclude_unset=True),
+                "site_id": site.id,
+                "model": dotted_path_for(JobOperationMetadata),
+            },
+        }
+    )
+    mdb.operations.insert_one(op.model_dump())
+    mdb.jobs.replace_one({"id": job.id}, job.model_dump(exclude_unset=True))
+    return op.model_dump(exclude_unset=True)
+@lru_cache
+def map_nmdc_workflow_id_to_dagster_job_name():
+    """Returns a dictionary mapping nmdc_workflow_id to dagster_job_name."""
+    return {
+        "metadata-in-1.0.0": "apply_metadata_in",
+        "export-study-biosamples-as-csv-1.0.0": "export_study_biosamples_metadata",
+        "gold_study_to_database": "gold_study_to_database",
+    }
+def ensure_run_config_data(
+    nmdc_workflow_id: str,
+    nmdc_workflow_inputs: List[str],
+    run_config_data: dict,
+    mdb: MongoDatabase,
+    user: User,
+):
+    r"""
+    Ensures that run_config_data has entries for certain nmdc workflow ids.
+    Returns return_config_data.
+    """
+    if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
+        run_config_data = assoc_in(
+            run_config_data,
+            ["ops", "get_study_biosamples_metadata", "config", "study_id"],
+            nmdc_workflow_inputs[0],
+        )
+        run_config_data = assoc_in(
+            run_config_data,
+            ["ops", "get_study_biosamples_metadata", "config", "username"],
+            user.username,
+        )
+        return run_config_data
+    if nmdc_workflow_id == "gold_study_to_database":
+        run_config_data = assoc_in(
+            run_config_data,
+            ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
+            nmdc_workflow_inputs[0],
+        )
+        run_config_data = assoc_in(
+            run_config_data,
+            ["ops", "export_json_to_drs", "config", "username"],
+            user.username,
+        )
+        return run_config_data
+    else:
+        return run_config_data
+def inputs_for(nmdc_workflow_id, run_config_data):
+    """Returns a URI path for given nmdc_workflow_id, constructed from run_config_data."""
+    if nmdc_workflow_id == "metadata-in-1.0.0":
+        return [
+            "/objects/"
+            + get_in(["ops", "get_json_in", "config", "object_id"], run_config_data)
+        ]
+    if nmdc_workflow_id == "export-study-biosamples-as-csv-1.0.0":
+        return [
+            "/studies/"
+            + get_in(
+                ["ops", "get_study_biosamples_metadata", "config", "study_id"],
+                run_config_data,
+            )
+        ]
+    if nmdc_workflow_id == "gold_study_to_database":
+        return [
+            "/studies/"
+            + get_in(
+                ["ops", "get_gold_study_pipeline_inputs", "config", "study_id"],
+                run_config_data,
+            )
+        ]
+def _request_dagster_run(
+    nmdc_workflow_id: str,
+    nmdc_workflow_inputs: List[str],
+    extra_run_config_data: dict,
+    mdb: MongoDatabase,
+    user: User,
+    repository_location_name=None,
+    repository_name=None,
+):
+    r"""
+    Requests a Dagster run using the specified parameters.
+    Returns a json dictionary indicating the job's success or failure.
+    This is a generic wrapper.
+    """
+    dagster_job_name = map_nmdc_workflow_id_to_dagster_job_name()[nmdc_workflow_id]
+    extra_run_config_data = ensure_run_config_data(
+        nmdc_workflow_id, nmdc_workflow_inputs, extra_run_config_data, mdb, user
+    )
+    # add REQUESTED RunEvent
+    nmdc_run_id = _add_run_requested_event(
+        run_spec=RunUserSpec(
+            job_id=nmdc_workflow_id,
+            run_config=extra_run_config_data,
+            inputs=inputs_for(nmdc_workflow_id, extra_run_config_data),
+        ),
+        mdb=mdb,
+        user=user,
+    )
+    dagster_client = get_dagster_graphql_client()
+    try:
+        dagster_run_id: str = dagster_client.submit_job_execution(
+            dagster_job_name,
+            repository_location_name=repository_location_name,
+            repository_name=repository_name,
+            run_config=extra_run_config_data,
+        )
+        # add STARTED RunEvent
+        _add_run_started_event(run_id=nmdc_run_id, mdb=mdb)
+        mdb.run_events.find_one_and_update(
+            filter={"run.id": nmdc_run_id, "type": "STARTED"},
+            update={"$set": {"run.facets.nmdcRuntime_dagsterRunId": dagster_run_id}},
+            sort=[("time", -1)],
+        )
+        return {"type": "success", "detail": {"run_id": nmdc_run_id}}
+    except DagsterGraphQLClientError as exc:
+        # add FAIL RunEvent
+        _add_run_fail_event(run_id=nmdc_run_id, mdb=mdb)
+        return {
+            "type": "error",
+            "detail": {"run_id": nmdc_run_id, "error_detail": str(exc)},
+        }
+def _get_dagster_run_status(run_id: str):
+    r"""
+    Returns the status (either "success" or "error") of a requested Dagster run.
+    """
+    dagster_client = get_dagster_graphql_client()
+    try:
+        run_status: DagsterRunStatus = dagster_client.get_run_status(run_id)
+        return {"type": "success", "detail": str(run_status.value)}
+    except DagsterGraphQLClientError as exc:
+        return {"type": "error", "detail": str(exc)}
+def check_action_permitted(username: str, action: str):
+    """Returns True if a Mongo database action is "allowed" and "not denied"."""
+    db: MongoDatabase = get_mongo_db()
+    filter_ = {"username": username, "action": action}
+    denied = db["_runtime.api.deny"].find_one(filter_) is not None
+    allowed = db["_runtime.api.allow"].find_one(filter_) is not None
+    return (not denied) and allowed

nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl