PyPI - nmdc-runtime - Versions diffs - 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -1
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +731 -40
nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
nmdc_runtime/site/graphs.py +80 -29
nmdc_runtime/site/ops.py +522 -183
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +108 -117
nmdc_runtime/site/resources.py +72 -36
nmdc_runtime/site/translation/gold_translator.py +22 -21
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
nmdc_runtime/site/translation/translator.py +64 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +175 -348
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/static/ORCID-iD_icon_vector.svg ADDED Viewed

@@ -0,0 +1,4 @@
+<svg width="32" height="32" fill="none" xmlns="http://www.w3.org/2000/svg">
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M32 16c0 8.837-7.163 16-16 16-8.838 0-16-7.163-16-16C0 7.162 7.162 0 16 0c8.837 0 16 7.162 16 16Z" fill="#A6CE39"/>
+  <path fill-rule="evenodd" clip-rule="evenodd" d="M18.813 9.637h-5.45v13.9h5.474c4.555 0 7.35-3.378 7.35-6.95 0-1.635-.562-3.372-1.77-4.704-1.215-1.336-3.065-2.246-5.605-2.246ZM18.6 21.3h-2.813v-9.425H18.5c1.823 0 3.12.552 3.96 1.4.842.849 1.252 2.021 1.252 3.312 0 .784-.239 1.967-.993 2.948-.745.969-2.01 1.765-4.119 1.765Zm5.311-4.026c-.251 1.74-1.494 4.276-5.311 4.276h-3.063H18.6c3.817 0 5.06-2.536 5.311-4.276Zm1.812-2.405c-.657-2.601-2.85-4.982-6.91-4.982h-5.2 5.2c4.06 0 6.253 2.38 6.91 4.982Zm.215 1.718ZM8.363 9.675v13.887h2.425V9.675H8.363Zm2.175 13.637H8.612h1.925ZM9.575 8.65c.84 0 1.513-.689 1.513-1.513 0-.823-.673-1.512-1.513-1.512-.838 0-1.512.674-1.512 1.513 0 .823.672 1.512 1.512 1.512Z" fill="#fff"/>
+</svg>

nmdc_runtime/static/README.md ADDED Viewed

@@ -0,0 +1,5 @@
+# Static
+This document contains information about the origins of the other files in this directory.
+- `ORCID-iD_icon_vector.svg`: On September 27, 2025, we downloaded this SVG file from ORCID's [Brand Library](https://orcid.filecamp.com/s/o/3CCuLloCl73Knntn/VU19wHSMUnX9TD4R), which we found a link to on the [Brand Guidelines](https://info.orcid.org/brand-guidelines/) page of ORCID's website.

nmdc_runtime/static/favicon.ico ADDED Viewed

Binary file

nmdc_runtime/util.py CHANGED Viewed

@@ -1,80 +1,34 @@
+import importlib.resources
 import json
 import mimetypes
 import os
-import pkgutil
+from collections import defaultdict
 from collections.abc import Iterable
-from contextlib import AbstractContextManager
-from copy import deepcopy
 from datetime import datetime, timezone
 from functools import lru_cache
-from io import BytesIO
 from itertools import chain
 from pathlib import Path
-from uuid import uuid4
-from typing import List, Optional, Set, Dict
+from typing import Callable, List, Optional, Set, Dict
-import fastjsonschema
 import requests
+from bson.son import SON
 from frozendict import frozendict
-from jsonschema.validators import Draft7Validator
-from linkml_runtime import linkml_model
-from linkml_runtime.utils.schemaview import SchemaView
-from nmdc_schema.nmdc import Database as NMDCDatabase
+from linkml.validator import Validator
+from linkml.validator.plugins import JsonschemaValidationPlugin
+from linkml_runtime import SchemaView
+from nmdc_schema import NmdcSchemaValidationPlugin
 from nmdc_schema.get_nmdc_view import ViewGetter
-from pydantic import Field, BaseModel
 from pymongo.database import Database as MongoDatabase
 from pymongo.errors import OperationFailure
-from refscan.lib.helpers import identify_references
-from refscan.lib.Finder import Finder
+from refscan.lib.helpers import (
+    identify_references,
+    get_collection_name_to_class_names_map,
+)
 from refscan.lib.ReferenceList import ReferenceList
-from refscan.scanner import scan_outgoing_references
-from toolz import merge, unique
+from toolz import merge
 from nmdc_runtime.api.core.util import sha256hash_from_file
 from nmdc_runtime.api.models.object import DrsObjectIn
-from typing_extensions import Annotated
-def get_names_of_classes_in_effective_range_of_slot(
-    schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
-) -> List[str]:
-    r"""
-    Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
-    Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
-          induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
-          of applying those additional constraints, so we do it manually here (if any are defined).
-          Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
-    Reference: https://linkml.io/linkml-model/latest/docs/any_of/
-    """
-    # Initialize the list to be empty.
-    names_of_eligible_target_classes = []
-    # If the `any_of` constraint is defined on this slot, use that instead of the `range`.
-    if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
-        for slot_expression in slot_definition.any_of:
-            # Use the slot expression's `range` to get the specified eligible class name
-            # and the names of all classes that inherit from that eligible class.
-            if slot_expression.range in schema_view.all_classes():
-                own_and_descendant_class_names = schema_view.class_descendants(
-                    slot_expression.range
-                )
-                names_of_eligible_target_classes.extend(own_and_descendant_class_names)
-    else:
-        # Use the slot's `range` to get the specified eligible class name
-        # and the names of all classes that inherit from that eligible class.
-        if slot_definition.range in schema_view.all_classes():
-            own_and_descendant_class_names = schema_view.class_descendants(
-                slot_definition.range
-            )
-            names_of_eligible_target_classes.extend(own_and_descendant_class_names)
-    # Remove duplicate class names.
-    names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
-    return names_of_eligible_target_classes
 def get_class_names_from_collection_spec(
@@ -157,41 +111,23 @@ def get_type_collections() -> dict:
     return mappings
-def without_id_patterns(nmdc_jsonschema):
-    rv = deepcopy(nmdc_jsonschema)
-    for cls_, spec in rv["$defs"].items():
-        if "properties" in spec:
-            if "id" in spec["properties"]:
-                spec["properties"]["id"].pop("pattern", None)
-    return rv
 @lru_cache
-def get_nmdc_jsonschema_dict(enforce_id_patterns=True):
-    """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
-    d = json.loads(
-        BytesIO(
-            pkgutil.get_data("nmdc_schema", "nmdc_materialized_patterns.schema.json")
-        )
-        .getvalue()
-        .decode("utf-8")
-    )
-    return d if enforce_id_patterns else without_id_patterns(d)
+def get_nmdc_jsonschema_path() -> Path:
+    """Get path to NMDC JSON Schema file."""
+    with importlib.resources.path(
+        "nmdc_schema", "nmdc_materialized_patterns.schema.json"
+    ) as p:
+        return p
-@lru_cache
-def get_nmdc_jsonschema_validator(enforce_id_patterns=True):
-    return fastjsonschema.compile(
-        get_nmdc_jsonschema_dict(enforce_id_patterns=enforce_id_patterns)
-    )
+@lru_cache()
+def get_nmdc_jsonschema_dict() -> dict:
+    """Get NMDC JSON Schema with materialized patterns (for identifier regexes)."""
+    with open(get_nmdc_jsonschema_path(), "r") as f:
+        return json.load(f)
 nmdc_jsonschema = get_nmdc_jsonschema_dict()
-nmdc_jsonschema_validator = get_nmdc_jsonschema_validator()
-nmdc_jsonschema_noidpatterns = get_nmdc_jsonschema_dict(enforce_id_patterns=False)
-nmdc_jsonschema_validator_noidpatterns = get_nmdc_jsonschema_validator(
-    enforce_id_patterns=False
-)
 REPO_ROOT_DIR = Path(__file__).parent.parent
@@ -332,9 +268,9 @@ def find_one(k_v: dict, entities: Iterable[dict]):
     """Find the first entity with key-value pair k_v, if any?
     >>> find_one({"id": "foo"}, [{"id": "foo"}])
+    {'id': 'foo'}
+    >>> find_one({"id": "foo"}, [{"id": "bar"}]) is None
     True
-    >>> find_one({"id": "foo"}, [{"id": "bar"}])
-    False
     """
     if len(k_v) > 1:
         raise Exception("Supports only one key-value pair")
@@ -360,6 +296,49 @@ def nmdc_schema_view():
     return ViewGetter().get_view()
+@lru_cache()
+def get_nmdc_schema_validator() -> Validator:
+    schema_view = nmdc_schema_view()
+    return Validator(
+        schema_view.schema,
+        validation_plugins=[
+            JsonschemaValidationPlugin(
+                closed=True,
+                # Since the `nmdc-schema` package exports a pre-built JSON Schema file, use that
+                # instead of relying on the plugin to generate one on the fly.
+                json_schema_path=get_nmdc_jsonschema_path(),
+            ),
+            NmdcSchemaValidationPlugin(),
+        ],
+    )
+@lru_cache
+def get_class_name_to_collection_names_map(
+    schema_view: SchemaView,
+) -> Dict[str, List[str]]:
+    """
+    Returns a mapping of class names to the names of the collections that can store instances of those classes/types,
+    according to the specified `SchemaView`.
+    Example output:
+    ```
+    {
+        "Study": ["study_set"],
+        "Biosample": ["biosample_set"],
+        ...
+    }
+    ```
+    """
+    class_name_to_collection_names = defaultdict(list)
+    for collection_name, class_names in get_collection_name_to_class_names_map(
+        schema_view
+    ).items():
+        for class_name in class_names:
+            class_name_to_collection_names[class_name].append(collection_name)
+    return class_name_to_collection_names
 @lru_cache
 def nmdc_database_collection_instance_class_names():
     names = []
@@ -378,7 +357,7 @@ def nmdc_database_collection_names():
     TODO: Document this function.
     TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
-          collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
+          collections, import/use the function named `get_collection_names_from_schema` from `refscan.lib.helpers`
           instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
           maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
     """
@@ -414,6 +393,12 @@ def all_docs_have_unique_id(coll) -> bool:
 def specialize_activity_set_docs(docs):
+    """
+    TODO: Document this function.
+    TODO: Check whether this function is still necessary, given that the `Database` class
+          in `nmdc-schema` does not have a slot named `activity_set`.
+    """
     validation_errors = {}
     type_collections = get_type_collections()
     if "activity_set" in docs:
@@ -497,8 +482,56 @@ def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[
     return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
+def does_collection_have_unique_index_on_id_field(
+    collection_name: str, db: MongoDatabase
+) -> bool:
+    """Check whether the specified MongoDB collection has a unique index on its `id` field (not `_id`).
+    Note: If the specified MongoDB collection either does not exist or is a _view_ instead of a collection,
+          this function will return `False`.
+    References:
+    - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes
+    - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.index_information
+    """
+    # Check whether the specified collection actually exists in the database; and, if it does,
+    # whether it is really a _collection_ (as opposed to being a _view_). If it doesn't exist,
+    # or it is a view, return `False` right away.
+    collection_infos_cursor = db.list_collections(filter={"name": collection_name})
+    collection_infos = list(collection_infos_cursor)
+    if len(collection_infos) == 0:
+        return False
+    collection_info = collection_infos[0]
+    if collection_info["type"] != "collection":
+        return False
+    # Now that we know we're dealing with a collection, get information about each of its indexes.
+    collection = db.get_collection(collection_name)
+    for index_information in collection.list_indexes():
+        # Get the "field_name-direction" pairs that make up this index.
+        field_name_and_direction_pairs: SON = index_information["key"]
+        # If this index involves a number of fields other than one, skip it.
+        # We're only interested in indexes that involve the `id` field by itself.
+        if len(field_name_and_direction_pairs.keys()) != 1:
+            continue
+        # Check whether the field this index involves is the `id` field,
+        # and whether this index is `unique`.
+        field_name = list(field_name_and_direction_pairs.keys())[0]
+        if field_name == "id" and index_information.get("unique", False):
+            return True
+    return False
 def ensure_unique_id_indexes(mdb: MongoDatabase):
     """Ensure that any collections with an "id" field have an index on "id"."""
+    # Note: The pipe (i.e. `|`) operator performs a union of the two sets. In this case,
+    #       it creates a set (i.e. `candidate_names`) consisting of the names of both
+    #       (a) all collections in the real database, and (b) all collections that
+    #       the NMDC schema says can contain instances of classes that have an "id" slot.
     candidate_names = (
         set(mdb.list_collection_names()) | schema_collection_names_with_id_field()
     )
@@ -506,273 +539,67 @@ def ensure_unique_id_indexes(mdb: MongoDatabase):
         if collection_name.startswith("system."):  # reserved by mongodb
             continue
+        # If the collection already has a unique index on `id`, there's no need
+        # to check anything else about the collection.
+        if does_collection_have_unique_index_on_id_field(collection_name, mdb):
+            continue
         if (
             collection_name in schema_collection_names_with_id_field()
             or all_docs_have_unique_id(mdb[collection_name])
         ):
-            mdb[collection_name].create_index("id", unique=True)
-class UpdateStatement(BaseModel):
-    q: dict
-    u: dict
-    upsert: bool = False
-    multi: bool = False
-class DeleteStatement(BaseModel):
-    q: dict
-    limit: Annotated[int, Field(ge=0, le=1)] = 1
-class OverlayDBError(Exception):
-    pass
-class OverlayDB(AbstractContextManager):
-    """Provides a context whereby a base Database is overlaid with a temporary one.
-    If you need to run basic simulations of updates to a base database,
-    you don't want to actually commit transactions to the base database.
-    For example, to insert or replace (matching on "id") many documents into a collection in order
-    to then validate the resulting total set of collection documents, an OverlayDB writes to
-    an overlay collection that "shadows" the base collection during a "find" query
-    (the "merge_find" method of an OverlayDB object): if a document with `id0` is found in the
-    overlay collection, that id is marked as "seen" and will not also be returned when
-    subsequently scanning the (unmodified) base-database collection.
-    Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
-          database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
-          `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
-          the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
-          "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
-          of the `merge_find` method, which internally accesses both the real database and the overlaying database.
-    Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
-    documents from a base collection to the overlay, and then applying the updates to the overlay,
-    so that again, base collections are unmodified, and a "merge_find" call will produce a result
-    *as if* the base collection(s) were modified.
+            # Check if index already exists, and if so, drop it if not unique
+            try:
+                existing_indexes = list(mdb[collection_name].list_indexes())
+                id_index = next(
+                    (idx for idx in existing_indexes if idx["name"] == "id_1"), None
+                )
-    Mongo deletions (as the "delete" method) also copy affected documents from the base collection
-    to the overlay collection, and flag them using the "_deleted" field. In this way, a `merge_find`
-    call will match a relevant document given a suitable filter, and will mark the document's id
-    as "seen" *without* returning the document. Thus, the result is as if the document were deleted.
+                if id_index:
+                    # If index exists but isn't unique, drop it so we can recreate
+                    if not id_index.get("unique", False):
+                        mdb[collection_name].drop_index("id_1")
+                # Create index with unique constraint
+                mdb[collection_name].create_index("id", unique=True)
+            except OperationFailure as e:
+                # If error is about index with same name, just continue
+                if "An existing index has the same name" in str(e):
+                    continue
+                else:
+                    # Re-raise other errors
+                    raise
-    Usage:
-    ````
-    with OverlayDB(mdb) as odb:
-        # do stuff, e.g. `odb.replace_or_insert_many(...)`
-    ```
-    """
-    def __init__(self, mdb: MongoDatabase):
-        self._bottom_db = mdb
-        self._top_db = self._bottom_db.client.get_database(f"overlay-{uuid4()}")
-        ensure_unique_id_indexes(self._top_db)
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_value, traceback):
-        self._bottom_db.client.drop_database(self._top_db.name)
-    def replace_or_insert_many(self, coll_name, documents: list):
-        try:
-            self._top_db[coll_name].insert_many(documents)
-        except OperationFailure as e:
-            raise OverlayDBError(str(e.details))
-    def apply_updates(self, coll_name, updates: list):
-        """prepare overlay db and apply updates to it."""
-        assert all(UpdateStatement(**us) for us in updates)
-        for update_spec in updates:
-            for bottom_doc in self._bottom_db[coll_name].find(update_spec["q"]):
-                self._top_db[coll_name].insert_one(bottom_doc)
-        try:
-            self._top_db.command({"update": coll_name, "updates": updates})
-        except OperationFailure as e:
-            raise OverlayDBError(str(e.details))
-    def delete(self, coll_name, deletes: list):
-        """ "apply" delete command by flagging docs in overlay database"""
-        assert all(DeleteStatement(**us) for us in deletes)
-        for delete_spec in deletes:
-            for bottom_doc in self._bottom_db[coll_name].find(
-                delete_spec["q"], limit=delete_spec.get("limit", 1)
-            ):
-                bottom_doc["_deleted"] = True
-                self._top_db[coll_name].insert_one(bottom_doc)
-    def merge_find(self, coll_name, find_spec: dict):
-        """Yield docs first from overlay and then from base db, minding deletion flags."""
-        # ensure projection of "id" and "_deleted"
-        if "projection" in find_spec:
-            proj = find_spec["projection"]
-            if isinstance(proj, dict):
-                proj = merge(proj, {"id": 1, "_deleted": 1})
-            elif isinstance(proj, list):
-                proj = list(unique(proj + ["id", "_deleted"]))
-        top_docs = self._top_db[coll_name].find(**find_spec)
-        bottom_docs = self._bottom_db[coll_name].find(**find_spec)
-        top_seen_ids = set()
-        for doc in top_docs:
-            if not doc.get("_deleted"):
-                yield doc
-            top_seen_ids.add(doc["id"])
-        for doc in bottom_docs:
-            if doc["id"] not in top_seen_ids:
-                yield doc
-def validate_json(
-    in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
-):
+def decorate_if(condition: bool = False) -> Callable:
     r"""
-    Checks whether the specified dictionary represents a valid instance of the `Database` class
-    defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
-    Example dictionary:
-    {
-        "biosample_set": [
-            {"id": "nmdc:bsm-00-000001", ...},
-            {"id": "nmdc:bsm-00-000002", ...}
-        ],
-        "study_set": [
-            {"id": "nmdc:sty-00-000001", ...},
-            {"id": "nmdc:sty-00-000002", ...}
-        ]
-    }
-    :param in_docs: The dictionary you want to validate
-    :param mdb: A reference to a MongoDB database
-    :param check_inter_document_references: Whether you want this function to check whether every document that
-                                            is referenced by any of the documents passed in would, indeed, exist
-                                            in the database, if the documents passed in were to be inserted into
-                                            the database. In other words, set this to `True` if you want this
-                                            function to perform referential integrity checks.
+    Decorator that applies another decorator only when `condition` is `True`.
+    Note: We implemented this so we could conditionally register
+          endpoints with FastAPI's `@router`.
+    Example usages:
+    A. Apply the `@router.get` decorator:
+       ```python
+       @decorate_if(True)(router.get("/me"))
+       def get_me(...):
+           ...
+       ```
+    B. Bypass the `@router.get` decorator:
+       ```python
+       @decorate_if(False)(router.get("/me"))
+       def get_me(...):
+           ...
+       ```
     """
-    validator = Draft7Validator(get_nmdc_jsonschema_dict())
-    docs = deepcopy(in_docs)
-    validation_errors = {}
-    known_coll_names = set(nmdc_database_collection_names())
-    for coll_name, coll_docs in docs.items():
-        if coll_name not in known_coll_names:
-            # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
-            #        See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
-            if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
-                continue
+    def apply_original_decorator(original_decorator: Callable) -> Callable:
+        def check_condition(original_function: Callable) -> Callable:
+            if condition:
+                return original_decorator(original_function)
             else:
-                validation_errors[coll_name] = [
-                    f"'{coll_name}' is not a known schema collection name"
-                ]
-                continue
+                return original_function
-        errors = list(validator.iter_errors({coll_name: coll_docs}))
-        validation_errors[coll_name] = [e.message for e in errors]
-        if coll_docs:
-            if not isinstance(coll_docs, list):
-                validation_errors[coll_name].append("value must be a list")
-            elif not all(isinstance(d, dict) for d in coll_docs):
-                validation_errors[coll_name].append(
-                    "all elements of list must be dicts"
-                )
-            if not validation_errors[coll_name]:
-                try:
-                    with OverlayDB(mdb) as odb:
-                        odb.replace_or_insert_many(coll_name, coll_docs)
-                except OverlayDBError as e:
-                    validation_errors[coll_name].append(str(e))
-    if all(len(v) == 0 for v in validation_errors.values()):
-        # Second pass. Try instantiating linkml-sourced dataclass
-        in_docs.pop("@type", None)
-        try:
-            NMDCDatabase(**in_docs)
-        except Exception as e:
-            return {"result": "errors", "detail": str(e)}
-        # Third pass (if enabled): Check inter-document references.
-        if check_inter_document_references is True:
-            # Prepare to use `refscan`.
-            #
-            # Note: We check the inter-document references in two stages, which are:
-            #       1. For each document in the JSON payload, check whether each document it references already exists
-            #          (in the collections the schema says it can exist in) in the database. We use the
-            #          `refscan` package to do this, which returns violation details we'll use in the second stage.
-            #       2. For each violation found in the first stage (i.e. each reference to a not-found document), we
-            #          check whether that document exists (in the collections the schema says it can exist in) in the
-            #          JSON payload. If it does, then we "waive" (i.e. discard) that violation.
-            #       The violations that remain after those two stages are the ones we return to the caller.
-            #
-            # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
-            #       does not provide a means to perform arbitrary queries against its virtual "merged" database. It
-            #       is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
-            #       `refscan`'s `Finder` class accepts.
-            #
-            finder = Finder(database=mdb)
-            references = get_allowed_references()
-            reference_field_names_by_source_class_name = (
-                references.get_reference_field_names_by_source_class_name()
-            )
+        return check_condition
-            # Iterate over the collections in the JSON payload.
-            for source_collection_name, documents in in_docs.items():
-                for document in documents:
-                    # Add an `_id` field to the document, since `refscan` requires the document to have one.
-                    source_document = dict(document, _id=None)
-                    violations = scan_outgoing_references(
-                        document=source_document,
-                        schema_view=nmdc_schema_view(),
-                        reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
-                        references=references,
-                        finder=finder,
-                        collection_names=nmdc_database_collection_names(),
-                        source_collection_name=source_collection_name,
-                        user_wants_to_locate_misplaced_documents=False,
-                    )
-                    # For each violation, check whether the misplaced document is in the JSON payload, itself.
-                    for violation in violations:
-                        can_waive_violation = False
-                        # Determine which collections can contain the referenced document, based upon
-                        # the schema class of which this source document is an instance.
-                        target_collection_names = (
-                            references.get_target_collection_names(
-                                source_class_name=violation.source_class_name,
-                                source_field_name=violation.source_field_name,
-                            )
-                        )
-                        # Check whether the referenced document exists in any of those collections in the JSON payload.
-                        for json_coll_name, json_coll_docs in in_docs.items():
-                            if json_coll_name in target_collection_names:
-                                for json_coll_doc in json_coll_docs:
-                                    if json_coll_doc["id"] == violation.target_id:
-                                        can_waive_violation = True
-                                        break  # stop checking
-                            if can_waive_violation:
-                                break  # stop checking
-                        if not can_waive_violation:
-                            violation_as_str = (
-                                f"Document '{violation.source_document_id}' "
-                                f"in collection '{violation.source_collection_name}' "
-                                f"has a field '{violation.source_field_name}' that "
-                                f"references a document having id "
-                                f"'{violation.target_id}', but the latter document "
-                                f"does not exist in any of the collections the "
-                                f"NMDC Schema says it can exist in."
-                            )
-                            validation_errors[source_collection_name].append(
-                                violation_as_str
-                            )
-            # If any collection's error list is not empty, return an error response.
-            if any(len(v) > 0 for v in validation_errors.values()):
-                return {"result": "errors", "detail": validation_errors}
-        return {"result": "All Okay!"}
-    else:
-        return {"result": "errors", "detail": validation_errors}
+    return apply_original_decorator

nmdc_runtime-2.12.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,45 @@
+Metadata-Version: 2.4
+Name: nmdc-runtime
+Version: 2.12.0
+Summary: A runtime system for NMDC data management and orchestration
+Project-URL: Changelog, https://github.com/microbiomedata/nmdc-runtime/releases
+Project-URL: Documentation, https://docs.microbiomedata.org/runtime
+Project-URL: Issues, https://github.com/microbiomedata/nmdc-runtime/issues
+Project-URL: Repository, https://github.com/microbiomedata/nmdc-runtime
+License-File: LICENSE
+Requires-Python: >=3.10
+Requires-Dist: base32-lib
+Requires-Dist: boto3
+Requires-Dist: click
+Requires-Dist: dagit
+Requires-Dist: dagster
+Requires-Dist: dagster-graphql
+Requires-Dist: dagster-postgres
+Requires-Dist: fastapi>=0.115.0
+Requires-Dist: frozendict
+Requires-Dist: git-root
+Requires-Dist: jq
+Requires-Dist: jsonasobj2
+Requires-Dist: linkml
+Requires-Dist: linkml-runtime
+Requires-Dist: lxml
+Requires-Dist: nmdc-schema==11.13.0
+Requires-Dist: ontology-loader==0.2.2
+Requires-Dist: pandas
+Requires-Dist: passlib[bcrypt]
+Requires-Dist: pydantic[email]>=1.10.0
+Requires-Dist: pyinstrument
+Requires-Dist: pymongo
+Requires-Dist: python-dotenv
+Requires-Dist: python-jose[cryptography]
+Requires-Dist: python-multipart>=0.0.18
+Requires-Dist: pyyaml
+Requires-Dist: refscan==0.3.2
+Requires-Dist: requests
+Requires-Dist: requests-cache
+Requires-Dist: scalar-fastapi<2.0.0,>=1.4.1
+Requires-Dist: tenacity
+Requires-Dist: toolz
+Requires-Dist: tqdm
+Requires-Dist: unidecode
+Requires-Dist: uvicorn[standard]

nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl