PyPI - nmdc-runtime - Versions diffs - 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

nmdc-runtime 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (16) hide show

nmdc_runtime/minter/config.py +18 -50
nmdc_runtime/site/graphs.py +39 -0
nmdc_runtime/site/ops.py +125 -28
nmdc_runtime/site/repair/__init__.py +0 -0
nmdc_runtime/site/repair/database_updater.py +230 -0
nmdc_runtime/site/repository.py +101 -1
nmdc_runtime/site/resources.py +36 -5
nmdc_runtime/site/translation/gold_translator.py +26 -4
nmdc_runtime/site/util.py +7 -2
nmdc_runtime/util.py +143 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/METADATA +10 -2
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/RECORD +16 -14
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/LICENSE +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/minter/config.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 from functools import lru_cache
 from typing import List
-from nmdc_runtime.util import get_nmdc_jsonschema_dict
+from nmdc_schema.id_helpers import get_typecode_for_future_ids
+from nmdc_runtime.util import get_nmdc_jsonschema_dict
 from nmdc_runtime.api.db.mongo import get_mongo_db
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
     return os.getenv("MINTING_SERVICE_ID")
-def extract_typecode_from_pattern(pattern: str) -> str:
-    r"""
-    Returns the typecode portion of the specified string.
-    >>> extract_typecode_from_pattern("foo-123-456$")  # original behavior
-    'foo'
-    >>> extract_typecode_from_pattern("(foo)-123-456$")  # returns first and only typecode
-    'foo'
-    >>> extract_typecode_from_pattern("(foo|bar)-123-456$")  # returns first of 2 typecodes
-    'foo'
-    >>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$")  # returns first of > 2 typecodes
-    'foo'
-    """
-    # Get the portion of the pattern preceding the first hyphen.
-    # e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
-    typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
-    # If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
-    # e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
-    if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
-        inner_pattern = typecode_sub_pattern[1:-1]
-        # Finally, get everything before the first `|`, if any.
-        # e.g. "apple|banana|carrot" → "apple"
-        # e.g. "apple" → "apple"
-        typecode = inner_pattern.split("|", maxsplit=1)[0]
-    else:
-        # Note: This is the original behavior, before we added support for multi-typecode patterns.
-        # e.g. "apple" → "apple"
-        typecode = typecode_sub_pattern
-    return typecode
 @lru_cache()
 def typecodes() -> List[dict]:
     r"""
     Returns a list of dictionaries containing typecodes and associated information derived from the schema.
-    Preconditions about the schema:
-    - The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
-    - The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
-      or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
-    - The typecode portion of the pattern does not, itself, contain any hyphens.
-    TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
-          Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
-          in a dedicated property of a class; for example, one named `typecode`).
+    Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
+          class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
+          that class _today_; regardless of what it may have used in the past.
+    >>> typecode_descriptors = typecodes()
+    # Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
+    >>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
+    True
+    # Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
+    >>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
+    True
+    >>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
+    False
     """
     id_pattern_prefix = r"^(nmdc):"
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
     for cls_name, defn in schema_dict["$defs"].items():
         match defn.get("properties"):
             case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
-                # Get the portion of the pattern following the prefix.
-                # e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
-                index_of_first_character_following_prefix = len(id_pattern_prefix)
-                pattern_without_prefix = p[index_of_first_character_following_prefix:]
+                # Extract the typecode from the pattern.
+                typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
                 rv.append(
                     {
                         "id": "nmdc:" + cls_name + "_" + "typecode",
                         "schema_class": "nmdc:" + cls_name,
-                        "name": extract_typecode_from_pattern(pattern_without_prefix),
+                        "name": typecode_for_future_ids,
                     }
                 )
             case _:

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -2,6 +2,7 @@ from dagster import graph
 from nmdc_runtime.site.ops import (
     build_merged_db,
+    generate_biosample_set_for_nmdc_study_from_gold,
     nmdc_schema_database_export_filename,
     nmdc_schema_database_from_gold_study,
     nmdc_schema_object_to_dict,
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
+    get_database_updater_inputs,
+    post_submission_portal_biosample_ingest_record_stitching_filename,
+    generate_data_generation_set_post_biosample_ingest,
 )
 from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
@@ -117,12 +121,14 @@ def apply_changesheet():
     sheet_in = get_changesheet_in()
     outputs = perform_changesheet_updates(sheet_in)
     add_output_run_event(outputs)
+    materialize_alldocs()
 @graph
 def apply_metadata_in():
     outputs = perform_mongo_updates(get_json_in())
     add_output_run_event(outputs)
+    materialize_alldocs()
 @graph
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
         all_instruments,
     )
     ncbi_submission_xml_asset(xml_data)
+@graph
+def generate_data_generation_set_for_biosamples_in_nmdc_study():
+    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
+    database = generate_data_generation_set_post_biosample_ingest(
+        study_id, gold_nmdc_instrument_map_df
+    )
+    database_dict = nmdc_schema_object_to_dict(database)
+    filename = post_submission_portal_biosample_ingest_record_stitching_filename(
+        study_id
+    )
+    outputs = export_json_to_drs(database_dict, filename)
+    add_output_run_event(outputs)
+@graph
+def generate_biosample_set_from_samples_in_gold():
+    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
+    database = generate_biosample_set_for_nmdc_study_from_gold(
+        study_id, gold_nmdc_instrument_map_df
+    )
+    database_dict = nmdc_schema_object_to_dict(database)
+    filename = post_submission_portal_biosample_ingest_record_stitching_filename(
+        study_id
+    )
+    outputs = export_json_to_drs(database_dict, filename)
+    add_output_run_event(outputs)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
 from nmdc_runtime.site.translation.submission_portal_translator import (
     SubmissionPortalTranslator,
 )
-from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
+from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
+from nmdc_runtime.site.util import (
+    run_and_log,
+    schema_collection_has_index_on_id,
+    nmdc_study_id_to_filename,
+)
 from nmdc_runtime.util import (
     drs_object_in_for,
     get_names_of_classes_in_effective_range_of_slot,
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
     # TODO include functional_annotation_agg  for "real-time" ref integrity checking.
     #   For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
     collection_names = populated_schema_collection_names_with_id_field(mdb)
-    context.log.info(f"{collection_names=}")
-    # Build alldocs
-    context.log.info("constructing `alldocs` collection")
+    context.log.info(f"constructing `alldocs` collection using {collection_names=}")
     document_class_names = set(
         chain.from_iterable(collection_name_to_class_names.values())
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
         for cls_name in document_class_names
     }
-    # Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
+    # Any ancestor of a document class is a document-referencable range,
+    # i.e., a valid range of a document-reference-ranged slot.
     document_referenceable_ranges = set(
         chain.from_iterable(
             schema_view.class_ancestors(cls_name) for cls_name in document_class_names
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
             ):
                 document_reference_ranged_slots[cls_name].append(slot_name)
-    # Drop any existing `alldocs` collection (e.g. from previous use of this op).
-    #
-    # FIXME: This "nuke and pave" approach introduces a race condition.
-    #        For example, if someone were to visit an API endpoint that uses the "alldocs" collection,
-    #        the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing.
-    #
-    mdb.alldocs.drop()
+    # Build `alldocs` to a temporary collection for atomic replacement
+    # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
+    temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
+    temp_alldocs_collection = mdb[temp_alldocs_collection_name]
+    context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
     for coll_name in collection_names:
         context.log.info(f"{coll_name=}")
-        requests = []
+        write_operations = []
         documents_processed_counter = 0
         for doc in mdb[coll_name].find():
             doc_type = doc["type"][5:]  # lop off "nmdc:" prefix
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
             ]
             new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
             new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
-            requests.append(InsertOne(new_doc))
-            if len(requests) == BULK_WRITE_BATCH_SIZE:
-                _ = mdb.alldocs.bulk_write(requests, ordered=False)
-                requests.clear()
+            write_operations.append(InsertOne(new_doc))
+            if len(write_operations) == BULK_WRITE_BATCH_SIZE:
+                _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
+                write_operations.clear()
                 documents_processed_counter += BULK_WRITE_BATCH_SIZE
-        if len(requests) > 0:
-            _ = mdb.alldocs.bulk_write(requests, ordered=False)
-            documents_processed_counter += len(requests)
+        if len(write_operations) > 0:
+            _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
+            documents_processed_counter += len(write_operations)
         context.log.info(
             f"Inserted {documents_processed_counter} documents from {coll_name=} "
         )
     context.log.info(
-        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
+        f"produced `{temp_alldocs_collection.name}` collection with"
+        f" {temp_alldocs_collection.estimated_document_count()} docs."
     )
-    # Re-idx for `alldocs` collection
-    mdb.alldocs.create_index("id", unique=True)
-    # The indexes were added to improve the performance of the
-    # /data_objects/study/{study_id} endpoint
+    context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
+    # Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
+    # so that `temp_alldocs_collection` will be "good to go" on renaming.
+    temp_alldocs_collection.create_index("id", unique=True)
+    # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
     slots_to_index = ["has_input", "has_output", "was_informed_by"]
-    [mdb.alldocs.create_index(slot) for slot in slots_to_index]
+    [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
     context.log.info(f"created indexes on id, {slots_to_index}.")
+    context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
+    temp_alldocs_collection.rename("alldocs", dropTarget=True)
     return mdb.alldocs.estimated_document_count()
@@ -1240,3 +1246,94 @@ def ncbi_submission_xml_from_nmdc_study(
         all_instruments,
     )
     return ncbi_xml
+@op
+def post_submission_portal_biosample_ingest_record_stitching_filename(
+    nmdc_study_id: str,
+) -> str:
+    filename = nmdc_study_id_to_filename(nmdc_study_id)
+    return f"missing_database_records_for_{filename}.json"
+@op(
+    config_schema={
+        "nmdc_study_id": str,
+        "gold_nmdc_instrument_mapping_file_url": str,
+    },
+    out={
+        "nmdc_study_id": Out(str),
+        "gold_nmdc_instrument_mapping_file_url": Out(str),
+    },
+)
+def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
+    return (
+        context.op_config["nmdc_study_id"],
+        context.op_config["gold_nmdc_instrument_mapping_file_url"],
+    )
+@op(
+    required_resource_keys={
+        "runtime_api_user_client",
+        "runtime_api_site_client",
+        "gold_api_client",
+    }
+)
+def generate_data_generation_set_post_biosample_ingest(
+    context: OpExecutionContext,
+    nmdc_study_id: str,
+    gold_nmdc_instrument_map_df: pd.DataFrame,
+) -> nmdc.Database:
+    runtime_api_user_client: RuntimeApiUserClient = (
+        context.resources.runtime_api_user_client
+    )
+    runtime_api_site_client: RuntimeApiSiteClient = (
+        context.resources.runtime_api_site_client
+    )
+    gold_api_client: GoldApiClient = context.resources.gold_api_client
+    database_updater = DatabaseUpdater(
+        runtime_api_user_client,
+        runtime_api_site_client,
+        gold_api_client,
+        nmdc_study_id,
+        gold_nmdc_instrument_map_df,
+    )
+    database = (
+        database_updater.generate_data_generation_set_records_from_gold_api_for_study()
+    )
+    return database
+@op(
+    required_resource_keys={
+        "runtime_api_user_client",
+        "runtime_api_site_client",
+        "gold_api_client",
+    }
+)
+def generate_biosample_set_for_nmdc_study_from_gold(
+    context: OpExecutionContext,
+    nmdc_study_id: str,
+    gold_nmdc_instrument_map_df: pd.DataFrame,
+) -> nmdc.Database:
+    runtime_api_user_client: RuntimeApiUserClient = (
+        context.resources.runtime_api_user_client
+    )
+    runtime_api_site_client: RuntimeApiSiteClient = (
+        context.resources.runtime_api_site_client
+    )
+    gold_api_client: GoldApiClient = context.resources.gold_api_client
+    database_updater = DatabaseUpdater(
+        runtime_api_user_client,
+        runtime_api_site_client,
+        gold_api_client,
+        nmdc_study_id,
+        gold_nmdc_instrument_map_df,
+    )
+    database = database_updater.generate_biosample_set_from_gold_api_for_study()
+    return database

nmdc_runtime/site/repair/__init__.py ADDED Viewed

File without changes

nmdc_runtime/site/repair/database_updater.py ADDED Viewed

@@ -0,0 +1,230 @@
+from functools import lru_cache
+from typing import Any, Dict, List
+import pandas as pd
+from nmdc_runtime.site.resources import (
+    RuntimeApiUserClient,
+    RuntimeApiSiteClient,
+    GoldApiClient,
+)
+from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
+from nmdc_schema import nmdc
+class DatabaseUpdater:
+    def __init__(
+        self,
+        runtime_api_user_client: RuntimeApiUserClient,
+        runtime_api_site_client: RuntimeApiSiteClient,
+        gold_api_client: GoldApiClient,
+        study_id: str,
+        gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+    ):
+        """This class serves as an API for repairing connections in the database by
+        adding records that are essentially missing "links"/"connections". As we identify
+        common use cases for adding missing records to the database, we can
+        add helper methods to this class.
+        :param runtime_api_user_client: An object of RuntimeApiUserClient which can be
+        used to retrieve instance records from the NMDC database.
+        :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
+        used to mint new IDs for the repaired records that need to be added into the NMDC database.
+        :param gold_api_client: An object of GoldApiClient which can be used to retrieve
+        records from GOLD via the GOLD API.
+        :param study_id: NMDC study ID for which the missing records need to be added.
+        :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
+        NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
+        """
+        self.runtime_api_user_client = runtime_api_user_client
+        self.runtime_api_site_client = runtime_api_site_client
+        self.gold_api_client = gold_api_client
+        self.study_id = study_id
+        self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
+    @lru_cache
+    def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
+        """Fetch response from GOLD /biosamples API for a given biosample id.
+        :param gold_biosample_id: GOLD biosample ID.
+        :return: Dictionary containing the response from the GOLD /biosamples API.
+        """
+        return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
+    @lru_cache
+    def _fetch_gold_projects(self, gold_biosample_id: str):
+        """Fetch response from GOLD /projects API for a given biosample id.
+        :param gold_biosample_id: GOLD biosample ID
+        :return: Dictionary containing the response from the GOLD /projects API.
+        """
+        return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
+    def generate_data_generation_set_records_from_gold_api_for_study(
+        self,
+    ) -> nmdc.Database:
+        """This method creates missing data generation records for a given study in the NMDC database using
+        metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
+        with the study from the NMDC database. Then, it fetches all the biosample and project data data
+        associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
+        mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
+        to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
+        on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
+        class which is responsible for making data_generation_set records.
+        :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
+        """
+        database = nmdc.Database()
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        all_gold_biosamples = []
+        all_gold_projects = []
+        for biosample in biosample_set:
+            gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
+            if gold_biosample_identifiers:
+                for gold_biosample_id in gold_biosample_identifiers:
+                    gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
+                    gold_projects = self._fetch_gold_projects(gold_biosample_id)
+                    gold_biosample["projects"] = gold_projects
+                    all_gold_biosamples.append(gold_biosample)
+                    all_gold_projects.extend(gold_projects)
+        gold_study_translator = GoldStudyTranslator(
+            biosamples=all_gold_biosamples,
+            projects=all_gold_projects,
+            gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+        )
+        # The GoldStudyTranslator class has some pre-processing logic which filters out
+        # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
+        filtered_biosamples = gold_study_translator.biosamples
+        filtered_projects = gold_study_translator.projects
+        gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
+        nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
+            "nmdc:NucleotideSequencing", len(gold_project_ids)
+        ).json()
+        gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
+            zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
+        )
+        gold_to_nmdc_biosample_ids = {}
+        for biosample in biosample_set:
+            gold_ids = biosample.get("gold_biosample_identifiers", [])
+            for gold_id in gold_ids:
+                gold_id_stripped = gold_id.replace("gold:", "")
+                gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
+        database.data_generation_set = []
+        # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
+        # created is based on the number of GOLD sequencing projects
+        for project in filtered_projects:
+            # map the projectGoldId to the NMDC biosample ID
+            biosample_gold_id = next(
+                (
+                    biosample["biosampleGoldId"]
+                    for biosample in filtered_biosamples
+                    if any(
+                        p["projectGoldId"] == project["projectGoldId"]
+                        for p in biosample.get("projects", [])
+                    )
+                ),
+                None,
+            )
+            if biosample_gold_id:
+                nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
+                if nmdc_biosample_id:
+                    database.data_generation_set.append(
+                        gold_study_translator._translate_nucleotide_sequencing(
+                            project,
+                            nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
+                                project["projectGoldId"]
+                            ],
+                            nmdc_biosample_id=nmdc_biosample_id,
+                            nmdc_study_id=self.study_id,
+                        )
+                    )
+        return database
+    def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
+        """This method creates biosample_set records for a given study in the NMDC database using
+        metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
+        biosamples associated with the study. Then, it fetches the list of all biosamples associated
+        with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
+        to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
+        filtered biosamples, we compute a "set difference" (conceptually) between the list of
+        filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
+        that are already present in the database, and continue on to create biosample_set records for
+        those that do not have records in the database already.
+        :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
+        """
+        database = nmdc.Database()
+        # get a list of all biosamples associated with a given NMDC study id
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        # get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
+        # over all the biosample_set records retrieved using the above logic
+        nmdc_gold_ids = set()
+        for biosample in biosample_set:
+            gold_ids = biosample.get("gold_biosample_identifiers", [])
+            for gold_id in gold_ids:
+                nmdc_gold_ids.add(gold_id.replace("gold:", ""))
+        # retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
+        # on the NMDC study record
+        nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
+        gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
+            "gold:", ""
+        )
+        # use the GOLD study id to fetch all biosample records associated with the study
+        gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
+            gold_study_id
+        )
+        # part of the code where we are (conceptually) computing a set difference between
+        # the list of filtered samples and ones that are already in the NMDC database
+        missing_gold_biosamples = [
+            gbs
+            for gbs in gold_biosamples_for_study
+            if gbs.get("biosampleGoldId") not in nmdc_gold_ids
+        ]
+        gold_study_translator = GoldStudyTranslator(
+            biosamples=missing_gold_biosamples,
+            gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+        )
+        translated_biosamples = gold_study_translator.biosamples
+        # mint new NMDC biosample IDs for the "missing" biosamples
+        gold_biosample_ids = [
+            biosample["biosampleGoldId"] for biosample in translated_biosamples
+        ]
+        nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
+            "nmdc:Biosample", len(translated_biosamples)
+        ).json()
+        gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
+        database.biosample_set = [
+            gold_study_translator._translate_biosample(
+                biosample,
+                nmdc_biosample_id=gold_to_nmdc_biosample_ids[
+                    biosample["biosampleGoldId"]
+                ],
+                nmdc_study_id=self.study_id,
+                nmdc_field_site_id=None,
+            )
+            for biosample in translated_biosamples
+        ]
+        return database

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -25,6 +25,7 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
 from nmdc_runtime.api.models.trigger import Trigger
 from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
 from nmdc_runtime.site.graphs import (
+    generate_biosample_set_from_samples_in_gold,
     translate_metadata_submission_to_nmdc_schema_database,
     ingest_metadata_submission,
     gold_study_to_database,
@@ -44,6 +45,7 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_surface_water_metadata,
     ensure_alldocs,
     nmdc_study_to_ncbi_submission_export,
+    generate_data_generation_set_for_biosamples_in_nmdc_study,
 )
 from nmdc_runtime.site.resources import (
     get_mongo,
@@ -113,6 +115,13 @@ housekeeping_weekly = ScheduleDefinition(
     job=housekeeping.to_job(**preset_normal),
 )
+ensure_alldocs_daily = ScheduleDefinition(
+    name="daily_ensure_alldocs",
+    cron_schedule="0 3 * * *",
+    execution_timezone="America/New_York",
+    job=ensure_alldocs.to_job(**preset_normal),
+)
 def asset_materialization_metadata(asset_event, key):
     """Get metadata from an asset materialization event.
@@ -453,7 +462,7 @@ def repo():
         export_study_biosamples_metadata.to_job(**preset_normal),
         ensure_alldocs.to_job(**preset_normal),
     ]
-    schedules = [housekeeping_weekly]
+    schedules = [housekeeping_weekly, ensure_alldocs_daily]
     sensors = [
         done_object_put_ops,
         ensure_gold_translation_job,
@@ -915,6 +924,97 @@ def biosample_export():
     ]
+@repository
+def database_records_stitching():
+    normal_resources = run_config_frozen__normal_env["resources"]
+    return [
+        generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
+            description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                        }
+                    },
+                    "export_json_to_drs": {"config": {"username": ""}},
+                },
+            },
+        ),
+        generate_biosample_set_from_samples_in_gold.to_job(
+            description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                        }
+                    },
+                    "export_json_to_drs": {"config": {"username": ""}},
+                },
+            },
+        ),
+    ]
 # @repository
 # def validation():
 #     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -129,16 +129,23 @@ class RuntimeApiUserClient(RuntimeApiClient):
         return response.json()["cursor"]["firstBatch"]
     def get_biosamples_for_study(self, study_id: str):
+        # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
+        # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
+        # but the tradeoff there is that we would need to make multiple requests to step through the
+        # each of the pages. By picking a large number for max_page_size, we can get all the results
+        # in a single request.
+        # This method previously used the /queries:run endpoint but the problem with that was that
+        # it used to truncate the number of results returned to 100.
         response = self.request(
-            "POST",
-            f"/queries:run",
+            "GET",
+            f"/nmdcschema/biosample_set",
             {
-                "find": "biosample_set",
-                "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
+                "filter": json.dumps({"associated_studies": study_id}),
+                "max_page_size": 10000,
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["resources"]
     def get_omics_processing_by_name(self, name: str):
         response = self.request(
@@ -152,6 +159,18 @@ class RuntimeApiUserClient(RuntimeApiClient):
         response.raise_for_status()
         return response.json()["cursor"]["firstBatch"]
+    def get_study(self, study_id: str):
+        response = self.request(
+            "POST",
+            f"/queries:run",
+            {
+                "find": "study_set",
+                "filter": {"id": study_id},
+            },
+        )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
 class RuntimeApiSiteClient(RuntimeApiClient):
     def __init__(
@@ -370,6 +389,18 @@ class GoldApiClient(BasicAuthClient):
             return None
         return results[0]
+    def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/projects", params={"biosampleGoldId": id})
+        return results
+    def fetch_biosample_by_biosample_id(
+        self, biosample_id: str
+    ) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/biosamples", params={"biosampleGoldId": id})
+        return results
 @resource(
     config_schema={

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -12,6 +12,29 @@ from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
 SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
+def _is_valid_project(project: dict) -> bool:
+    """A project is considered valid if:
+    1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
+    2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
+        `projectStatus` must be in ("Permanent Draft", "Complete and Published")
+    3. otherwise, no `projectStatus` filter is applied
+    :param project: GOLD project object (structurally similar to response
+                    from `/projects` endpoint)
+    :return: True if the project is valid, False otherwise
+    """
+    if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
+        return False
+    if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
+        return project.get("projectStatus") in (
+            "Permanent Draft",
+            "Complete and Published",
+        )
+    return True
 class GoldStudyTranslator(Translator):
     def __init__(
         self,
@@ -36,16 +59,15 @@ class GoldStudyTranslator(Translator):
             biosample
             for biosample in biosamples
             if any(
-                project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
-                for project in biosample.get("projects", [])
+                _is_valid_project(project) for project in biosample.get("projects", [])
             )
         ]
         # Fetch the valid projectGoldIds that are associated with filtered
         # biosamples on their `projects` field
         valid_project_ids = {
             project.get("projectGoldId")
-            for biosample in self.biosamples
-            for project in biosample.get("projects", [])
+            for project in projects
+            if _is_valid_project(project)
         }
         # Filter projects to only those with `projectGoldId` in valid_project_ids
         self.projects = [

nmdc_runtime/site/util.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
-from functools import lru_cache
-from subprocess import Popen, PIPE, STDOUT, CalledProcessError
+from dagster import op
+from functools import lru_cache
 from pymongo.database import Database as MongoDatabase
+from subprocess import Popen, PIPE, STDOUT, CalledProcessError
 from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
 from nmdc_runtime.site.resources import mongo_resource
@@ -47,3 +48,7 @@ def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
 def get_basename(filename: str) -> str:
     return os.path.basename(filename)
+def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
+    return nmdc_study_id.replace(":", "_").replace("-", "_")

nmdc_runtime/util.py CHANGED Viewed

@@ -24,6 +24,10 @@ from nmdc_schema.get_nmdc_view import ViewGetter
 from pydantic import Field, BaseModel
 from pymongo.database import Database as MongoDatabase
 from pymongo.errors import OperationFailure
+from refscan.lib.helpers import identify_references
+from refscan.lib.Finder import Finder
+from refscan.lib.ReferenceList import ReferenceList
+from refscan.scanner import scan_outgoing_references
 from toolz import merge, unique
 from nmdc_runtime.api.core.util import sha256hash_from_file
@@ -120,6 +124,23 @@ def get_class_names_from_collection_spec(
     return class_names
+@lru_cache
+def get_allowed_references() -> ReferenceList:
+    r"""
+    Returns a `ReferenceList` of all the inter-document references that
+    the NMDC Schema allows a schema-compliant MongoDB database to contain.
+    """
+    # Identify the inter-document references that the schema allows a database to contain.
+    print("Identifying schema-allowed references.")
+    references = identify_references(
+        schema_view=nmdc_schema_view(),
+        collection_name_to_class_names=collection_name_to_class_names,
+    )
+    return references
 @lru_cache
 def get_type_collections() -> dict:
     """Returns a dictionary mapping class names to Mongo collection names."""
@@ -353,6 +374,14 @@ def nmdc_database_collection_instance_class_names():
 @lru_cache
 def nmdc_database_collection_names():
+    r"""
+    TODO: Document this function.
+    TODO: Assuming this function was designed to return a list of names of all Database slots that represents database
+          collections, use the function named `get_collection_names_from_schema` in `nmdc_runtime/api/db/mongo.py`
+          instead, since (a) it includes documentation and (b) it performs the additional checks the lead schema
+          maintainer expects (e.g. checking whether a slot is `multivalued` and `inlined_as_list`).
+    """
     names = []
     view = nmdc_schema_view()
     all_classes = set(view.all_classes())
@@ -513,6 +542,13 @@ class OverlayDB(AbstractContextManager):
     overlay collection, that id is marked as "seen" and will not also be returned when
     subsequently scanning the (unmodified) base-database collection.
+    Note: The OverlayDB object does not provide a means to perform arbitrary MongoDB queries on the virtual "merged"
+          database. Callers can access the real database via `overlay_db._bottom_db` and the overlaying database via
+          `overlay_db._top_db` and perform arbitrary MongoDB queries on the individual databases that way. Access to
+          the virtual "merged" database is limited to the methods of the `OverlayDB` class, which simulates the
+          "merging" just-in-time to process the method invocation. You can see an example of this in the implementation
+          of the `merge_find` method, which internally accesses both the real database and the overlaying database.
     Mongo "update" commands (as the "apply_updates" method) are simulated by first copying affected
     documents from a base collection to the overlay, and then applying the updates to the overlay,
     so that again, base collections are unmodified, and a "merge_find" call will produce a result
@@ -591,7 +627,33 @@ class OverlayDB(AbstractContextManager):
                 yield doc
-def validate_json(in_docs: dict, mdb: MongoDatabase):
+def validate_json(
+    in_docs: dict, mdb: MongoDatabase, check_inter_document_references: bool = False
+):
+    r"""
+    Checks whether the specified dictionary represents a valid instance of the `Database` class
+    defined in the NMDC Schema. Referential integrity checking is performed on an opt-in basis.
+    Example dictionary:
+    {
+        "biosample_set": [
+            {"id": "nmdc:bsm-00-000001", ...},
+            {"id": "nmdc:bsm-00-000002", ...}
+        ],
+        "study_set": [
+            {"id": "nmdc:sty-00-000001", ...},
+            {"id": "nmdc:sty-00-000002", ...}
+        ]
+    }
+    :param in_docs: The dictionary you want to validate
+    :param mdb: A reference to a MongoDB database
+    :param check_inter_document_references: Whether you want this function to check whether every document that
+                                            is referenced by any of the documents passed in would, indeed, exist
+                                            in the database, if the documents passed in were to be inserted into
+                                            the database. In other words, set this to `True` if you want this
+                                            function to perform referential integrity checks.
+    """
     validator = Draft7Validator(get_nmdc_jsonschema_dict())
     docs = deepcopy(in_docs)
     validation_errors = {}
@@ -599,6 +661,8 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
     known_coll_names = set(nmdc_database_collection_names())
     for coll_name, coll_docs in docs.items():
         if coll_name not in known_coll_names:
+            # FIXME: Document what `@type` is (conceptually; e.g., why this function accepts it as a collection name).
+            #        See: https://github.com/microbiomedata/nmdc-runtime/discussions/858
             if coll_name == "@type" and coll_docs in ("Database", "nmdc:Database"):
                 continue
             else:
@@ -631,6 +695,84 @@ def validate_json(in_docs: dict, mdb: MongoDatabase):
         except Exception as e:
             return {"result": "errors", "detail": str(e)}
+        # Third pass (if enabled): Check inter-document references.
+        if check_inter_document_references is True:
+            # Prepare to use `refscan`.
+            #
+            # Note: We check the inter-document references in two stages, which are:
+            #       1. For each document in the JSON payload, check whether each document it references already exists
+            #          (in the collections the schema says it can exist in) in the database. We use the
+            #          `refscan` package to do this, which returns violation details we'll use in the second stage.
+            #       2. For each violation found in the first stage (i.e. each reference to a not-found document), we
+            #          check whether that document exists (in the collections the schema says it can exist in) in the
+            #          JSON payload. If it does, then we "waive" (i.e. discard) that violation.
+            #       The violations that remain after those two stages are the ones we return to the caller.
+            #
+            # Note: The reason we do not insert documents into an `OverlayDB` and scan _that_, is that the `OverlayDB`
+            #       does not provide a means to perform arbitrary queries against its virtual "merged" database. It
+            #       is not a drop-in replacement for a pymongo's `Database` class, which is the only thing that
+            #       `refscan`'s `Finder` class accepts.
+            #
+            finder = Finder(database=mdb)
+            references = get_allowed_references()
+            reference_field_names_by_source_class_name = (
+                references.get_reference_field_names_by_source_class_name()
+            )
+            # Iterate over the collections in the JSON payload.
+            for source_collection_name, documents in in_docs.items():
+                for document in documents:
+                    # Add an `_id` field to the document, since `refscan` requires the document to have one.
+                    source_document = dict(document, _id=None)
+                    violations = scan_outgoing_references(
+                        document=source_document,
+                        schema_view=nmdc_schema_view(),
+                        reference_field_names_by_source_class_name=reference_field_names_by_source_class_name,
+                        references=references,
+                        finder=finder,
+                        collection_names=nmdc_database_collection_names(),
+                        source_collection_name=source_collection_name,
+                        user_wants_to_locate_misplaced_documents=False,
+                    )
+                    # For each violation, check whether the misplaced document is in the JSON payload, itself.
+                    for violation in violations:
+                        can_waive_violation = False
+                        # Determine which collections can contain the referenced document, based upon
+                        # the schema class of which this source document is an instance.
+                        target_collection_names = (
+                            references.get_target_collection_names(
+                                source_class_name=violation.source_class_name,
+                                source_field_name=violation.source_field_name,
+                            )
+                        )
+                        # Check whether the referenced document exists in any of those collections in the JSON payload.
+                        for json_coll_name, json_coll_docs in in_docs.items():
+                            if json_coll_name in target_collection_names:
+                                for json_coll_doc in json_coll_docs:
+                                    if json_coll_doc["id"] == violation.target_id:
+                                        can_waive_violation = True
+                                        break  # stop checking
+                            if can_waive_violation:
+                                break  # stop checking
+                        if not can_waive_violation:
+                            violation_as_str = (
+                                f"Document '{violation.source_document_id}' "
+                                f"in collection '{violation.source_collection_name}' "
+                                f"has a field '{violation.source_field_name}' that "
+                                f"references a document having id "
+                                f"'{violation.target_id}', but the latter document "
+                                f"does not exist in any of the collections the "
+                                f"NMDC Schema says it can exist in."
+                            )
+                            validation_errors[source_collection_name].append(
+                                violation_as_str
+                            )
+            # If any collection's error list is not empty, return an error response.
+            if any(len(v) > 0 for v in validation_errors.values()):
+                return {"result": "errors", "detail": validation_errors}
         return {"result": "All Okay!"}
     else:
         return {"result": "errors", "detail": validation_errors}

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.2
 Name: nmdc_runtime
-Version: 2.2.1
+Version: 2.3.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston
@@ -11,6 +11,14 @@ Classifier: License :: OSI Approved :: Apache Software License
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: requires-python
+Dynamic: summary
 A runtime system for NMDC data management and orchestration.

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
 nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
 nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/util.py,sha256=aMzS8eATEjpXOiuyAFYthx92fb_cgIzWWd5ZQU6ZlAY,22931
+nmdc_runtime/util.py,sha256=HzQsNMYG6Pb-IuBEE9HBzX_lNkII7jiNe65UFk34ZYA,31414
 nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -28,7 +28,7 @@ nmdc_runtime/lib/nmdc_etl_class.py,sha256=tVh3rKVMkBHQE65_LhKeIjCsaCZQk_HJzbc9K4
 nmdc_runtime/lib/transform_nmdc_data.py,sha256=hij4lR3IMQRJQdL-rsP_I-m_WyFPsBMchV2MNFUkh0M,39906
 nmdc_runtime/minter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/bootstrap.py,sha256=5Ej6pJVBRryRIi0ZwEloY78Zky7iE2okF6tPwRI2axM,822
-nmdc_runtime/minter/config.py,sha256=WrxX9WmyN7Ft4INRAQbd31jmlm5qwaDDaNS9AktieYA,4112
+nmdc_runtime/minter/config.py,sha256=gsXZropDeeTO5tmLAtRuoocwqL3HgfgqVAENyCbX-Gc,2739
 nmdc_runtime/minter/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/adapters/repository.py,sha256=I-jmGP38-9kPhkogrwUht_Ir0CfHA9_5ZImw5I_wbcw,8323
 nmdc_runtime/minter/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
 nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/graphs.py,sha256=mu4bE8799TItWXaPBfOeFB2XMyYwPZcj-VJQmadN2MA,14171
-nmdc_runtime/site/ops.py,sha256=T9_WrwDaySGnu6olwOHQizHQfeofMOaqMcq_vYEIzO0,43140
-nmdc_runtime/site/repository.py,sha256=JtHlp6l3UVo0QhV670TGns9bMfht7NOQrNWQtvsYr2g,39183
-nmdc_runtime/site/resources.py,sha256=6bmvplgql3KdEXKI49BibSk0Sug96SFJi8eOs2zeKK0,18252
-nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
+nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
+nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
+nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
+nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
+nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
 nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -56,10 +56,12 @@ nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZ
 nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
+nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
 nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
 nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
-nmdc_runtime/site/translation/gold_translator.py,sha256=RfAB68dJ9hDep20wETmCNBc0gugZbEKqVimT8h2t0uM,31470
+nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
 nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
 nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
@@ -73,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-2.2.1.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-2.2.1.dist-info/METADATA,sha256=yIkwZWVw8J1xDqhwVQy2Rxfz7cIc42yT4JkRBdsRBr4,7256
-nmdc_runtime-2.2.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-nmdc_runtime-2.2.1.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-2.2.1.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-2.2.1.dist-info/RECORD,,
+nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
+nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-2.3.0.dist-info/RECORD,,

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.6.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.2.1py3-none-any.whl → 2.3.0py3-none-any.whl