PyPI - nmdc-runtime - Versions diffs - 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

nmdc-runtime 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (19) hide show

nmdc_runtime/minter/config.py +18 -50
nmdc_runtime/site/export/ncbi_xml.py +23 -2
nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
nmdc_runtime/site/graphs.py +39 -0
nmdc_runtime/site/ops.py +131 -31
nmdc_runtime/site/repair/__init__.py +0 -0
nmdc_runtime/site/repair/database_updater.py +230 -0
nmdc_runtime/site/repository.py +109 -9
nmdc_runtime/site/resources.py +36 -5
nmdc_runtime/site/translation/gold_translator.py +26 -4
nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
nmdc_runtime/site/util.py +7 -2
nmdc_runtime/util.py +143 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA +11 -3
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD +19 -17
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/minter/config.py CHANGED Viewed

@@ -2,8 +2,9 @@ import os
 from functools import lru_cache
 from typing import List
-from nmdc_runtime.util import get_nmdc_jsonschema_dict
+from nmdc_schema.id_helpers import get_typecode_for_future_ids
+from nmdc_runtime.util import get_nmdc_jsonschema_dict
 from nmdc_runtime.api.db.mongo import get_mongo_db
@@ -12,55 +13,24 @@ def minting_service_id() -> str | None:
     return os.getenv("MINTING_SERVICE_ID")
-def extract_typecode_from_pattern(pattern: str) -> str:
-    r"""
-    Returns the typecode portion of the specified string.
-    >>> extract_typecode_from_pattern("foo-123-456$")  # original behavior
-    'foo'
-    >>> extract_typecode_from_pattern("(foo)-123-456$")  # returns first and only typecode
-    'foo'
-    >>> extract_typecode_from_pattern("(foo|bar)-123-456$")  # returns first of 2 typecodes
-    'foo'
-    >>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$")  # returns first of > 2 typecodes
-    'foo'
-    """
-    # Get the portion of the pattern preceding the first hyphen.
-    # e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
-    typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
-    # If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
-    # e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
-    if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
-        inner_pattern = typecode_sub_pattern[1:-1]
-        # Finally, get everything before the first `|`, if any.
-        # e.g. "apple|banana|carrot" → "apple"
-        # e.g. "apple" → "apple"
-        typecode = inner_pattern.split("|", maxsplit=1)[0]
-    else:
-        # Note: This is the original behavior, before we added support for multi-typecode patterns.
-        # e.g. "apple" → "apple"
-        typecode = typecode_sub_pattern
-    return typecode
 @lru_cache()
 def typecodes() -> List[dict]:
     r"""
     Returns a list of dictionaries containing typecodes and associated information derived from the schema.
-    Preconditions about the schema:
-    - The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
-    - The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
-      or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
-    - The typecode portion of the pattern does not, itself, contain any hyphens.
-    TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
-          Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
-          in a dedicated property of a class; for example, one named `typecode`).
+    Note: In this function, we rely on a helper function provided by the `nmdc-schema` package to extract—from a given
+          class's `id` slot's pattern—the typecode that the minter would use when generating an ID for an instance of
+          that class _today_; regardless of what it may have used in the past.
+    >>> typecode_descriptors = typecodes()
+    # Test #1: We get the typecode we expect, for a class whose pattern contains only one typecode.
+    >>> any((td["name"] == "sty" and td["schema_class"] == "nmdc:Study") for td in typecode_descriptors)
+    True
+    # Tests #2 and #3: We get only the typecode we expect, for a class whose pattern contains multiple typecodes.
+    >>> any((td["name"] == "dgms" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
+    True
+    >>> any((td["name"] == "omprc" and td["schema_class"] == "nmdc:MassSpectrometry") for td in typecode_descriptors)
+    False
     """
     id_pattern_prefix = r"^(nmdc):"
@@ -69,16 +39,14 @@ def typecodes() -> List[dict]:
     for cls_name, defn in schema_dict["$defs"].items():
         match defn.get("properties"):
             case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
-                # Get the portion of the pattern following the prefix.
-                # e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
-                index_of_first_character_following_prefix = len(id_pattern_prefix)
-                pattern_without_prefix = p[index_of_first_character_following_prefix:]
+                # Extract the typecode from the pattern.
+                typecode_for_future_ids = get_typecode_for_future_ids(slot_pattern=p)
                 rv.append(
                     {
                         "id": "nmdc:" + cls_name + "_" + "typecode",
                         "schema_class": "nmdc:" + cls_name,
-                        "name": extract_typecode_from_pattern(pattern_without_prefix),
+                        "name": typecode_for_future_ids,
                     }
                 )
             case _:

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -4,7 +4,7 @@ import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
-from typing import Any
+from typing import Any, List, Union
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument_used: List[str] = ntseq.get(
+                                "instrument_used", []
+                            )
+                            if not instrument_used:
+                                instrument_id = None
+                            else:
+                                instrument_id = instrument_used[0]
                             instrument = all_instruments.get(instrument_id, {})
                             instrument_vendor = instrument.get("vendor", "")
                             instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
                             )
                         )
+                    elif instrument_model == "novaseq_6000":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute",
+                                "NovaSeq 6000",
+                                {"name": "instrument_model"},
+                            )
+                        )
+                    elif instrument_model == "hiseq":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "HiSeq", {"name": "instrument_model"}
+                            )
+                        )
                 if analyte_category == "metagenome":
                     sra_attributes.append(

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from io import BytesIO, StringIO
+from typing import Any, Dict, List, Union
+from nmdc_runtime.api.endpoints.util import strip_oid
 from nmdc_runtime.minter.config import typecodes
 from lxml import etree
+from pymongo.collection import Collection
 import csv
 import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
         raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
-def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_data_objects_from_biosamples(
+    all_docs_collection: Collection,
+    data_object_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the data objects that are "associated" (derived from/products of)
+    with their respective biosamples by iterating over the alldocs collection recursively.
+    The methods returns a dictionary with biosample ids as keys and the associated list of
+    data objects as values.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_object_set: reference to the data_object_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated data objects as values
+    """
+    biosample_data_objects = []
+    def collect_data_objects(doc_ids, collected_objects, unique_ids):
+        for doc_id in doc_ids:
+            if (
+                get_classname_from_typecode(doc_id) == "DataObject"
+                and doc_id not in unique_ids
+            ):
+                data_obj = data_object_set.find_one({"id": doc_id})
+                if data_obj:
+                    collected_objects.append(strip_oid(data_obj))
+                    unique_ids.add(doc_id)
     biosample_data_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
         collected_data_objects = []
+        unique_ids = set()
         while current_ids:
             new_current_ids = []
             for current_id in current_ids:
-                query = {"has_input": current_id}
-                document = all_docs_collection.find_one(query)
+                for doc in all_docs_collection.find({"has_input": current_id}):
+                    has_output = doc.get("has_output", [])
-                if not document:
-                    continue
-                has_output = document.get("has_output")
-                if not has_output:
-                    continue
-                for output_id in has_output:
-                    if get_classname_from_typecode(output_id) == "DataObject":
-                        data_object_doc = all_docs_collection.find_one(
-                            {"id": output_id}
-                        )
-                        if data_object_doc:
-                            collected_data_objects.append(data_object_doc)
-                    else:
-                        new_current_ids.append(output_id)
+                    collect_data_objects(has_output, collected_data_objects, unique_ids)
+                    new_current_ids.extend(
+                        op
+                        for op in has_output
+                        if get_classname_from_typecode(op) != "DataObject"
+                    )
             current_ids = new_current_ids
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     return biosample_data_objects
-def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
-    biosample_data_objects = []
+def fetch_nucleotide_sequencing_from_biosamples(
+    all_docs_collection: Collection,
+    data_generation_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the nucleotide sequencing process records that create data objects
+    for biosamples by iterating over the alldocs collection recursively.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_generation_set: reference to the data_generation_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
+    process objects as values
+    """
+    biosample_ntseq_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
-        collected_data_objects = []
+        collected_ntseq_objects = []
         while current_ids:
             new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
                 for output_id in has_output:
                     if get_classname_from_typecode(output_id) == "DataObject":
-                        nucleotide_sequencing_doc = all_docs_collection.find_one(
+                        nucleotide_sequencing_doc = data_generation_set.find_one(
                             {"id": document["id"]}
                         )
                         if nucleotide_sequencing_doc:
-                            collected_data_objects.append(nucleotide_sequencing_doc)
+                            collected_ntseq_objects.append(
+                                strip_oid(nucleotide_sequencing_doc)
+                            )
                     else:
                         new_current_ids.append(output_id)
             current_ids = new_current_ids
-        if collected_data_objects:
-            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+        if collected_ntseq_objects:
+            biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
+    return biosample_ntseq_objects
-    return biosample_data_objects
+def fetch_library_preparation_from_biosamples(
+    all_docs_collection: Collection,
+    material_processing_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the library preparation process records that create processed samples,
+    which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
+    for biosamples by iterating over the alldocs collection recursively.
-def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
+    :param all_docs_collection: reference to the alldocs collection
+    :param material_processing_set: reference to the material_processing_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated library preparation process
+    objects as values
+    """
     biosample_lib_prep = []
     for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
                 "has_input": output_id,
                 "type": {"$in": ["LibraryPreparation"]},
             }
-            lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
+            lib_prep_doc = material_processing_set.find_one(lib_prep_query)
             if lib_prep_doc:
-                biosample_lib_prep.append({biosample_id: lib_prep_doc})
+                biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
                 break  # Stop at the first document that meets the criteria
     return biosample_lib_prep

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -2,6 +2,7 @@ from dagster import graph
 from nmdc_runtime.site.ops import (
     build_merged_db,
+    generate_biosample_set_for_nmdc_study_from_gold,
     nmdc_schema_database_export_filename,
     nmdc_schema_database_from_gold_study,
     nmdc_schema_object_to_dict,
@@ -57,6 +58,9 @@ from nmdc_runtime.site.ops import (
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
+    get_database_updater_inputs,
+    post_submission_portal_biosample_ingest_record_stitching_filename,
+    generate_data_generation_set_post_biosample_ingest,
 )
 from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
@@ -117,12 +121,14 @@ def apply_changesheet():
     sheet_in = get_changesheet_in()
     outputs = perform_changesheet_updates(sheet_in)
     add_output_run_event(outputs)
+    materialize_alldocs()
 @graph
 def apply_metadata_in():
     outputs = perform_mongo_updates(get_json_in())
     add_output_run_event(outputs)
+    materialize_alldocs()
 @graph
@@ -465,3 +471,36 @@ def nmdc_study_to_ncbi_submission_export():
         all_instruments,
     )
     ncbi_submission_xml_asset(xml_data)
+@graph
+def generate_data_generation_set_for_biosamples_in_nmdc_study():
+    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
+    database = generate_data_generation_set_post_biosample_ingest(
+        study_id, gold_nmdc_instrument_map_df
+    )
+    database_dict = nmdc_schema_object_to_dict(database)
+    filename = post_submission_portal_biosample_ingest_record_stitching_filename(
+        study_id
+    )
+    outputs = export_json_to_drs(database_dict, filename)
+    add_output_run_event(outputs)
+@graph
+def generate_biosample_set_from_samples_in_gold():
+    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
+    database = generate_biosample_set_for_nmdc_study_from_gold(
+        study_id, gold_nmdc_instrument_map_df
+    )
+    database_dict = nmdc_schema_object_to_dict(database)
+    filename = post_submission_portal_biosample_ingest_record_stitching_filename(
+        study_id
+    )
+    outputs = export_json_to_drs(database_dict, filename)
+    add_output_run_event(outputs)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -91,7 +91,12 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
 from nmdc_runtime.site.translation.submission_portal_translator import (
     SubmissionPortalTranslator,
 )
-from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
+from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
+from nmdc_runtime.site.util import (
+    run_and_log,
+    schema_collection_has_index_on_id,
+    nmdc_study_id_to_filename,
+)
 from nmdc_runtime.util import (
     drs_object_in_for,
     get_names_of_classes_in_effective_range_of_slot,
@@ -1054,10 +1059,7 @@ def materialize_alldocs(context) -> int:
     # TODO include functional_annotation_agg  for "real-time" ref integrity checking.
     #   For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
     collection_names = populated_schema_collection_names_with_id_field(mdb)
-    context.log.info(f"{collection_names=}")
-    # Build alldocs
-    context.log.info("constructing `alldocs` collection")
+    context.log.info(f"constructing `alldocs` collection using {collection_names=}")
     document_class_names = set(
         chain.from_iterable(collection_name_to_class_names.values())
@@ -1070,7 +1072,8 @@ def materialize_alldocs(context) -> int:
         for cls_name in document_class_names
     }
-    # Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
+    # Any ancestor of a document class is a document-referencable range,
+    # i.e., a valid range of a document-reference-ranged slot.
     document_referenceable_ranges = set(
         chain.from_iterable(
             schema_view.class_ancestors(cls_name) for cls_name in document_class_names
@@ -1086,17 +1089,15 @@ def materialize_alldocs(context) -> int:
             ):
                 document_reference_ranged_slots[cls_name].append(slot_name)
-    # Drop any existing `alldocs` collection (e.g. from previous use of this op).
-    #
-    # FIXME: This "nuke and pave" approach introduces a race condition.
-    #        For example, if someone were to visit an API endpoint that uses the "alldocs" collection,
-    #        the endpoint would fail to perform its job since the "alldocs" collection is temporarily missing.
-    #
-    mdb.alldocs.drop()
+    # Build `alldocs` to a temporary collection for atomic replacement
+    # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
+    temp_alldocs_collection_name = f"tmp.alldocs.{ObjectId()}"
+    temp_alldocs_collection = mdb[temp_alldocs_collection_name]
+    context.log.info(f"constructing `{temp_alldocs_collection.name}` collection")
     for coll_name in collection_names:
         context.log.info(f"{coll_name=}")
-        requests = []
+        write_operations = []
         documents_processed_counter = 0
         for doc in mdb[coll_name].find():
             doc_type = doc["type"][5:]  # lop off "nmdc:" prefix
@@ -1105,30 +1106,35 @@ def materialize_alldocs(context) -> int:
             ]
             new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
             new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
-            requests.append(InsertOne(new_doc))
-            if len(requests) == BULK_WRITE_BATCH_SIZE:
-                _ = mdb.alldocs.bulk_write(requests, ordered=False)
-                requests.clear()
+            write_operations.append(InsertOne(new_doc))
+            if len(write_operations) == BULK_WRITE_BATCH_SIZE:
+                _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
+                write_operations.clear()
                 documents_processed_counter += BULK_WRITE_BATCH_SIZE
-        if len(requests) > 0:
-            _ = mdb.alldocs.bulk_write(requests, ordered=False)
-            documents_processed_counter += len(requests)
+        if len(write_operations) > 0:
+            _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
+            documents_processed_counter += len(write_operations)
         context.log.info(
             f"Inserted {documents_processed_counter} documents from {coll_name=} "
         )
     context.log.info(
-        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
+        f"produced `{temp_alldocs_collection.name}` collection with"
+        f" {temp_alldocs_collection.estimated_document_count()} docs."
     )
-    # Re-idx for `alldocs` collection
-    mdb.alldocs.create_index("id", unique=True)
-    # The indexes were added to improve the performance of the
-    # /data_objects/study/{study_id} endpoint
+    context.log.info(f"creating indexes on `{temp_alldocs_collection.name}` ...")
+    # Ensure unique index on "id". Index creation here is blocking (i.e. background=False),
+    # so that `temp_alldocs_collection` will be "good to go" on renaming.
+    temp_alldocs_collection.create_index("id", unique=True)
+    # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
     slots_to_index = ["has_input", "has_output", "was_informed_by"]
-    [mdb.alldocs.create_index(slot) for slot in slots_to_index]
+    [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
     context.log.info(f"created indexes on id, {slots_to_index}.")
+    context.log.info(f"renaming `{temp_alldocs_collection.name}` to `alldocs`...")
+    temp_alldocs_collection.rename("alldocs", dropTarget=True)
     return mdb.alldocs.estimated_document_count()
@@ -1182,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_object_set = mdb["data_object_set"]
     biosample_data_objects = fetch_data_objects_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_object_set, biosamples
     )
     return biosample_data_objects
@@ -1194,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_generation_set = mdb["data_generation_set"]
     biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_generation_set, biosamples
     )
     return biosample_omics_processing
@@ -1206,8 +1214,9 @@ def get_library_preparation_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    material_processing_set = mdb["material_processing_set"]
     biosample_lib_prep = fetch_library_preparation_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, material_processing_set, biosamples
     )
     return biosample_lib_prep
@@ -1240,3 +1249,94 @@ def ncbi_submission_xml_from_nmdc_study(
         all_instruments,
     )
     return ncbi_xml
+@op
+def post_submission_portal_biosample_ingest_record_stitching_filename(
+    nmdc_study_id: str,
+) -> str:
+    filename = nmdc_study_id_to_filename(nmdc_study_id)
+    return f"missing_database_records_for_{filename}.json"
+@op(
+    config_schema={
+        "nmdc_study_id": str,
+        "gold_nmdc_instrument_mapping_file_url": str,
+    },
+    out={
+        "nmdc_study_id": Out(str),
+        "gold_nmdc_instrument_mapping_file_url": Out(str),
+    },
+)
+def get_database_updater_inputs(context: OpExecutionContext) -> Tuple[str, str]:
+    return (
+        context.op_config["nmdc_study_id"],
+        context.op_config["gold_nmdc_instrument_mapping_file_url"],
+    )
+@op(
+    required_resource_keys={
+        "runtime_api_user_client",
+        "runtime_api_site_client",
+        "gold_api_client",
+    }
+)
+def generate_data_generation_set_post_biosample_ingest(
+    context: OpExecutionContext,
+    nmdc_study_id: str,
+    gold_nmdc_instrument_map_df: pd.DataFrame,
+) -> nmdc.Database:
+    runtime_api_user_client: RuntimeApiUserClient = (
+        context.resources.runtime_api_user_client
+    )
+    runtime_api_site_client: RuntimeApiSiteClient = (
+        context.resources.runtime_api_site_client
+    )
+    gold_api_client: GoldApiClient = context.resources.gold_api_client
+    database_updater = DatabaseUpdater(
+        runtime_api_user_client,
+        runtime_api_site_client,
+        gold_api_client,
+        nmdc_study_id,
+        gold_nmdc_instrument_map_df,
+    )
+    database = (
+        database_updater.generate_data_generation_set_records_from_gold_api_for_study()
+    )
+    return database
+@op(
+    required_resource_keys={
+        "runtime_api_user_client",
+        "runtime_api_site_client",
+        "gold_api_client",
+    }
+)
+def generate_biosample_set_for_nmdc_study_from_gold(
+    context: OpExecutionContext,
+    nmdc_study_id: str,
+    gold_nmdc_instrument_map_df: pd.DataFrame,
+) -> nmdc.Database:
+    runtime_api_user_client: RuntimeApiUserClient = (
+        context.resources.runtime_api_user_client
+    )
+    runtime_api_site_client: RuntimeApiSiteClient = (
+        context.resources.runtime_api_site_client
+    )
+    gold_api_client: GoldApiClient = context.resources.gold_api_client
+    database_updater = DatabaseUpdater(
+        runtime_api_user_client,
+        runtime_api_site_client,
+        gold_api_client,
+        nmdc_study_id,
+        gold_nmdc_instrument_map_df,
+    )
+    database = database_updater.generate_biosample_set_from_gold_api_for_study()
+    return database

nmdc_runtime/site/repair/__init__.py ADDED Viewed

File without changes

nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl