PyPI - nmdc-runtime - Versions diffs - 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

nmdc-runtime 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (15) hide show

nmdc_runtime/config.py +1 -0
nmdc_runtime/site/export/ncbi_xml.py +155 -28
nmdc_runtime/site/export/ncbi_xml_utils.py +32 -0
nmdc_runtime/site/graphs.py +11 -2
nmdc_runtime/site/ops.py +109 -17
nmdc_runtime/site/repository.py +2 -2
nmdc_runtime/site/translation/submission_portal_translator.py +3 -1
nmdc_runtime/site/util.py +7 -5
nmdc_runtime/util.py +32 -1
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/METADATA +1 -1
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/RECORD +15 -14
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/LICENSE +0 -0
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/config.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ DATABASE_CLASS_NAME = "Database"

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -66,7 +66,7 @@ class NCBISubmissionXML:
             element.append(child)
         return element
-    def set_description(self, email, user, first, last, org, date=None):
+    def set_description(self, email, first, last, org, date=None):
         date = date or datetime.datetime.now().strftime("%Y-%m-%d")
         description = self.set_element(
             "Description",
@@ -74,7 +74,6 @@ class NCBISubmissionXML:
                 self.set_element(
                     "Comment", f"NMDC Submission for {self.nmdc_study_id}"
                 ),
-                self.set_element("Submitter", attrib={"user_name": user}),
                 self.set_element(
                     "Organization",
                     attrib={"role": "owner", "type": "center"},
@@ -159,7 +158,6 @@ class NCBISubmissionXML:
         org,
         bioproject_id,
         nmdc_biosamples,
-        nmdc_omics_processing,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
             self.nmdc_ncbi_attribute_mapping_file_url
@@ -206,7 +204,7 @@ class NCBISubmissionXML:
                     children=[
                         self.set_element(
                             "Title",
-                            f"NMDC Biosample {sample_id_value} from {organism_name} part of {self.nmdc_study_id} study",
+                            f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
                         ),
                     ],
                 ),
@@ -230,6 +228,13 @@ class NCBISubmissionXML:
                             "Attribute", attributes[key], {"attribute_name": key}
                         )
                         for key in sorted(attributes)
+                    ]
+                    + [
+                        self.set_element(
+                            "Attribute",
+                            "National Microbiome Data Collaborative",
+                            {"attribute_name": "broker name"},
+                        )
                     ],
                 ),
             ]
@@ -278,29 +283,63 @@ class NCBISubmissionXML:
         biosample_data_objects: list,
         bioproject_id: str,
         org: str,
+        nmdc_omics_processing: list,
+        nmdc_biosamples: list,
+        nmdc_library_preparation: list,
     ):
+        bsm_id_name_dict = {
+            biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
+        }
         for entry in biosample_data_objects:
             fastq_files = []
             biosample_ids = []
+            omics_processing_ids = {}
+            lib_prep_protocol_names = {}
+            instrument_name = ""
+            omics_type = ""
+            library_name = ""
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
                 for data_object in data_objects:
                     if "url" in data_object:
                         url = urlparse(data_object["url"])
-                        file_path = os.path.join(
-                            os.path.basename(os.path.dirname(url.path)),
-                            os.path.basename(url.path),
-                        )
+                        file_path = os.path.basename(url.path)
                         fastq_files.append(file_path)
+                for omprc_dict in nmdc_omics_processing:
+                    if biosample_id in omprc_dict:
+                        for omprc in omprc_dict[biosample_id]:
+                            omics_processing_ids[biosample_id] = omprc.get("id", "")
+                            instrument_name = omprc.get("instrument_name", "")
+                            omics_type = (
+                                omprc.get("omics_type", {})
+                                .get("has_raw_value", "")
+                                .lower()
+                            )
+                            library_name = bsm_id_name_dict.get(biosample_id, "")
+                for lib_prep_dict in nmdc_library_preparation:
+                    if biosample_id in lib_prep_dict:
+                        lib_prep_protocol_names[biosample_id] = (
+                            lib_prep_dict[biosample_id]
+                            .get("protocol_link", {})
+                            .get("name", "")
+                        )
             if fastq_files:
                 files_elements = [
                     self.set_element(
                         "File",
                         "",
                         {"file_path": f},
-                        [self.set_element("DataType", "generic-data")],
+                        [
+                            self.set_element(
+                                "DataType",
+                                "sra-run-fastq" if ".fastq" in f else "generic-data",
+                            )
+                        ],
                     )
                     for f in fastq_files
                 ]
@@ -344,35 +383,122 @@ class NCBISubmissionXML:
                         )
                     )
-                identifier_element = self.set_element(
-                    "Identifier",
-                    children=[
+                sra_attributes = []
+                if instrument_name.lower().startswith("illumina"):
+                    sra_attributes.append(
+                        self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
+                    )
+                    if "nextseq550" in instrument_name.lower():
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "NextSeq 550", {"name": "instrument_model"}
+                            )
+                        )
+                if omics_type == "metagenome":
+                    sra_attributes.append(
                         self.set_element(
-                            "SPUID", bioproject_id, {"spuid_namespace": org}
+                            "Attribute", "WGS", {"name": "library_strategy"}
                         )
-                    ],
-                )
+                    )
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "METAGENOMIC", {"name": "library_source"}
+                        )
+                    )
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "RANDOM", {"name": "library_selection"}
+                        )
+                    )
-                action = self.set_element(
-                    "Action",
-                    children=[
+                if omics_type == "metatranscriptome":
+                    sra_attributes.append(
                         self.set_element(
-                            "AddFiles",
-                            attrib={"target_db": "SRA"},
-                            children=files_elements
-                            + attribute_elements
-                            + [identifier_element],
-                        ),
-                    ],
+                            "Attribute",
+                            "METATRANSCRIPTOMIC",
+                            {"name": "library_source"},
+                        )
+                    )
+                has_paired_reads = any(
+                    data_object.get("data_object_type", "").lower()
+                    == "metagenome raw reads"
+                    for data_object in data_objects
+                ) or (
+                    any(
+                        data_object.get("data_object_type", "").lower()
+                        == "metagenome raw read 1"
+                        for data_object in data_objects
+                    )
+                    and any(
+                        data_object.get("data_object_type", "").lower()
+                        == "metagenome raw read 2"
+                        for data_object in data_objects
+                    )
                 )
-                self.root.append(action)
+                if has_paired_reads:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "paired", {"name": "library_layout"}
+                        )
+                    )
+                else:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "single", {"name": "library_layout"}
+                        )
+                    )
+                if library_name:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", library_name, {"name": "library_name"}
+                        )
+                    )
+                for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute",
+                            lib_prep_name,
+                            {"name": "library_construction_protocol"},
+                        )
+                    )
+                for biosample_id, omics_processing_id in omics_processing_ids.items():
+                    identifier_element = self.set_element(
+                        "Identifier",
+                        children=[
+                            self.set_element(
+                                "SPUID", omics_processing_id, {"spuid_namespace": org}
+                            )
+                        ],
+                    )
+                    action = self.set_element(
+                        "Action",
+                        children=[
+                            self.set_element(
+                                "AddFiles",
+                                attrib={"target_db": "SRA"},
+                                children=files_elements
+                                + attribute_elements
+                                + sra_attributes
+                                + [identifier_element],
+                            ),
+                        ],
+                    )
+                    self.root.append(action)
     def get_submission_xml(
         self,
         biosamples_list: list,
         biosample_omics_processing_list: list,
         biosample_data_objects_list: list,
+        biosample_library_preparation_list: list,
     ):
         data_type = None
         ncbi_project_id = None
@@ -387,7 +513,6 @@ class NCBISubmissionXML:
         self.set_description(
             email=self.nmdc_pi_email,
-            user="National Microbiome Data Collaborative (NMDC)",
             first=self.first_name,
             last=self.last_name,
             org=self.ncbi_submission_metadata.get("organization", ""),
@@ -407,13 +532,15 @@ class NCBISubmissionXML:
             org=self.ncbi_submission_metadata.get("organization", ""),
             bioproject_id=ncbi_project_id,
             nmdc_biosamples=biosamples_list,
-            nmdc_omics_processing=biosample_omics_processing_list,
         )
         self.set_fastq(
             biosample_data_objects=biosample_data_objects_list,
             bioproject_id=ncbi_project_id,
             org=self.ncbi_submission_metadata.get("organization", ""),
+            nmdc_omics_processing=biosample_omics_processing_list,
+            nmdc_biosamples=biosamples_list,
+            nmdc_library_preparation=biosample_library_preparation_list,
         )
         rough_string = ET.tostring(self.root, "unicode")

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -96,6 +96,38 @@ def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list)
     return biosample_data_objects
+def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
+    biosample_lib_prep = []
+    for biosample in biosamples_list:
+        biosample_id = biosample["id"]
+        # Step 1: Find any document with biosample id as has_input
+        initial_query = {"has_input": biosample_id}
+        initial_document = all_docs_collection.find_one(initial_query)
+        if not initial_document:
+            continue
+        initial_output = initial_document.get("has_output")
+        if not initial_output:
+            continue
+        # Step 2: Use has_output to find the library preparation document
+        for output_id in initial_output:
+            lib_prep_query = {
+                "has_input": output_id,
+                "designated_class": "nmdc:LibraryPreparation",
+            }
+            lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
+            if lib_prep_doc:
+                biosample_lib_prep.append({biosample_id: lib_prep_doc})
+                break  # Stop at the first document that meets the criteria
+    return biosample_lib_prep
 def handle_quantity_value(slot_value):
     if "has_numeric_value" in slot_value and "has_unit" in slot_value:
         return f"{slot_value['has_numeric_value']} {slot_value['has_unit']}"

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -48,9 +48,11 @@ from nmdc_runtime.site.ops import (
     get_neon_pipeline_inputs,
     get_df_from_url,
     site_code_mapping,
+    materialize_alldocs,
     get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
     get_omics_processing_from_biosamples,
+    get_library_preparation_from_biosamples,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -98,6 +100,11 @@ def housekeeping():
     delete_operations(list_operations(filter_ops_undone_expired()))
+@graph
+def ensure_alldocs():
+    materialize_alldocs()
 @graph
 def ensure_jobs():
     jobs = construct_jobs()
@@ -384,12 +391,14 @@ def nmdc_study_to_ncbi_submission_export():
     ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
     biosamples = get_biosamples_by_study_id(nmdc_study)
     omics_processing_records = get_omics_processing_from_biosamples(biosamples)
-    data_objects = get_data_objects_from_biosamples(biosamples)
+    data_object_records = get_data_objects_from_biosamples(biosamples)
+    library_preparation_records = get_library_preparation_from_biosamples(biosamples)
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
         ncbi_submission_metadata,
         biosamples,
         omics_processing_records,
-        data_objects,
+        data_object_records,
+        library_preparation_records,
     )
     ncbi_submission_xml_asset(xml_data)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -13,6 +13,7 @@ from zipfile import ZipFile
 import pandas as pd
 import requests
 from bson import ObjectId, json_util
 from dagster import (
     Any,
@@ -65,6 +66,7 @@ from nmdc_runtime.site.export.ncbi_xml import NCBISubmissionXML
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     fetch_data_objects_from_biosamples,
     fetch_omics_processing_from_biosamples,
+    fetch_library_preparation_from_biosamples,
 )
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
@@ -73,6 +75,7 @@ from nmdc_runtime.site.resources import (
     RuntimeApiSiteClient,
     RuntimeApiUserClient,
     NeonApiClient,
+    MongoDB as MongoDBResource,
 )
 from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
 from nmdc_runtime.site.translation.neon_soil_translator import NeonSoilDataTranslator
@@ -85,15 +88,19 @@ from nmdc_runtime.site.translation.neon_surface_water_translator import (
 from nmdc_runtime.site.translation.submission_portal_translator import (
     SubmissionPortalTranslator,
 )
-from nmdc_runtime.site.util import collection_indexed_on_id, run_and_log
+from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
 from nmdc_runtime.util import (
     drs_object_in_for,
     pluralize,
     put_object,
     validate_json,
     specialize_activity_set_docs,
+    collection_name_to_class_names,
+    class_hierarchy_as_list,
+    populated_schema_collection_names_with_id_field,
 )
 from nmdc_schema import nmdc
+from nmdc_schema.nmdc import Database as NMDCDatabase
 from pydantic import BaseModel
 from pymongo.database import Database as MongoDatabase
 from starlette import status
@@ -521,29 +528,45 @@ def perform_mongo_updates(context, json_in):
     if rv["result"] == "errors":
         raise Failure(str(rv["detail"]))
-    coll_has_id_index = collection_indexed_on_id(mongo.db)
-    if all(coll_has_id_index[coll] for coll in docs.keys()):
+    # TODO containing op `perform_mongo_updates` needs test coverage, as below line had trivial bug.
+    #   ref: https://github.com/microbiomedata/nmdc-runtime/issues/631
+    add_docs_result = _add_schema_docs_with_or_without_replacement(mongo, docs)
+    op_patch = UpdateOperationRequest(
+        done=True,
+        result=add_docs_result,
+        metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
+    )
+    op_doc = client.update_operation(op_id, op_patch).json()
+    return ["/operations/" + op_doc["id"]]
+def _add_schema_docs_with_or_without_replacement(
+    mongo: MongoDBResource, docs: Dict[str, list]
+):
+    coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
+    if all(coll_index_on_id_map[coll] for coll in docs.keys()):
         replace = True
-    elif all(not coll_has_id_index[coll] for coll in docs.keys()):
+    elif all(not coll_index_on_id_map[coll] for coll in docs.keys()):
+        # FIXME: XXX: This is a hack because e.g. <https://w3id.org/nmdc/FunctionalAnnotationAggMember>
+        # documents should be unique with compound key (metagenome_annotation_id, gene_function_id)
+        # and yet this is not explicit in the schema. One potential solution is to auto-generate an `id`
+        # as a deterministic hash of the compound key.
+        #
+        # For now, decision is to potentially re-insert "duplicate" documents, i.e. to interpret
+        # lack of `id` as lack of unique document identity for de-duplication.
         replace = False  # wasting time trying to upsert by `id`.
     else:
         colls_not_id_indexed = [
-            coll for coll in docs.keys() if not coll_has_id_index[coll]
+            coll for coll in docs.keys() if not coll_index_on_id_map[coll]
         ]
-        colls_id_indexed = [coll for coll in docs.keys() if coll_has_id_index[coll]]
+        colls_id_indexed = [coll for coll in docs.keys() if coll_index_on_id_map[coll]]
         raise Failure(
             "Simultaneous addition of non-`id`ed collections and `id`-ed collections"
             " is not supported at this time."
             f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
         )
     op_result = mongo.add_docs(docs, validate=False, replace=replace)
-    op_patch = UpdateOperationRequest(
-        done=True,
-        result=mongo_add_docs_result_as_dict(op_result),
-        metadata={"done_at": datetime.now(timezone.utc).isoformat(timespec="seconds")},
-    )
-    op_doc = client.update_operation(op_id, op_patch).json()
-    return ["/operations/" + op_doc["id"]]
+    return mongo_add_docs_result_as_dict(op_result)
 @op(required_resource_keys={"mongo"})
@@ -659,7 +682,6 @@ def translate_portal_submission_to_nmdc_schema_database(
     study_category: Optional[str],
     study_doi_category: Optional[str],
     study_doi_provider: Optional[str],
-    study_funding_sources: Optional[List[str]],
     study_pi_image_url: Optional[str],
     biosample_extras: Optional[list[dict]],
     biosample_extras_slot_mapping: Optional[list[dict]],
@@ -678,7 +700,6 @@ def translate_portal_submission_to_nmdc_schema_database(
         study_category=study_category,
         study_doi_category=study_doi_category,
         study_doi_provider=study_doi_provider,
-        study_funding_sources=study_funding_sources,
         study_pi_image_url=study_pi_image_url,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
@@ -973,6 +994,61 @@ def site_code_mapping() -> dict:
         )
+@op(required_resource_keys={"mongo"})
+def materialize_alldocs(context) -> int:
+    mdb = context.resources.mongo.db
+    collection_names = populated_schema_collection_names_with_id_field(mdb)
+    for name in collection_names:
+        assert (
+            len(collection_name_to_class_names[name]) == 1
+        ), f"{name} collection has class name of {collection_name_to_class_names[name]} and len {len(collection_name_to_class_names[name])}"
+    context.log.info(f"{collection_names=}")
+    # Drop any existing `alldocs` collection (e.g. from previous use of this op).
+    mdb.alldocs.drop()
+    # Build alldocs
+    context.log.info("constructing `alldocs` collection")
+    for collection in collection_names:
+        # Calculate class_hierarchy_as_list once per collection, using the first document in list
+        try:
+            nmdcdb = NMDCDatabase(
+                **{collection: [dissoc(mdb[collection].find_one(), "_id")]}
+            )
+            exemplar = getattr(nmdcdb, collection)[0]
+            newdoc_type: list[str] = class_hierarchy_as_list(exemplar)
+        except ValueError as e:
+            context.log.info(f"Collection {collection} does not exist.")
+            raise e
+        context.log.info(
+            f"Found {mdb[collection].estimated_document_count()} estimated documents for {collection=}."
+        )
+        # For each document in this collection, replace the value of the `type` field with
+        # a _list_ of the document's own class and ancestor classes, remove the `_id` field,
+        # and insert the resulting document into the `alldocs` collection.
+        inserted_many_result = mdb.alldocs.insert_many(
+            [
+                assoc(dissoc(doc, "type", "_id"), "type", newdoc_type)
+                for doc in mdb[collection].find()
+            ]
+        )
+        context.log.info(
+            f"Inserted {len(inserted_many_result.inserted_ids)} documents for {collection=}."
+        )
+    # Re-idx for `alldocs` collection
+    mdb.alldocs.create_index("id", unique=True)
+    context.log.info(
+        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
+    )
+    return mdb.alldocs.estimated_document_count()
 @op(config_schema={"nmdc_study_id": str}, required_resource_keys={"mongo"})
 def get_ncbi_export_pipeline_study(context: OpExecutionContext) -> Any:
     nmdc_study = find_study_by_id(
@@ -1039,6 +1115,18 @@ def get_omics_processing_from_biosamples(context: OpExecutionContext, biosamples
     return biosample_omics_processing
+@op(required_resource_keys={"mongo"})
+def get_library_preparation_from_biosamples(
+    context: OpExecutionContext, biosamples: list
+):
+    mdb = context.resources.mongo.db
+    alldocs_collection = mdb["alldocs"]
+    biosample_lib_prep = fetch_library_preparation_from_biosamples(
+        alldocs_collection, biosamples
+    )
+    return biosample_lib_prep
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
@@ -1046,10 +1134,14 @@ def ncbi_submission_xml_from_nmdc_study(
     ncbi_exporter_metadata: dict,
     biosamples: list,
     omics_processing_records: list,
-    data_objects: list,
+    data_object_records: list,
+    library_preparation_records: list,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(
-        biosamples, omics_processing_records, data_objects
+        biosamples,
+        omics_processing_records,
+        data_object_records,
+        library_preparation_records,
     )
     return ncbi_xml

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -42,6 +42,7 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_soil_metadata,
     ingest_neon_benthic_metadata,
     ingest_neon_surface_water_metadata,
+    ensure_alldocs,
     nmdc_study_to_ncbi_submission_export,
 )
 from nmdc_runtime.site.resources import (
@@ -450,6 +451,7 @@ def repo():
         ensure_jobs.to_job(**preset_normal),
         apply_metadata_in.to_job(**preset_normal),
         export_study_biosamples_metadata.to_job(**preset_normal),
+        ensure_alldocs.to_job(**preset_normal),
     ]
     schedules = [housekeeping_weekly]
     sensors = [
@@ -537,7 +539,6 @@ def biosample_submission_ingest():
                             "study_category": None,
                             "study_doi_category": None,
                             "study_doi_provider": None,
-                            "study_funding_sources": None,
                             "study_pi_image_url": None,
                         }
                     },
@@ -576,7 +577,6 @@ def biosample_submission_ingest():
                             "study_category": None,
                             "study_doi_category": None,
                             "study_doi_provider": None,
-                            "study_funding_sources": None,
                             "study_pi_image_url": None,
                         }
                     },

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -404,7 +404,9 @@ class SubmissionPortalTranslator(Translator):
             description=self._get_from(
                 metadata_submission, ["studyForm", "description"]
             ),
-            funding_sources=self.study_funding_sources,
+            funding_sources=self._get_from(
+                metadata_submission, ["studyForm", "fundingSources"]
+            ),
             # emsl_proposal_identifier=self._get_from(
             #     metadata_submission, ["multiOmicsForm", "studyNumber"]
             # ),

nmdc_runtime/site/util.py CHANGED Viewed

@@ -4,6 +4,7 @@ from subprocess import Popen, PIPE, STDOUT, CalledProcessError
 from pymongo.database import Database as MongoDatabase
+from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
 from nmdc_runtime.site.resources import mongo_resource
 mode_test = {
@@ -34,12 +35,13 @@ def run_and_log(shell_cmd, context):
 @lru_cache
-def collection_indexed_on_id(mdb: MongoDatabase) -> dict:
-    set_collection_names = [
-        name for name in mdb.list_collection_names() if name.endswith("_set")
-    ]
+def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
+    present_collection_names = set(mdb.list_collection_names())
     return {
-        name: ("id_1" in mdb[name].index_information()) for name in set_collection_names
+        name: (
+            name in present_collection_names and "id_1" in mdb[name].index_information()
+        )
+        for name in get_collection_names_from_schema()
     }

nmdc_runtime/util.py CHANGED Viewed

@@ -8,6 +8,7 @@ from copy import deepcopy
 from datetime import datetime, timezone
 from functools import lru_cache
 from io import BytesIO
+from itertools import chain
 from pathlib import Path
 from uuid import uuid4
 from typing import List, Optional, Set, Dict
@@ -369,13 +370,38 @@ def specialize_activity_set_docs(docs):
 # Define a mapping from collection name to a list of class names allowable for that collection's documents.
 collection_name_to_class_names: Dict[str, List[str]] = {
-    collection_name: get_class_names_from_collection_spec(spec)
+    collection_name: list(
+        set(
+            chain.from_iterable(
+                nmdc_schema_view().class_descendants(cls_name)
+                for cls_name in get_class_names_from_collection_spec(spec)
+            )
+        )
+    )
     for collection_name, spec in nmdc_jsonschema["$defs"]["Database"][
         "properties"
     ].items()
 }
+def class_hierarchy_as_list(obj) -> list[str]:
+    """
+    get list of inherited classes for each concrete class
+    """
+    rv = []
+    current_class = obj.__class__
+    def recurse_through_bases(cls):
+        if cls.__name__ == "YAMLRoot":
+            return rv
+        rv.append(cls.__name__)
+        for base in cls.__bases__:
+            recurse_through_bases(base)
+        return rv
+    return recurse_through_bases(current_class)
 @lru_cache
 def schema_collection_names_with_id_field() -> Set[str]:
     """
@@ -393,6 +419,11 @@ def schema_collection_names_with_id_field() -> Set[str]:
     return target_collection_names
+def populated_schema_collection_names_with_id_field(mdb: MongoDatabase) -> List[str]:
+    collection_names = sorted(schema_collection_names_with_id_field())
+    return [n for n in collection_names if mdb[n].find_one({"id": {"$exists": True}})]
 def ensure_unique_id_indexes(mdb: MongoDatabase):
     """Ensure that any collections with an "id" field have an index on "id"."""
     candidate_names = (

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nmdc_runtime
-Version: 1.7.0
+Version: 1.9.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,8 @@
 nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
 nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
 nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/util.py,sha256=3mHVEUdMOv73XgT6NTuzMuMCL5Gs6NJ4Mk0bkgQQaQU,19844
+nmdc_runtime/util.py,sha256=Wd2GuuskyUqf1eV5mHLZws8BHAOsqnc0Qj7_4WhSvAM,20736
 nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -35,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
 nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/graphs.py,sha256=_vCyQnICis4OQGH91i1ZwpvHYcXOG6Nfg04f5DVdy2M,12040
-nmdc_runtime/site/ops.py,sha256=G6X3YgSmDNxOnsMEByLUMfB0peY4o21o0_Ig3V7v6M4,35835
-nmdc_runtime/site/repository.py,sha256=-dOk9BEnLSrmAN6bZoIu_WnFSqriIpO0c5P76PuHW1M,37472
+nmdc_runtime/site/graphs.py,sha256=jqfwhrCVUBszt9168au_DVvZBtgIfpUf1OXFiyPHI6U,12304
+nmdc_runtime/site/ops.py,sha256=DchVsC0v7J3noZMhVXUZgSGrm_sC78Y9_z_Nfhuq21E,39632
+nmdc_runtime/site/repository.py,sha256=ge3LW_5izCgL6x1Ios8z2Hrt--aY6LXqhGjnAjcIJkI,37422
 nmdc_runtime/site/resources.py,sha256=ZSH1yvA-li0R7Abc22_v0XLbjBYf5igETr2G01J3hnc,17557
-nmdc_runtime/site/util.py,sha256=6hyVPpb6ZkWEG8Nm7uQxnZ-QmuPOG9hgWvl0mUBr5JU,1303
+nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
 nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -50,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=Z2qsaGIBvY2OdOkf8kJEZl1T_8R_YzhAlXxJ1gMQwnk,16946
-nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=CqrtjwzmUbZXEW8aD-KpnCV_PlXVH-Gqp309nw3vbeo,6464
+nmdc_runtime/site/export/ncbi_xml.py,sha256=KMKHZJEjTGECI2N2Hp0yDSMGrkjEC7GmlOnptaZCy2E,22297
+nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=jY4YJt5P7EMsy8gSOPI33K6VcEfOXaVR_zQINZOBUKU,7561
 nmdc_runtime/site/export/study_metadata.py,sha256=WRU0F1ksWfNX3k9LD91Pn2DuLA-IOpGvYPJd6DnguEs,4819
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -64,7 +65,7 @@ nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=e_7tXFrP0PpdhqUC
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=cJJ_QPva5G5SIT_7DjCSsqbDvgbiKGqUYrxK3nx7_Lw,37634
 nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=6LaFwBnVx6TN9v1D-G6LFrDxY0TK05AvMklx0E1tTeQ,26590
 nmdc_runtime/site/translation/neon_utils.py,sha256=mdxJVPb3zbD4DiKW3Fwgk22kjczKMwkcozvy7fwteTE,5203
-nmdc_runtime/site/translation/submission_portal_translator.py,sha256=KiVO1vohhrJGfwzLJOumRfyHjcbYfswBIBvkYIdFxv8,28097
+nmdc_runtime/site/translation/submission_portal_translator.py,sha256=aNGIXTiJEXGC_29qWeol2C426bAt5VlY3In_YhplPU0,28169
 nmdc_runtime/site/translation/translator.py,sha256=xM9dM-nTgSWwu5HFoUVNHf8kqk9iiH4PgWdSx4OKxEk,601
 nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
 nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,9 +73,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-1.7.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-1.7.0.dist-info/METADATA,sha256=FnoXHNgR6o5PEe6XhqRGdqOjbIX_ry-SKY5uMtZJQXY,7302
-nmdc_runtime-1.7.0.dist-info/WHEEL,sha256=mguMlWGMX-VHnMpKOjjQidIo1ssRlCFu4a4mBpz1s2M,91
-nmdc_runtime-1.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-1.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-1.7.0.dist-info/RECORD,,
+nmdc_runtime-1.9.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-1.9.0.dist-info/METADATA,sha256=6PtNVNbnAQR1l8MWaC6jtXv9YcJzoaoumZkDpvAQ7jE,7302
+nmdc_runtime-1.9.0.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
+nmdc_runtime-1.9.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-1.9.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-1.9.0.dist-info/RECORD,,

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (70.1.1)
+Generator: setuptools (73.0.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-1.7.0.dist-info → nmdc_runtime-1.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 1.7.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 1.7.0py3-none-any.whl → 1.9.0py3-none-any.whl