PyPI - nmdc-runtime - Versions diffs - 2.1.1__py3-none-any.whl → 2.2.0__py3-none-any.whl - Mend

nmdc-runtime 2.1.1py3-none-any.whl → 2.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (15) hide show

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
+import re
 import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
@@ -6,6 +7,7 @@ import xml.dom.minidom
 from typing import Any
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
+    get_instruments,
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
     handle_geolocation_value,
@@ -170,7 +172,39 @@ class NCBISubmissionXML:
             for json_key, value in biosample.items():
                 if isinstance(value, list):
-                    continue  # Skip processing for list values
+                    for item in value:
+                        if json_key not in attribute_mappings:
+                            continue
+                        xml_key = attribute_mappings[json_key]
+                        value_type = slot_range_mappings.get(json_key, "string")
+                        handler = self.type_handlers.get(
+                            value_type, handle_string_value
+                        )
+                        # Special handling for "elev" key
+                        if json_key == "elev":
+                            value = f"{float(value)} m"  # Convert to float if possible
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+                        # Special handling for "host_taxid"
+                        if json_key == "host_taxid" and isinstance(value, dict):
+                            if "term" in value and "id" in value["term"]:
+                                value = re.findall(
+                                    r"\d+", value["term"]["id"].split(":")[1]
+                                )[0]
+                            attributes[xml_key] = value
+                            continue  # Skip applying the handler to this key
+                        formatted_value = handler(item)
+                        # Combine multiple values with a separator for list elements
+                        if xml_key in attributes:
+                            attributes[xml_key] += f"| {formatted_value}"
+                        else:
+                            attributes[xml_key] = formatted_value
+                    continue
                 if json_key == "env_package":
                     env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
@@ -187,6 +221,20 @@ class NCBISubmissionXML:
                 value_type = slot_range_mappings.get(json_key, "string")
                 handler = self.type_handlers.get(value_type, handle_string_value)
+                # Special handling for "elev" key
+                if json_key == "elev":
+                    value = f"{float(value)} m"  # Convert to float if possible
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+                # Special handling for "host_taxid"
+                if json_key == "host_taxid" and isinstance(value, dict):
+                    if "term" in value and "id" in value["term"]:
+                        value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
+                    attributes[xml_key] = value
+                    continue  # Skip applying the handler to this key
+                # Default processing for other keys
                 formatted_value = handler(value)
                 attributes[xml_key] = formatted_value
@@ -286,6 +334,7 @@ class NCBISubmissionXML:
         nmdc_nucleotide_sequencing: list,
         nmdc_biosamples: list,
         nmdc_library_preparation: list,
+        all_instruments: dict,
     ):
         bsm_id_name_dict = {
             biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
@@ -296,9 +345,10 @@ class NCBISubmissionXML:
             biosample_ids = []
             nucleotide_sequencing_ids = {}
             lib_prep_protocol_names = {}
-            instrument_name = ""
             analyte_category = ""
             library_name = ""
+            instrument_vendor = ""
+            instrument_model = ""
             for biosample_id, data_objects in entry.items():
                 biosample_ids.append(biosample_id)
@@ -316,7 +366,11 @@ class NCBISubmissionXML:
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_name = ntseq.get("instrument_used", "")[0]
+                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument = all_instruments.get(instrument_id, {})
+                            instrument_vendor = instrument.get("vendor", "")
+                            instrument_model = instrument.get("model", "")
                             analyte_category = ntseq.get("analyte_category", "")
                             library_name = bsm_id_name_dict.get(biosample_id, "")
@@ -353,9 +407,9 @@ class NCBISubmissionXML:
                                 "RefId",
                                 children=[
                                     self.set_element(
-                                        "SPUID",
+                                        "PrimaryId",
                                         bioproject_id,
-                                        {"spuid_namespace": org},
+                                        {"db": "BioProject"},
                                     )
                                 ],
                             )
@@ -384,11 +438,11 @@ class NCBISubmissionXML:
                     )
                 sra_attributes = []
-                if instrument_name.lower().startswith("illumina"):
+                if instrument_vendor == "illumina":
                     sra_attributes.append(
                         self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
                     )
-                    if "nextseq550" in instrument_name.lower():
+                    if instrument_model == "nextseq_550":
                         sra_attributes.append(
                             self.set_element(
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
@@ -501,6 +555,7 @@ class NCBISubmissionXML:
         biosample_nucleotide_sequencing_list: list,
         biosample_data_objects_list: list,
         biosample_library_preparation_list: list,
+        instruments_dict: dict,
     ):
         data_type = None
         ncbi_project_id = None
@@ -545,6 +600,7 @@ class NCBISubmissionXML:
             nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
             nmdc_biosamples=biosamples_list,
             nmdc_library_preparation=biosample_library_preparation_list,
+            all_instruments=instruments_dict,
         )
         rough_string = ET.tostring(self.root, "unicode")

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -20,6 +20,31 @@ def get_classname_from_typecode(doc_id):
     return class_map.get(typecode)
+def get_instruments(instrument_set_collection):
+    # dictionary to capture a list of all instruments
+    # Structure of dict:
+    # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
+    all_instruments = {}
+    try:
+        query = {"type": "nmdc:Instrument"}
+        cursor = instrument_set_collection.find(query)
+        for document in cursor:
+            instrument_id = document.get("id")
+            vendor = document.get("vendor")
+            model = document.get("model")
+            if not instrument_id or not vendor or not model:
+                continue
+            all_instruments[instrument_id] = {"vendor": vendor, "model": model}
+        return all_instruments
+    except Exception as e:
+        raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
 def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     biosample_data_objects = []

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -53,6 +53,7 @@ from nmdc_runtime.site.ops import (
     get_data_objects_from_biosamples,
     get_nucleotide_sequencing_from_biosamples,
     get_library_preparation_from_biosamples,
+    get_all_instruments,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
@@ -126,9 +127,12 @@ def apply_metadata_in():
 @graph
 def gold_study_to_database():
-    (study_id, study_type, gold_nmdc_instrument_mapping_file_url) = (
-        get_gold_study_pipeline_inputs()
-    )
+    (
+        study_id,
+        study_type,
+        gold_nmdc_instrument_mapping_file_url,
+        include_field_site_info,
+    ) = get_gold_study_pipeline_inputs()
     projects = gold_projects_by_study(study_id)
     biosamples = gold_biosamples_by_study(study_id)
@@ -143,6 +147,7 @@ def gold_study_to_database():
         biosamples,
         analysis_projects,
         gold_nmdc_instrument_map_df,
+        include_field_site_info,
     )
     database_dict = nmdc_schema_object_to_dict(database)
     filename = nmdc_schema_database_export_filename(study)
@@ -449,6 +454,7 @@ def nmdc_study_to_ncbi_submission_export():
     )
     data_object_records = get_data_objects_from_biosamples(biosamples)
     library_preparation_records = get_library_preparation_from_biosamples(biosamples)
+    all_instruments = get_all_instruments()
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
         ncbi_submission_metadata,
@@ -456,5 +462,6 @@ def nmdc_study_to_ncbi_submission_export():
         nucleotide_sequencing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     ncbi_submission_xml_asset(xml_data)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -7,6 +7,7 @@ import tempfile
 from collections import defaultdict
 from datetime import datetime, timezone
 from io import BytesIO, StringIO
+from toolz.dicttoolz import keyfilter
 from typing import Tuple
 from zipfile import ZipFile
 from itertools import chain
@@ -68,6 +69,7 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
     fetch_data_objects_from_biosamples,
     fetch_nucleotide_sequencing_from_biosamples,
     fetch_library_preparation_from_biosamples,
+    get_instruments,
 )
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
@@ -92,17 +94,20 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
 from nmdc_runtime.site.util import run_and_log, schema_collection_has_index_on_id
 from nmdc_runtime.util import (
     drs_object_in_for,
+    get_names_of_classes_in_effective_range_of_slot,
     pluralize,
     put_object,
     validate_json,
     specialize_activity_set_docs,
     collection_name_to_class_names,
     class_hierarchy_as_list,
+    nmdc_schema_view,
     populated_schema_collection_names_with_id_field,
 )
 from nmdc_schema import nmdc
 from nmdc_schema.nmdc import Database as NMDCDatabase
 from pydantic import BaseModel
+from pymongo import InsertOne
 from pymongo.database import Database as MongoDatabase
 from starlette import status
 from toolz import assoc, dissoc, get_in, valfilter, identity
@@ -588,18 +593,23 @@ def add_output_run_event(context: OpExecutionContext, outputs: List[str]):
         "study_id": str,
         "study_type": str,
         "gold_nmdc_instrument_mapping_file_url": str,
+        "include_field_site_info": bool,
     },
     out={
         "study_id": Out(str),
         "study_type": Out(str),
         "gold_nmdc_instrument_mapping_file_url": Out(str),
+        "include_field_site_info": Out(bool),
     },
 )
-def get_gold_study_pipeline_inputs(context: OpExecutionContext) -> Tuple[str, str, str]:
+def get_gold_study_pipeline_inputs(
+    context: OpExecutionContext,
+) -> Tuple[str, str, str, bool]:
     return (
         context.op_config["study_id"],
         context.op_config["study_type"],
         context.op_config["gold_nmdc_instrument_mapping_file_url"],
+        context.op_config["include_field_site_info"],
     )
@@ -642,6 +652,7 @@ def nmdc_schema_database_from_gold_study(
     biosamples: List[Dict[str, Any]],
     analysis_projects: List[Dict[str, Any]],
     gold_nmdc_instrument_map_df: pd.DataFrame,
+    include_field_site_info: bool,
 ) -> nmdc.Database:
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
@@ -656,6 +667,7 @@ def nmdc_schema_database_from_gold_study(
         projects,
         analysis_projects,
         gold_nmdc_instrument_map_df,
+        include_field_site_info,
         id_minter=id_minter,
     )
     database = translator.get_database()
@@ -1029,23 +1041,51 @@ def site_code_mapping() -> dict:
 @op(required_resource_keys={"mongo"})
 def materialize_alldocs(context) -> int:
+    """
+    This function re-creates the alldocs collection to reflect the current state of the Mongo database.
+    See nmdc-runtime/docs/nb/bulk_validation_referential_integrity_check.ipynb for more details.
+    """
     mdb = context.resources.mongo.db
-    collection_names = populated_schema_collection_names_with_id_field(mdb)
+    schema_view = nmdc_schema_view()
-    # Insert a no-op as an anchor point for this comment.
-    #
-    # Note: There used to be code here that `assert`-ed that each collection could only contain documents of a single
-    #       type. With the legacy schema, that assertion was true. With the Berkeley schema, it is false. That code was
-    #       in place because subsequent code (further below) used a single document in a collection as the source of the
-    #       class ancestry information of _all_ documents in that collection; an optimization that spared us from
-    #       having to do the same for every single document in that collection. With the Berkeley schema, we have
-    #       eliminated that optimization (since it is inadequate; it would produce some incorrect class ancestries
-    #       for descendants of `PlannedProcess`, for example).
-    #
-    pass
+    # batch size for writing documents to alldocs
+    BULK_WRITE_BATCH_SIZE = 2000
+    # TODO include functional_annotation_agg  for "real-time" ref integrity checking.
+    #   For now, production use cases for materialized `alldocs` are limited to `id`-having collections.
+    collection_names = populated_schema_collection_names_with_id_field(mdb)
     context.log.info(f"{collection_names=}")
+    # Build alldocs
+    context.log.info("constructing `alldocs` collection")
+    document_class_names = set(
+        chain.from_iterable(collection_name_to_class_names.values())
+    )
+    cls_slot_map = {
+        cls_name: {
+            slot.name: slot for slot in schema_view.class_induced_slots(cls_name)
+        }
+        for cls_name in document_class_names
+    }
+    # Any ancestor of a document class is a document-referenceable range, i.e., a valid range of a document-reference-ranged slot.
+    document_referenceable_ranges = set(
+        chain.from_iterable(
+            schema_view.class_ancestors(cls_name) for cls_name in document_class_names
+        )
+    )
+    document_reference_ranged_slots = defaultdict(list)
+    for cls_name, slot_map in cls_slot_map.items():
+        for slot_name, slot in slot_map.items():
+            if (
+                set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
+                & document_referenceable_ranges
+            ):
+                document_reference_ranged_slots[cls_name].append(slot_name)
     # Drop any existing `alldocs` collection (e.g. from previous use of this op).
     #
     # FIXME: This "nuke and pave" approach introduces a race condition.
@@ -1054,90 +1094,41 @@ def materialize_alldocs(context) -> int:
     #
     mdb.alldocs.drop()
-    # Build alldocs
-    context.log.info("constructing `alldocs` collection")
-    # For each collection, group its documents by their `type` value, transform them, and load them into `alldocs`.
-    for collection_name in collection_names:
+    for coll_name in collection_names:
+        context.log.info(f"{coll_name=}")
+        requests = []
+        documents_processed_counter = 0
+        for doc in mdb[coll_name].find():
+            doc_type = doc["type"][5:]  # lop off "nmdc:" prefix
+            slots_to_include = ["id", "type"] + document_reference_ranged_slots[
+                doc_type
+            ]
+            new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
+            new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
+            requests.append(InsertOne(new_doc))
+            if len(requests) == BULK_WRITE_BATCH_SIZE:
+                _ = mdb.alldocs.bulk_write(requests, ordered=False)
+                requests.clear()
+                documents_processed_counter += BULK_WRITE_BATCH_SIZE
+        if len(requests) > 0:
+            _ = mdb.alldocs.bulk_write(requests, ordered=False)
+            documents_processed_counter += len(requests)
         context.log.info(
-            f"Found {mdb[collection_name].estimated_document_count()} estimated documents for {collection_name=}."
+            f"Inserted {documents_processed_counter} documents from {coll_name=} "
         )
-        # Process all the distinct `type` values (i.e. value in the `type` field) of the documents in this collection.
-        #
-        # References:
-        # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.distinct
-        #
-        distinct_type_values = mdb[collection_name].distinct(key="type")
-        context.log.info(
-            f"Found {len(distinct_type_values)} distinct `type` values in {collection_name=}: {distinct_type_values=}"
-        )
-        for type_value in distinct_type_values:
-            # Process all the documents in this collection that have this value in their `type` field.
-            #
-            # References:
-            # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.count_documents
-            # - https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.find
-            #
-            filter_ = {"type": type_value}
-            num_docs_having_type = mdb[collection_name].count_documents(filter=filter_)
-            docs_having_type = mdb[collection_name].find(filter=filter_)
-            context.log.info(
-                f"Found {num_docs_having_type} documents having {type_value=} in {collection_name=}."
-            )
-            # Get a "representative" document from the result.
-            #
-            # Note: Since all of the documents in this batch have the same class ancestry, we will save time by
-            #       determining the class ancestry of only _one_ of them (we call this the "representative") and then
-            #       (later) attributing that class ancestry to all of them.
-            #
-            representative_doc = next(docs_having_type)
-            # Instantiate the Python class represented by the "representative" document.
-            db_dict = {
-                # Shed the `_id` attribute, since the constructor doesn't allow it.
-                collection_name: [dissoc(representative_doc, "_id")]
-            }
-            nmdc_db = NMDCDatabase(**db_dict)
-            representative_instance = getattr(nmdc_db, collection_name)[0]
-            # Get the class ancestry of that instance, as a list of class names (including its own class name).
-            ancestor_class_names = class_hierarchy_as_list(representative_instance)
-            # Store the documents belonging to this group, in the `alldocs` collection, setting their `type` field
-            # to the list of class names obtained from the "representative" document above.
-            #
-            # TODO: Document why clobbering the existing contents of the `type` field is OK.
-            #
-            # Note: The reason we `chain()` our "representative" document (in an iterable) with the `docs_having_type`
-            #       iterator here is that, when we called `next(docs_having_type)` above, we "consumed" our
-            #       "representative" document from that iterator. We use `chain()` here so that that document gets
-            #       inserted alongside its cousins (i.e. the documents _still_ accessible via `docs_having_type`).
-            #       Reference: https://docs.python.org/3/library/itertools.html#itertools.chain
-            #
-            inserted_many_result = mdb.alldocs.insert_many(
-                [
-                    assoc(dissoc(doc, "type", "_id"), "type", ancestor_class_names)
-                    for doc in chain([representative_doc], docs_having_type)
-                ]
-            )
-            context.log.info(
-                f"Inserted {len(inserted_many_result.inserted_ids)} documents from {collection_name=} "
-                f"originally having {type_value=}."
-            )
+    context.log.info(
+        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
+    )
     # Re-idx for `alldocs` collection
     mdb.alldocs.create_index("id", unique=True)
     # The indexes were added to improve the performance of the
     # /data_objects/study/{study_id} endpoint
-    mdb.alldocs.create_index("has_input")
-    mdb.alldocs.create_index("has_output")
-    mdb.alldocs.create_index("was_informed_by")
-    context.log.info(
-        f"refreshed {mdb.alldocs} collection with {mdb.alldocs.estimated_document_count()} docs."
-    )
+    slots_to_index = ["has_input", "has_output", "was_informed_by"]
+    [mdb.alldocs.create_index(slot) for slot in slots_to_index]
+    context.log.info(f"created indexes on id, {slots_to_index}.")
     return mdb.alldocs.estimated_document_count()
@@ -1221,6 +1212,14 @@ def get_library_preparation_from_biosamples(
     return biosample_lib_prep
+@op(required_resource_keys={"mongo"})
+def get_all_instruments(context: OpExecutionContext):
+    mdb = context.resources.mongo.db
+    instrument_set_collection = mdb["instrument_set"]
+    all_instruments = get_instruments(instrument_set_collection)
+    return all_instruments
 @op
 def ncbi_submission_xml_from_nmdc_study(
     context: OpExecutionContext,
@@ -1230,6 +1229,7 @@ def ncbi_submission_xml_from_nmdc_study(
     omics_processing_records: list,
     data_object_records: list,
     library_preparation_records: list,
+    all_instruments: dict,
 ) -> str:
     ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
     ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1237,5 +1237,6 @@ def ncbi_submission_xml_from_nmdc_study(
         omics_processing_records,
         data_object_records,
         library_preparation_records,
+        all_instruments,
     )
     return ncbi_xml

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -506,6 +506,7 @@ def biosample_submission_ingest():
                             "study_id": "",
                             "study_type": "research_study",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
                         },
                     },
                     "export_json_to_drs": {"config": {"username": ""}},

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -331,9 +331,26 @@ class GoldApiClient(BasicAuthClient):
         """
         return id.replace("gold:", "")
-    def fetch_biosamples_by_study(self, study_id: str) -> List[Dict[str, Any]]:
+    def fetch_biosamples_by_study(
+        self, study_id: str, include_project=True
+    ) -> List[Dict[str, Any]]:
         id = self._normalize_id(study_id)
         results = self.request("/biosamples", params={"studyGoldId": id})
+        if include_project:
+            projects = self.fetch_projects_by_study(id)
+            biosamples_by_id = {
+                biosample["biosampleGoldId"]: biosample for biosample in results
+            }
+            for project in projects:
+                sample_id = project.get("biosampleGoldId")
+                if not sample_id:
+                    continue
+                if sample_id not in biosamples_by_id:
+                    continue
+                biosample = biosamples_by_id[sample_id]
+                if "projects" not in biosample:
+                    biosample["projects"] = []
+                biosample["projects"].append(project)
         return results
     def fetch_projects_by_study(self, study_id: str) -> List[Dict[str, Any]]:

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -7,6 +7,10 @@ import pandas as pd
 from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
+# Dictionary of sequencing strategies from GOLD that we are filtering on
+# based on the kind of samples that are required for NMDC
+SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
 class GoldStudyTranslator(Translator):
     def __init__(
@@ -17,6 +21,7 @@ class GoldStudyTranslator(Translator):
         projects: List[JSON_OBJECT] = [],
         analysis_projects: List[JSON_OBJECT] = [],
         gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+        include_field_site_info: bool = False,
         *args,
         **kwargs,
     ) -> None:
@@ -24,9 +29,39 @@ class GoldStudyTranslator(Translator):
         self.study = study
         self.study_type = nmdc.StudyCategoryEnum(study_type)
-        self.biosamples = biosamples
-        self.projects = projects
-        self.analysis_projects = analysis_projects
+        self.include_field_site_info = include_field_site_info
+        # Filter biosamples to only those with `sequencingStrategy` of
+        # "Metagenome" or "Metatranscriptome"
+        self.biosamples = [
+            biosample
+            for biosample in biosamples
+            if any(
+                project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
+                for project in biosample.get("projects", [])
+            )
+        ]
+        # Fetch the valid projectGoldIds that are associated with filtered
+        # biosamples on their `projects` field
+        valid_project_ids = {
+            project.get("projectGoldId")
+            for biosample in self.biosamples
+            for project in biosample.get("projects", [])
+        }
+        # Filter projects to only those with `projectGoldId` in valid_project_ids
+        self.projects = [
+            project
+            for project in projects
+            if project.get("projectGoldId") in valid_project_ids
+        ]
+        # Filter analysis_projects to only those with all `projects` in valid_project_ids
+        self.analysis_projects = [
+            analysis_project
+            for analysis_project in analysis_projects
+            if all(
+                project_id in valid_project_ids
+                for project_id in analysis_project.get("projects", [])
+            )
+        ]
         self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
         self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
@@ -596,7 +631,11 @@ class GoldStudyTranslator(Translator):
             principal_investigator=self._get_pi(gold_project),
             processing_institution=self._get_processing_institution(gold_project),
             instrument_used=self._get_instrument(gold_project),
-            analyte_category="metagenome",
+            analyte_category=(
+                gold_project.get("sequencingStrategy").lower()
+                if gold_project.get("sequencingStrategy")
+                else None
+            ),
             associated_studies=[nmdc_study_id],
         )
@@ -621,21 +660,24 @@ class GoldStudyTranslator(Translator):
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
         gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
-        gold_field_site_names = sorted(
-            {self._get_field_site_name(biosample) for biosample in self.biosamples}
-        )
-        nmdc_field_site_ids = self._id_minter(
-            "nmdc:FieldResearchSite", len(gold_field_site_names)
-        )
-        gold_name_to_nmdc_field_site_ids = dict(
-            zip(gold_field_site_names, nmdc_field_site_ids)
-        )
-        gold_biosample_to_nmdc_field_site_ids = {
-            biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
-                self._get_field_site_name(biosample)
-            ]
-            for biosample in self.biosamples
-        }
+        if self.include_field_site_info:
+            gold_field_site_names = sorted(
+                {self._get_field_site_name(biosample) for biosample in self.biosamples}
+            )
+            nmdc_field_site_ids = self._id_minter(
+                "nmdc:FieldResearchSite", len(gold_field_site_names)
+            )
+            gold_name_to_nmdc_field_site_ids = dict(
+                zip(gold_field_site_names, nmdc_field_site_ids)
+            )
+            gold_biosample_to_nmdc_field_site_ids = {
+                biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
+                    self._get_field_site_name(biosample)
+                ]
+                for biosample in self.biosamples
+            }
+        else:
+            gold_biosample_to_nmdc_field_site_ids = {}
         gold_project_ids = [project["projectGoldId"] for project in self.projects]
         nmdc_nucleotide_sequencing_ids = self._id_minter(
@@ -653,16 +695,17 @@ class GoldStudyTranslator(Translator):
                     biosample["biosampleGoldId"]
                 ],
                 nmdc_study_id=nmdc_study_id,
-                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids[
-                    biosample["biosampleGoldId"]
-                ],
+                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
+                    biosample["biosampleGoldId"], None
+                ),
             )
             for biosample in self.biosamples
         ]
-        database.field_research_site_set = [
-            nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
-            for name, id in gold_name_to_nmdc_field_site_ids.items()
-        ]
+        if self.include_field_site_info:
+            database.field_research_site_set = [
+                nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
+                for name, id in gold_name_to_nmdc_field_site_ids.items()
+            ]
         database.data_generation_set = [
             self._translate_nucleotide_sequencing(
                 project,

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import logging
 import re
 from datetime import datetime
+from enum import Enum
 from functools import lru_cache
 from importlib import resources
 from typing import Any, List, Optional, Union
@@ -8,14 +9,36 @@ from typing import Any, List, Optional, Union
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import SlotDefinition
 from nmdc_schema import nmdc
-from toolz import get_in, groupby, concat, valmap, dissoc
+from toolz import concat, dissoc, get_in, groupby, valmap
 from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
 BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
+class EnvironmentPackage(Enum):
+    r"""
+    Enumeration of all possible environmental packages.
+    >>> EnvironmentPackage.AIR.value
+    'air'
+    >>> EnvironmentPackage.SEDIMENT.value
+    'sediment'
+    """
+    AIR = "air"
+    BIOFILM = "microbial mat_biofilm"
+    BUILT_ENV = "built environment"
+    HCR_CORES = "hydrocarbon resources-cores"
+    HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
+    HOST_ASSOCIATED = "host-associated"
+    MISC_ENVS = "miscellaneous natural or artificial environment"
+    PLANT_ASSOCIATED = "plant-associated"
+    SEDIMENT = "sediment"
+    SOIL = "soil"
+    WATER = "water"
 @lru_cache
 def _get_schema_view():
     """Return a SchemaView instance representing the NMDC schema"""
@@ -550,7 +573,6 @@ class SubmissionPortalTranslator(Translator):
         sample_data: List[JSON_OBJECT],
         nmdc_biosample_id: str,
         nmdc_study_id: str,
-        default_env_package: str,
     ) -> nmdc.Biosample:
         """Translate sample data from portal submission into an `nmdc:Biosample` object.
@@ -565,18 +587,23 @@ class SubmissionPortalTranslator(Translator):
                             from each applicable submission portal tab
         :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
         :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
-        :param default_env_package: Default value for `env_package` slot
         :return: nmdc:Biosample
         """
-        biosample_key = sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
+        env_idx = next(
+            (
+                i
+                for i, tab in enumerate(sample_data)
+                if tab.get("env_package") is not None
+            ),
+            0,
+        )
+        biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
         slots = {
             "id": nmdc_biosample_id,
             "associated_studies": [nmdc_study_id],
             "type": "nmdc:Biosample",
-            "name": sample_data[0].get("samp_name", "").strip(),
-            "env_package": nmdc.TextValue(
-                has_raw_value=default_env_package, type="nmdc:TextValue"
-            ),
+            "name": sample_data[env_idx].get("samp_name", "").strip(),
+            "env_package": sample_data[env_idx].get("env_package"),
         }
         for tab in sample_data:
             transformed_tab = self._transform_dict_for_class(tab, "Biosample")
@@ -613,9 +640,18 @@ class SubmissionPortalTranslator(Translator):
         ]
         sample_data = metadata_submission_data.get("sampleData", {})
-        package_name = metadata_submission_data["packageName"]
+        for key in sample_data.keys():
+            env = key.removesuffix("_data").upper()
+            try:
+                package_name = EnvironmentPackage[env].value
+                for sample in sample_data[key]:
+                    sample["env_package"] = package_name
+            except KeyError:
+                pass
         sample_data_by_id = groupby(
-            BIOSAMPLE_UNIQUE_KEY_SLOT, concat(sample_data.values())
+            BIOSAMPLE_UNIQUE_KEY_SLOT,
+            concat(sample_data.values()),
         )
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
         sample_data_to_nmdc_biosample_ids = dict(
@@ -627,7 +663,6 @@ class SubmissionPortalTranslator(Translator):
                 sample_data,
                 nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
                 nmdc_study_id=nmdc_study_id,
-                default_env_package=package_name,
             )
             for sample_data_id, sample_data in sample_data_by_id.items()
             if sample_data

nmdc_runtime/util.py CHANGED Viewed

@@ -17,6 +17,8 @@ import fastjsonschema
 import requests
 from frozendict import frozendict
 from jsonschema.validators import Draft7Validator
+from linkml_runtime import linkml_model
+from linkml_runtime.utils.schemaview import SchemaView
 from nmdc_schema.nmdc import Database as NMDCDatabase
 from nmdc_schema.get_nmdc_view import ViewGetter
 from pydantic import Field, BaseModel
@@ -29,6 +31,48 @@ from nmdc_runtime.api.models.object import DrsObjectIn
 from typing_extensions import Annotated
+def get_names_of_classes_in_effective_range_of_slot(
+    schema_view: SchemaView, slot_definition: linkml_model.SlotDefinition
+) -> List[str]:
+    r"""
+    Determine the slot's "effective" range, by taking into account its `any_of` constraints (if defined).
+    Note: The `any_of` constraints constrain the slot's "effective" range beyond that described by the
+          induced slot definition's `range` attribute. `SchemaView` does not seem to provide the result
+          of applying those additional constraints, so we do it manually here (if any are defined).
+          Reference: https://github.com/orgs/linkml/discussions/2101#discussion-6625646
+    Reference: https://linkml.io/linkml-model/latest/docs/any_of/
+    """
+    # Initialize the list to be empty.
+    names_of_eligible_target_classes = []
+    # If the `any_of` constraint is defined on this slot, use that instead of the `range`.
+    if "any_of" in slot_definition and len(slot_definition.any_of) > 0:
+        for slot_expression in slot_definition.any_of:
+            # Use the slot expression's `range` to get the specified eligible class name
+            # and the names of all classes that inherit from that eligible class.
+            if slot_expression.range in schema_view.all_classes():
+                own_and_descendant_class_names = schema_view.class_descendants(
+                    slot_expression.range
+                )
+                names_of_eligible_target_classes.extend(own_and_descendant_class_names)
+    else:
+        # Use the slot's `range` to get the specified eligible class name
+        # and the names of all classes that inherit from that eligible class.
+        if slot_definition.range in schema_view.all_classes():
+            own_and_descendant_class_names = schema_view.class_descendants(
+                slot_definition.range
+            )
+            names_of_eligible_target_classes.extend(own_and_descendant_class_names)
+    # Remove duplicate class names.
+    names_of_eligible_target_classes = list(set(names_of_eligible_target_classes))
+    return names_of_eligible_target_classes
 def get_class_names_from_collection_spec(
     spec: dict, prefix: Optional[str] = None
 ) -> List[str]:

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: nmdc_runtime
-Version: 2.1.1
+Version: 2.2.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston
@@ -145,8 +145,6 @@ http://127.0.0.1:8000/redoc/.
 Tests can be found in `tests` and are run with the following commands:
-On an M1 Mac? May need to `export DOCKER_DEFAULT_PLATFORM=linux/amd64`.
 ```bash
 make up-test
 make test

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ nmdc_runtime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/config.py,sha256=qyV_To6t--DQUpYJ3SrE6sZlxuVXLPmx2dVtZV-3l-c,33
 nmdc_runtime/containers.py,sha256=8m_S1wiFu8VOWvY7tyqzf-02X9gXY83YGc8FgjWzLGA,418
 nmdc_runtime/main.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/util.py,sha256=Wd2GuuskyUqf1eV5mHLZws8BHAOsqnc0Qj7_4WhSvAM,20736
+nmdc_runtime/util.py,sha256=aMzS8eATEjpXOiuyAFYthx92fb_cgIzWWd5ZQU6ZlAY,22931
 nmdc_runtime/client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/core/db/Database.py,sha256=WamgBUbq85A7-fr3p5B9Tk92U__yPdr9pBb4zyQok-4,377
@@ -36,10 +36,10 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
 nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/graphs.py,sha256=ZHglSPwVHfXzdgR2CGvmbzLLbmsijloU58XvIe9Thjs,13996
-nmdc_runtime/site/ops.py,sha256=6P3kn4BygY8LKD_OpfKX2U0AYQKDlB2jw12Yn-hEmD0,44651
-nmdc_runtime/site/repository.py,sha256=rDtwUjozhyOxlkuF9HvaheOQDQWkgZYqVtsB50BcUp4,39121
-nmdc_runtime/site/resources.py,sha256=ZSH1yvA-li0R7Abc22_v0XLbjBYf5igETr2G01J3hnc,17557
+nmdc_runtime/site/graphs.py,sha256=mu4bE8799TItWXaPBfOeFB2XMyYwPZcj-VJQmadN2MA,14171
+nmdc_runtime/site/ops.py,sha256=T9_WrwDaySGnu6olwOHQizHQfeofMOaqMcq_vYEIzO0,43140
+nmdc_runtime/site/repository.py,sha256=JtHlp6l3UVo0QhV670TGns9bMfht7NOQrNWQtvsYr2g,39183
+nmdc_runtime/site/resources.py,sha256=6bmvplgql3KdEXKI49BibSk0Sug96SFJi8eOs2zeKK0,18252
 nmdc_runtime/site/util.py,sha256=zAY0oIY7GRf63ecqWelmS27N7PVrAXVwEhtnpescBSw,1415
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
@@ -51,21 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=bfGnvFO7jQmlNAdzXpQiNBw7DGvWQ3pTPfgbhczb_kM,22561
-nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=71LSFIYioF61xalKgsQ_Po63322dNbfQfYzqo2hZ720,7575
+nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
+nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
 nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
 nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
 nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
-nmdc_runtime/site/translation/gold_translator.py,sha256=wkl1WwJ45EFwz73l_-t0D9Y3SilctDC1obTieY0eqxM,29600
+nmdc_runtime/site/translation/gold_translator.py,sha256=RfAB68dJ9hDep20wETmCNBc0gugZbEKqVimT8h2t0uM,31470
 nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
 nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
 nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
 nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
-nmdc_runtime/site/translation/submission_portal_translator.py,sha256=FVBqCvk6NAJIA22IhtFOTyvAQIiFN3KsznHc5zmOG40,29676
+nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
 nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
 nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
 nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -73,9 +73,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-2.1.1.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-2.1.1.dist-info/METADATA,sha256=jiwY6Bhzhc5sNIqqF-ib1ouWTezhdWBlmW-yD-qR1IA,7329
-nmdc_runtime-2.1.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-nmdc_runtime-2.1.1.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-2.1.1.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-2.1.1.dist-info/RECORD,,
+nmdc_runtime-2.2.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-2.2.0.dist-info/METADATA,sha256=igSdpzN5dxlLV9r_O8btdkVPMTvLDzkn032LUdb-3hY,7256
+nmdc_runtime-2.2.0.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+nmdc_runtime-2.2.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-2.2.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-2.2.0.dist-info/RECORD,,

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-2.1.1.dist-info → nmdc_runtime-2.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 2.1.1__py3-none-any.whl → 2.2.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.1.1py3-none-any.whl → 2.2.0py3-none-any.whl