PyPI - nmdc-runtime - Versions diffs - 2.5.0__py3-none-any.whl → 2.7.0__py3-none-any.whl - Mend

nmdc-runtime 2.5.0py3-none-any.whl → 2.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (14) hide show

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -7,7 +7,6 @@ import xml.dom.minidom
 from typing import Any, List, Union
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
-    get_instruments,
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
     handle_geolocation_value,

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -24,31 +24,6 @@ def get_classname_from_typecode(doc_id):
     return class_map.get(typecode)
-def get_instruments(instrument_set_collection):
-    # dictionary to capture a list of all instruments
-    # Structure of dict:
-    # {"instrument_id": {"vendor": "vendor_name", "model": "model_name"}}
-    all_instruments = {}
-    try:
-        query = {"type": "nmdc:Instrument"}
-        cursor = instrument_set_collection.find(query)
-        for document in cursor:
-            instrument_id = document.get("id")
-            vendor = document.get("vendor")
-            model = document.get("model")
-            if not instrument_id or not vendor or not model:
-                continue
-            all_instruments[instrument_id] = {"vendor": vendor, "model": model}
-        return all_instruments
-    except Exception as e:
-        raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
 def fetch_data_objects_from_biosamples(
     all_docs_collection: Collection,
     data_object_set: Collection,

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -61,6 +61,8 @@ from nmdc_runtime.site.ops import (
     get_database_updater_inputs,
     post_submission_portal_biosample_ingest_record_stitching_filename,
     generate_data_generation_set_post_biosample_ingest,
+    get_instrument_ids_by_model,
+    log_database_ids,
 )
 from nmdc_runtime.site.export.study_metadata import get_biosamples_by_study_id
@@ -181,6 +183,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
     biosample_extras_slot_mapping = get_csv_rows_from_url(
         biosample_extras_slot_mapping_file_url
     )
+    instrument_mapping = get_instrument_ids_by_model()
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission,
@@ -188,10 +191,13 @@ def translate_metadata_submission_to_nmdc_schema_database():
         data_object_mapping=data_object_mapping,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
+        instrument_mapping=instrument_mapping,
     )
     validate_metadata(database)
+    log_database_ids(database)
     database_dict = nmdc_schema_object_to_dict(database)
     filename = nmdc_schema_database_export_filename(metadata_submission)
     outputs = export_json_to_drs(database_dict, filename)
@@ -217,6 +223,7 @@ def ingest_metadata_submission():
     biosample_extras_slot_mapping = get_csv_rows_from_url(
         biosample_extras_slot_mapping_file_url
     )
+    instrument_mapping = get_instrument_ids_by_model()
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission,
@@ -224,7 +231,11 @@ def ingest_metadata_submission():
         data_object_mapping=data_object_mapping,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
+        instrument_mapping=instrument_mapping,
     )
+    log_database_ids(database)
     run_id = submit_metadata_to_db(database)
     poll_for_run_completion(run_id)

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -7,6 +7,7 @@ import tempfile
 from collections import defaultdict
 from datetime import datetime, timezone
 from io import BytesIO, StringIO
+from pprint import pformat
 from toolz.dicttoolz import keyfilter
 from typing import Tuple
 from zipfile import ZipFile
@@ -38,7 +39,7 @@ from dagster import (
     Bool,
 )
 from gridfs import GridFS
-from linkml_runtime.dumpers import json_dumper
+from linkml_runtime.utils.dictutils import as_simple_dict
 from linkml_runtime.utils.yamlutils import YAMLRoot
 from nmdc_runtime.api.db.mongo import get_mongo_db
 from nmdc_runtime.api.core.idgen import generate_one_id
@@ -69,7 +70,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
     fetch_data_objects_from_biosamples,
     fetch_nucleotide_sequencing_from_biosamples,
     fetch_library_preparation_from_biosamples,
-    get_instruments,
 )
 from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
 from nmdc_runtime.site.resources import (
@@ -96,6 +96,7 @@ from nmdc_runtime.site.util import (
     run_and_log,
     schema_collection_has_index_on_id,
     nmdc_study_id_to_filename,
+    get_instruments_by_id,
 )
 from nmdc_runtime.util import (
     drs_object_in_for,
@@ -720,9 +721,8 @@ def translate_portal_submission_to_nmdc_schema_database(
     metadata_submission: Dict[str, Any],
     nucleotide_sequencing_mapping: List,
     data_object_mapping: List,
+    instrument_mapping: Dict[str, str],
     study_category: Optional[str],
-    study_doi_category: Optional[str],
-    study_doi_provider: Optional[str],
     study_pi_image_url: Optional[str],
     biosample_extras: Optional[list[dict]],
     biosample_extras_slot_mapping: Optional[list[dict]],
@@ -739,11 +739,10 @@ def translate_portal_submission_to_nmdc_schema_database(
         data_object_mapping=data_object_mapping,
         id_minter=id_minter,
         study_category=study_category,
-        study_doi_category=study_doi_category,
-        study_doi_provider=study_doi_provider,
         study_pi_image_url=study_pi_image_url,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
+        illumina_instrument_mapping=instrument_mapping,
     )
     database = translator.get_database()
     return database
@@ -761,7 +760,7 @@ def nmdc_schema_database_export_filename(study: Dict[str, Any]) -> str:
 @op
 def nmdc_schema_object_to_dict(object: YAMLRoot) -> Dict[str, Any]:
-    return json_dumper.to_dict(object)
+    return as_simple_dict(object)
 @op(required_resource_keys={"mongo"}, config_schema={"username": str})
@@ -1227,11 +1226,26 @@ def get_library_preparation_from_biosamples(
 @op(required_resource_keys={"mongo"})
-def get_all_instruments(context: OpExecutionContext):
+def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
     mdb = context.resources.mongo.db
-    instrument_set_collection = mdb["instrument_set"]
-    all_instruments = get_instruments(instrument_set_collection)
-    return all_instruments
+    return get_instruments_by_id(mdb)
+@op(required_resource_keys={"mongo"})
+def get_instrument_ids_by_model(context: OpExecutionContext) -> dict[str, str]:
+    mdb = context.resources.mongo.db
+    instruments_by_id = get_instruments_by_id(mdb)
+    instruments_by_model: dict[str, str] = {}
+    for inst_id, instrument in instruments_by_id.items():
+        model = instrument.get("model")
+        if model is None:
+            context.log.warning(f"Instrument {inst_id} has no model.")
+            continue
+        if model in instruments_by_model:
+            context.log.warning(f"Instrument model {model} is not unique.")
+        instruments_by_model[model] = inst_id
+    context.log.info("Instrument models: %s", pformat(instruments_by_model))
+    return instruments_by_model
 @op
@@ -1345,3 +1359,26 @@ def generate_biosample_set_for_nmdc_study_from_gold(
     database = database_updater.generate_biosample_set_from_gold_api_for_study()
     return database
+@op
+def log_database_ids(
+    context: OpExecutionContext,
+    database: nmdc.Database,
+) -> None:
+    """Log the IDs of the database."""
+    database_dict = as_simple_dict(database)
+    message = ""
+    for collection_name, collection in database_dict.items():
+        if not isinstance(collection, list):
+            continue
+        message += f"{collection_name} ({len(collection)}):\n"
+        if len(collection) < 10:
+            message += "\n".join(f"  {doc['id']}" for doc in collection)
+        else:
+            message += "\n".join(f"  {doc['id']}" for doc in collection[:4])
+            message += f"\n  ... {len(collection) - 8} more\n"
+            message += "\n".join(f"  {doc['id']}" for doc in collection[-4:])
+        message += "\n"
+    if message:
+        context.log.info(message)

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -553,8 +553,6 @@ def biosample_submission_ingest():
                     "translate_portal_submission_to_nmdc_schema_database": {
                         "inputs": {
                             "study_category": "research_study",
-                            "study_doi_category": None,
-                            "study_doi_provider": None,
                             "study_pi_image_url": None,
                         }
                     },
@@ -591,8 +589,6 @@ def biosample_submission_ingest():
                     "translate_portal_submission_to_nmdc_schema_database": {
                         "inputs": {
                             "study_category": None,
-                            "study_doi_category": None,
-                            "study_doi_provider": None,
                             "study_pi_image_url": None,
                         }
                     },

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -639,6 +639,16 @@ class GoldStudyTranslator(Translator):
         :return: nmdc:NucleotideSequencing object
         """
         gold_project_id = gold_project["projectGoldId"]
+        ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
+        insdc_bioproject_identifiers = []
+        if ncbi_bioproject_identifier:
+            insdc_bioproject_identifiers.append(
+                self._ensure_curie(
+                    ncbi_bioproject_identifier,
+                    default_prefix="bioproject",
+                )
+            )
         return nmdc.NucleotideSequencing(
             id=nmdc_nucleotide_sequencing_id,
             name=gold_project.get("projectName"),
@@ -650,6 +660,7 @@ class GoldStudyTranslator(Translator):
             has_input=nmdc_biosample_id,
             add_date=gold_project.get("addDate"),
             mod_date=self._get_mod_date(gold_project),
+            insdc_bioproject_identifiers=insdc_bioproject_identifiers,
             principal_investigator=self._get_pi(gold_project),
             processing_institution=self._get_processing_institution(gold_project),
             instrument_used=self._get_instrument(gold_project),

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import logging
 import re
+from collections import namedtuple
 from datetime import datetime
 from enum import Enum
 from functools import lru_cache
 from importlib import resources
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Union, Tuple
+from urllib.parse import urlparse
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import SlotDefinition
@@ -13,8 +15,38 @@ from toolz import concat, dissoc, get_in, groupby, valmap
 from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
+DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
+READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
+READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
+INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
+DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
 BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
+TAB_NAME_KEY = "__tab_name"
+METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
+METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
+    nmdc.NucleotideSequencingEnum.metatranscriptome
+)
+TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
+    "metagenome_sequencing_non_interleaved_data": METAGENOME,
+    "metagenome_sequencing_interleaved_data": METAGENOME,
+    "metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
+    "metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
+}
+DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
+    (READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
+    (READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
+    (INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
+    (READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
+    (READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
+    (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
+}
 class EnvironmentPackage(Enum):
     r"""
@@ -75,6 +107,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
     return grouped
+def split_strip(string: str | None, sep: str) -> list[str] | None:
+    """Split a string by a separator and strip whitespace from each part.
+    :param string: string to split
+    :param sep: separator to split by
+    :return: list of stripped strings
+    """
+    if string is None:
+        return None
+    return [s.strip() for s in string.split(sep)]
 class SubmissionPortalTranslator(Translator):
     """A Translator subclass for handling submission portal entries
@@ -86,17 +130,15 @@ class SubmissionPortalTranslator(Translator):
     def __init__(
         self,
-        metadata_submission: JSON_OBJECT = {},
+        metadata_submission: Optional[JSON_OBJECT] = None,
         *args,
         nucleotide_sequencing_mapping: Optional[list] = None,
         data_object_mapping: Optional[list] = None,
+        illumina_instrument_mapping: Optional[dict[str, str]] = None,
         # Additional study-level metadata not captured by the submission portal currently
         # See: https://github.com/microbiomedata/submission-schema/issues/162
-        study_doi_category: Optional[str] = None,
-        study_doi_provider: Optional[str] = None,
         study_category: Optional[str] = None,
         study_pi_image_url: Optional[str] = None,
-        study_funding_sources: Optional[list[str]] = None,
         # Additional biosample-level metadata with optional column mapping information not captured
         # by the submission portal currently.
         # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -106,23 +148,17 @@ class SubmissionPortalTranslator(Translator):
     ) -> None:
         super().__init__(*args, **kwargs)
-        self.metadata_submission = metadata_submission
+        self.metadata_submission: JSON_OBJECT = metadata_submission or {}
         self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
         self.data_object_mapping = data_object_mapping
-        self.study_doi_category = (
-            nmdc.DoiCategoryEnum(study_doi_category)
-            if study_doi_category
-            else nmdc.DoiCategoryEnum.dataset_doi
-        )
-        self.study_doi_provider = (
-            nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
+        self.illumina_instrument_mapping: dict[str, str] = (
+            illumina_instrument_mapping or {}
         )
         self.study_category = (
             nmdc.StudyCategoryEnum(study_category) if study_category else None
         )
         self.study_pi_image_url = study_pi_image_url
-        self.study_funding_sources = study_funding_sources
         self.biosample_extras = group_dicts_by_key(
             BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -153,28 +189,6 @@ class SubmissionPortalTranslator(Translator):
             type=nmdc.PersonValue.class_class_curie,
         )
-    def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
-        """Get DOI information from the context form data
-        :param metadata_submission: submission portal entry
-        :return: list of strings or None
-        """
-        dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
-        if not dataset_doi:
-            return None
-        if not dataset_doi.startswith("doi:"):
-            dataset_doi = f"doi:{dataset_doi}"
-        return [
-            nmdc.Doi(
-                doi_value=dataset_doi,
-                doi_provider=self.study_doi_provider,
-                doi_category=self.study_doi_category,
-                type="nmdc:Doi",
-            )
-        ]
     def _get_has_credit_associations(
         self, metadata_submission: JSON_OBJECT
     ) -> Union[List[nmdc.CreditAssociation], None]:
@@ -203,21 +217,34 @@ class SubmissionPortalTranslator(Translator):
     def _get_gold_study_identifiers(
         self, metadata_submission: JSON_OBJECT
     ) -> Union[List[str], None]:
-        """Construct a GOLD CURIE from the multiomics from data
+        """Construct a GOLD CURIE from the study form data
         :param metadata_submission: submission portal entry
         :return: GOLD CURIE
         """
-        gold_study_id = get_in(["multiOmicsForm", "GOLDStudyId"], metadata_submission)
+        gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
         if not gold_study_id:
             return None
         return [self._ensure_curie(gold_study_id, default_prefix="gold")]
+    def _get_ncbi_bioproject_identifiers(
+        self, metadata_submission: JSON_OBJECT
+    ) -> Union[List[str], None]:
+        """Construct a NCBI Bioproject CURIE from the study form data"""
+        ncbi_bioproject_id = get_in(
+            ["studyForm", "NCBIBioProjectId"], metadata_submission
+        )
+        if not ncbi_bioproject_id:
+            return None
+        return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
     def _get_jgi_study_identifiers(
         self, metadata_submission: JSON_OBJECT
     ) -> Union[List[str], None]:
-        """Construct a JGI proposal CURIE from the multiomics from data
+        """Construct a JGI proposal CURIE from the multiomics form data
         :param metadata_submission: submission portal entry
         :return: JGI proposal CURIE
@@ -228,6 +255,20 @@ class SubmissionPortalTranslator(Translator):
         return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
+    def _get_emsl_project_identifiers(
+        self, metadata_submission: JSON_OBJECT
+    ) -> Union[List[str], None]:
+        """Construct an EMSL project CURIE from the multiomics form data
+        :param metadata_submission: submission portal entry
+        :return: EMSL project CURIE
+        """
+        emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
+        if not emsl_project_id:
+            return None
+        return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
     def _get_quantity_value(
         self, raw_value: Optional[str], unit: Optional[str] = None
     ) -> Union[nmdc.QuantityValue, None]:
@@ -434,6 +475,75 @@ class SubmissionPortalTranslator(Translator):
         return value
+    def _get_data_objects_from_fields(
+        self,
+        sample_data: JSON_OBJECT,
+        *,
+        url_field_name: str,
+        md5_checksum_field_name: str,
+        nucleotide_sequencing_id: str,
+        data_object_type: nmdc.FileTypeEnum,
+    ) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
+        """Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
+        If the field provides multiple URLs, multiple DataObject instances will be created and a
+        Manifest will be created and provided in the second return value.
+        :param sample_data: sample data
+        :param url_field_name: field name for the URL
+        :param md5_checksum_field_name: field name for the MD5 checksum
+        :param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
+        :param data_object_type: FileTypeEnum representing the type of the data object
+        :return: nmdc.DataObject or None
+        """
+        data_objects: List[nmdc.DataObject] = []
+        urls = split_strip(sample_data.get(url_field_name), ";")
+        if not urls:
+            return data_objects, None
+        md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
+        if md5_checksums and len(urls) != len(md5_checksums):
+            raise ValueError(
+                f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
+            )
+        data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
+        manifest: nmdc.Manifest | None = None
+        if len(urls) > 1:
+            manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
+            manifest = nmdc.Manifest(
+                id=manifest_id,
+                manifest_category=nmdc.ManifestCategoryEnum(
+                    nmdc.ManifestCategoryEnum.poolable_replicates
+                ),
+                type="nmdc:Manifest",
+            )
+        for i, url in enumerate(urls):
+            data_object_id = data_object_ids[i]
+            parsed_url = urlparse(url)
+            possible_filename = parsed_url.path.rsplit("/", 1)[-1]
+            data_object_slots = {
+                "id": data_object_id,
+                "name": possible_filename,
+                "description": f"{data_object_type} for {nucleotide_sequencing_id}",
+                "type": "nmdc:DataObject",
+                "url": url,
+                "md5_checksum": md5_checksums[i] if md5_checksums else None,
+                "in_manifest": [manifest.id] if manifest else None,
+                "data_category": nmdc.DataCategoryEnum(
+                    nmdc.DataCategoryEnum.instrument_data
+                ),
+                "data_object_type": data_object_type,
+                "was_generated_by": nucleotide_sequencing_id,
+            }
+            data_object_slots.update(
+                self._transform_dict_for_class(sample_data, "DataObject")
+            )
+            data_objects.append(nmdc.DataObject(**data_object_slots))
+        return data_objects, manifest
     def _translate_study(
         self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
     ) -> nmdc.Study:
@@ -448,18 +558,17 @@ class SubmissionPortalTranslator(Translator):
         """
         return nmdc.Study(
             alternative_names=self._get_from(
-                metadata_submission, ["multiOmicsForm", "alternativeNames"]
+                metadata_submission, ["studyForm", "alternativeNames"]
             ),
-            associated_dois=self._get_doi(metadata_submission),
             description=self._get_from(
                 metadata_submission, ["studyForm", "description"]
             ),
             funding_sources=self._get_from(
                 metadata_submission, ["studyForm", "fundingSources"]
             ),
-            # emsl_proposal_identifier=self._get_from(
-            #     metadata_submission, ["multiOmicsForm", "studyNumber"]
-            # ),
+            emsl_project_identifiers=self._get_emsl_project_identifiers(
+                metadata_submission
+            ),
             gold_study_identifiers=self._get_gold_study_identifiers(
                 metadata_submission
             ),
@@ -467,8 +576,8 @@ class SubmissionPortalTranslator(Translator):
                 metadata_submission
             ),
             id=nmdc_study_id,
-            insdc_bioproject_identifiers=self._get_from(
-                metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
+            insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
+                metadata_submission
             ),
             jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
                 metadata_submission
@@ -555,7 +664,7 @@ class SubmissionPortalTranslator(Translator):
             if slot_definition.multivalued:
                 value_list = value
                 if isinstance(value, str):
-                    value_list = [v.strip() for v in value.split("|")]
+                    value_list = split_strip(value, "|")
                 transformed_value = [
                     self._transform_value_for_slot(item, slot_definition, unit)
                     for item in value_list
@@ -629,16 +738,18 @@ class SubmissionPortalTranslator(Translator):
         :return: nmdc:Database object
         """
         database = nmdc.Database()
-        nmdc_study_id = self._id_minter("nmdc:Study")[0]
         metadata_submission_data = self.metadata_submission.get(
             "metadata_submission", {}
         )
+        # Generate one Study instance based on the metadata submission
+        nmdc_study_id = self._id_minter("nmdc:Study")[0]
         database.study_set = [
             self._translate_study(metadata_submission_data, nmdc_study_id)
         ]
+        # Automatically populate the `env_package` field in the sample data based on which
+        # environmental data tab the sample data came from.
         sample_data = metadata_submission_data.get("sampleData", {})
         for key in sample_data.keys():
             env = key.removesuffix("_data").upper()
@@ -647,8 +758,16 @@ class SubmissionPortalTranslator(Translator):
                 for sample in sample_data[key]:
                     sample["env_package"] = package_name
             except KeyError:
+                # This is expected when processing rows from tabs like the JGI/EMSL tabs or external
+                # sequencing data tabs.
                 pass
+        # Before regrouping the data by sample name, record which tab each object came from
+        for tab_name in sample_data.keys():
+            for tab in sample_data[tab_name]:
+                tab[TAB_NAME_KEY] = tab_name
+        # Reorganize the sample data by sample name and generate a unique NMDC ID for each
         sample_data_by_id = groupby(
             BIOSAMPLE_UNIQUE_KEY_SLOT,
             concat(sample_data.values()),
@@ -658,6 +777,7 @@ class SubmissionPortalTranslator(Translator):
             zip(sample_data_by_id.keys(), nmdc_biosample_ids)
         )
+        # Translate the sample data into nmdc:Biosample objects
         database.biosample_set = [
             self._translate_biosample(
                 sample_data,
@@ -668,6 +788,104 @@ class SubmissionPortalTranslator(Translator):
             if sample_data
         ]
+        # This section handles the translation of information in the external sequencing tabs into
+        # various NMDC objects.
+        database.data_generation_set = []
+        database.data_object_set = []
+        database.instrument_set = []
+        database.manifest_set = []
+        today = datetime.now().strftime("%Y-%m-%d")
+        for sample_data_id, sample_data in sample_data_by_id.items():
+            for tab in sample_data:
+                tab_name = tab.get(TAB_NAME_KEY)
+                analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
+                if not analyte_category:
+                    # If the tab name cannot be mapped to an analyte category, that means we're
+                    # not in an external sequencing data tabs (e.g. this is an environmental data
+                    # tab or a JGI/EMSL tab). Skip this tab.
+                    continue
+                # Start by generating one NucleotideSequencing instance with a has_input
+                # relationship to the current Biosample instance.
+                nucleotide_sequencing_id = self._id_minter(
+                    "nmdc:NucleotideSequencing", 1
+                )[0]
+                nucleotide_sequencing_slots = {
+                    "id": nucleotide_sequencing_id,
+                    "has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
+                    "has_output": [],
+                    "associated_studies": [nmdc_study_id],
+                    "add_date": today,
+                    "mod_date": today,
+                    "analyte_category": analyte_category,
+                    "type": "nmdc:NucleotideSequencing",
+                }
+                # If the protocol_link column was filled in, expand it into an nmdc:Protocol object
+                if "protocol_link" in tab:
+                    protocol_link = tab.pop("protocol_link")
+                    nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
+                        url=protocol_link,
+                        type="nmdc:Protocol",
+                    )
+                # If model column was filled in, expand it into an nmdc:Instrument object. This is
+                # done by first checking the provided instrument mapping to see if the model is
+                # already present. If it is not, a new instrument object is created and added to the
+                # instrument_set. Currently, we only accept sequencing data in the submission portal
+                # that was generated by Illumina instruments, so the vendor is hardcoded here.
+                if "model" in tab:
+                    model = tab.pop("model")
+                    if model not in self.illumina_instrument_mapping:
+                        # If the model is not already in the mapping, create a new record for it
+                        nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
+                        database.instrument_set.append(
+                            nmdc.Instrument(
+                                id=nmdc_instrument_id,
+                                vendor=nmdc.InstrumentVendorEnum(
+                                    nmdc.InstrumentVendorEnum.illumina
+                                ),
+                                model=nmdc.InstrumentModelEnum(model),
+                                type="nmdc:Instrument",
+                            )
+                        )
+                        self.illumina_instrument_mapping[model] = nmdc_instrument_id
+                    nucleotide_sequencing_slots["instrument_used"] = (
+                        self.illumina_instrument_mapping[model]
+                    )
+                # Process the remaining columns according to the NucleotideSequencing class
+                # definition
+                nucleotide_sequencing_slots.update(
+                    self._transform_dict_for_class(tab, "NucleotideSequencing")
+                )
+                nucleotide_sequencing = nmdc.NucleotideSequencing(
+                    **nucleotide_sequencing_slots
+                )
+                database.data_generation_set.append(nucleotide_sequencing)
+                # Iterate over the columns that contain URLs and MD5 checksums and translate them
+                # into DataObject instances. Each of these DataObject instances will be connected
+                # to the NucleotideSequencing instance via the has_output/was_generated_by
+                # relationships.
+                for data_url in DATA_URL_SETS:
+                    data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
+                        (data_url, str(analyte_category))
+                    ]
+                    data_objects, manifest = self._get_data_objects_from_fields(
+                        tab,
+                        url_field_name=data_url.url,
+                        md5_checksum_field_name=data_url.md5_checksum,
+                        nucleotide_sequencing_id=nucleotide_sequencing_id,
+                        data_object_type=nmdc.FileTypeEnum(data_object_type),
+                    )
+                    if manifest:
+                        database.manifest_set.append(manifest)
+                    for data_object in data_objects:
+                        nucleotide_sequencing.has_output.append(data_object.id)
+                        database.data_object_set.append(data_object)
+        # This is the older way of handling attaching NucleotideSequencing and DataObject instances
+        # to the Biosample instances. This should now mainly be handled by the external sequencing
+        # data tabs in the submission portal. This code is being left in place for now in case it is
+        # needed in the future.
         if self.nucleotide_sequencing_mapping:
             # If there is data from an NucleotideSequencing mapping file, process it now. This part
             # assumes that there is a column in that file with the header __biosample_samp_name

nmdc_runtime/site/util.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
-from dagster import op
 from functools import lru_cache
 from pymongo.database import Database as MongoDatabase
 from subprocess import Popen, PIPE, STDOUT, CalledProcessError
+from toolz import groupby
 from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
 from nmdc_runtime.site.resources import mongo_resource
@@ -52,3 +52,10 @@ def get_basename(filename: str) -> str:
 def nmdc_study_id_to_filename(nmdc_study_id: str) -> str:
     return nmdc_study_id.replace(":", "_").replace("-", "_")
+def get_instruments_by_id(mdb: MongoDatabase) -> dict[str, dict]:
+    """Get all documents from the instrument_set collection in a dict keyed by id."""
+    return {
+        instrument["id"]: instrument for instrument in mdb["instrument_set"].find({})
+    }

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nmdc_runtime
-Version: 2.5.0
+Version: 2.7.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston
@@ -106,10 +106,10 @@ docker compose version
 docker info
 ```
-Ensure the permissions of `./mongoKeyFile` are such that only the file's owner can read or write the file.
+Ensure the permissions of `./.docker/mongoKeyFile` are such that only the file's owner can read or write the file.
 ```shell
-chmod 600 ./mongoKeyFile
+chmod 600 ./.docker/mongoKeyFile
 ```
 Ensure you have a `.env` file for the Docker services to source from. You may copy `.env.example` to

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/RECORD RENAMED Viewed

@@ -36,11 +36,11 @@ nmdc_runtime/minter/domain/model.py,sha256=WMOuKub3dVzkOt_EZSRDLeTsJPqFbKx01SMQ5
 nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
-nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
-nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
+nmdc_runtime/site/graphs.py,sha256=DoKK6B6xkSwRcY5PVVo6jV_IA4HI5qL8xW9_n94jVfQ,15990
+nmdc_runtime/site/ops.py,sha256=atZNkU5mzRRqTnaW39fvq7gVO2sKSH8ztVOp8_dOLbU,48048
+nmdc_runtime/site/repository.py,sha256=nHu1skayyTjJWwGEf5eToX02cgBNTG_kdSluzJZ6rJc,43695
 nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
-nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
+nmdc_runtime/site/util.py,sha256=h70UJCT9g-I63EJn0drZjv1iaQ8LHJTbG29R9kqJ04c,1821
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/backup/nmdcdb_mongodump.py,sha256=H5uosmEiXwLwklJrYJWrNhb_Nuf_ew8dBpZLl6_dYhs,2699
 nmdc_runtime/site/backup/nmdcdb_mongoexport.py,sha256=XIFI_AI3zl0dFr-ELOEmwvT41MyRKBGFaAT3RcamTNE,4166
@@ -51,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
-nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
+nmdc_runtime/site/export/ncbi_xml.py,sha256=4RqaT6qs1LDSiDDfF-JNZL5gOel8m65oCOelfr0blXs,26209
+nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=X35zbkxBxEyCnA9peY9YBAa_0oeoWy3DQEXoAXmc6vg,10100
 nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -61,13 +61,13 @@ nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14
 nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
 nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
-nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
+nmdc_runtime/site/translation/gold_translator.py,sha256=HGbWeuxppqlVfU8F5oKTYIDoC6qaftugJeWFIALB9XE,32720
 nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
 nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
 nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
 nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
-nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
+nmdc_runtime/site/translation/submission_portal_translator.py,sha256=UEeqlkz_YGqcnx8vomFysetOlXxDu23q0Ryr93SZy78,41684
 nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
 nmdc_runtime/site/translation/util.py,sha256=w_l3SiExGsl6cXRqto0a_ssDmHkP64ITvrOVfPxmNpY,4366
 nmdc_runtime/site/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
-nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
-nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-2.5.0.dist-info/RECORD,,
+nmdc_runtime-2.7.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-2.7.0.dist-info/METADATA,sha256=YgD6NKMOIO2FpMKIy7EWaGDTE_XkEM15ZXG2AhgMFFk,8155
+nmdc_runtime-2.7.0.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
+nmdc_runtime-2.7.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-2.7.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-2.7.0.dist-info/RECORD,,

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (78.0.2)
+Generator: setuptools (80.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-2.5.0.dist-info → nmdc_runtime-2.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 2.5.0__py3-none-any.whl → 2.7.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.5.0py3-none-any.whl → 2.7.0py3-none-any.whl