PyPI - nmdc-runtime - Versions diffs - 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -0
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +30 -4
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +1331 -0
nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
nmdc_runtime/site/export/study_metadata.py +27 -4
nmdc_runtime/site/graphs.py +294 -45
nmdc_runtime/site/ops.py +1008 -230
nmdc_runtime/site/repair/database_updater.py +451 -0
nmdc_runtime/site/repository.py +368 -133
nmdc_runtime/site/resources.py +154 -80
nmdc_runtime/site/translation/gold_translator.py +235 -83
nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
nmdc_runtime/site/translation/neon_utils.py +24 -7
nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
nmdc_runtime/site/translation/translator.py +73 -3
nmdc_runtime/site/util.py +26 -7
nmdc_runtime/site/validation/emsl.py +1 -0
nmdc_runtime/site/validation/gold.py +1 -0
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +236 -192
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/terminusdb/generate.py +0 -198
nmdc_runtime/site/terminusdb/ingest.py +0 -44
nmdc_runtime/site/terminusdb/schema.py +0 -1671
nmdc_runtime/site/translation/emsl.py +0 -42
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -31
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -42
nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
/nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -1,18 +1,83 @@
 import logging
 import re
+from collections import namedtuple
 from datetime import datetime
+from decimal import Decimal
+from enum import Enum
 from functools import lru_cache
 from importlib import resources
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Union, Tuple
+from urllib.parse import urlparse
 from linkml_runtime import SchemaView
 from linkml_runtime.linkml_model import SlotDefinition
 from nmdc_schema import nmdc
-from toolz import get_in, groupby, concat, valmap, dissoc
+from toolz import concat, dissoc, get_in, groupby, valmap
 from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
+DataUrlSet = namedtuple("DataUrlSet", ["url", "md5_checksum"])
+READ_1 = DataUrlSet("read_1_url", "read_1_md5_checksum")
+READ_2 = DataUrlSet("read_2_url", "read_2_md5_checksum")
+INTERLEAVED = DataUrlSet("interleaved_url", "interleaved_md5_checksum")
+DATA_URL_SETS: list[DataUrlSet] = [READ_1, READ_2, INTERLEAVED]
+BIOSAMPLE_UNIQUE_KEY_SLOT = "samp_name"
+TAB_NAME_KEY = "__tab_name"
+METAGENOME = nmdc.NucleotideSequencingEnum(nmdc.NucleotideSequencingEnum.metagenome)
+METATRANSCRIPTOME = nmdc.NucleotideSequencingEnum(
+    nmdc.NucleotideSequencingEnum.metatranscriptome
+)
+TAB_NAME_TO_ANALYTE_CATEGORY: dict[str, nmdc.NucleotideSequencingEnum] = {
+    "metagenome_sequencing_non_interleaved_data": METAGENOME,
+    "metagenome_sequencing_interleaved_data": METAGENOME,
+    "metatranscriptome_sequencing_non_interleaved_data": METATRANSCRIPTOME,
+    "metatranscriptome_sequencing_interleaved_data": METATRANSCRIPTOME,
+}
+DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str] = {
+    (READ_1, str(METAGENOME)): "Metagenome Raw Read 1",
+    (READ_2, str(METAGENOME)): "Metagenome Raw Read 2",
+    (INTERLEAVED, str(METAGENOME)): "Metagenome Raw Reads",
+    (READ_1, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 1",
+    (READ_2, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Read 2",
+    (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
+}
+UNIT_OVERRIDES: dict[str, dict[str, str]] = {
+    "Biosample": {
+        "depth": "m",
+    }
+}
+class EnvironmentPackage(Enum):
+    r"""
+    Enumeration of all possible environmental packages.
+    >>> EnvironmentPackage.AIR.value
+    'air'
+    >>> EnvironmentPackage.SEDIMENT.value
+    'sediment'
+    """
+    AIR = "air"
+    BIOFILM = "microbial mat_biofilm"
+    BUILT_ENV = "built environment"
+    HCR_CORES = "hydrocarbon resources-cores"
+    HRC_FLUID_SWABS = "hydrocarbon resources-fluids_swabs"
+    HOST_ASSOCIATED = "host-associated"
+    MISC_ENVS = "miscellaneous natural or artificial environment"
+    PLANT_ASSOCIATED = "plant-associated"
+    SEDIMENT = "sediment"
+    SOIL = "soil"
+    WATER = "water"
 @lru_cache
 def _get_schema_view():
     """Return a SchemaView instance representing the NMDC schema"""
@@ -49,6 +114,18 @@ def group_dicts_by_key(key: str, seq: Optional[list[dict]]) -> Optional[dict]:
     return grouped
+def split_strip(string: str | None, sep: str) -> list[str] | None:
+    """Split a string by a separator and strip whitespace from each part.
+    :param string: string to split
+    :param sep: separator to split by
+    :return: list of stripped strings
+    """
+    if string is None:
+        return None
+    return [s.strip() for s in string.split(sep)]
 class SubmissionPortalTranslator(Translator):
     """A Translator subclass for handling submission portal entries
@@ -60,17 +137,16 @@ class SubmissionPortalTranslator(Translator):
     def __init__(
         self,
-        metadata_submission: JSON_OBJECT = {},
-        omics_processing_mapping: Optional[list] = None,
-        data_object_mapping: Optional[list] = None,
+        metadata_submission: Optional[JSON_OBJECT] = None,
         *args,
+        nucleotide_sequencing_mapping: Optional[list] = None,
+        data_object_mapping: Optional[list] = None,
+        illumina_instrument_mapping: Optional[dict[str, str]] = None,
         # Additional study-level metadata not captured by the submission portal currently
         # See: https://github.com/microbiomedata/submission-schema/issues/162
-        study_doi_category: Optional[str] = None,
-        study_doi_provider: Optional[str] = None,
         study_category: Optional[str] = None,
         study_pi_image_url: Optional[str] = None,
-        study_funding_sources: Optional[list[str]] = None,
+        study_id: Optional[str] = None,
         # Additional biosample-level metadata with optional column mapping information not captured
         # by the submission portal currently.
         # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -80,30 +156,34 @@ class SubmissionPortalTranslator(Translator):
     ) -> None:
         super().__init__(*args, **kwargs)
-        self.metadata_submission = metadata_submission
-        self.omics_processing_mapping = omics_processing_mapping
+        self.metadata_submission: JSON_OBJECT = metadata_submission or {}
+        self.nucleotide_sequencing_mapping = nucleotide_sequencing_mapping
         self.data_object_mapping = data_object_mapping
-        self.study_doi_category = (
-            nmdc.DoiCategoryEnum(study_doi_category)
-            if study_doi_category
-            else nmdc.DoiCategoryEnum.dataset_doi
-        )
-        self.study_doi_provider = (
-            nmdc.DoiProviderEnum(study_doi_provider) if study_doi_provider else None
+        self.illumina_instrument_mapping: dict[str, str] = (
+            illumina_instrument_mapping or {}
         )
         self.study_category = (
             nmdc.StudyCategoryEnum(study_category) if study_category else None
         )
         self.study_pi_image_url = study_pi_image_url
-        self.study_funding_sources = study_funding_sources
+        self.study_id = study_id
-        self.biosample_extras = group_dicts_by_key("source_mat_id", biosample_extras)
+        self.biosample_extras = group_dicts_by_key(
+            BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
+        )
         self.biosample_extras_slot_mapping = group_dicts_by_key(
             "subject_id", biosample_extras_slot_mapping
         )
         self.schema_view: SchemaView = _get_schema_view()
+        self._material_processing_subclass_names = []
+        for class_name in self.schema_view.class_descendants(
+            "MaterialProcessing", reflexive=False
+        ):
+            class_def = self.schema_view.get_class(class_name)
+            if not class_def.abstract:
+                self._material_processing_subclass_names.append(class_name)
     def _get_pi(
         self, metadata_submission: JSON_OBJECT
@@ -122,29 +202,9 @@ class SubmissionPortalTranslator(Translator):
             email=study_form.get("piEmail"),
             orcid=study_form.get("piOrcid"),
             profile_image_url=self.study_pi_image_url,
+            type=nmdc.PersonValue.class_class_curie,
         )
-    def _get_doi(self, metadata_submission: JSON_OBJECT) -> Union[List[nmdc.Doi], None]:
-        """Get DOI information from the context form data
-        :param metadata_submission: submission portal entry
-        :return: list of strings or None
-        """
-        dataset_doi = get_in(["contextForm", "datasetDoi"], metadata_submission)
-        if not dataset_doi:
-            return None
-        if not dataset_doi.startswith("doi:"):
-            dataset_doi = f"doi:{dataset_doi}"
-        return [
-            nmdc.Doi(
-                doi_value=dataset_doi,
-                doi_provider=self.study_doi_provider,
-                doi_category=self.study_doi_category,
-            )
-        ]
     def _get_has_credit_associations(
         self, metadata_submission: JSON_OBJECT
     ) -> Union[List[nmdc.CreditAssociation], None]:
@@ -162,8 +222,10 @@ class SubmissionPortalTranslator(Translator):
                 applies_to_person=nmdc.PersonValue(
                     name=contributor.get("name"),
                     orcid=contributor.get("orcid"),
+                    type="nmdc:PersonValue",
                 ),
                 applied_roles=contributor.get("roles"),
+                type="nmdc:CreditAssociation",
             )
             for contributor in contributors
         ]
@@ -171,72 +233,92 @@ class SubmissionPortalTranslator(Translator):
     def _get_gold_study_identifiers(
         self, metadata_submission: JSON_OBJECT
     ) -> Union[List[str], None]:
-        """Construct a GOLD CURIE from the multiomics from data
+        """Construct a GOLD CURIE from the study form data
         :param metadata_submission: submission portal entry
         :return: GOLD CURIE
         """
-        gold_study_id = get_in(["multiOmicsForm", "GOLDStudyId"], metadata_submission)
+        gold_study_id = get_in(["studyForm", "GOLDStudyId"], metadata_submission)
         if not gold_study_id:
             return None
-        return [self._get_curie("GOLD", gold_study_id)]
+        return [self._ensure_curie(gold_study_id, default_prefix="gold")]
-    def _get_quantity_value(
-        self, raw_value: Optional[str], unit: Optional[str] = None
-    ) -> Union[nmdc.QuantityValue, None]:
-        """Construct a nmdc:QuantityValue from a raw value string
+    def _get_ncbi_bioproject_identifiers(
+        self, metadata_submission: JSON_OBJECT
+    ) -> Union[List[str], None]:
+        """Construct a NCBI Bioproject CURIE from the study form data"""
-        The regex pattern minimally matches on a single numeric value (possibly
-        floating point). The pattern can also identify a range represented by
-        two numeric values separated by a hyphen. It can also identify non-numeric
-        characters at the end of the string which are interpreted as a unit. A unit
-        may also be explicitly provided as an argument to this function. If parsing
-        identifies a unit and a unit argument is provided, the unit argument is used.
-        If the pattern is not matched at all None is returned.
+        ncbi_bioproject_id = get_in(
+            ["studyForm", "NCBIBioProjectId"], metadata_submission
+        )
+        if not ncbi_bioproject_id:
+            return None
-        TODO: currently the parsed unit string is used as-is. In the future we may want
-        to be stricter about what we accept or coerce into a controlled value set
+        return [self._ensure_curie(ncbi_bioproject_id, default_prefix="bioproject")]
-        :param raw_value: string to parse
-        :param unit: optional unit, defaults to None
-        :return: nmdc:QuantityValue
+    def _get_jgi_study_identifiers(
+        self, metadata_submission: JSON_OBJECT
+    ) -> Union[List[str], None]:
+        """Construct a JGI proposal CURIE from the multiomics form data
+        :param metadata_submission: submission portal entry
+        :return: JGI proposal CURIE
         """
-        if raw_value is None:
+        jgi_study_id = get_in(["multiOmicsForm", "JGIStudyId"], metadata_submission)
+        if not jgi_study_id:
             return None
-        match = re.fullmatch(
-            "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
-            raw_value,
-        )
-        if not match:
+        return [self._ensure_curie(jgi_study_id, default_prefix="jgi.proposal")]
+    def _get_emsl_project_identifiers(
+        self, metadata_submission: JSON_OBJECT
+    ) -> Union[List[str], None]:
+        """Construct an EMSL project CURIE from the multiomics form data
+        :param metadata_submission: submission portal entry
+        :return: EMSL project CURIE
+        """
+        emsl_project_id = get_in(["multiOmicsForm", "studyNumber"], metadata_submission)
+        if not emsl_project_id:
             return None
-        qv = nmdc.QuantityValue(has_raw_value=raw_value)
-        if match.group(2):
-            # having group 2 means the value is a range like "0 - 1". Either
-            # group 1 or group 2 might be the minimum especially when handling
-            # negative ranges like "0 - -1"
-            num_1 = float(match.group(1))
-            num_2 = float(match.group(2))
-            qv.has_minimum_numeric_value = min(num_1, num_2)
-            qv.has_maximum_numeric_value = max(num_1, num_2)
-        else:
-            # otherwise we just have a single numeric value
-            qv.has_numeric_value = float(match.group(1))
-        if unit:
-            # a unit was manually specified
-            if match.group(3) and unit != match.group(3):
-                # a unit was also found in the raw string; issue a warning
-                # if they don't agree, but keep the manually specified one
-                logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
-            qv.has_unit = unit
-        elif match.group(3):
-            # a unit was found in the raw string
-            qv.has_unit = match.group(3)
-        return qv
+        return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
+    def _get_quantity_value(
+        self,
+        raw_value: Optional[str | int | float],
+        slot_definition: SlotDefinition,
+        unit: Optional[str] = None,
+    ) -> Union[nmdc.QuantityValue, None]:
+        """Construct a nmdc:QuantityValue from a raw value string"""
+        # If the storage_units annotation is present on the slot and it only contains one unit (i.e.
+        # not a pipe-separated list of units) then use that unit.
+        if "storage_units" in slot_definition.annotations:
+            storage_units = slot_definition.annotations["storage_units"].value
+            if storage_units and "|" not in storage_units:
+                unit = storage_units
+        # If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
+        if isinstance(raw_value, (int, float)):
+            if unit is None:
+                raise ValueError(
+                    f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
+                )
+            # Constructing a Decimal directly from a float will maintain the full precision of the
+            # float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
+            # a string first and then constructing the Decimal from that string will give a more
+            # expected value.
+            value_as_str = str(raw_value)
+            return nmdc.QuantityValue(
+                has_raw_value=value_as_str,
+                has_numeric_value=Decimal(value_as_str),
+                has_unit=unit,
+                type="nmdc:QuantityValue",
+            )
+        return self._parse_quantity_value(raw_value, unit)
     def _get_ontology_class(
         self, raw_value: Optional[str]
@@ -259,6 +341,7 @@ class SubmissionPortalTranslator(Translator):
         return nmdc.OntologyClass(
             name=match.group(1).strip(),
             id=match.group(2).strip(),
+            type="nmdc:OntologyClass",
         )
     def _get_controlled_identified_term_value(
@@ -280,7 +363,9 @@ class SubmissionPortalTranslator(Translator):
             return None
         return nmdc.ControlledIdentifiedTermValue(
-            has_raw_value=raw_value, term=ontology_class
+            has_raw_value=raw_value,
+            term=ontology_class,
+            type="nmdc:ControlledIdentifiedTermValue",
         )
     def _get_controlled_term_value(
@@ -297,7 +382,10 @@ class SubmissionPortalTranslator(Translator):
         if not raw_value:
             return None
-        value = nmdc.ControlledTermValue(has_raw_value=raw_value)
+        value = nmdc.ControlledTermValue(
+            has_raw_value=raw_value,
+            type="nmdc:ControlledTermValue",
+        )
         ontology_class = self._get_ontology_class(raw_value)
         if ontology_class is not None:
             value.term = ontology_class
@@ -327,7 +415,10 @@ class SubmissionPortalTranslator(Translator):
             return None
         return nmdc.GeolocationValue(
-            has_raw_value=raw_value, latitude=match.group(1), longitude=match.group(2)
+            has_raw_value=raw_value,
+            latitude=match.group(1),
+            longitude=match.group(2),
+            type="nmdc:GeolocationValue",
         )
     def _get_float(self, raw_value: Optional[str]) -> Union[float, None]:
@@ -376,6 +467,127 @@ class SubmissionPortalTranslator(Translator):
         return value
+    def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
+        """Collect and format DOIs from submission portal schema in nmdc format DOIs
+        If there were no DOIs, None is returned.
+        :param metadata_submission: submission portal entry
+        :return: list of nmdc.DOI objects
+        """
+        data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
+        award_dois = self._get_from(
+            metadata_submission, ["multiOmicsForm", "awardDois"]
+        )
+        if data_dois and len(data_dois) > 0:
+            updated_data_dois = [
+                nmdc.Doi(
+                    doi_category="dataset_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in data_dois
+            ]
+        else:
+            updated_data_dois = []
+        if award_dois and len(award_dois) > 0:
+            updated_award_dois = [
+                nmdc.Doi(
+                    doi_category="award_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in award_dois
+            ]
+        else:
+            updated_award_dois = []
+        return_val = updated_data_dois + updated_award_dois
+        if len(return_val) == 0:
+            return_val = None
+        return return_val
+    def _get_data_objects_from_fields(
+        self,
+        sample_data: JSON_OBJECT,
+        *,
+        url_field_name: str,
+        md5_checksum_field_name: str,
+        nucleotide_sequencing_id: str,
+        data_object_type: nmdc.FileTypeEnum,
+    ) -> Tuple[List[nmdc.DataObject], nmdc.Manifest | None]:
+        """Get a DataObject instances based on the URLs and MD5 checksums in the given fields.
+        If the field provides multiple URLs, multiple DataObject instances will be created and a
+        Manifest will be created and provided in the second return value.
+        :param sample_data: sample data
+        :param url_field_name: field name for the URL
+        :param md5_checksum_field_name: field name for the MD5 checksum
+        :param nucleotide_sequencing_id: ID for the nmdc:NucleotideSequencing object that generated the data object(s)
+        :param data_object_type: FileTypeEnum representing the type of the data object
+        :return: nmdc.DataObject or None
+        """
+        data_objects: List[nmdc.DataObject] = []
+        urls = split_strip(sample_data.get(url_field_name), ";")
+        if not urls:
+            return data_objects, None
+        md5_checksums = split_strip(sample_data.get(md5_checksum_field_name), ";")
+        if md5_checksums and len(urls) != len(md5_checksums):
+            raise ValueError(
+                f"{url_field_name} and {md5_checksum_field_name} must have the same number of values"
+            )
+        data_object_ids = self._id_minter("nmdc:DataObject", len(urls))
+        manifest: nmdc.Manifest | None = None
+        if len(urls) > 1:
+            manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
+            manifest = nmdc.Manifest(
+                id=manifest_id,
+                manifest_category=nmdc.ManifestCategoryEnum(
+                    nmdc.ManifestCategoryEnum.poolable_replicates
+                ),
+                type="nmdc:Manifest",
+            )
+        for i, url in enumerate(urls):
+            data_object_id = data_object_ids[i]
+            parsed_url = urlparse(url)
+            possible_filename = parsed_url.path.rsplit("/", 1)[-1]
+            data_object_slots = {
+                "id": data_object_id,
+                "name": possible_filename,
+                "description": f"{data_object_type} for {nucleotide_sequencing_id}",
+                "type": "nmdc:DataObject",
+                "url": url,
+                "md5_checksum": md5_checksums[i] if md5_checksums else None,
+                "in_manifest": [manifest.id] if manifest else None,
+                "data_category": nmdc.DataCategoryEnum(
+                    nmdc.DataCategoryEnum.instrument_data
+                ),
+                "data_object_type": data_object_type,
+                "was_generated_by": nucleotide_sequencing_id,
+            }
+            data_object_slots.update(
+                self._transform_dict_for_class(sample_data, "DataObject")
+            )
+            data_objects.append(nmdc.DataObject(**data_object_slots))
+        return data_objects, manifest
+    def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
+        """Parse a sample link in the form of `ProcessingName:SampleName,..."""
+        pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
+        match = re.match(pattern, sample_link)
+        if not match:
+            return None
+        return match.group(1), split_strip(match.group(2), ",")
     def _translate_study(
         self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
     ) -> nmdc.Study:
@@ -389,20 +601,18 @@ class SubmissionPortalTranslator(Translator):
         :return: nmdc:Study object
         """
         return nmdc.Study(
-            alternative_identifiers=self._get_from(
-                metadata_submission, ["multiOmicsForm", "JGIStudyId"]
-            ),
             alternative_names=self._get_from(
-                metadata_submission, ["multiOmicsForm", "alternativeNames"]
+                metadata_submission, ["studyForm", "alternativeNames"]
             ),
-            associated_dois=self._get_doi(metadata_submission),
             description=self._get_from(
                 metadata_submission, ["studyForm", "description"]
             ),
-            funding_sources=self.study_funding_sources,
-            # emsl_proposal_identifier=self._get_from(
-            #     metadata_submission, ["multiOmicsForm", "studyNumber"]
-            # ),
+            funding_sources=self._get_from(
+                metadata_submission, ["studyForm", "fundingSources"]
+            ),
+            emsl_project_identifiers=self._get_emsl_project_identifiers(
+                metadata_submission
+            ),
             gold_study_identifiers=self._get_gold_study_identifiers(
                 metadata_submission
             ),
@@ -410,17 +620,22 @@ class SubmissionPortalTranslator(Translator):
                 metadata_submission
             ),
             id=nmdc_study_id,
-            insdc_bioproject_identifiers=self._get_from(
-                metadata_submission, ["multiOmicsForm", "NCBIBioProjectId"]
+            insdc_bioproject_identifiers=self._get_ncbi_bioproject_identifiers(
+                metadata_submission
+            ),
+            jgi_portal_study_identifiers=self._get_jgi_study_identifiers(
+                metadata_submission
             ),
             name=self._get_from(metadata_submission, ["studyForm", "studyName"]),
             notes=self._get_from(metadata_submission, ["studyForm", "notes"]),
             principal_investigator=self._get_pi(metadata_submission),
             study_category=self.study_category,
             title=self._get_from(metadata_submission, ["studyForm", "studyName"]),
+            type="nmdc:Study",
             websites=self._get_from(
                 metadata_submission, ["studyForm", "linkOutWebpage"]
             ),
+            associated_dois=self._get_study_dois(metadata_submission),
         )
     def _transform_value_for_slot(
@@ -428,15 +643,25 @@ class SubmissionPortalTranslator(Translator):
     ):
         transformed_value = None
         if slot.range == "TextValue":
-            transformed_value = nmdc.TextValue(has_raw_value=value)
+            transformed_value = nmdc.TextValue(
+                has_raw_value=value,
+                type="nmdc:TextValue",
+            )
         elif slot.range == "QuantityValue":
-            transformed_value = self._get_quantity_value(value, unit=unit)
+            transformed_value = self._get_quantity_value(
+                value,
+                slot,
+                unit=unit,
+            )
         elif slot.range == "ControlledIdentifiedTermValue":
             transformed_value = self._get_controlled_identified_term_value(value)
         elif slot.range == "ControlledTermValue":
             transformed_value = self._get_controlled_term_value(value)
         elif slot.range == "TimestampValue":
-            transformed_value = nmdc.TimestampValue(has_raw_value=value)
+            transformed_value = nmdc.TimestampValue(
+                has_raw_value=value,
+                type="nmdc:TimestampValue",
+            )
         elif slot.range == "GeolocationValue":
             transformed_value = self._get_geolocation_value(value)
         elif slot.range == "float":
@@ -481,11 +706,22 @@ class SubmissionPortalTranslator(Translator):
                 logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
                 continue
+            # This step handles cases where the submission portal/schema instructs a user to
+            # provide a value in a specific unit. The unit cannot be parsed out of the raw value
+            # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
+            # go away once units are encoded in the schema itself.
+            # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
+            if class_name in UNIT_OVERRIDES:
+                # If the class has unit overrides, check if the slot is in the overrides
+                unit_overrides = UNIT_OVERRIDES[class_name]
+                if slot_name in unit_overrides:
+                    unit = unit_overrides[slot_name]
             slot_definition = self.schema_view.induced_slot(slot_name, class_name)
             if slot_definition.multivalued:
                 value_list = value
                 if isinstance(value, str):
-                    value_list = [v.strip() for v in value.split("|")]
+                    value_list = split_strip(value, "|")
                 transformed_value = [
                     self._transform_value_for_slot(item, slot_definition, unit)
                     for item in value_list
@@ -503,7 +739,6 @@ class SubmissionPortalTranslator(Translator):
         sample_data: List[JSON_OBJECT],
         nmdc_biosample_id: str,
         nmdc_study_id: str,
-        default_env_package: str,
     ) -> nmdc.Biosample:
         """Translate sample data from portal submission into an `nmdc:Biosample` object.
@@ -518,22 +753,30 @@ class SubmissionPortalTranslator(Translator):
                             from each applicable submission portal tab
         :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the translated object
         :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
-        :param default_env_package: Default value for `env_package` slot
         :return: nmdc:Biosample
         """
-        source_mat_id = sample_data[0].get("source_mat_id", "").strip()
+        env_idx = next(
+            (
+                i
+                for i, tab in enumerate(sample_data)
+                if tab.get("env_package") is not None
+            ),
+            0,
+        )
+        biosample_key = sample_data[env_idx].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip()
         slots = {
             "id": nmdc_biosample_id,
-            "part_of": nmdc_study_id,
-            "name": sample_data[0].get("samp_name", "").strip(),
-            "env_package": nmdc.TextValue(has_raw_value=default_env_package),
+            "associated_studies": [nmdc_study_id],
+            "type": "nmdc:Biosample",
+            "name": sample_data[env_idx].get("samp_name", "").strip(),
+            "env_package": sample_data[env_idx].get("env_package"),
         }
         for tab in sample_data:
             transformed_tab = self._transform_dict_for_class(tab, "Biosample")
             slots.update(transformed_tab)
         if self.biosample_extras:
-            raw_extras = self.biosample_extras.get(source_mat_id)
+            raw_extras = self.biosample_extras.get(biosample_key)
             if raw_extras:
                 transformed_extras = self._transform_dict_for_class(
                     raw_extras, "Biosample", self.biosample_extras_slot_mapping
@@ -552,47 +795,217 @@ class SubmissionPortalTranslator(Translator):
         :return: nmdc:Database object
         """
         database = nmdc.Database()
-        nmdc_study_id = self._id_minter("nmdc:Study")[0]
         metadata_submission_data = self.metadata_submission.get(
             "metadata_submission", {}
         )
-        database.study_set = [
-            self._translate_study(metadata_submission_data, nmdc_study_id)
-        ]
+        # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
+        if self.study_id:
+            nmdc_study_id = self.study_id
+        else:
+            nmdc_study_id = self._id_minter("nmdc:Study")[0]
+            database.study_set = [
+                self._translate_study(metadata_submission_data, nmdc_study_id)
+            ]
+        # Automatically populate the `env_package` field in the sample data based on which
+        # environmental data tab the sample data came from.
         sample_data = metadata_submission_data.get("sampleData", {})
-        package_name = metadata_submission_data["packageName"]
-        sample_data_by_id = groupby("source_mat_id", concat(sample_data.values()))
+        for key in sample_data.keys():
+            env = key.removesuffix("_data").upper()
+            try:
+                package_name = EnvironmentPackage[env].value
+                for sample in sample_data[key]:
+                    sample["env_package"] = package_name
+            except KeyError:
+                # This is expected when processing rows from tabs like the JGI/EMSL tabs or external
+                # sequencing data tabs.
+                pass
+        # Before regrouping the data by sample name, record which tab each object came from
+        for tab_name in sample_data.keys():
+            for tab in sample_data[tab_name]:
+                tab[TAB_NAME_KEY] = tab_name
+        # Reorganize the sample data by sample name and generate a unique NMDC ID for each
+        sample_data_by_id = groupby(
+            BIOSAMPLE_UNIQUE_KEY_SLOT,
+            concat(sample_data.values()),
+        )
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_data_by_id))
         sample_data_to_nmdc_biosample_ids = dict(
             zip(sample_data_by_id.keys(), nmdc_biosample_ids)
         )
-        database.biosample_set = [
-            self._translate_biosample(
-                sample_data,
-                nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
-                nmdc_study_id=nmdc_study_id,
-                default_env_package=package_name,
-            )
-            for sample_data_id, sample_data in sample_data_by_id.items()
-            if sample_data
-        ]
+        # Translate the sample data into nmdc:Biosample objects
+        database.biosample_set = []
+        for sample_data_id, sample_data in sample_data_by_id.items():
+            # This shouldn't happen, but just in case skip empty sample data
+            if not sample_data:
+                continue
-        if self.omics_processing_mapping:
-            # If there is data from an OmicsProcessing mapping file, process it now. This part
-            # assumes that there is a column in that file with the header __biosample_source_mat_id
+            # Find the first tab that has a sample_link value and attempt to parse it
+            sample_link = ""
+            for tab in sample_data:
+                if tab.get("sample_link"):
+                    sample_link = tab.get("sample_link")
+                    break
+            parsed_sample_link = self._parse_sample_link(sample_link)
+            # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
+            # format, then create a ProcessedSample and MaterialProcessing instance instead of a
+            # Biosample instance. The input samples must be present in the submission for this to
+            # work. An exception is raised if any of the referenced input samples are missing.
+            if parsed_sample_link is not None:
+                processing_type, processing_inputs = parsed_sample_link
+                if not all(
+                    input_id in sample_data_to_nmdc_biosample_ids
+                    for input_id in processing_inputs
+                ):
+                    raise ValueError(
+                        f"Could not find all input samples in sample_link '{sample_link}'"
+                    )
+                processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
+                database.processed_sample_set.append(
+                    nmdc.ProcessedSample(
+                        id=processed_sample_id,
+                        type="nmdc:ProcessedSample",
+                        name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
+                    )
+                )
+                processing_class = getattr(nmdc, processing_type)
+                material_processing = processing_class(
+                    id=self._id_minter(f"nmdc:{processing_type}")[0],
+                    type=f"nmdc:{processing_type}",
+                    has_input=[
+                        sample_data_to_nmdc_biosample_ids[input_id]
+                        for input_id in processing_inputs
+                    ],
+                    has_output=[processed_sample_id],
+                )
+                database.material_processing_set.append(material_processing)
+            # If there was no sample_link or it doesn't follow the expected format, create a
+            # Biosample instance as normal.
+            else:
+                biosample = self._translate_biosample(
+                    sample_data,
+                    nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
+                    nmdc_study_id=nmdc_study_id,
+                )
+                database.biosample_set.append(biosample)
+        # This section handles the translation of information in the external sequencing tabs into
+        # various NMDC objects.
+        database.data_generation_set = []
+        database.data_object_set = []
+        database.instrument_set = []
+        database.manifest_set = []
+        today = datetime.now().strftime("%Y-%m-%d")
+        for sample_data_id, sample_data in sample_data_by_id.items():
+            for tab in sample_data:
+                tab_name = tab.get(TAB_NAME_KEY)
+                analyte_category = TAB_NAME_TO_ANALYTE_CATEGORY.get(tab_name)
+                if not analyte_category:
+                    # If the tab name cannot be mapped to an analyte category, that means we're
+                    # not in an external sequencing data tabs (e.g. this is an environmental data
+                    # tab or a JGI/EMSL tab). Skip this tab.
+                    continue
+                # Start by generating one NucleotideSequencing instance with a has_input
+                # relationship to the current Biosample instance.
+                nucleotide_sequencing_id = self._id_minter(
+                    "nmdc:NucleotideSequencing", 1
+                )[0]
+                nucleotide_sequencing_slots = {
+                    "id": nucleotide_sequencing_id,
+                    "has_input": sample_data_to_nmdc_biosample_ids[sample_data_id],
+                    "has_output": [],
+                    "associated_studies": [nmdc_study_id],
+                    "add_date": today,
+                    "mod_date": today,
+                    "analyte_category": analyte_category,
+                    "type": "nmdc:NucleotideSequencing",
+                }
+                # If the protocol_link column was filled in, expand it into an nmdc:Protocol object
+                if "protocol_link" in tab:
+                    protocol_link = tab.pop("protocol_link")
+                    nucleotide_sequencing_slots["protocol_link"] = nmdc.Protocol(
+                        url=protocol_link,
+                        type="nmdc:Protocol",
+                    )
+                # If model column was filled in, expand it into an nmdc:Instrument object. This is
+                # done by first checking the provided instrument mapping to see if the model is
+                # already present. If it is not, a new instrument object is created and added to the
+                # instrument_set. Currently, we only accept sequencing data in the submission portal
+                # that was generated by Illumina instruments, so the vendor is hardcoded here.
+                if "model" in tab:
+                    model = tab.pop("model")
+                    if model not in self.illumina_instrument_mapping:
+                        # If the model is not already in the mapping, create a new record for it
+                        nmdc_instrument_id = self._id_minter("nmdc:Instrument", 1)[0]
+                        database.instrument_set.append(
+                            nmdc.Instrument(
+                                id=nmdc_instrument_id,
+                                vendor=nmdc.InstrumentVendorEnum(
+                                    nmdc.InstrumentVendorEnum.illumina
+                                ),
+                                model=nmdc.InstrumentModelEnum(model),
+                                type="nmdc:Instrument",
+                            )
+                        )
+                        self.illumina_instrument_mapping[model] = nmdc_instrument_id
+                    nucleotide_sequencing_slots["instrument_used"] = (
+                        self.illumina_instrument_mapping[model]
+                    )
+                # Process the remaining columns according to the NucleotideSequencing class
+                # definition
+                nucleotide_sequencing_slots.update(
+                    self._transform_dict_for_class(tab, "NucleotideSequencing")
+                )
+                nucleotide_sequencing = nmdc.NucleotideSequencing(
+                    **nucleotide_sequencing_slots
+                )
+                database.data_generation_set.append(nucleotide_sequencing)
+                # Iterate over the columns that contain URLs and MD5 checksums and translate them
+                # into DataObject instances. Each of these DataObject instances will be connected
+                # to the NucleotideSequencing instance via the has_output/was_generated_by
+                # relationships.
+                for data_url in DATA_URL_SETS:
+                    data_object_type = DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE[
+                        (data_url, str(analyte_category))
+                    ]
+                    data_objects, manifest = self._get_data_objects_from_fields(
+                        tab,
+                        url_field_name=data_url.url,
+                        md5_checksum_field_name=data_url.md5_checksum,
+                        nucleotide_sequencing_id=nucleotide_sequencing_id,
+                        data_object_type=nmdc.FileTypeEnum(data_object_type),
+                    )
+                    if manifest:
+                        database.manifest_set.append(manifest)
+                    for data_object in data_objects:
+                        nucleotide_sequencing.has_output.append(data_object.id)
+                        database.data_object_set.append(data_object)
+        # This is the older way of handling attaching NucleotideSequencing and DataObject instances
+        # to the Biosample instances. This should now mainly be handled by the external sequencing
+        # data tabs in the submission portal. This code is being left in place for now in case it is
+        # needed in the future.
+        if self.nucleotide_sequencing_mapping:
+            # If there is data from an NucleotideSequencing mapping file, process it now. This part
+            # assumes that there is a column in that file with the header __biosample_samp_name
             # that can be used to join with the sample data from the submission portal. The
-            # biosample identified by that `source_mat_id` will be referenced in the `has_input`
-            # slot of the OmicsProcessing object. If a DataObject mapping file was also provided,
-            # those objects will also be generated and referenced in the `has_output` slot of the
-            # OmicsProcessing object. By keying off of the `source_mat_id` slot of the submission's
-            # sample data there is an implicit 1:1 relationship between Biosample objects and
-            # OmicsProcessing objects generated here.
-            join_key = "__biosample_source_mat_id"
-            database.omics_processing_set = []
+            # biosample identified by that `samp_name` will be referenced in the `has_input`
+            # slot of the NucleotideSequencing object. If a DataObject mapping file was also
+            # provided, those objects will also be generated and referenced in the `has_output` slot
+            # of the NucleotideSequencing object. By keying off of the `samp_name` slot of the
+            # submission's sample data there is an implicit 1:1 relationship between Biosample
+            # objects and NucleotideSequencing objects generated here.
+            join_key = f"__biosample_{BIOSAMPLE_UNIQUE_KEY_SLOT}"
+            database.data_generation_set = []
             database.data_object_set = []
             data_objects_by_sample_data_id = {}
             today = datetime.now().strftime("%Y-%m-%d")
@@ -608,45 +1021,47 @@ class SubmissionPortalTranslator(Translator):
                     grouped,
                 )
-            for omics_processing_row in self.omics_processing_mapping:
-                # For each row in the OmicsProcessing mapping file, first grab the minted Biosample
-                # id that corresponds to the sample ID from the submission
-                sample_data_id = omics_processing_row.pop(join_key)
+            for nucleotide_sequencing_row in self.nucleotide_sequencing_mapping:
+                # For each row in the NucleotideSequencing mapping file, first grab the minted
+                # Biosample id that corresponds to the sample ID from the submission
+                sample_data_id = nucleotide_sequencing_row.pop(join_key)
                 if (
                     not sample_data_id
                     or sample_data_id not in sample_data_to_nmdc_biosample_ids
                 ):
                     logging.warning(
-                        f"Unrecognized biosample source_mat_id: {sample_data_id}"
+                        f"Unrecognized biosample {BIOSAMPLE_UNIQUE_KEY_SLOT}: {sample_data_id}"
                     )
                     continue
                 nmdc_biosample_id = sample_data_to_nmdc_biosample_ids[sample_data_id]
-                # Transform the raw row data according to the OmicsProcessing class's slots, and
-                # generate an instance. A few key slots do not come from the mapping file, but
+                # Transform the raw row data according to the NucleotideSequencing class's slots,
+                # and generate an instance. A few key slots do not come from the mapping file, but
                 # instead are defined here.
-                omics_processing_slots = {
-                    "id": self._id_minter("nmdc:OmicsProcessing", 1)[0],
+                nucleotide_sequencing_slots = {
+                    "id": self._id_minter("nmdc:NucleotideSequencing", 1)[0],
                     "has_input": [nmdc_biosample_id],
                     "has_output": [],
-                    "part_of": nmdc_study_id,
+                    "associated_studies": [nmdc_study_id],
                     "add_date": today,
                     "mod_date": today,
-                    "type": "nmdc:OmicsProcessing",
+                    "type": "nmdc:NucleotideSequencing",
                 }
-                omics_processing_slots.update(
+                nucleotide_sequencing_slots.update(
                     self._transform_dict_for_class(
-                        omics_processing_row, "OmicsProcessing"
+                        nucleotide_sequencing_row, "NucleotideSequencing"
                     )
                 )
-                omics_processing = nmdc.OmicsProcessing(**omics_processing_slots)
+                nucleotide_sequencing = nmdc.NucleotideSequencing(
+                    **nucleotide_sequencing_slots
+                )
                 for data_object_row in data_objects_by_sample_data_id.get(
                     sample_data_id, []
                 ):
                     # For each row in the DataObject mapping file that corresponds to the sample ID,
                     # transform the raw row data according to the DataObject class's slots, generate
-                    # an instance, and connect that instance's minted ID to the OmicsProcessing
+                    # an instance, and connect that instance's minted ID to the NucleotideSequencing
                     # instance
                     data_object_id = self._id_minter("nmdc:DataObject", 1)[0]
                     data_object_slots = {
@@ -658,10 +1073,49 @@ class SubmissionPortalTranslator(Translator):
                     )
                     data_object = nmdc.DataObject(**data_object_slots)
-                    omics_processing.has_output.append(data_object_id)
+                    nucleotide_sequencing.has_output.append(data_object_id)
                     database.data_object_set.append(data_object)
-                database.omics_processing_set.append(omics_processing)
+                database.data_generation_set.append(nucleotide_sequencing)
         return database
+    @staticmethod
+    def set_study_images(
+        nmdc_study: nmdc.Study,
+        pi_image_url: Optional[str],
+        primary_study_image_url: Optional[str],
+        study_images_url: Optional[list[str]],
+    ) -> None:
+        """Set images for a study based on provided URLs."""
+        if pi_image_url:
+            if not nmdc_study.principal_investigator:
+                nmdc_study.principal_investigator = nmdc.PersonValue(
+                    type="nmdc:PersonValue"
+                )
+            nmdc_study.principal_investigator.profile_image_url = pi_image_url
+        if primary_study_image_url:
+            if not nmdc_study.study_image:
+                nmdc_study.study_image = []
+            nmdc_study.study_image.append(
+                nmdc.ImageValue(
+                    type="nmdc:ImageValue",
+                    url=primary_study_image_url,
+                    display_order=0,
+                )
+            )
+        if study_images_url:
+            if not nmdc_study.study_image:
+                nmdc_study.study_image = []
+            for idx, image_url in enumerate(study_images_url, start=1):
+                nmdc_study.study_image.append(
+                    nmdc.ImageValue(
+                        type="nmdc:ImageValue",
+                        url=image_url,
+                        display_order=idx,
+                    )
+                )

nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl