PyPI - nmdc-runtime - Versions diffs - 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl - Mend

nmdc-runtime 2.7.0py3-none-any.whl → 2.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (21) hide show

nmdc_runtime/config.py +57 -1
nmdc_runtime/mongo_util.py +90 -0
nmdc_runtime/site/export/ncbi_xml.py +98 -27
nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
nmdc_runtime/site/graphs.py +72 -9
nmdc_runtime/site/ops.py +408 -65
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +107 -6
nmdc_runtime/site/resources.py +17 -4
nmdc_runtime/site/translation/gold_translator.py +18 -9
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
nmdc_runtime/site/translation/submission_portal_translator.py +62 -0
nmdc_runtime/util.py +53 -267
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/METADATA +18 -3
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/RECORD +21 -20
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/licenses/LICENSE +0 -0
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/site/repair/database_updater.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 import pandas as pd
 from nmdc_runtime.site.resources import (
     RuntimeApiUserClient,
@@ -18,6 +18,8 @@ class DatabaseUpdater:
         gold_api_client: GoldApiClient,
         study_id: str,
         gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+        include_field_site_info: bool = False,
+        enable_biosample_filtering: bool = True,
     ):
         """This class serves as an API for repairing connections in the database by
         adding records that are essentially missing "links"/"connections". As we identify
@@ -39,6 +41,8 @@ class DatabaseUpdater:
         self.gold_api_client = gold_api_client
         self.study_id = study_id
         self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
+        self.include_field_site_info = include_field_site_info
+        self.enable_biosample_filtering = enable_biosample_filtering
     @lru_cache
     def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
@@ -95,6 +99,8 @@ class DatabaseUpdater:
             biosamples=all_gold_biosamples,
             projects=all_gold_projects,
             gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+            include_field_site_info=self.include_field_site_info,
+            enable_biosample_filtering=self.enable_biosample_filtering,
         )
         # The GoldStudyTranslator class has some pre-processing logic which filters out
@@ -214,6 +220,8 @@ class DatabaseUpdater:
             projects=gold_sequencing_projects_for_study,
             analysis_projects=gold_analysis_projects_for_study,
             gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+            include_field_site_info=self.include_field_site_info,
+            enable_biosample_filtering=self.enable_biosample_filtering,
         )
         translated_biosamples = gold_study_translator.biosamples
@@ -240,3 +248,204 @@ class DatabaseUpdater:
         ]
         return database
+    def queries_run_script_to_update_insdc_identifiers(
+        self,
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """This method creates a `/queries:run` API endpoint compatible update script that can be run
+        using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
+        of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
+        The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
+        `ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
+        :return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
+        """
+        # Fetch all biosamples associated with the study
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        # Fetch all data_generation records associated with the study
+        data_generation_set = (
+            self.runtime_api_user_client.get_data_generation_records_for_study(
+                self.study_id
+            )
+        )
+        biosample_updates = []
+        data_generation_updates = []
+        # Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
+        gold_project_to_bioproject = {}
+        # Dictionary to store all project data we gather during biosample processing
+        all_processed_projects = {}
+        # Process biosamples for insdc_biosample_identifiers
+        for biosample in biosample_set:
+            # get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
+            gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
+            if not gold_biosample_identifiers:
+                continue
+            biosample_id = biosample.get("id")
+            if not biosample_id:
+                continue
+            insdc_biosample_identifiers = []
+            for gold_biosample_id in gold_biosample_identifiers:
+                normalized_id = gold_biosample_id.replace("gold:", "")
+                # fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
+                gold_projects = self.gold_api_client.fetch_projects_by_biosample(
+                    normalized_id
+                )
+                for project in gold_projects:
+                    # Store each project for later use
+                    project_gold_id = project.get("projectGoldId")
+                    if project_gold_id:
+                        all_processed_projects[project_gold_id] = project
+                    # Collect ncbi_biosample_accession for biosample updates
+                    ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
+                    if ncbi_biosample_accession and ncbi_biosample_accession.strip():
+                        insdc_biosample_identifiers.append(ncbi_biosample_accession)
+                    # Collect ncbi_bioproject_accession for data_generation records
+                    ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
+                    if (
+                        project_gold_id
+                        and ncbi_bioproject_accession
+                        and ncbi_bioproject_accession.strip()
+                    ):
+                        gold_project_to_bioproject[project_gold_id] = (
+                            ncbi_bioproject_accession
+                        )
+            if insdc_biosample_identifiers:
+                existing_insdc_biosample_identifiers = biosample.get(
+                    "insdc_biosample_identifiers", []
+                )
+                new_insdc_biosample_identifiers = list(
+                    set(insdc_biosample_identifiers)
+                    - set(existing_insdc_biosample_identifiers)
+                )
+                if new_insdc_biosample_identifiers:
+                    prefixed_new_biosample_identifiers = [
+                        f"biosample:{id}" for id in new_insdc_biosample_identifiers
+                    ]
+                    if existing_insdc_biosample_identifiers:
+                        all_biosample_identifiers = list(
+                            set(
+                                existing_insdc_biosample_identifiers
+                                + prefixed_new_biosample_identifiers
+                            )
+                        )
+                        biosample_updates.append(
+                            {
+                                "q": {"id": biosample_id},
+                                "u": {
+                                    "$set": {
+                                        "insdc_biosample_identifiers": all_biosample_identifiers
+                                    }
+                                },
+                            }
+                        )
+                    else:
+                        biosample_updates.append(
+                            {
+                                "q": {"id": biosample_id},
+                                "u": {
+                                    "$set": {
+                                        "insdc_biosample_identifiers": prefixed_new_biosample_identifiers
+                                    }
+                                },
+                            }
+                        )
+        # Process data_generation records for insdc_bioproject_identifiers
+        for data_generation in data_generation_set:
+            data_generation_id = data_generation.get("id")
+            if not data_generation_id:
+                continue
+            # Extract existing insdc_bioproject_identifiers
+            existing_insdc_bioproject_identifiers = data_generation.get(
+                "insdc_bioproject_identifiers", []
+            )
+            collected_insdc_bioproject_identifiers = set()
+            # Add any project identifiers already on the record
+            if "insdc_bioproject_identifiers" in data_generation:
+                for identifier in data_generation["insdc_bioproject_identifiers"]:
+                    collected_insdc_bioproject_identifiers.add(identifier)
+            # If there are gold_sequencing_project_identifiers, use our pre-collected mapping
+            gold_project_identifiers = data_generation.get(
+                "gold_sequencing_project_identifiers", []
+            )
+            for gold_project_id in gold_project_identifiers:
+                normalized_id = gold_project_id.replace("gold:", "")
+                # Check if we have a bioproject ID for this GOLD project ID
+                if normalized_id in gold_project_to_bioproject:
+                    ncbi_bioproject_accession = gold_project_to_bioproject[
+                        normalized_id
+                    ]
+                    collected_insdc_bioproject_identifiers.add(
+                        f"bioproject:{ncbi_bioproject_accession}"
+                    )
+                else:
+                    # Only if we don't have it in our mapping, try to fetch it
+                    # Instead of making a direct API request, check if we've already seen this project
+                    if normalized_id in all_processed_projects:
+                        project_data = all_processed_projects[normalized_id]
+                        ncbi_bioproject_accession = project_data.get(
+                            "ncbiBioProjectAccession"
+                        )
+                        if (
+                            ncbi_bioproject_accession
+                            and ncbi_bioproject_accession.strip()
+                        ):
+                            collected_insdc_bioproject_identifiers.add(
+                                f"bioproject:{ncbi_bioproject_accession}"
+                            )
+                            # Add to our mapping for future reference
+                            gold_project_to_bioproject[normalized_id] = (
+                                ncbi_bioproject_accession
+                            )
+            # Create a list from the set of collected identifiers
+            collected_insdc_bioproject_identifiers = list(
+                collected_insdc_bioproject_identifiers
+            )
+            # Only update if there are identifiers to add
+            if collected_insdc_bioproject_identifiers and set(
+                collected_insdc_bioproject_identifiers
+            ) != set(existing_insdc_bioproject_identifiers):
+                data_generation_updates.append(
+                    {
+                        "q": {"id": data_generation_id},
+                        "u": {
+                            "$set": {
+                                "insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
+                            }
+                        },
+                    }
+                )
+        # Return updates for both collections
+        if data_generation_updates:
+            return [
+                {"update": "biosample_set", "updates": biosample_updates},
+                {"update": "data_generation_set", "updates": data_generation_updates},
+            ]
+        else:
+            return {"update": "biosample_set", "updates": biosample_updates}

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -14,6 +14,7 @@ from dagster import (
     DagsterRunStatus,
     RunStatusSensorContext,
     DefaultSensorStatus,
+    in_process_executor,
 )
 from starlette import status
 from toolz import merge, get_in
@@ -44,8 +45,10 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_benthic_metadata,
     ingest_neon_surface_water_metadata,
     ensure_alldocs,
+    run_ontology_load,
     nmdc_study_to_ncbi_submission_export,
     generate_data_generation_set_for_biosamples_in_nmdc_study,
+    generate_update_script_for_insdc_biosample_identifiers,
 )
 from nmdc_runtime.site.resources import (
     get_mongo,
@@ -123,6 +126,55 @@ ensure_alldocs_daily = ScheduleDefinition(
 )
+load_envo_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_envo_ontology",
+    cron_schedule="0 7 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_envo_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
+load_uberon_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_uberon_ontology",
+    cron_schedule="0 8 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_uberon_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
+load_po_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_po_ontology",
+    cron_schedule="0 9 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_po_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
 def asset_materialization_metadata(asset_event, key):
     """Get metadata from an asset materialization event.
@@ -411,11 +463,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
         yield SkipReason("; ".join(skip_notes))
-# TODO ensure data_object_type values from file_type_enum
-#    see /metadata-translation/notebooks/202106_curation_updates.ipynb
-#    for details ("Create file_type_enum collection" section).
 @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
 def done_object_put_ops(_context):
     client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -462,7 +509,13 @@ def repo():
         export_study_biosamples_metadata.to_job(**preset_normal),
         ensure_alldocs.to_job(**preset_normal),
     ]
-    schedules = [housekeeping_weekly, ensure_alldocs_daily]
+    schedules = [
+        housekeeping_weekly,
+        ensure_alldocs_daily,
+        load_envo_ontology_weekly,
+        load_uberon_ontology_weekly,
+        load_po_ontology_weekly,
+    ]
     sensors = [
         done_object_put_ops,
         ensure_gold_translation_job,
@@ -516,6 +569,7 @@ def biosample_submission_ingest():
                             "study_type": "research_study",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
                             "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         },
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
@@ -960,6 +1014,8 @@ def database_records_stitching():
                         "config": {
                             "nmdc_study_id": "",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         }
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
@@ -1002,12 +1058,57 @@ def database_records_stitching():
                         "config": {
                             "nmdc_study_id": "",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         }
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
                 },
             },
         ),
+        generate_update_script_for_insdc_biosample_identifiers.to_job(
+            description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
+                        }
+                    },
+                },
+            },
+        ),
     ]

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -109,7 +109,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
     def get_omics_processing_records_by_gold_project_id(self, gold_project_id: str):
         gold_project_id = normalize_gold_id(gold_project_id)
@@ -126,7 +126,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
     def get_biosamples_for_study(self, study_id: str):
         # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
@@ -147,6 +147,19 @@ class RuntimeApiUserClient(RuntimeApiClient):
         response.raise_for_status()
         return response.json()["resources"]
+    def get_data_generation_records_for_study(self, study_id: str):
+        # TODO: same as above, we are using a large max_page_size to avoid pagination.
+        response = self.request(
+            "GET",
+            f"/nmdcschema/data_generation_set",
+            {
+                "filter": json.dumps({"associated_studies": study_id}),
+                "max_page_size": 10000,
+            },
+        )
+        response.raise_for_status()
+        return response.json()["resources"]
     def get_omics_processing_by_name(self, name: str):
         response = self.request(
             "POST",
@@ -157,7 +170,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
     def get_study(self, study_id: str):
         response = self.request(
@@ -169,7 +182,7 @@ class RuntimeApiUserClient(RuntimeApiClient):
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["cursor"]["batch"]
 class RuntimeApiSiteClient(RuntimeApiClient):

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -45,6 +45,7 @@ class GoldStudyTranslator(Translator):
         analysis_projects: List[JSON_OBJECT] = [],
         gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
         include_field_site_info: bool = False,
+        enable_biosample_filtering: bool = True,
         *args,
         **kwargs,
     ) -> None:
@@ -53,15 +54,20 @@ class GoldStudyTranslator(Translator):
         self.study = study
         self.study_type = nmdc.StudyCategoryEnum(study_type)
         self.include_field_site_info = include_field_site_info
+        self.enable_biosample_filtering = enable_biosample_filtering
         # Filter biosamples to only those with `sequencingStrategy` of
-        # "Metagenome" or "Metatranscriptome"
-        self.biosamples = [
-            biosample
-            for biosample in biosamples
-            if any(
-                _is_valid_project(project) for project in biosample.get("projects", [])
-            )
-        ]
+        # "Metagenome" or "Metatranscriptome" if filtering is enabled
+        if enable_biosample_filtering:
+            self.biosamples = [
+                biosample
+                for biosample in biosamples
+                if any(
+                    _is_valid_project(project)
+                    for project in biosample.get("projects", [])
+                )
+            ]
+        else:
+            self.biosamples = biosamples
         # Fetch the valid projectGoldIds that are associated with filtered
         # biosamples on their `projects` field
         valid_project_ids = {
@@ -116,6 +122,9 @@ class GoldStudyTranslator(Translator):
         :param gold_entity: GOLD entity object
         :return: PersonValue corresponding to the first PI in the `contacts` field
         """
+        if "contacts" not in gold_entity:
+            return None
         pi_dict = next(
             (
                 contact
@@ -169,7 +178,7 @@ class GoldStudyTranslator(Translator):
                 project["ncbiBioSampleAccession"], default_prefix="biosample"
             )
             for project in biosample_projects
-            if project["ncbiBioSampleAccession"]
+            if project.get("ncbiBioSampleAccession")
         ]
     def _get_samp_taxon_id(

nmdc_runtime/site/translation/neon_benthic_translator.py CHANGED Viewed

@@ -349,6 +349,7 @@ class NeonBenthicDataTranslator(Translator):
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
             data_object_type=do_type,
+            data_category=nmdc.DataCategoryEnum.instrument_data.text,
             in_manifest=manifest_id,
         )

nmdc_runtime/site/translation/neon_soil_translator.py CHANGED Viewed

@@ -264,6 +264,7 @@ class NeonSoilDataTranslator(Translator):
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
             md5_checksum=checksum,
+            data_category=nmdc.DataCategoryEnum.instrument_data.text,
             data_object_type=do_type,
         )

nmdc_runtime/site/translation/neon_surface_water_translator.py CHANGED Viewed

@@ -397,6 +397,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
             data_object_type=do_type,
+            data_category=nmdc.DataCategoryEnum.instrument_data.text,
             in_manifest=manifest_id,
         )

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -47,6 +47,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
     (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
 }
+UNIT_OVERRIDES: dict[str, dict[str, str]] = {
+    "Biosample": {
+        "depth": "m",
+    }
+}
 class EnvironmentPackage(Enum):
     r"""
@@ -475,6 +481,50 @@ class SubmissionPortalTranslator(Translator):
         return value
+    def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
+        """Collect and format DOIs from submission portal schema in nmdc format DOIs
+        If there were no DOIs, None is returned.
+        :param metadata_submission: submission portal entry
+        :return: list of nmdc.DOI objects
+        """
+        data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
+        award_dois = self._get_from(
+            metadata_submission, ["multiOmicsForm", "awardDois"]
+        )
+        if data_dois and len(data_dois) > 0:
+            updated_data_dois = [
+                nmdc.Doi(
+                    doi_category="dataset_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in data_dois
+            ]
+        else:
+            updated_data_dois = []
+        if award_dois and len(award_dois) > 0:
+            updated_award_dois = [
+                nmdc.Doi(
+                    doi_category="award_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in award_dois
+            ]
+        else:
+            updated_award_dois = []
+        return_val = updated_data_dois + updated_award_dois
+        if len(return_val) == 0:
+            return_val = None
+        return return_val
     def _get_data_objects_from_fields(
         self,
         sample_data: JSON_OBJECT,
@@ -591,6 +641,7 @@ class SubmissionPortalTranslator(Translator):
             websites=self._get_from(
                 metadata_submission, ["studyForm", "linkOutWebpage"]
             ),
+            associated_dois=self._get_study_dois(metadata_submission),
         )
     def _transform_value_for_slot(
@@ -660,6 +711,17 @@ class SubmissionPortalTranslator(Translator):
                 logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
                 continue
+            # This step handles cases where the submission portal/schema instructs a user to
+            # provide a value in a specific unit. The unit cannot be parsed out of the raw value
+            # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
+            # go away once units are encoded in the schema itself.
+            # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
+            if class_name in UNIT_OVERRIDES:
+                # If the class has unit overrides, check if the slot is in the overrides
+                unit_overrides = UNIT_OVERRIDES[class_name]
+                if slot_name in unit_overrides:
+                    unit = unit_overrides[slot_name]
             slot_definition = self.schema_view.induced_slot(slot_name, class_name)
             if slot_definition.multivalued:
                 value_list = value

nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.7.0py3-none-any.whl → 2.9.0py3-none-any.whl