PyPI - nmdc-runtime - Versions diffs - 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -0
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +30 -4
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +1331 -0
nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
nmdc_runtime/site/export/study_metadata.py +27 -4
nmdc_runtime/site/graphs.py +294 -45
nmdc_runtime/site/ops.py +1008 -230
nmdc_runtime/site/repair/database_updater.py +451 -0
nmdc_runtime/site/repository.py +368 -133
nmdc_runtime/site/resources.py +154 -80
nmdc_runtime/site/translation/gold_translator.py +235 -83
nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
nmdc_runtime/site/translation/neon_utils.py +24 -7
nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
nmdc_runtime/site/translation/translator.py +73 -3
nmdc_runtime/site/util.py +26 -7
nmdc_runtime/site/validation/emsl.py +1 -0
nmdc_runtime/site/validation/gold.py +1 -0
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +236 -192
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/terminusdb/generate.py +0 -198
nmdc_runtime/site/terminusdb/ingest.py +0 -44
nmdc_runtime/site/terminusdb/schema.py +0 -1671
nmdc_runtime/site/translation/emsl.py +0 -42
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -31
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -42
nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
/nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
{nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -2,26 +2,94 @@ import collections
 import re
 from typing import List, Tuple, Union
 from nmdc_schema import nmdc
+import pandas as pd
 from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
+# Dictionary of sequencing strategies from GOLD that we are filtering on
+# based on the kind of samples that are required for NMDC
+SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
+def _is_valid_project(project: dict) -> bool:
+    """A project is considered valid if:
+    1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
+    2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
+        `projectStatus` must be in ("Permanent Draft", "Complete and Published")
+    3. otherwise, no `projectStatus` filter is applied
+    :param project: GOLD project object (structurally similar to response
+                    from `/projects` endpoint)
+    :return: True if the project is valid, False otherwise
+    """
+    if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
+        return False
+    if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
+        return project.get("projectStatus") in (
+            "Permanent Draft",
+            "Complete and Published",
+        )
+    return True
 class GoldStudyTranslator(Translator):
     def __init__(
         self,
         study: JSON_OBJECT = {},
+        study_type: str = "research_study",
         biosamples: List[JSON_OBJECT] = [],
         projects: List[JSON_OBJECT] = [],
         analysis_projects: List[JSON_OBJECT] = [],
+        gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+        include_field_site_info: bool = False,
+        enable_biosample_filtering: bool = True,
         *args,
         **kwargs,
     ) -> None:
         super().__init__(*args, **kwargs)
         self.study = study
-        self.biosamples = biosamples
-        self.projects = projects
-        self.analysis_projects = analysis_projects
+        self.study_type = nmdc.StudyCategoryEnum(study_type)
+        self.include_field_site_info = include_field_site_info
+        self.enable_biosample_filtering = enable_biosample_filtering
+        # Filter biosamples to only those with `sequencingStrategy` of
+        # "Metagenome" or "Metatranscriptome" if filtering is enabled
+        if enable_biosample_filtering:
+            self.biosamples = [
+                biosample
+                for biosample in biosamples
+                if any(
+                    _is_valid_project(project)
+                    for project in biosample.get("projects", [])
+                )
+            ]
+        else:
+            self.biosamples = biosamples
+        # Fetch the valid projectGoldIds that are associated with filtered
+        # biosamples on their `projects` field
+        valid_project_ids = {
+            project.get("projectGoldId")
+            for project in projects
+            if _is_valid_project(project)
+        }
+        # Filter projects to only those with `projectGoldId` in valid_project_ids
+        self.projects = [
+            project
+            for project in projects
+            if project.get("projectGoldId") in valid_project_ids
+        ]
+        # Filter analysis_projects to only those with all `projects` in valid_project_ids
+        self.analysis_projects = [
+            analysis_project
+            for analysis_project in analysis_projects
+            if all(
+                project_id in valid_project_ids
+                for project_id in analysis_project.get("projects", [])
+            )
+        ]
+        self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
         self._projects_by_id = self._index_by_id(self.projects, "projectGoldId")
         self._analysis_projects_by_id = self._index_by_id(
@@ -53,6 +121,9 @@ class GoldStudyTranslator(Translator):
         :param gold_entity: GOLD entity object
         :return: PersonValue corresponding to the first PI in the `contacts` field
         """
+        if "contacts" not in gold_entity:
+            return None
         pi_dict = next(
             (
                 contact
@@ -69,6 +140,7 @@ class GoldStudyTranslator(Translator):
             has_raw_value=pi_dict.get("name"),
             name=pi_dict.get("name"),
             email=pi_dict.get("email"),
+            type="nmdc:PersonValue",
         )
     def _get_mod_date(self, gold_entity: JSON_OBJECT) -> Union[str, None]:
@@ -101,29 +173,67 @@ class GoldStudyTranslator(Translator):
             for id in self._project_ids_by_biosample_id[gold_biosample_id]
         )
         return [
-            self._get_curie("biosample", project["ncbiBioSampleAccession"])
+            self._ensure_curie(
+                project["ncbiBioSampleAccession"], default_prefix="biosample"
+            )
             for project in biosample_projects
-            if project["ncbiBioSampleAccession"]
+            if project.get("ncbiBioSampleAccession")
         ]
     def _get_samp_taxon_id(
         self, gold_biosample: JSON_OBJECT
-    ) -> Union[nmdc.TextValue, None]:
-        """Get a TextValue representing the NCBI taxon for a GOLD biosample
+    ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
+        """Get a ControlledIdentifiedTermValue representing the NCBI taxon
+        for a GOLD biosample
         This method gets the `ncbiTaxName` and `ncbiTaxId` from a GOLD biosample object.
-        If both are not `None`, it constructs a TextValue of the format
+        If both are not `None`, it constructs a ControlledIdentifiedTermValue of the format
         `{ncbiTaxName} [NCBITaxon:{ncbiTaxId}]`. Otherwise, it returns `None`
         :param gold_biosample: GOLD biosample object
-        :return: TextValue object
+        :return: ControlledIdentifiedTermValue object
         """
         ncbi_tax_name = gold_biosample.get("ncbiTaxName")
         ncbi_tax_id = gold_biosample.get("ncbiTaxId")
         if ncbi_tax_name is None or ncbi_tax_id is None:
             return None
-        return nmdc.TextValue(f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]")
+        raw_value = f"{ncbi_tax_name} [NCBITaxon:{ncbi_tax_id}]"
+        return nmdc.ControlledIdentifiedTermValue(
+            has_raw_value=raw_value,
+            term=nmdc.OntologyClass(
+                id=f"NCBITaxon:{ncbi_tax_id}",
+                name=ncbi_tax_name,
+                type="nmdc:OntologyClass",
+            ),
+            type="nmdc:ControlledIdentifiedTermValue",
+        )
+    def _get_host_taxid(
+        self, gold_biosample: JSON_OBJECT
+    ) -> Union[nmdc.ControlledIdentifiedTermValue, None]:
+        """Get a ControlledIdentifiedTermValue representing the NCBI host taxon id
+        for a GOLD biosample
+        This method gets the `hostNcbiTaxid` from a GOLD biosample object.
+        It constructs a ControlledIdentifiedTermValue of the format
+        `[NCBITaxon:{hostNcbiTaxid}]`. Otherwise, it returns `None`
+        :param gold_biosample: GOLD biosample object
+        :return: ControlledIdentifiedTermValue object
+        """
+        host_taxid = gold_biosample.get("hostNcbiTaxid")
+        if host_taxid is None:
+            return None
+        return nmdc.ControlledIdentifiedTermValue(
+            has_raw_value=f"NCBITaxon:{host_taxid}",
+            term=nmdc.OntologyClass(
+                id=f"NCBITaxon:{host_taxid}",
+                type="nmdc:OntologyClass",
+            ),
+            type="nmdc:ControlledIdentifiedTermValue",
+        )
     def _get_samp_name(self, gold_biosample: JSON_OBJECT) -> Union[str, None]:
         """Get a sample name for a GOLD biosample object
@@ -183,7 +293,9 @@ class GoldStudyTranslator(Translator):
         date_collected = gold_biosample.get("dateCollected")
         if date_collected is None:
             return None
-        return nmdc.TimestampValue(has_raw_value=date_collected)
+        return nmdc.TimestampValue(
+            has_raw_value=date_collected, type="nmdc:TimestampValue"
+        )
     def _get_quantity_value(
         self,
@@ -215,23 +327,21 @@ class GoldStudyTranslator(Translator):
                     has_raw_value=minimum_numeric_value,
                     has_numeric_value=nmdc.Double(minimum_numeric_value),
                     has_unit=unit,
+                    type="nmdc:QuantityValue",
                 )
             else:
                 return nmdc.QuantityValue(
                     has_minimum_numeric_value=nmdc.Double(minimum_numeric_value),
                     has_maximum_numeric_value=nmdc.Double(maximum_numeric_value),
                     has_unit=unit,
+                    type="nmdc:QuantityValue",
                 )
         field_value = gold_entity.get(gold_field)
         if field_value is None:
             return None
-        return nmdc.QuantityValue(
-            has_raw_value=field_value,
-            has_numeric_value=nmdc.Double(field_value),
-            has_unit=unit,
-        )
+        return self._parse_quantity_value(str(field_value), unit)
     def _get_text_value(
         self, gold_entity: JSON_OBJECT, gold_field: str
@@ -249,7 +359,7 @@ class GoldStudyTranslator(Translator):
         field_value = gold_entity.get(gold_field)
         if field_value is None:
             return None
-        return nmdc.TextValue(has_raw_value=field_value)
+        return nmdc.TextValue(has_raw_value=field_value, type="nmdc:TextValue")
     def _get_controlled_term_value(
         self, gold_entity: JSON_OBJECT, gold_field: str
@@ -267,7 +377,9 @@ class GoldStudyTranslator(Translator):
         field_value = gold_entity.get(gold_field)
         if field_value is None:
             return None
-        return nmdc.ControlledTermValue(has_raw_value=field_value)
+        return nmdc.ControlledTermValue(
+            has_raw_value=field_value, type="nmdc:ControlledTermValue"
+        )
     def _get_env_term_value(
         self, gold_biosample: JSON_OBJECT, gold_field: str
@@ -277,8 +389,8 @@ class GoldStudyTranslator(Translator):
         In GOLD entities ENVO terms are represented as a nested object with `id` and `label`
         fields. This method extracts this type of nested object by the given field name, and
         returns it as an `nmdc:ControlledIdentifiedTermValue` object. The `id` in the original
-        GOLD object be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
-        `ENVO:00005801`). If the value of the given field is `None` or if does not contain
+        GOLD object should be reformatted by replacing `_` with `:` (e.g. `ENVO_00005801` to
+        `ENVO:00005801`). If the value of the given field is `None` or if it does not contain
         a nested object with an `id` field, `None` is returned.
         :param gold_biosample: GOLD biosample object
@@ -292,8 +404,10 @@ class GoldStudyTranslator(Translator):
             term=nmdc.OntologyClass(
                 id=env_field["id"].replace("_", ":"),
                 name=env_field.get("label"),
+                type="nmdc:OntologyClass",
             ),
             has_raw_value=env_field["id"],
+            type="nmdc:ControlledIdentifiedTermValue",
         )
     def _get_lat_lon(
@@ -316,22 +430,40 @@ class GoldStudyTranslator(Translator):
             has_raw_value=f"{latitude} {longitude}",
             latitude=nmdc.DecimalDegree(latitude),
             longitude=nmdc.DecimalDegree(longitude),
+            type="nmdc:GeolocationValue",
         )
-    def _get_instrument_name(self, gold_project: JSON_OBJECT) -> Union[str, None]:
-        """Get instrument name used in a GOLD project
+    def _get_instrument(self, gold_project: JSON_OBJECT) -> Union[str, None]:
+        """Get instrument id referenced in instrument_set collection in Mongo.
+        Note: The instrument id is not retrieved by making a call to the database,
+        but rather parsed out from a TSV file in the nmdc-schema repo stored at
+        self.gold_instrument_set_mapping_file_path.
-        This method gets the `seqMethod` field from a GOLD project object. If
-        that value is not `None` it should be a list and the first element of that
-        list is returned. If the value of the field is `None`, `None` is returned.
+        This method gets the seqMethod field from a GOLD project object. If
+        that value is not None and is in the self.gold_instrument_set_mapping_file_path
+        file's GOLD SeqMethod column, the corresponding instrument id from
+        NMDC instrument_set id column is returned. If the value of the field
+        is None, None is returned.
         :param gold_project: GOLD project object
-        :return: Instrument name
+        :return: id corresponding to an Instrument from instrument_set collection
         """
         seq_method = gold_project.get("seqMethod")
         if not seq_method:
             return None
-        return seq_method[0]
+        seq_method = seq_method[0].strip()
+        df = self.gold_nmdc_instrument_map_df
+        matching_row = df[df["GOLD SeqMethod"] == seq_method]
+        if not matching_row.empty:
+            instrument_id = matching_row["NMDC instrument_set id"].values[0]
+            return instrument_id
+        raise ValueError(
+            f"seqMethod '{seq_method}' could not be found in the GOLD-NMDC instrument mapping TSV file."
+        )
     def _get_processing_institution(
         self, gold_project: JSON_OBJECT
@@ -401,12 +533,15 @@ class GoldStudyTranslator(Translator):
         """
         return nmdc.Study(
             description=gold_study.get("description"),
-            gold_study_identifiers=self._get_curie("gold", gold_study["studyGoldId"]),
+            gold_study_identifiers=self._ensure_curie(
+                gold_study["studyGoldId"], default_prefix="gold"
+            ),
             id=nmdc_study_id,
             name=gold_study.get("studyName"),
             principal_investigator=self._get_pi(gold_study),
             title=gold_study.get("studyName"),
             type="nmdc:Study",
+            study_category=self.study_type,
         )
     def _translate_biosample(
@@ -432,13 +567,11 @@ class GoldStudyTranslator(Translator):
         gold_biosample_id = gold_biosample["biosampleGoldId"]
         return nmdc.Biosample(
             add_date=gold_biosample.get("addDate"),
-            alt=self._get_quantity_value(
-                gold_biosample, "altitudeInMeters", unit="meters"
-            ),
+            alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
             collected_from=nmdc_field_site_id,
             collection_date=self._get_collection_date(gold_biosample),
             depth=self._get_quantity_value(
-                gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
+                gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
             ),
             description=gold_biosample.get("description"),
             diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -451,10 +584,12 @@ class GoldStudyTranslator(Translator):
             env_local_scale=self._get_env_term_value(gold_biosample, "envoLocalScale"),
             env_medium=self._get_env_term_value(gold_biosample, "envoMedium"),
             geo_loc_name=self._get_text_value(gold_biosample, "geoLocation"),
-            gold_biosample_identifiers=self._get_curie("gold", gold_biosample_id),
+            gold_biosample_identifiers=self._ensure_curie(
+                gold_biosample_id, default_prefix="gold"
+            ),
             habitat=gold_biosample.get("habitat"),
             host_name=gold_biosample.get("hostName"),
-            host_taxid=self._get_text_value(gold_biosample, "hostNcbiTaxid"),
+            host_taxid=self._get_host_taxid(gold_biosample),
             id=nmdc_biosample_id,
             img_identifiers=self._get_img_identifiers(gold_biosample_id),
             insdc_biosample_identifiers=self._get_insdc_biosample_identifiers(
@@ -466,7 +601,6 @@ class GoldStudyTranslator(Translator):
             name=gold_biosample.get("biosampleName"),
             ncbi_taxonomy_name=gold_biosample.get("ncbiTaxName"),
             nitrite=self._get_quantity_value(gold_biosample, "nitrateConcentration"),
-            part_of=nmdc_study_id,
             ph=gold_biosample.get("ph"),
             pressure=self._get_quantity_value(gold_biosample, "pressure"),
             samp_name=self._get_samp_name(gold_biosample),
@@ -476,53 +610,67 @@ class GoldStudyTranslator(Translator):
             ),
             specific_ecosystem=gold_biosample.get("specificEcosystem"),
             subsurface_depth=self._get_quantity_value(
-                gold_biosample, "subsurfaceDepthInMeters", unit="meters"
+                gold_biosample, "subsurfaceDepthInMeters", unit="m"
             ),
             temp=self._get_quantity_value(
                 gold_biosample, "sampleCollectionTemperature"
             ),
             type="nmdc:Biosample",
+            associated_studies=[nmdc_study_id],
         )
-    def _translate_omics_processing(
+    def _translate_nucleotide_sequencing(
         self,
         gold_project: JSON_OBJECT,
-        nmdc_omics_processing_id: str,
+        nmdc_nucleotide_sequencing_id: str,
         nmdc_biosample_id: str,
         nmdc_study_id: str,
-    ) -> nmdc.OmicsProcessing:
-        """Translate a GOLD project object into an `nmdc:OmicsProcessing` object.
+    ):
+        """Translate a GOLD project object into an `nmdc:NucleotideSequencing` object.
-        This method translates a GOLD project object into an equivalent `nmdc:OmicsProcessing`
+        This method translates a GOLD project object into an equivalent `nmdc:NucleotideSequencing`
         object. Any minted NMDC IDs must be passed to this method. Internally, each
-        slot of the `nmdc:OmicsProcessing` is either directly pulled from the GOLD object or
+        slot of the `nmdc:NucleotideSequencing` is either directly pulled from the GOLD object or
         one of the `_get_*` methods is used.
         :param gold_project: GOLD project object
-        :param nmdc_omics_processing_id: Minted nmdc:OmicsProcessing identifier for the translated object
+        :param nmdc_omics_processing_id: Minted nmdc:NucleotideSequencing identifier for the translated object
         :param nmdc_biosample_id: Minted nmdc:Biosample identifier for the related Biosample
         :param nmdc_study_id: Minted nmdc:Study identifier for the related Study
-        :return: nmdc:OmicsProcessing object
+        :return: nmdc:NucleotideSequencing object
         """
         gold_project_id = gold_project["projectGoldId"]
-        return nmdc.OmicsProcessing(
-            id=nmdc_omics_processing_id,
+        ncbi_bioproject_identifier = gold_project.get("ncbiBioProjectAccession")
+        insdc_bioproject_identifiers = []
+        if ncbi_bioproject_identifier:
+            insdc_bioproject_identifiers.append(
+                self._ensure_curie(
+                    ncbi_bioproject_identifier,
+                    default_prefix="bioproject",
+                )
+            )
+        return nmdc.NucleotideSequencing(
+            id=nmdc_nucleotide_sequencing_id,
             name=gold_project.get("projectName"),
-            gold_sequencing_project_identifiers=self._get_curie(
-                "gold", gold_project_id
+            gold_sequencing_project_identifiers=self._ensure_curie(
+                gold_project_id, default_prefix="gold"
             ),
             ncbi_project_name=gold_project.get("projectName"),
-            type="nmdc:OmicsProcessing",
+            type="nmdc:NucleotideSequencing",
             has_input=nmdc_biosample_id,
-            part_of=nmdc_study_id,
             add_date=gold_project.get("addDate"),
             mod_date=self._get_mod_date(gold_project),
+            insdc_bioproject_identifiers=insdc_bioproject_identifiers,
             principal_investigator=self._get_pi(gold_project),
-            omics_type=self._get_controlled_term_value(
-                gold_project, "sequencingStrategy"
-            ),
-            instrument_name=self._get_instrument_name(gold_project),
             processing_institution=self._get_processing_institution(gold_project),
+            instrument_used=self._get_instrument(gold_project),
+            analyte_category=(
+                gold_project.get("sequencingStrategy").lower()
+                if gold_project.get("sequencingStrategy")
+                else None
+            ),
+            associated_studies=[nmdc_study_id],
         )
     def get_database(self) -> nmdc.Database:
@@ -546,28 +694,31 @@ class GoldStudyTranslator(Translator):
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(self.biosamples))
         gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
-        gold_field_site_names = sorted(
-            {self._get_field_site_name(biosample) for biosample in self.biosamples}
-        )
-        nmdc_field_site_ids = self._id_minter(
-            "nmdc:FieldResearchSite", len(gold_field_site_names)
-        )
-        gold_name_to_nmdc_field_site_ids = dict(
-            zip(gold_field_site_names, nmdc_field_site_ids)
-        )
-        gold_biosample_to_nmdc_field_site_ids = {
-            biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
-                self._get_field_site_name(biosample)
-            ]
-            for biosample in self.biosamples
-        }
+        if self.include_field_site_info:
+            gold_field_site_names = sorted(
+                {self._get_field_site_name(biosample) for biosample in self.biosamples}
+            )
+            nmdc_field_site_ids = self._id_minter(
+                "nmdc:FieldResearchSite", len(gold_field_site_names)
+            )
+            gold_name_to_nmdc_field_site_ids = dict(
+                zip(gold_field_site_names, nmdc_field_site_ids)
+            )
+            gold_biosample_to_nmdc_field_site_ids = {
+                biosample["biosampleGoldId"]: gold_name_to_nmdc_field_site_ids[
+                    self._get_field_site_name(biosample)
+                ]
+                for biosample in self.biosamples
+            }
+        else:
+            gold_biosample_to_nmdc_field_site_ids = {}
         gold_project_ids = [project["projectGoldId"] for project in self.projects]
-        nmdc_omics_processing_ids = self._id_minter(
-            "nmdc:OmicsProcessing", len(gold_project_ids)
+        nmdc_nucleotide_sequencing_ids = self._id_minter(
+            "nmdc:NucleotideSequencing", len(gold_project_ids)
         )
-        gold_project_to_nmdc_omics_processing_ids = dict(
-            zip(gold_project_ids, nmdc_omics_processing_ids)
+        gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
+            zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
         )
         database.study_set = [self._translate_study(self.study, nmdc_study_id)]
@@ -578,20 +729,21 @@ class GoldStudyTranslator(Translator):
                     biosample["biosampleGoldId"]
                 ],
                 nmdc_study_id=nmdc_study_id,
-                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids[
-                    biosample["biosampleGoldId"]
-                ],
+                nmdc_field_site_id=gold_biosample_to_nmdc_field_site_ids.get(
+                    biosample["biosampleGoldId"], None
+                ),
             )
             for biosample in self.biosamples
         ]
-        database.field_research_site_set = [
-            nmdc.FieldResearchSite(id=id, name=name)
-            for name, id in gold_name_to_nmdc_field_site_ids.items()
-        ]
-        database.omics_processing_set = [
-            self._translate_omics_processing(
+        if self.include_field_site_info:
+            database.field_research_site_set = [
+                nmdc.FieldResearchSite(id=id, name=name, type="nmdc:FieldResearchSite")
+                for name, id in gold_name_to_nmdc_field_site_ids.items()
+            ]
+        database.data_generation_set = [
+            self._translate_nucleotide_sequencing(
                 project,
-                nmdc_omics_processing_id=gold_project_to_nmdc_omics_processing_ids[
+                nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
                     project["projectGoldId"]
                 ],
                 nmdc_biosample_id=gold_to_nmdc_biosample_ids[

nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 1.3.1py3-none-any.whl → 2.12.0py3-none-any.whl