PyPI - nmdc-runtime - Versions diffs - 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -1
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +731 -40
nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
nmdc_runtime/site/graphs.py +80 -29
nmdc_runtime/site/ops.py +522 -183
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +108 -117
nmdc_runtime/site/resources.py +72 -36
nmdc_runtime/site/translation/gold_translator.py +22 -21
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
nmdc_runtime/site/translation/translator.py +64 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +175 -348
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -2,6 +2,7 @@ import logging
 import re
 from collections import namedtuple
 from datetime import datetime
+from decimal import Decimal
 from enum import Enum
 from functools import lru_cache
 from importlib import resources
@@ -47,6 +48,12 @@ DATA_URL_SET_AND_ANALYTE_TO_DATA_OBJECT_TYPE: dict[tuple[DataUrlSet, str], str]
     (INTERLEAVED, str(METATRANSCRIPTOME)): "Metatranscriptome Raw Reads",
 }
+UNIT_OVERRIDES: dict[str, dict[str, str]] = {
+    "Biosample": {
+        "depth": "m",
+    }
+}
 class EnvironmentPackage(Enum):
     r"""
@@ -139,6 +146,7 @@ class SubmissionPortalTranslator(Translator):
         # See: https://github.com/microbiomedata/submission-schema/issues/162
         study_category: Optional[str] = None,
         study_pi_image_url: Optional[str] = None,
+        study_id: Optional[str] = None,
         # Additional biosample-level metadata with optional column mapping information not captured
         # by the submission portal currently.
         # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -159,6 +167,7 @@ class SubmissionPortalTranslator(Translator):
             nmdc.StudyCategoryEnum(study_category) if study_category else None
         )
         self.study_pi_image_url = study_pi_image_url
+        self.study_id = study_id
         self.biosample_extras = group_dicts_by_key(
             BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -168,6 +177,13 @@ class SubmissionPortalTranslator(Translator):
         )
         self.schema_view: SchemaView = _get_schema_view()
+        self._material_processing_subclass_names = []
+        for class_name in self.schema_view.class_descendants(
+            "MaterialProcessing", reflexive=False
+        ):
+            class_def = self.schema_view.get_class(class_name)
+            if not class_def.abstract:
+                self._material_processing_subclass_names.append(class_name)
     def _get_pi(
         self, metadata_submission: JSON_OBJECT
@@ -270,63 +286,39 @@ class SubmissionPortalTranslator(Translator):
         return [self._ensure_curie(emsl_project_id, default_prefix="emsl.project")]
     def _get_quantity_value(
-        self, raw_value: Optional[str], unit: Optional[str] = None
+        self,
+        raw_value: Optional[str | int | float],
+        slot_definition: SlotDefinition,
+        unit: Optional[str] = None,
     ) -> Union[nmdc.QuantityValue, None]:
-        """Construct a nmdc:QuantityValue from a raw value string
-        The regex pattern minimally matches on a single numeric value (possibly
-        floating point). The pattern can also identify a range represented by
-        two numeric values separated by a hyphen. It can also identify non-numeric
-        characters at the end of the string which are interpreted as a unit. A unit
-        may also be explicitly provided as an argument to this function. If parsing
-        identifies a unit and a unit argument is provided, the unit argument is used.
-        If the pattern is not matched at all None is returned.
-        TODO: currently the parsed unit string is used as-is. In the future we may want
-        to be stricter about what we accept or coerce into a controlled value set
-        :param raw_value: string to parse
-        :param unit: optional unit, defaults to None
-        :return: nmdc:QuantityValue
-        """
-        if raw_value is None:
-            return None
-        match = re.fullmatch(
-            "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
-            raw_value,
-        )
-        if not match:
-            return None
+        """Construct a nmdc:QuantityValue from a raw value string"""
+        # If the storage_units annotation is present on the slot and it only contains one unit (i.e.
+        # not a pipe-separated list of units) then use that unit.
+        if "storage_units" in slot_definition.annotations:
+            storage_units = slot_definition.annotations["storage_units"].value
+            if storage_units and "|" not in storage_units:
+                unit = storage_units
+        # If the raw_value is numeric, directly construct a QuantityValue with the inferred unit.
+        if isinstance(raw_value, (int, float)):
+            if unit is None:
+                raise ValueError(
+                    f"While processing value for slot {slot_definition.name}, a numeric value was provided but no unit could be inferred."
+                )
+            # Constructing a Decimal directly from a float will maintain the full precision of the
+            # float (i.e. numbers like 0.5 cannot be represented exactly). Converting the float to
+            # a string first and then constructing the Decimal from that string will give a more
+            # expected value.
+            value_as_str = str(raw_value)
+            return nmdc.QuantityValue(
+                has_raw_value=value_as_str,
+                has_numeric_value=Decimal(value_as_str),
+                has_unit=unit,
+                type="nmdc:QuantityValue",
+            )
-        qv = nmdc.QuantityValue(
-            has_raw_value=raw_value,
-            type="nmdc:QuantityValue",
-        )
-        if match.group(2):
-            # having group 2 means the value is a range like "0 - 1". Either
-            # group 1 or group 2 might be the minimum especially when handling
-            # negative ranges like "0 - -1"
-            num_1 = float(match.group(1))
-            num_2 = float(match.group(2))
-            qv.has_minimum_numeric_value = min(num_1, num_2)
-            qv.has_maximum_numeric_value = max(num_1, num_2)
-        else:
-            # otherwise we just have a single numeric value
-            qv.has_numeric_value = float(match.group(1))
-        if unit:
-            # a unit was manually specified
-            if match.group(3) and unit != match.group(3):
-                # a unit was also found in the raw string; issue a warning
-                # if they don't agree, but keep the manually specified one
-                logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
-            qv.has_unit = unit
-        elif match.group(3):
-            # a unit was found in the raw string
-            qv.has_unit = match.group(3)
-        return qv
+        return self._parse_quantity_value(raw_value, unit)
     def _get_ontology_class(
         self, raw_value: Optional[str]
@@ -475,6 +467,50 @@ class SubmissionPortalTranslator(Translator):
         return value
+    def _get_study_dois(self, metadata_submission) -> Union[List[nmdc.Doi], None]:
+        """Collect and format DOIs from submission portal schema in nmdc format DOIs
+        If there were no DOIs, None is returned.
+        :param metadata_submission: submission portal entry
+        :return: list of nmdc.DOI objects
+        """
+        data_dois = self._get_from(metadata_submission, ["studyForm", "dataDois"])
+        award_dois = self._get_from(
+            metadata_submission, ["multiOmicsForm", "awardDois"]
+        )
+        if data_dois and len(data_dois) > 0:
+            updated_data_dois = [
+                nmdc.Doi(
+                    doi_category="dataset_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in data_dois
+            ]
+        else:
+            updated_data_dois = []
+        if award_dois and len(award_dois) > 0:
+            updated_award_dois = [
+                nmdc.Doi(
+                    doi_category="award_doi",
+                    doi_provider=doi["provider"],
+                    doi_value=self._ensure_curie(doi["value"], default_prefix="doi"),
+                    type="nmdc:Doi",
+                )
+                for doi in award_dois
+            ]
+        else:
+            updated_award_dois = []
+        return_val = updated_data_dois + updated_award_dois
+        if len(return_val) == 0:
+            return_val = None
+        return return_val
     def _get_data_objects_from_fields(
         self,
         sample_data: JSON_OBJECT,
@@ -544,6 +580,14 @@ class SubmissionPortalTranslator(Translator):
         return data_objects, manifest
+    def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
+        """Parse a sample link in the form of `ProcessingName:SampleName,..."""
+        pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
+        match = re.match(pattern, sample_link)
+        if not match:
+            return None
+        return match.group(1), split_strip(match.group(2), ",")
     def _translate_study(
         self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
     ) -> nmdc.Study:
@@ -591,6 +635,7 @@ class SubmissionPortalTranslator(Translator):
             websites=self._get_from(
                 metadata_submission, ["studyForm", "linkOutWebpage"]
             ),
+            associated_dois=self._get_study_dois(metadata_submission),
         )
     def _transform_value_for_slot(
@@ -605,6 +650,7 @@ class SubmissionPortalTranslator(Translator):
         elif slot.range == "QuantityValue":
             transformed_value = self._get_quantity_value(
                 value,
+                slot,
                 unit=unit,
             )
         elif slot.range == "ControlledIdentifiedTermValue":
@@ -660,6 +706,17 @@ class SubmissionPortalTranslator(Translator):
                 logging.warning(f"No slot '{slot_name}' on class '{class_name}'")
                 continue
+            # This step handles cases where the submission portal/schema instructs a user to
+            # provide a value in a specific unit. The unit cannot be parsed out of the raw value
+            # in these cases, so we have to manually set it via UNIT_OVERRIDES. This part can
+            # go away once units are encoded in the schema itself.
+            # See: https://github.com/microbiomedata/nmdc-schema/issues/2517
+            if class_name in UNIT_OVERRIDES:
+                # If the class has unit overrides, check if the slot is in the overrides
+                unit_overrides = UNIT_OVERRIDES[class_name]
+                if slot_name in unit_overrides:
+                    unit = unit_overrides[slot_name]
             slot_definition = self.schema_view.induced_slot(slot_name, class_name)
             if slot_definition.multivalued:
                 value_list = value
@@ -742,11 +799,14 @@ class SubmissionPortalTranslator(Translator):
             "metadata_submission", {}
         )
-        # Generate one Study instance based on the metadata submission
-        nmdc_study_id = self._id_minter("nmdc:Study")[0]
-        database.study_set = [
-            self._translate_study(metadata_submission_data, nmdc_study_id)
-        ]
+        # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
+        if self.study_id:
+            nmdc_study_id = self.study_id
+        else:
+            nmdc_study_id = self._id_minter("nmdc:Study")[0]
+            database.study_set = [
+                self._translate_study(metadata_submission_data, nmdc_study_id)
+            ]
         # Automatically populate the `env_package` field in the sample data based on which
         # environmental data tab the sample data came from.
@@ -778,15 +838,63 @@ class SubmissionPortalTranslator(Translator):
         )
         # Translate the sample data into nmdc:Biosample objects
-        database.biosample_set = [
-            self._translate_biosample(
-                sample_data,
-                nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
-                nmdc_study_id=nmdc_study_id,
-            )
-            for sample_data_id, sample_data in sample_data_by_id.items()
-            if sample_data
-        ]
+        database.biosample_set = []
+        for sample_data_id, sample_data in sample_data_by_id.items():
+            # This shouldn't happen, but just in case skip empty sample data
+            if not sample_data:
+                continue
+            # Find the first tab that has a sample_link value and attempt to parse it
+            sample_link = ""
+            for tab in sample_data:
+                if tab.get("sample_link"):
+                    sample_link = tab.get("sample_link")
+                    break
+            parsed_sample_link = self._parse_sample_link(sample_link)
+            # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
+            # format, then create a ProcessedSample and MaterialProcessing instance instead of a
+            # Biosample instance. The input samples must be present in the submission for this to
+            # work. An exception is raised if any of the referenced input samples are missing.
+            if parsed_sample_link is not None:
+                processing_type, processing_inputs = parsed_sample_link
+                if not all(
+                    input_id in sample_data_to_nmdc_biosample_ids
+                    for input_id in processing_inputs
+                ):
+                    raise ValueError(
+                        f"Could not find all input samples in sample_link '{sample_link}'"
+                    )
+                processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
+                database.processed_sample_set.append(
+                    nmdc.ProcessedSample(
+                        id=processed_sample_id,
+                        type="nmdc:ProcessedSample",
+                        name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
+                    )
+                )
+                processing_class = getattr(nmdc, processing_type)
+                material_processing = processing_class(
+                    id=self._id_minter(f"nmdc:{processing_type}")[0],
+                    type=f"nmdc:{processing_type}",
+                    has_input=[
+                        sample_data_to_nmdc_biosample_ids[input_id]
+                        for input_id in processing_inputs
+                    ],
+                    has_output=[processed_sample_id],
+                )
+                database.material_processing_set.append(material_processing)
+            # If there was no sample_link or it doesn't follow the expected format, create a
+            # Biosample instance as normal.
+            else:
+                biosample = self._translate_biosample(
+                    sample_data,
+                    nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
+                    nmdc_study_id=nmdc_study_id,
+                )
+                database.biosample_set.append(biosample)
         # This section handles the translation of information in the external sequencing tabs into
         # various NMDC objects.
@@ -972,3 +1080,42 @@ class SubmissionPortalTranslator(Translator):
                 database.data_generation_set.append(nucleotide_sequencing)
         return database
+    @staticmethod
+    def set_study_images(
+        nmdc_study: nmdc.Study,
+        pi_image_url: Optional[str],
+        primary_study_image_url: Optional[str],
+        study_images_url: Optional[list[str]],
+    ) -> None:
+        """Set images for a study based on provided URLs."""
+        if pi_image_url:
+            if not nmdc_study.principal_investigator:
+                nmdc_study.principal_investigator = nmdc.PersonValue(
+                    type="nmdc:PersonValue"
+                )
+            nmdc_study.principal_investigator.profile_image_url = pi_image_url
+        if primary_study_image_url:
+            if not nmdc_study.study_image:
+                nmdc_study.study_image = []
+            nmdc_study.study_image.append(
+                nmdc.ImageValue(
+                    type="nmdc:ImageValue",
+                    url=primary_study_image_url,
+                    display_order=0,
+                )
+            )
+        if study_images_url:
+            if not nmdc_study.study_image:
+                nmdc_study.study_image = []
+            for idx, image_url in enumerate(study_images_url, start=1):
+                nmdc_study.study_image.append(
+                    nmdc.ImageValue(
+                        type="nmdc:ImageValue",
+                        url=image_url,
+                        display_order=idx,
+                    )
+                )

nmdc_runtime/site/translation/translator.py CHANGED Viewed

@@ -1,9 +1,14 @@
+import logging
+import re
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional
+from decimal import Decimal
+from typing import Any, Callable, Dict, List, Optional, Union
 from nmdc_schema import nmdc
 JSON_OBJECT = Dict[str, Any]
+logger = logging.getLogger(__name__)
 class Translator(ABC):
     def __init__(
@@ -27,3 +32,61 @@ class Translator(ABC):
     @abstractmethod
     def get_database(self) -> nmdc.Database:
         pass
+    def _parse_quantity_value(
+        self, raw_value: Optional[str], unit: Optional[str] = None
+    ) -> Union[nmdc.QuantityValue, None]:
+        """Construct a nmdc:QuantityValue from a raw value string
+        The regex pattern minimally matches on a single numeric value (possibly
+        floating point). The pattern can also identify a range represented by
+        two numeric values separated by a hyphen. It can also identify non-numeric
+        characters at the end of the string which are interpreted as a unit. A unit
+        may also be explicitly provided as an argument to this function. If parsing
+        identifies a unit and a unit argument is provided, the unit argument is used.
+        If the pattern is not matched at all None is returned.
+        :param raw_value: string to parse
+        :param unit: optional unit, defaults to None. If None, the unit is extracted from the
+            raw_value. If a unit is provided, it will override the unit extracted from the
+            raw_value.
+        :return: nmdc:QuantityValue
+        """
+        if raw_value is None:
+            return None
+        match = re.fullmatch(
+            "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
+            raw_value,
+        )
+        if not match:
+            return None
+        quantity_value_kwargs = {
+            "has_raw_value": raw_value,
+            "type": "nmdc:QuantityValue",
+        }
+        if match.group(2):
+            # having group 2 means the value is a range like "0 - 1". Either
+            # group 1 or group 2 might be the minimum especially when handling
+            # negative ranges like "0 - -1"
+            num_1 = Decimal(match.group(1))
+            num_2 = Decimal(match.group(2))
+            quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
+            quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
+        else:
+            # otherwise we just have a single numeric value
+            quantity_value_kwargs["has_numeric_value"] = Decimal(match.group(1))
+        if unit:
+            # a unit was manually specified
+            if match.group(3) and unit != match.group(3):
+                # a unit was also found in the raw string; issue a warning
+                # if they don't agree, but keep the manually specified one
+                logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
+            quantity_value_kwargs["has_unit"] = unit
+        elif match.group(3):
+            # a unit was found in the raw string
+            quantity_value_kwargs["has_unit"] = match.group(3)
+        return nmdc.QuantityValue(**quantity_value_kwargs)

nmdc_runtime/site/util.py CHANGED Viewed

@@ -3,10 +3,11 @@ import os
 from functools import lru_cache
 from pymongo.database import Database as MongoDatabase
 from subprocess import Popen, PIPE, STDOUT, CalledProcessError
-from toolz import groupby
+from refscan.lib.helpers import get_collection_names_from_schema
-from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
 from nmdc_runtime.site.resources import mongo_resource
+from nmdc_runtime.util import nmdc_schema_view
 mode_test = {
     "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
 @lru_cache
 def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
+    """
+    TODO: Document this function.
+    """
+    schema_view = nmdc_schema_view()
     present_collection_names = set(mdb.list_collection_names())
     return {
         name: (
             name in present_collection_names and "id_1" in mdb[name].index_information()
         )
-        for name in get_collection_names_from_schema()
+        for name in get_collection_names_from_schema(schema_view)
     }

nmdc_runtime/site/validation/util.py CHANGED Viewed

@@ -1,6 +1,5 @@
-from dagster import op, AssetMaterialization, AssetKey, EventMetadata
-from jsonschema import Draft7Validator
-from nmdc_runtime.util import get_nmdc_jsonschema_dict
+from dagster import op, AssetMaterialization, AssetKey, MetadataValue
+from nmdc_runtime.util import get_nmdc_schema_validator
 from toolz import dissoc
 from nmdc_runtime.site.resources import mongo_resource
@@ -61,19 +60,19 @@ def validate_mongo_collection(context, collection_name: str):
     collection = mongo_db[collection_name]  # get mongo collection
     db_set = collection_name.split(".")[0]
-    validator = Draft7Validator(get_nmdc_jsonschema_dict())
+    validator = get_nmdc_schema_validator()
     validation_errors = []
     for count, doc in enumerate(collection.find()):
         # add logging for progress?
         # e.g.: if count % 1000 == 0: context.log.info(“done X of Y")
         doc = dissoc(doc, "_id")  # dissoc _id
-        errors = list(validator.iter_errors({f"{db_set}": [doc]}))
-        if len(errors) > 0:
+        report = validator.validate({f"{db_set}": [doc]}, target_class="Database")
+        if len(report.results) > 0:
             if "id" in doc.keys():
-                errors = {doc["id"]: [e.message for e in errors]}
+                errors = {doc["id"]: [r.message for r in report.results]}
             else:
-                errors = {f"missing id ({count})": [e.message for e in errors]}
+                errors = {f"missing id ({count})": [r.message for r in report.results]}
             validation_errors.append(errors)
     return {"collection_name": collection_name, "errors": validation_errors}
@@ -92,10 +91,15 @@ def announce_validation_report(context, report, api_object):
         asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
         description=f"{collection_name} translation validation",
         metadata={
-            # https://docs.dagster.io/_apidocs/solids#event-metadata
-            # also .json, .md, .path, .url, .python_artifact, ...
-            "n_errors": EventMetadata.int(len(report["errors"])),
-            "object_id": EventMetadata.text(api_object["id"]),
+            # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
+            #       which has since been replaced by Dagster's `MetadataValue` class.
+            #
+            #       Reference:
+            #       - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
+            #       - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
+            #
+            "n_errors": MetadataValue.int(len(report["errors"])),
+            "object_id": MetadataValue.text(api_object["id"]),
         },
     )

nmdc_runtime/site/workspace.yaml ADDED Viewed

@@ -0,0 +1,13 @@
+load_from:
+  - python_package:
+      package_name: nmdc_runtime.site.repository
+      attribute: repo
+  - python_package:
+      package_name: nmdc_runtime.site.repository
+      attribute: biosample_submission_ingest
+  - python_package:
+      package_name: nmdc_runtime.site.repository
+      attribute: biosample_export
+  - python_package:
+      package_name: nmdc_runtime.site.repository
+      attribute: database_records_stitching

nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl