PyPI - nmdc-runtime - Versions diffs - 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show

nmdc_runtime/Dockerfile +167 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +208 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +788 -0
nmdc_runtime/api/core/util.py +109 -0
nmdc_runtime/api/db/mongo.py +435 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +143 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +270 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +796 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +425 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +37 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +140 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +7 -8
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +1 -2
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +633 -13
nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
nmdc_runtime/site/graphs.py +8 -22
nmdc_runtime/site/ops.py +147 -181
nmdc_runtime/site/repository.py +2 -112
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/gold_translator.py +4 -12
nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
nmdc_runtime/site/translation/translator.py +63 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +10 -5
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +90 -48
nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
-from typing import Any
 from dagster import (
     repository,
@@ -14,7 +13,6 @@ from dagster import (
     DagsterRunStatus,
     RunStatusSensorContext,
     DefaultSensorStatus,
-    in_process_executor,
 )
 from starlette import status
 from toolz import merge, get_in
@@ -30,8 +28,6 @@ from nmdc_runtime.site.graphs import (
     translate_metadata_submission_to_nmdc_schema_database,
     ingest_metadata_submission,
     gold_study_to_database,
-    gold_translation,
-    gold_translation_curation,
     create_objects_from_site_object_puts,
     housekeeping,
     ensure_jobs,
@@ -62,9 +58,6 @@ from nmdc_runtime.site.resources import (
 from nmdc_runtime.site.resources import (
     get_runtime_api_site_client,
 )
-from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
-from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
-from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
 from nmdc_runtime.util import freeze
 from nmdc_runtime.util import unfreeze
@@ -249,82 +242,6 @@ def process_workflow_job_triggers(_context):
         yield SkipReason("No new jobs required")
-@asset_sensor(
-    asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
-    job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
-)
-def ensure_gold_translation_job(_context, asset_event):
-    mdb = get_mongo(run_config_frozen__normal_env).db
-    gold_etl_latest = mdb.objects.find_one(
-        {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
-    )
-    sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
-    if gold_etl_latest is None:
-        yield SkipReason("can't find sensed asset object_id in database")
-        return
-    elif gold_etl_latest["id"] != sensed_object_id:
-        yield SkipReason("later object than sensed materialization")
-        return
-    run_config = merge(
-        run_config_frozen__normal_env,
-        {
-            "solids": {
-                "construct_jobs": {
-                    "config": {
-                        "base_jobs": [
-                            {
-                                "workflow": {"id": "gold-translation-1.0.0"},
-                                "config": {"object_id": gold_etl_latest["id"]},
-                            }
-                        ]
-                    }
-                }
-            }
-        },
-    )
-    yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
-@asset_sensor(
-    asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
-    job=gold_translation_curation.to_job(**preset_normal),
-)
-def claim_and_run_gold_translation_curation(_context, asset_event):
-    client = get_runtime_api_site_client(run_config_frozen__normal_env)
-    mdb = get_mongo(run_config_frozen__normal_env).db
-    object_id_latest = asset_materialization_metadata(
-        asset_event, "object_id_latest"
-    ).text
-    job = mdb.jobs.find_one(
-        {
-            "workflow.id": "gold-translation-1.0.0",
-            "config.object_id_latest": object_id_latest,
-        }
-    )
-    if job is not None:
-        rv = client.claim_job(job["id"])
-        if rv.status_code == status.HTTP_200_OK:
-            operation = rv.json()
-            run_config = merge(
-                run_config_frozen__normal_env,
-                {
-                    "ops": {
-                        "get_operation": {
-                            "config": {
-                                "operation_id": operation["id"],
-                            }
-                        }
-                    }
-                },
-            )
-            yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
-        else:
-            yield SkipReason("Job found, but already claimed by this site")
-    else:
-        yield SkipReason("No job found")
 @sensor(
     job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
     default_status=DefaultSensorStatus.RUNNING,
@@ -502,7 +419,6 @@ def on_run_fail(context: RunStatusSensorContext):
 @repository
 def repo():
     graph_jobs = [
-        gold_translation.to_job(**preset_normal),
         hello_graph.to_job(name="hello_job"),
         ensure_jobs.to_job(**preset_normal),
         apply_metadata_in.to_job(**preset_normal),
@@ -518,8 +434,6 @@ def repo():
     ]
     sensors = [
         done_object_put_ops,
-        ensure_gold_translation_job,
-        claim_and_run_gold_translation_curation,
         process_workflow_job_triggers,
         claim_and_run_apply_changesheet_jobs,
         claim_and_run_metadata_in_jobs,
@@ -529,20 +443,6 @@ def repo():
     return graph_jobs + schedules + sensors
-@repository
-def translation():
-    graph_jobs = [jgi_job, gold_job, emsl_job]
-    return graph_jobs
-@repository
-def test_translation():
-    graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
-    return graph_jobs
 @repository
 def biosample_submission_ingest():
     normal_resources = run_config_frozen__normal_env["resources"]
@@ -602,6 +502,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {
@@ -638,6 +539,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {
@@ -1110,15 +1012,3 @@ def database_records_stitching():
             },
         ),
     ]
-# @repository
-# def validation():
-#     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
-#     return graph_jobs
-#
-#
-# @repository
-# def test_validation():
-#     graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
-#     return graph_jobs

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -520,11 +520,24 @@ class MongoDB:
         self.db = self.client[dbname]
     def add_docs(self, docs, validate=True, replace=True):
+        """
+        TODO: Document this function.
+        """
         try:
             if validate:
                 nmdc_jsonschema_validator_noidpatterns(docs)
             rv = {}
-            for collection_name, docs in docs.items():
+            for collection_name, collection_docs in docs.items():
+                # If `collection_docs` is empty, abort this iteration.
+                #
+                # Note: We do this because the `bulk_write` method called below will raise
+                #       an `InvalidOperation` exception if it is passed 0 operations.
+                #
+                # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
+                #
+                if len(collection_docs) == 0:
+                    continue
                 rv[collection_name] = self.db[collection_name].bulk_write(
                     [
                         (
@@ -532,7 +545,7 @@ class MongoDB:
                             if replace
                             else InsertOne(d)
                         )
-                        for d in docs
+                        for d in collection_docs
                     ]
                 )
                 now = datetime.now(timezone.utc)
@@ -544,7 +557,7 @@ class MongoDB:
                             "ts": now,
                             # "dtl": {},
                         }
-                        for d in docs
+                        for d in collection_docs
                     ]
                 )
             return rv

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import collections
-import csv
 import re
 from typing import List, Tuple, Union
 from nmdc_schema import nmdc
@@ -342,12 +341,7 @@ class GoldStudyTranslator(Translator):
         if field_value is None:
             return None
-        return nmdc.QuantityValue(
-            has_raw_value=field_value,
-            has_numeric_value=nmdc.Double(field_value),
-            has_unit=unit,
-            type="nmdc:QuantityValue",
-        )
+        return self._parse_quantity_value(str(field_value), unit)
     def _get_text_value(
         self, gold_entity: JSON_OBJECT, gold_field: str
@@ -573,13 +567,11 @@ class GoldStudyTranslator(Translator):
         gold_biosample_id = gold_biosample["biosampleGoldId"]
         return nmdc.Biosample(
             add_date=gold_biosample.get("addDate"),
-            alt=self._get_quantity_value(
-                gold_biosample, "altitudeInMeters", unit="meters"
-            ),
+            alt=self._get_quantity_value(gold_biosample, "altitudeInMeters", unit="m"),
             collected_from=nmdc_field_site_id,
             collection_date=self._get_collection_date(gold_biosample),
             depth=self._get_quantity_value(
-                gold_biosample, ("depthInMeters", "depthInMeters2"), unit="meters"
+                gold_biosample, ("depthInMeters", "depthInMeters2"), unit="m"
             ),
             description=gold_biosample.get("description"),
             diss_oxygen=self._get_quantity_value(gold_biosample, "oxygenConcentration"),
@@ -618,7 +610,7 @@ class GoldStudyTranslator(Translator):
             ),
             specific_ecosystem=gold_biosample.get("specificEcosystem"),
             subsurface_depth=self._get_quantity_value(
-                gold_biosample, "subsurfaceDepthInMeters", unit="meters"
+                gold_biosample, "subsurfaceDepthInMeters", unit="m"
             ),
             temp=self._get_quantity_value(
                 gold_biosample, "sampleCollectionTemperature"

nmdc_runtime/site/translation/neon_benthic_translator.py CHANGED Viewed

@@ -11,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
 from nmdc_runtime.site.translation.neon_utils import (
     _get_value_or_none,
     _create_controlled_identified_term_value,
-    _create_controlled_term_value,
     _create_geolocation_value,
     _create_quantity_value,
     _create_timestamp_value,

nmdc_runtime/site/translation/neon_soil_translator.py CHANGED Viewed

@@ -10,7 +10,6 @@ from nmdc_runtime.site.util import get_basename
 from nmdc_runtime.site.translation.neon_utils import (
     _get_value_or_none,
     _create_controlled_identified_term_value,
-    _create_controlled_term_value,
     _create_geolocation_value,
     _create_quantity_value,
     _create_timestamp_value,
@@ -153,7 +152,7 @@ class NeonSoilDataTranslator(Translator):
             collection_date=_create_timestamp_value(
                 biosample_row["collectDate"].values[0]
             ),
-            temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Celsius"),
+            temp=_create_quantity_value(biosample_row["soilTemp"].values[0], "Cel"),
             depth=nmdc.QuantityValue(
                 has_minimum_numeric_value=_get_value_or_none(
                     biosample_row, "sampleTopDepth"
@@ -169,13 +168,13 @@ class NeonSoilDataTranslator(Translator):
             analysis_type=_get_value_or_none(biosample_row, "sequenceAnalysisType"),
             env_package=_create_text_value(biosample_row["sampleType"].values[0]),
             nitro=_create_quantity_value(
-                biosample_row["nitrogenPercent"].values[0], "percent"
+                biosample_row["nitrogenPercent"].values[0], "%"
             ),
             org_carb=_create_quantity_value(
-                biosample_row["organicCPercent"].values[0], "percent"
+                biosample_row["organicCPercent"].values[0], "%"
             ),
             carb_nitro_ratio=_create_quantity_value(
-                biosample_row["CNratio"].values[0], None
+                biosample_row["CNratio"].values[0], "ratio"
             ),
             ph=_create_double_value(biosample_row["soilInWaterpH"].values[0]),
             water_content=(

nmdc_runtime/site/translation/neon_surface_water_translator.py CHANGED Viewed

@@ -3,7 +3,6 @@ import sqlite3
 from typing import Dict, Optional, Union
 import pandas as pd
-import requests
 import requests_cache
 from nmdc_schema import nmdc
@@ -12,7 +11,6 @@ from nmdc_runtime.site.util import get_basename
 from nmdc_runtime.site.translation.neon_utils import (
     _get_value_or_none,
     _create_controlled_identified_term_value,
-    _create_controlled_term_value,
     _create_geolocation_value,
     _create_quantity_value,
     _create_timestamp_value,

nmdc_runtime/site/translation/submission_portal_translator.py CHANGED Viewed

@@ -145,6 +145,7 @@ class SubmissionPortalTranslator(Translator):
         # See: https://github.com/microbiomedata/submission-schema/issues/162
         study_category: Optional[str] = None,
         study_pi_image_url: Optional[str] = None,
+        study_id: Optional[str] = None,
         # Additional biosample-level metadata with optional column mapping information not captured
         # by the submission portal currently.
         # See: https://github.com/microbiomedata/submission-schema/issues/162
@@ -165,6 +166,7 @@ class SubmissionPortalTranslator(Translator):
             nmdc.StudyCategoryEnum(study_category) if study_category else None
         )
         self.study_pi_image_url = study_pi_image_url
+        self.study_id = study_id
         self.biosample_extras = group_dicts_by_key(
             BIOSAMPLE_UNIQUE_KEY_SLOT, biosample_extras
@@ -174,6 +176,13 @@ class SubmissionPortalTranslator(Translator):
         )
         self.schema_view: SchemaView = _get_schema_view()
+        self._material_processing_subclass_names = []
+        for class_name in self.schema_view.class_descendants(
+            "MaterialProcessing", reflexive=False
+        ):
+            class_def = self.schema_view.get_class(class_name)
+            if not class_def.abstract:
+                self._material_processing_subclass_names.append(class_name)
     def _get_pi(
         self, metadata_submission: JSON_OBJECT
@@ -278,61 +287,9 @@ class SubmissionPortalTranslator(Translator):
     def _get_quantity_value(
         self, raw_value: Optional[str], unit: Optional[str] = None
     ) -> Union[nmdc.QuantityValue, None]:
-        """Construct a nmdc:QuantityValue from a raw value string
+        """Construct a nmdc:QuantityValue from a raw value string"""
-        The regex pattern minimally matches on a single numeric value (possibly
-        floating point). The pattern can also identify a range represented by
-        two numeric values separated by a hyphen. It can also identify non-numeric
-        characters at the end of the string which are interpreted as a unit. A unit
-        may also be explicitly provided as an argument to this function. If parsing
-        identifies a unit and a unit argument is provided, the unit argument is used.
-        If the pattern is not matched at all None is returned.
-        TODO: currently the parsed unit string is used as-is. In the future we may want
-        to be stricter about what we accept or coerce into a controlled value set
-        :param raw_value: string to parse
-        :param unit: optional unit, defaults to None
-        :return: nmdc:QuantityValue
-        """
-        if raw_value is None:
-            return None
-        match = re.fullmatch(
-            "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
-            raw_value,
-        )
-        if not match:
-            return None
-        qv = nmdc.QuantityValue(
-            has_raw_value=raw_value,
-            type="nmdc:QuantityValue",
-        )
-        if match.group(2):
-            # having group 2 means the value is a range like "0 - 1". Either
-            # group 1 or group 2 might be the minimum especially when handling
-            # negative ranges like "0 - -1"
-            num_1 = float(match.group(1))
-            num_2 = float(match.group(2))
-            qv.has_minimum_numeric_value = min(num_1, num_2)
-            qv.has_maximum_numeric_value = max(num_1, num_2)
-        else:
-            # otherwise we just have a single numeric value
-            qv.has_numeric_value = float(match.group(1))
-        if unit:
-            # a unit was manually specified
-            if match.group(3) and unit != match.group(3):
-                # a unit was also found in the raw string; issue a warning
-                # if they don't agree, but keep the manually specified one
-                logging.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
-            qv.has_unit = unit
-        elif match.group(3):
-            # a unit was found in the raw string
-            qv.has_unit = match.group(3)
-        return qv
+        return self._parse_quantity_value(raw_value, unit)
     def _get_ontology_class(
         self, raw_value: Optional[str]
@@ -594,6 +551,14 @@ class SubmissionPortalTranslator(Translator):
         return data_objects, manifest
+    def _parse_sample_link(self, sample_link: str) -> tuple[str, list[str]] | None:
+        """Parse a sample link in the form of `ProcessingName:SampleName,..."""
+        pattern = r"(" + "|".join(self._material_processing_subclass_names) + r"):(.+)"
+        match = re.match(pattern, sample_link)
+        if not match:
+            return None
+        return match.group(1), split_strip(match.group(2), ",")
     def _translate_study(
         self, metadata_submission: JSON_OBJECT, nmdc_study_id: str
     ) -> nmdc.Study:
@@ -804,11 +769,14 @@ class SubmissionPortalTranslator(Translator):
             "metadata_submission", {}
         )
-        # Generate one Study instance based on the metadata submission
-        nmdc_study_id = self._id_minter("nmdc:Study")[0]
-        database.study_set = [
-            self._translate_study(metadata_submission_data, nmdc_study_id)
-        ]
+        # Generate one Study instance based on the metadata submission, if a study_id wasn't provided
+        if self.study_id:
+            nmdc_study_id = self.study_id
+        else:
+            nmdc_study_id = self._id_minter("nmdc:Study")[0]
+            database.study_set = [
+                self._translate_study(metadata_submission_data, nmdc_study_id)
+            ]
         # Automatically populate the `env_package` field in the sample data based on which
         # environmental data tab the sample data came from.
@@ -840,15 +808,63 @@ class SubmissionPortalTranslator(Translator):
         )
         # Translate the sample data into nmdc:Biosample objects
-        database.biosample_set = [
-            self._translate_biosample(
-                sample_data,
-                nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
-                nmdc_study_id=nmdc_study_id,
-            )
-            for sample_data_id, sample_data in sample_data_by_id.items()
-            if sample_data
-        ]
+        database.biosample_set = []
+        for sample_data_id, sample_data in sample_data_by_id.items():
+            # This shouldn't happen, but just in case skip empty sample data
+            if not sample_data:
+                continue
+            # Find the first tab that has a sample_link value and attempt to parse it
+            sample_link = ""
+            for tab in sample_data:
+                if tab.get("sample_link"):
+                    sample_link = tab.get("sample_link")
+                    break
+            parsed_sample_link = self._parse_sample_link(sample_link)
+            # If the sample_link could be parsed according to the [ProcessName]:[InputSample,...]
+            # format, then create a ProcessedSample and MaterialProcessing instance instead of a
+            # Biosample instance. The input samples must be present in the submission for this to
+            # work. An exception is raised if any of the referenced input samples are missing.
+            if parsed_sample_link is not None:
+                processing_type, processing_inputs = parsed_sample_link
+                if not all(
+                    input_id in sample_data_to_nmdc_biosample_ids
+                    for input_id in processing_inputs
+                ):
+                    raise ValueError(
+                        f"Could not find all input samples in sample_link '{sample_link}'"
+                    )
+                processed_sample_id = self._id_minter("nmdc:ProcessedSample")[0]
+                database.processed_sample_set.append(
+                    nmdc.ProcessedSample(
+                        id=processed_sample_id,
+                        type="nmdc:ProcessedSample",
+                        name=sample_data[0].get(BIOSAMPLE_UNIQUE_KEY_SLOT, "").strip(),
+                    )
+                )
+                processing_class = getattr(nmdc, processing_type)
+                material_processing = processing_class(
+                    id=self._id_minter(f"nmdc:{processing_type}")[0],
+                    type=f"nmdc:{processing_type}",
+                    has_input=[
+                        sample_data_to_nmdc_biosample_ids[input_id]
+                        for input_id in processing_inputs
+                    ],
+                    has_output=[processed_sample_id],
+                )
+                database.material_processing_set.append(material_processing)
+            # If there was no sample_link or it doesn't follow the expected format, create a
+            # Biosample instance as normal.
+            else:
+                biosample = self._translate_biosample(
+                    sample_data,
+                    nmdc_biosample_id=sample_data_to_nmdc_biosample_ids[sample_data_id],
+                    nmdc_study_id=nmdc_study_id,
+                )
+                database.biosample_set.append(biosample)
         # This section handles the translation of information in the external sequencing tabs into
         # various NMDC objects.

nmdc_runtime/site/translation/translator.py CHANGED Viewed

@@ -1,9 +1,13 @@
+import logging
+import re
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 from nmdc_schema import nmdc
 JSON_OBJECT = Dict[str, Any]
+logger = logging.getLogger(__name__)
 class Translator(ABC):
     def __init__(
@@ -27,3 +31,61 @@ class Translator(ABC):
     @abstractmethod
     def get_database(self) -> nmdc.Database:
         pass
+    def _parse_quantity_value(
+        self, raw_value: Optional[str], unit: Optional[str] = None
+    ) -> Union[nmdc.QuantityValue, None]:
+        """Construct a nmdc:QuantityValue from a raw value string
+        The regex pattern minimally matches on a single numeric value (possibly
+        floating point). The pattern can also identify a range represented by
+        two numeric values separated by a hyphen. It can also identify non-numeric
+        characters at the end of the string which are interpreted as a unit. A unit
+        may also be explicitly provided as an argument to this function. If parsing
+        identifies a unit and a unit argument is provided, the unit argument is used.
+        If the pattern is not matched at all None is returned.
+        :param raw_value: string to parse
+        :param unit: optional unit, defaults to None. If None, the unit is extracted from the
+            raw_value. If a unit is provided, it will override the unit extracted from the
+            raw_value.
+        :return: nmdc:QuantityValue
+        """
+        if raw_value is None:
+            return None
+        match = re.fullmatch(
+            "([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?)(?: *- *([+-]?(?=\.\d|\d)(?:\d+)?(?:\.?\d*)(?:[eE][+-]?\d+)?))?(?: *(\S+))?",
+            raw_value,
+        )
+        if not match:
+            return None
+        quantity_value_kwargs = {
+            "has_raw_value": raw_value,
+            "type": "nmdc:QuantityValue",
+        }
+        if match.group(2):
+            # having group 2 means the value is a range like "0 - 1". Either
+            # group 1 or group 2 might be the minimum especially when handling
+            # negative ranges like "0 - -1"
+            num_1 = float(match.group(1))
+            num_2 = float(match.group(2))
+            quantity_value_kwargs["has_minimum_numeric_value"] = min(num_1, num_2)
+            quantity_value_kwargs["has_maximum_numeric_value"] = max(num_1, num_2)
+        else:
+            # otherwise we just have a single numeric value
+            quantity_value_kwargs["has_numeric_value"] = float(match.group(1))
+        if unit:
+            # a unit was manually specified
+            if match.group(3) and unit != match.group(3):
+                # a unit was also found in the raw string; issue a warning
+                # if they don't agree, but keep the manually specified one
+                logger.warning(f'Unit mismatch: "{unit}" and "{match.group(3)}"')
+            quantity_value_kwargs["has_unit"] = unit
+        elif match.group(3):
+            # a unit was found in the raw string
+            quantity_value_kwargs["has_unit"] = match.group(3)
+        return nmdc.QuantityValue(**quantity_value_kwargs)

nmdc_runtime/site/util.py CHANGED Viewed

@@ -3,10 +3,11 @@ import os
 from functools import lru_cache
 from pymongo.database import Database as MongoDatabase
 from subprocess import Popen, PIPE, STDOUT, CalledProcessError
-from toolz import groupby
+from refscan.lib.helpers import get_collection_names_from_schema
-from nmdc_runtime.api.db.mongo import get_collection_names_from_schema
 from nmdc_runtime.site.resources import mongo_resource
+from nmdc_runtime.util import nmdc_schema_view
 mode_test = {
     "resource_defs": {"mongo": mongo_resource}
@@ -37,12 +38,16 @@ def run_and_log(shell_cmd, context):
 @lru_cache
 def schema_collection_has_index_on_id(mdb: MongoDatabase) -> dict:
+    """
+    TODO: Document this function.
+    """
+    schema_view = nmdc_schema_view()
     present_collection_names = set(mdb.list_collection_names())
     return {
         name: (
             name in present_collection_names and "id_1" in mdb[name].index_information()
         )
-        for name in get_collection_names_from_schema()
+        for name in get_collection_names_from_schema(schema_view)
     }

nmdc_runtime/site/validation/util.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from dagster import op, AssetMaterialization, AssetKey, EventMetadata
+from dagster import op, AssetMaterialization, AssetKey, MetadataValue
 from jsonschema import Draft7Validator
 from nmdc_runtime.util import get_nmdc_jsonschema_dict
 from toolz import dissoc
@@ -92,10 +92,15 @@ def announce_validation_report(context, report, api_object):
         asset_key=AssetKey(["validation", f"{collection_name}_validation"]),
         description=f"{collection_name} translation validation",
         metadata={
-            # https://docs.dagster.io/_apidocs/solids#event-metadata
-            # also .json, .md, .path, .url, .python_artifact, ...
-            "n_errors": EventMetadata.int(len(report["errors"])),
-            "object_id": EventMetadata.text(api_object["id"]),
+            # Note: When this code was originally written, it used Dagster's `EventMetadata` class,
+            #       which has since been replaced by Dagster's `MetadataValue` class.
+            #
+            #       Reference:
+            #       - https://docs.dagster.io/api/dagster/ops#dagster.MetadataValue
+            #       - https://docs.dagster.io/api/dagster/metadata#dagster.MetadataValue
+            #
+            "n_errors": MetadataValue.int(len(report["errors"])),
+            "object_id": MetadataValue.text(api_object["id"]),
         },
     )

nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl