PyPI - nmdc-runtime - Versions diffs - 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl - Mend

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

nmdc_runtime/Dockerfile +177 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +212 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +777 -0
nmdc_runtime/api/core/util.py +114 -0
nmdc_runtime/api/db/mongo.py +436 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +206 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +277 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +817 -0
nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +495 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +57 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +207 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/wfe_file_stages.py +122 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +56 -1
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +89 -0
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +29 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +29 -0
nmdc_runtime/site/export/ncbi_xml.py +731 -40
nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
nmdc_runtime/site/graphs.py +80 -29
nmdc_runtime/site/ops.py +522 -183
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +108 -117
nmdc_runtime/site/resources.py +72 -36
nmdc_runtime/site/translation/gold_translator.py +22 -21
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
nmdc_runtime/site/translation/translator.py +64 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +16 -12
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +175 -348
nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/site/repair/database_updater.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 import pandas as pd
 from nmdc_runtime.site.resources import (
     RuntimeApiUserClient,
@@ -18,6 +18,8 @@ class DatabaseUpdater:
         gold_api_client: GoldApiClient,
         study_id: str,
         gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+        include_field_site_info: bool = False,
+        enable_biosample_filtering: bool = True,
     ):
         """This class serves as an API for repairing connections in the database by
         adding records that are essentially missing "links"/"connections". As we identify
@@ -39,6 +41,8 @@ class DatabaseUpdater:
         self.gold_api_client = gold_api_client
         self.study_id = study_id
         self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
+        self.include_field_site_info = include_field_site_info
+        self.enable_biosample_filtering = enable_biosample_filtering
     @lru_cache
     def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
@@ -95,6 +99,8 @@ class DatabaseUpdater:
             biosamples=all_gold_biosamples,
             projects=all_gold_projects,
             gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+            include_field_site_info=self.include_field_site_info,
+            enable_biosample_filtering=self.enable_biosample_filtering,
         )
         # The GoldStudyTranslator class has some pre-processing logic which filters out
@@ -214,6 +220,8 @@ class DatabaseUpdater:
             projects=gold_sequencing_projects_for_study,
             analysis_projects=gold_analysis_projects_for_study,
             gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+            include_field_site_info=self.include_field_site_info,
+            enable_biosample_filtering=self.enable_biosample_filtering,
         )
         translated_biosamples = gold_study_translator.biosamples
@@ -240,3 +248,204 @@ class DatabaseUpdater:
         ]
         return database
+    def queries_run_script_to_update_insdc_identifiers(
+        self,
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """This method creates a `/queries:run` API endpoint compatible update script that can be run
+        using that API endpoint to update/add information on the `insdc_biosample_identifiers` field
+        of biosample_set records and the `insdc_bioproject_identifiers` field on data_generation_set records.
+        The information to be asserted is retrieved from the `ncbiBioSampleAccession` and
+        `ncbiBioProjectAccession` fields on the GOLD `/projects` API endpoint.
+        :return: A `/queries:run` update query compatible script serialized as a dictionary/JSON.
+        """
+        # Fetch all biosamples associated with the study
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        # Fetch all data_generation records associated with the study
+        data_generation_set = (
+            self.runtime_api_user_client.get_data_generation_records_for_study(
+                self.study_id
+            )
+        )
+        biosample_updates = []
+        data_generation_updates = []
+        # Dictionary to store gold_project_id -> ncbi_bioproject_accession mapping
+        gold_project_to_bioproject = {}
+        # Dictionary to store all project data we gather during biosample processing
+        all_processed_projects = {}
+        # Process biosamples for insdc_biosample_identifiers
+        for biosample in biosample_set:
+            # get the list (usually one) of GOLD biosample identifiers on the gold_biosample_identifiers slot
+            gold_biosample_identifiers = biosample.get("gold_biosample_identifiers", [])
+            if not gold_biosample_identifiers:
+                continue
+            biosample_id = biosample.get("id")
+            if not biosample_id:
+                continue
+            insdc_biosample_identifiers = []
+            for gold_biosample_id in gold_biosample_identifiers:
+                normalized_id = gold_biosample_id.replace("gold:", "")
+                # fetch projects associated with a GOLD biosample from the GOLD `/projects` API endpoint
+                gold_projects = self.gold_api_client.fetch_projects_by_biosample(
+                    normalized_id
+                )
+                for project in gold_projects:
+                    # Store each project for later use
+                    project_gold_id = project.get("projectGoldId")
+                    if project_gold_id:
+                        all_processed_projects[project_gold_id] = project
+                    # Collect ncbi_biosample_accession for biosample updates
+                    ncbi_biosample_accession = project.get("ncbiBioSampleAccession")
+                    if ncbi_biosample_accession and ncbi_biosample_accession.strip():
+                        insdc_biosample_identifiers.append(ncbi_biosample_accession)
+                    # Collect ncbi_bioproject_accession for data_generation records
+                    ncbi_bioproject_accession = project.get("ncbiBioProjectAccession")
+                    if (
+                        project_gold_id
+                        and ncbi_bioproject_accession
+                        and ncbi_bioproject_accession.strip()
+                    ):
+                        gold_project_to_bioproject[project_gold_id] = (
+                            ncbi_bioproject_accession
+                        )
+            if insdc_biosample_identifiers:
+                existing_insdc_biosample_identifiers = biosample.get(
+                    "insdc_biosample_identifiers", []
+                )
+                new_insdc_biosample_identifiers = list(
+                    set(insdc_biosample_identifiers)
+                    - set(existing_insdc_biosample_identifiers)
+                )
+                if new_insdc_biosample_identifiers:
+                    prefixed_new_biosample_identifiers = [
+                        f"biosample:{id}" for id in new_insdc_biosample_identifiers
+                    ]
+                    if existing_insdc_biosample_identifiers:
+                        all_biosample_identifiers = list(
+                            set(
+                                existing_insdc_biosample_identifiers
+                                + prefixed_new_biosample_identifiers
+                            )
+                        )
+                        biosample_updates.append(
+                            {
+                                "q": {"id": biosample_id},
+                                "u": {
+                                    "$set": {
+                                        "insdc_biosample_identifiers": all_biosample_identifiers
+                                    }
+                                },
+                            }
+                        )
+                    else:
+                        biosample_updates.append(
+                            {
+                                "q": {"id": biosample_id},
+                                "u": {
+                                    "$set": {
+                                        "insdc_biosample_identifiers": prefixed_new_biosample_identifiers
+                                    }
+                                },
+                            }
+                        )
+        # Process data_generation records for insdc_bioproject_identifiers
+        for data_generation in data_generation_set:
+            data_generation_id = data_generation.get("id")
+            if not data_generation_id:
+                continue
+            # Extract existing insdc_bioproject_identifiers
+            existing_insdc_bioproject_identifiers = data_generation.get(
+                "insdc_bioproject_identifiers", []
+            )
+            collected_insdc_bioproject_identifiers = set()
+            # Add any project identifiers already on the record
+            if "insdc_bioproject_identifiers" in data_generation:
+                for identifier in data_generation["insdc_bioproject_identifiers"]:
+                    collected_insdc_bioproject_identifiers.add(identifier)
+            # If there are gold_sequencing_project_identifiers, use our pre-collected mapping
+            gold_project_identifiers = data_generation.get(
+                "gold_sequencing_project_identifiers", []
+            )
+            for gold_project_id in gold_project_identifiers:
+                normalized_id = gold_project_id.replace("gold:", "")
+                # Check if we have a bioproject ID for this GOLD project ID
+                if normalized_id in gold_project_to_bioproject:
+                    ncbi_bioproject_accession = gold_project_to_bioproject[
+                        normalized_id
+                    ]
+                    collected_insdc_bioproject_identifiers.add(
+                        f"bioproject:{ncbi_bioproject_accession}"
+                    )
+                else:
+                    # Only if we don't have it in our mapping, try to fetch it
+                    # Instead of making a direct API request, check if we've already seen this project
+                    if normalized_id in all_processed_projects:
+                        project_data = all_processed_projects[normalized_id]
+                        ncbi_bioproject_accession = project_data.get(
+                            "ncbiBioProjectAccession"
+                        )
+                        if (
+                            ncbi_bioproject_accession
+                            and ncbi_bioproject_accession.strip()
+                        ):
+                            collected_insdc_bioproject_identifiers.add(
+                                f"bioproject:{ncbi_bioproject_accession}"
+                            )
+                            # Add to our mapping for future reference
+                            gold_project_to_bioproject[normalized_id] = (
+                                ncbi_bioproject_accession
+                            )
+            # Create a list from the set of collected identifiers
+            collected_insdc_bioproject_identifiers = list(
+                collected_insdc_bioproject_identifiers
+            )
+            # Only update if there are identifiers to add
+            if collected_insdc_bioproject_identifiers and set(
+                collected_insdc_bioproject_identifiers
+            ) != set(existing_insdc_bioproject_identifiers):
+                data_generation_updates.append(
+                    {
+                        "q": {"id": data_generation_id},
+                        "u": {
+                            "$set": {
+                                "insdc_bioproject_identifiers": collected_insdc_bioproject_identifiers
+                            }
+                        },
+                    }
+                )
+        # Return updates for both collections
+        if data_generation_updates:
+            return [
+                {"update": "biosample_set", "updates": biosample_updates},
+                {"update": "data_generation_set", "updates": data_generation_updates},
+            ]
+        else:
+            return {"update": "biosample_set", "updates": biosample_updates}

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import json
-from typing import Any
 from dagster import (
     repository,
@@ -29,8 +28,6 @@ from nmdc_runtime.site.graphs import (
     translate_metadata_submission_to_nmdc_schema_database,
     ingest_metadata_submission,
     gold_study_to_database,
-    gold_translation,
-    gold_translation_curation,
     create_objects_from_site_object_puts,
     housekeeping,
     ensure_jobs,
@@ -44,8 +41,10 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_benthic_metadata,
     ingest_neon_surface_water_metadata,
     ensure_alldocs,
+    run_ontology_load,
     nmdc_study_to_ncbi_submission_export,
     generate_data_generation_set_for_biosamples_in_nmdc_study,
+    generate_update_script_for_insdc_biosample_identifiers,
 )
 from nmdc_runtime.site.resources import (
     get_mongo,
@@ -59,9 +58,6 @@ from nmdc_runtime.site.resources import (
 from nmdc_runtime.site.resources import (
     get_runtime_api_site_client,
 )
-from nmdc_runtime.site.translation.emsl import emsl_job, test_emsl_job
-from nmdc_runtime.site.translation.gold import gold_job, test_gold_job
-from nmdc_runtime.site.translation.jgi import jgi_job, test_jgi_job
 from nmdc_runtime.util import freeze
 from nmdc_runtime.util import unfreeze
@@ -123,6 +119,55 @@ ensure_alldocs_daily = ScheduleDefinition(
 )
+load_envo_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_envo_ontology",
+    cron_schedule="0 7 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_envo_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "envo"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
+load_uberon_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_uberon_ontology",
+    cron_schedule="0 8 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_uberon_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "uberon"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
+load_po_ontology_weekly = ScheduleDefinition(
+    name="weekly_load_po_ontology",
+    cron_schedule="0 9 * * 1",
+    execution_timezone="America/New_York",
+    job=run_ontology_load.to_job(
+        name="scheduled_po_ontology_load",
+        config=unfreeze(
+            merge(
+                run_config_frozen__normal_env,
+                {"ops": {"load_ontology": {"config": {"source_ontology": "po"}}}},
+            )
+        ),
+        resource_defs=resource_defs,
+    ),
+)
 def asset_materialization_metadata(asset_event, key):
     """Get metadata from an asset materialization event.
@@ -197,82 +242,6 @@ def process_workflow_job_triggers(_context):
         yield SkipReason("No new jobs required")
-@asset_sensor(
-    asset_key=AssetKey(["object", "nmdc_database.json.zip"]),
-    job=ensure_jobs.to_job(name="ensure_gold_translation", **preset_normal),
-)
-def ensure_gold_translation_job(_context, asset_event):
-    mdb = get_mongo(run_config_frozen__normal_env).db
-    gold_etl_latest = mdb.objects.find_one(
-        {"name": "nmdc_database.json.zip"}, sort=[("created_time", -1)]
-    )
-    sensed_object_id = asset_materialization_metadata(asset_event, "object_id").text
-    if gold_etl_latest is None:
-        yield SkipReason("can't find sensed asset object_id in database")
-        return
-    elif gold_etl_latest["id"] != sensed_object_id:
-        yield SkipReason("later object than sensed materialization")
-        return
-    run_config = merge(
-        run_config_frozen__normal_env,
-        {
-            "solids": {
-                "construct_jobs": {
-                    "config": {
-                        "base_jobs": [
-                            {
-                                "workflow": {"id": "gold-translation-1.0.0"},
-                                "config": {"object_id": gold_etl_latest["id"]},
-                            }
-                        ]
-                    }
-                }
-            }
-        },
-    )
-    yield RunRequest(run_key=sensed_object_id, run_config=unfreeze(run_config))
-@asset_sensor(
-    asset_key=AssetKey(["job", "gold-translation-1.0.0"]),
-    job=gold_translation_curation.to_job(**preset_normal),
-)
-def claim_and_run_gold_translation_curation(_context, asset_event):
-    client = get_runtime_api_site_client(run_config_frozen__normal_env)
-    mdb = get_mongo(run_config_frozen__normal_env).db
-    object_id_latest = asset_materialization_metadata(
-        asset_event, "object_id_latest"
-    ).text
-    job = mdb.jobs.find_one(
-        {
-            "workflow.id": "gold-translation-1.0.0",
-            "config.object_id_latest": object_id_latest,
-        }
-    )
-    if job is not None:
-        rv = client.claim_job(job["id"])
-        if rv.status_code == status.HTTP_200_OK:
-            operation = rv.json()
-            run_config = merge(
-                run_config_frozen__normal_env,
-                {
-                    "ops": {
-                        "get_operation": {
-                            "config": {
-                                "operation_id": operation["id"],
-                            }
-                        }
-                    }
-                },
-            )
-            yield RunRequest(run_key=operation["id"], run_config=unfreeze(run_config))
-        else:
-            yield SkipReason("Job found, but already claimed by this site")
-    else:
-        yield SkipReason("No job found")
 @sensor(
     job=apply_metadata_in.to_job(name="apply_metadata_in_sensed", **preset_normal),
     default_status=DefaultSensorStatus.RUNNING,
@@ -411,11 +380,6 @@ def claim_and_run_apply_changesheet_jobs(_context):
         yield SkipReason("; ".join(skip_notes))
-# TODO ensure data_object_type values from file_type_enum
-#    see /metadata-translation/notebooks/202106_curation_updates.ipynb
-#    for details ("Create file_type_enum collection" section).
 @sensor(job=create_objects_from_site_object_puts.to_job(**preset_normal))
 def done_object_put_ops(_context):
     client = get_runtime_api_site_client(run_config_frozen__normal_env)
@@ -455,18 +419,21 @@ def on_run_fail(context: RunStatusSensorContext):
 @repository
 def repo():
     graph_jobs = [
-        gold_translation.to_job(**preset_normal),
         hello_graph.to_job(name="hello_job"),
         ensure_jobs.to_job(**preset_normal),
         apply_metadata_in.to_job(**preset_normal),
         export_study_biosamples_metadata.to_job(**preset_normal),
         ensure_alldocs.to_job(**preset_normal),
     ]
-    schedules = [housekeeping_weekly, ensure_alldocs_daily]
+    schedules = [
+        housekeeping_weekly,
+        ensure_alldocs_daily,
+        load_envo_ontology_weekly,
+        load_uberon_ontology_weekly,
+        load_po_ontology_weekly,
+    ]
     sensors = [
         done_object_put_ops,
-        ensure_gold_translation_job,
-        claim_and_run_gold_translation_curation,
         process_workflow_job_triggers,
         claim_and_run_apply_changesheet_jobs,
         claim_and_run_metadata_in_jobs,
@@ -476,20 +443,6 @@ def repo():
     return graph_jobs + schedules + sensors
-@repository
-def translation():
-    graph_jobs = [jgi_job, gold_job, emsl_job]
-    return graph_jobs
-@repository
-def test_translation():
-    graph_jobs = [test_jgi_job, test_gold_job, test_emsl_job]
-    return graph_jobs
 @repository
 def biosample_submission_ingest():
     normal_resources = run_config_frozen__normal_env["resources"]
@@ -516,6 +469,7 @@ def biosample_submission_ingest():
                             "study_type": "research_study",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
                             "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         },
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
@@ -548,6 +502,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {
@@ -584,6 +539,7 @@ def biosample_submission_ingest():
                             "data_object_mapping_file_url": None,
                             "biosample_extras_file_url": None,
                             "biosample_extras_slot_mapping_file_url": None,
+                            "study_id": None,
                         }
                     },
                     "translate_portal_submission_to_nmdc_schema_database": {
@@ -960,6 +916,8 @@ def database_records_stitching():
                         "config": {
                             "nmdc_study_id": "",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         }
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
@@ -1002,22 +960,55 @@ def database_records_stitching():
                         "config": {
                             "nmdc_study_id": "",
                             "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
                         }
                     },
                     "export_json_to_drs": {"config": {"username": ""}},
                 },
             },
         ),
+        generate_update_script_for_insdc_biosample_identifiers.to_job(
+            description="This job generates a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                            "include_field_site_info": False,
+                            "enable_biosample_filtering": True,
+                        }
+                    },
+                },
+            },
+        ),
     ]
-# @repository
-# def validation():
-#     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]
-#     return graph_jobs
-#
-#
-# @repository
-# def test_validation():
-#     graph_jobs = [test_validate_jgi_job, test_validate_gold_job, test_validate_emsl_job]
-#     return graph_jobs

nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl

nmdc-runtime 2.6.0py3-none-any.whl → 2.12.0py3-none-any.whl