PyPI - nmdc-runtime - Versions diffs - 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

nmdc-runtime 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (19) hide show

nmdc_runtime/minter/config.py +18 -50
nmdc_runtime/site/export/ncbi_xml.py +23 -2
nmdc_runtime/site/export/ncbi_xml_utils.py +81 -30
nmdc_runtime/site/graphs.py +39 -0
nmdc_runtime/site/ops.py +131 -31
nmdc_runtime/site/repair/__init__.py +0 -0
nmdc_runtime/site/repair/database_updater.py +230 -0
nmdc_runtime/site/repository.py +109 -9
nmdc_runtime/site/resources.py +36 -5
nmdc_runtime/site/translation/gold_translator.py +26 -4
nmdc_runtime/site/translation/neon_surface_water_translator.py +128 -78
nmdc_runtime/site/util.py +7 -2
nmdc_runtime/util.py +143 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA +11 -3
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD +19 -17
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.2.1.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/site/repair/database_updater.py ADDED Viewed

@@ -0,0 +1,230 @@
+from functools import lru_cache
+from typing import Any, Dict, List
+import pandas as pd
+from nmdc_runtime.site.resources import (
+    RuntimeApiUserClient,
+    RuntimeApiSiteClient,
+    GoldApiClient,
+)
+from nmdc_runtime.site.translation.gold_translator import GoldStudyTranslator
+from nmdc_schema import nmdc
+class DatabaseUpdater:
+    def __init__(
+        self,
+        runtime_api_user_client: RuntimeApiUserClient,
+        runtime_api_site_client: RuntimeApiSiteClient,
+        gold_api_client: GoldApiClient,
+        study_id: str,
+        gold_nmdc_instrument_map_df: pd.DataFrame = pd.DataFrame(),
+    ):
+        """This class serves as an API for repairing connections in the database by
+        adding records that are essentially missing "links"/"connections". As we identify
+        common use cases for adding missing records to the database, we can
+        add helper methods to this class.
+        :param runtime_api_user_client: An object of RuntimeApiUserClient which can be
+        used to retrieve instance records from the NMDC database.
+        :param runtime_api_site_client: An object of RuntimeApiSiteClient which can be
+        used to mint new IDs for the repaired records that need to be added into the NMDC database.
+        :param gold_api_client: An object of GoldApiClient which can be used to retrieve
+        records from GOLD via the GOLD API.
+        :param study_id: NMDC study ID for which the missing records need to be added.
+        :param gold_nmdc_instrument_map_df: A dataframe originally stored as a TSV mapping file in the
+        NMDC schema repo, which maps GOLD instrument IDs to IDs of NMDC instrument_set records.
+        """
+        self.runtime_api_user_client = runtime_api_user_client
+        self.runtime_api_site_client = runtime_api_site_client
+        self.gold_api_client = gold_api_client
+        self.study_id = study_id
+        self.gold_nmdc_instrument_map_df = gold_nmdc_instrument_map_df
+    @lru_cache
+    def _fetch_gold_biosample(self, gold_biosample_id: str) -> List[Dict[str, Any]]:
+        """Fetch response from GOLD /biosamples API for a given biosample id.
+        :param gold_biosample_id: GOLD biosample ID.
+        :return: Dictionary containing the response from the GOLD /biosamples API.
+        """
+        return self.gold_api_client.fetch_biosample_by_biosample_id(gold_biosample_id)
+    @lru_cache
+    def _fetch_gold_projects(self, gold_biosample_id: str):
+        """Fetch response from GOLD /projects API for a given biosample id.
+        :param gold_biosample_id: GOLD biosample ID
+        :return: Dictionary containing the response from the GOLD /projects API.
+        """
+        return self.gold_api_client.fetch_projects_by_biosample(gold_biosample_id)
+    def generate_data_generation_set_records_from_gold_api_for_study(
+        self,
+    ) -> nmdc.Database:
+        """This method creates missing data generation records for a given study in the NMDC database using
+        metadata from GOLD. The way the logic works is, it first fetches all the biosamples associated
+        with the study from the NMDC database. Then, it fetches all the biosample and project data data
+        associated with the individual biosamples from the GOLD API using the NMDC-GOLD biosample id
+        mappings on the "gold_biosample_identifiers" key/slot. We use the GoldStudyTranslator class
+        to mint the required number of `nmdc:DataGeneration` (`nmdc:NucleotideSequencing`) records based
+        on the number of GOLD sequencing projects, and then reimplement only the part of logic from that
+        class which is responsible for making data_generation_set records.
+        :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
+        """
+        database = nmdc.Database()
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        all_gold_biosamples = []
+        all_gold_projects = []
+        for biosample in biosample_set:
+            gold_biosample_identifiers = biosample.get("gold_biosample_identifiers")
+            if gold_biosample_identifiers:
+                for gold_biosample_id in gold_biosample_identifiers:
+                    gold_biosample = self._fetch_gold_biosample(gold_biosample_id)[0]
+                    gold_projects = self._fetch_gold_projects(gold_biosample_id)
+                    gold_biosample["projects"] = gold_projects
+                    all_gold_biosamples.append(gold_biosample)
+                    all_gold_projects.extend(gold_projects)
+        gold_study_translator = GoldStudyTranslator(
+            biosamples=all_gold_biosamples,
+            projects=all_gold_projects,
+            gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+        )
+        # The GoldStudyTranslator class has some pre-processing logic which filters out
+        # invalid biosamples and projects (based on `sequencingStrategy`, `projectStatus`, etc.)
+        filtered_biosamples = gold_study_translator.biosamples
+        filtered_projects = gold_study_translator.projects
+        gold_project_ids = [project["projectGoldId"] for project in filtered_projects]
+        nmdc_nucleotide_sequencing_ids = self.runtime_api_site_client.mint_id(
+            "nmdc:NucleotideSequencing", len(gold_project_ids)
+        ).json()
+        gold_project_to_nmdc_nucleotide_sequencing_ids = dict(
+            zip(gold_project_ids, nmdc_nucleotide_sequencing_ids)
+        )
+        gold_to_nmdc_biosample_ids = {}
+        for biosample in biosample_set:
+            gold_ids = biosample.get("gold_biosample_identifiers", [])
+            for gold_id in gold_ids:
+                gold_id_stripped = gold_id.replace("gold:", "")
+                gold_to_nmdc_biosample_ids[gold_id_stripped] = biosample["id"]
+        database.data_generation_set = []
+        # Similar to the logic in GoldStudyTranslator, the number of nmdc:NucleotideSequencing records
+        # created is based on the number of GOLD sequencing projects
+        for project in filtered_projects:
+            # map the projectGoldId to the NMDC biosample ID
+            biosample_gold_id = next(
+                (
+                    biosample["biosampleGoldId"]
+                    for biosample in filtered_biosamples
+                    if any(
+                        p["projectGoldId"] == project["projectGoldId"]
+                        for p in biosample.get("projects", [])
+                    )
+                ),
+                None,
+            )
+            if biosample_gold_id:
+                nmdc_biosample_id = gold_to_nmdc_biosample_ids.get(biosample_gold_id)
+                if nmdc_biosample_id:
+                    database.data_generation_set.append(
+                        gold_study_translator._translate_nucleotide_sequencing(
+                            project,
+                            nmdc_nucleotide_sequencing_id=gold_project_to_nmdc_nucleotide_sequencing_ids[
+                                project["projectGoldId"]
+                            ],
+                            nmdc_biosample_id=nmdc_biosample_id,
+                            nmdc_study_id=self.study_id,
+                        )
+                    )
+        return database
+    def generate_biosample_set_from_gold_api_for_study(self) -> nmdc.Database:
+        """This method creates biosample_set records for a given study in the NMDC database using
+        metadata from GOLD. The logic works by first fetching the biosampleGoldId values of all
+        biosamples associated with the study. Then, it fetches the list of all biosamples associated
+        with the GOLD study using the GOLD API. There's pre-processing logic in the GoldStudyTranslator
+        to filter out biosamples based on `sequencingStrategy` and `projectStatus`. On this list of
+        filtered biosamples, we compute a "set difference" (conceptually) between the list of
+        filtered samples and ones that are already in the NMDC database, i.e., we ignore biosamples
+        that are already present in the database, and continue on to create biosample_set records for
+        those that do not have records in the database already.
+        :return: An instance of `nmdc:Database` object which is JSON-ified and rendered on the frontend.
+        """
+        database = nmdc.Database()
+        # get a list of all biosamples associated with a given NMDC study id
+        biosample_set = self.runtime_api_user_client.get_biosamples_for_study(
+            self.study_id
+        )
+        # get a list of GOLD biosample ids (`biosampleGoldId` values) by iterating
+        # over all the biosample_set records retrieved using the above logic
+        nmdc_gold_ids = set()
+        for biosample in biosample_set:
+            gold_ids = biosample.get("gold_biosample_identifiers", [])
+            for gold_id in gold_ids:
+                nmdc_gold_ids.add(gold_id.replace("gold:", ""))
+        # retrieve GOLD study id by looking at the `gold_study_identifiers` key/slot
+        # on the NMDC study record
+        nmdc_study = self.runtime_api_user_client.get_study(self.study_id)[0]
+        gold_study_id = nmdc_study.get("gold_study_identifiers", [])[0].replace(
+            "gold:", ""
+        )
+        # use the GOLD study id to fetch all biosample records associated with the study
+        gold_biosamples_for_study = self.gold_api_client.fetch_biosamples_by_study(
+            gold_study_id
+        )
+        # part of the code where we are (conceptually) computing a set difference between
+        # the list of filtered samples and ones that are already in the NMDC database
+        missing_gold_biosamples = [
+            gbs
+            for gbs in gold_biosamples_for_study
+            if gbs.get("biosampleGoldId") not in nmdc_gold_ids
+        ]
+        gold_study_translator = GoldStudyTranslator(
+            biosamples=missing_gold_biosamples,
+            gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
+        )
+        translated_biosamples = gold_study_translator.biosamples
+        # mint new NMDC biosample IDs for the "missing" biosamples
+        gold_biosample_ids = [
+            biosample["biosampleGoldId"] for biosample in translated_biosamples
+        ]
+        nmdc_biosample_ids = self.runtime_api_site_client.mint_id(
+            "nmdc:Biosample", len(translated_biosamples)
+        ).json()
+        gold_to_nmdc_biosample_ids = dict(zip(gold_biosample_ids, nmdc_biosample_ids))
+        database.biosample_set = [
+            gold_study_translator._translate_biosample(
+                biosample,
+                nmdc_biosample_id=gold_to_nmdc_biosample_ids[
+                    biosample["biosampleGoldId"]
+                ],
+                nmdc_study_id=self.study_id,
+                nmdc_field_site_id=None,
+            )
+            for biosample in translated_biosamples
+        ]
+        return database

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -25,6 +25,7 @@ from nmdc_runtime.api.models.run import _add_run_fail_event
 from nmdc_runtime.api.models.trigger import Trigger
 from nmdc_runtime.site.export.study_metadata import export_study_biosamples_metadata
 from nmdc_runtime.site.graphs import (
+    generate_biosample_set_from_samples_in_gold,
     translate_metadata_submission_to_nmdc_schema_database,
     ingest_metadata_submission,
     gold_study_to_database,
@@ -44,6 +45,7 @@ from nmdc_runtime.site.graphs import (
     ingest_neon_surface_water_metadata,
     ensure_alldocs,
     nmdc_study_to_ncbi_submission_export,
+    generate_data_generation_set_for_biosamples_in_nmdc_study,
 )
 from nmdc_runtime.site.resources import (
     get_mongo,
@@ -113,6 +115,13 @@ housekeeping_weekly = ScheduleDefinition(
     job=housekeeping.to_job(**preset_normal),
 )
+ensure_alldocs_daily = ScheduleDefinition(
+    name="daily_ensure_alldocs",
+    cron_schedule="0 3 * * *",
+    execution_timezone="America/New_York",
+    job=ensure_alldocs.to_job(**preset_normal),
+)
 def asset_materialization_metadata(asset_event, key):
     """Get metadata from an asset materialization event.
@@ -453,7 +462,7 @@ def repo():
         export_study_biosamples_metadata.to_job(**preset_normal),
         ensure_alldocs.to_job(**preset_normal),
     ]
-    schedules = [housekeeping_weekly]
+    schedules = [housekeeping_weekly, ensure_alldocs_daily]
     sensors = [
         done_object_put_ops,
         ensure_gold_translation_job,
@@ -643,7 +652,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -685,7 +694,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -728,7 +737,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_benthic_data_product": {
@@ -770,7 +779,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -813,14 +822,14 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_surface_water_data_product": {
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -847,7 +856,7 @@ def biosample_submission_ingest():
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -855,7 +864,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -915,6 +924,97 @@ def biosample_export():
     ]
+@repository
+def database_records_stitching():
+    normal_resources = run_config_frozen__normal_env["resources"]
+    return [
+        generate_data_generation_set_for_biosamples_in_nmdc_study.to_job(
+            description="This job can be used to create a data_generation_set JSON for biosamples that are already present in the NMDC database.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                        }
+                    },
+                    "export_json_to_drs": {"config": {"username": ""}},
+                },
+            },
+        ),
+        generate_biosample_set_from_samples_in_gold.to_job(
+            description="This job can be used to create a biosample_set JSON from samples in GOLD for a given study in NMDC.",
+            resource_defs=resource_defs,
+            config={
+                "resources": merge(
+                    unfreeze(normal_resources),
+                    {
+                        "runtime_api_user_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "username": {"env": "API_ADMIN_USER"},
+                                "password": {"env": "API_ADMIN_PASS"},
+                            },
+                        },
+                        "runtime_api_site_client": {
+                            "config": {
+                                "base_url": {"env": "API_HOST"},
+                                "client_id": {"env": "API_SITE_CLIENT_ID"},
+                                "client_secret": {"env": "API_SITE_CLIENT_SECRET"},
+                                "site_id": {"env": "API_SITE_ID"},
+                            },
+                        },
+                        "gold_api_client": {
+                            "config": {
+                                "base_url": {"env": "GOLD_API_BASE_URL"},
+                                "username": {"env": "GOLD_API_USERNAME"},
+                                "password": {"env": "GOLD_API_PASSWORD"},
+                            },
+                        },
+                    },
+                ),
+                "ops": {
+                    "get_database_updater_inputs": {
+                        "config": {
+                            "nmdc_study_id": "",
+                            "gold_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/gold_seqMethod_to_nmdc_instrument_set.tsv",
+                        }
+                    },
+                    "export_json_to_drs": {"config": {"username": ""}},
+                },
+            },
+        ),
+    ]
 # @repository
 # def validation():
 #     graph_jobs = [validate_jgi_job, validate_gold_job, validate_emsl_job]

nmdc_runtime/site/resources.py CHANGED Viewed

@@ -129,16 +129,23 @@ class RuntimeApiUserClient(RuntimeApiClient):
         return response.json()["cursor"]["firstBatch"]
     def get_biosamples_for_study(self, study_id: str):
+        # TODO: 10000 is an arbitrarily large number that has been chosen for the max_page_size param.
+        # The /nmdcschema/{collection-name} endpoint implements pagination via the page_token mechanism,
+        # but the tradeoff there is that we would need to make multiple requests to step through the
+        # each of the pages. By picking a large number for max_page_size, we can get all the results
+        # in a single request.
+        # This method previously used the /queries:run endpoint but the problem with that was that
+        # it used to truncate the number of results returned to 100.
         response = self.request(
-            "POST",
-            f"/queries:run",
+            "GET",
+            f"/nmdcschema/biosample_set",
             {
-                "find": "biosample_set",
-                "filter": {"part_of": {"$elemMatch": {"$eq": study_id}}},
+                "filter": json.dumps({"associated_studies": study_id}),
+                "max_page_size": 10000,
             },
         )
         response.raise_for_status()
-        return response.json()["cursor"]["firstBatch"]
+        return response.json()["resources"]
     def get_omics_processing_by_name(self, name: str):
         response = self.request(
@@ -152,6 +159,18 @@ class RuntimeApiUserClient(RuntimeApiClient):
         response.raise_for_status()
         return response.json()["cursor"]["firstBatch"]
+    def get_study(self, study_id: str):
+        response = self.request(
+            "POST",
+            f"/queries:run",
+            {
+                "find": "study_set",
+                "filter": {"id": study_id},
+            },
+        )
+        response.raise_for_status()
+        return response.json()["cursor"]["firstBatch"]
 class RuntimeApiSiteClient(RuntimeApiClient):
     def __init__(
@@ -370,6 +389,18 @@ class GoldApiClient(BasicAuthClient):
             return None
         return results[0]
+    def fetch_projects_by_biosample(self, biosample_id: str) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/projects", params={"biosampleGoldId": id})
+        return results
+    def fetch_biosample_by_biosample_id(
+        self, biosample_id: str
+    ) -> List[Dict[str, Any]]:
+        id = self._normalize_id(biosample_id)
+        results = self.request("/biosamples", params={"biosampleGoldId": id})
+        return results
 @resource(
     config_schema={

nmdc_runtime/site/translation/gold_translator.py CHANGED Viewed

@@ -12,6 +12,29 @@ from nmdc_runtime.site.translation.translator import JSON_OBJECT, Translator
 SEQUENCING_STRATEGIES = {"Metagenome", "Metatranscriptome"}
+def _is_valid_project(project: dict) -> bool:
+    """A project is considered valid if:
+    1. `sequencingStrategy` is in {"Metagenome", "Metatranscriptome"}
+    2. if `sequencingCenters` == 'DOE Joint Genome Institute (JGI)' then
+        `projectStatus` must be in ("Permanent Draft", "Complete and Published")
+    3. otherwise, no `projectStatus` filter is applied
+    :param project: GOLD project object (structurally similar to response
+                    from `/projects` endpoint)
+    :return: True if the project is valid, False otherwise
+    """
+    if project.get("sequencingStrategy") not in SEQUENCING_STRATEGIES:
+        return False
+    if project.get("sequencingCenters") == "DOE Joint Genome Institute (JGI)":
+        return project.get("projectStatus") in (
+            "Permanent Draft",
+            "Complete and Published",
+        )
+    return True
 class GoldStudyTranslator(Translator):
     def __init__(
         self,
@@ -36,16 +59,15 @@ class GoldStudyTranslator(Translator):
             biosample
             for biosample in biosamples
             if any(
-                project.get("sequencingStrategy") in SEQUENCING_STRATEGIES
-                for project in biosample.get("projects", [])
+                _is_valid_project(project) for project in biosample.get("projects", [])
             )
         ]
         # Fetch the valid projectGoldIds that are associated with filtered
         # biosamples on their `projects` field
         valid_project_ids = {
             project.get("projectGoldId")
-            for biosample in self.biosamples
-            for project in biosample.get("projects", [])
+            for project in projects
+            if _is_valid_project(project)
         }
         # Filter projects to only those with `projectGoldId` in valid_project_ids
         self.projects = [

nmdc-runtime 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.2.1py3-none-any.whl → 2.4.0py3-none-any.whl