PyPI - nmdc-runtime - Versions diffs - 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl - Mend

nmdc-runtime 2.7.0py3-none-any.whl → 2.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (21) hide show

nmdc_runtime/config.py +57 -1
nmdc_runtime/mongo_util.py +90 -0
nmdc_runtime/site/export/ncbi_xml.py +98 -27
nmdc_runtime/site/export/ncbi_xml_utils.py +27 -25
nmdc_runtime/site/graphs.py +72 -9
nmdc_runtime/site/ops.py +408 -65
nmdc_runtime/site/repair/database_updater.py +210 -1
nmdc_runtime/site/repository.py +107 -6
nmdc_runtime/site/resources.py +17 -4
nmdc_runtime/site/translation/gold_translator.py +18 -9
nmdc_runtime/site/translation/neon_benthic_translator.py +1 -0
nmdc_runtime/site/translation/neon_soil_translator.py +1 -0
nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -0
nmdc_runtime/site/translation/submission_portal_translator.py +62 -0
nmdc_runtime/util.py +53 -267
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/METADATA +18 -3
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/RECORD +21 -20
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/WHEEL +1 -1
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/licenses/LICENSE +0 -0
{nmdc_runtime-2.7.0.dist-info → nmdc_runtime-2.9.0.dist-info}/top_level.txt +0 -0

nmdc_runtime/config.py CHANGED Viewed

@@ -1 +1,57 @@
-DATABASE_CLASS_NAME = "Database"
+"""
+This module acts as a unified interface between the codebase and the environment.
+We will eventually move all of the Runtime's environment variables reads into this
+module, instead of leaving them sprinkled throughout the codebase.
+TODO: Move all environment variable reads into this module and update references accordingly.
+"""
+from typing import Set
+import os
+def is_env_var_true(name: str, default: str = "false") -> bool:
+    r"""
+    Checks whether the value of the specified environment variable
+    meets our criteria for true-ness.
+    Reference: https://docs.python.org/3/library/os.html#os.environ
+    Run doctests via: $ python -m doctest nmdc_runtime/config.py
+    >>> import os
+    >>> name = "EXAMPLE_ENV_VAR"
+    >>> os.unsetenv(name)  # Undefined
+    >>> is_env_var_true(name)
+    False
+    >>> is_env_var_true(name, "true")  # Undefined, overridden default
+    True
+    >>> os.environ[name] = "false"  # Defined as false
+    >>> is_env_var_true(name)
+    False
+    >>> os.environ[name] = "true"  # Defined as true
+    >>> is_env_var_true(name)
+    True
+    >>> os.environ[name] = "TRUE"  # Case-insensitive
+    >>> is_env_var_true(name)
+    True
+    >>> os.environ[name] = "potato"  # Non-boolean string
+    >>> is_env_var_true(name)
+    False
+    """
+    lowercase_true_strings: Set[str] = {"true"}
+    return os.environ.get(name, default).lower() in lowercase_true_strings
+# The name of the schema class representing the database. We don't bother to
+# make this customizable via the environment, as we expect it to never change.
+DATABASE_CLASS_NAME: str = "Database"
+# Feature flag that can be used to enable/disable the `/nmdcschema/related_ids`
+# endpoint and the tests that target it.
+IS_RELATED_IDS_ENDPOINT_ENABLED: bool = is_env_var_true(
+    "IS_RELATED_IDS_ENDPOINT_ENABLED", default="true"
+)
+# Feature flag that can be used to enable/disable the `/scalar` endpoint.
+IS_SCALAR_ENABLED: bool = is_env_var_true("IS_SCALAR_ENABLED", default="true")

nmdc_runtime/mongo_util.py ADDED Viewed

@@ -0,0 +1,90 @@
+from pymongo import MongoClient
+from pymongo.database import Database
+from pymongo.collection import Collection
+from typing import Any, Mapping, Optional, Type, Callable
+from pymongo.client_session import ClientSession
+import inspect
+def _wrap_with_session(obj: Any, name: str, session: Optional[ClientSession]) -> Any:
+    """
+    Wraps a callable attribute of an object to automatically include a session
+    if the callable accepts a 'session' keyword argument.
+    """
+    attr = getattr(obj, name)
+    if callable(attr):
+        signature = inspect.signature(attr)
+        parameters = signature.parameters
+        accepts_session = any(
+            param.name == "session"
+            for param in parameters.values()
+            if param.kind
+            in (inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.KEYWORD_ONLY)
+        )
+        def wrapper(*args, **kwargs):
+            if session is not None and accepts_session and "session" not in kwargs:
+                kwargs["session"] = session
+            return attr(*args, **kwargs)
+        return wrapper
+    return attr
+class SessionBoundCollection:
+    """
+    A wrapper around pymongo.collection.Collection that automatically passes a session
+    to methods that accept it.
+    """
+    def __init__(self, collection: Collection, session: Optional[ClientSession] = None):
+        self._collection = collection
+        self._session = session
+    def __getattr__(self, name: str):
+        return _wrap_with_session(self._collection, name, self._session)
+    def __getitem__(self, name: str) -> "SessionBoundCollection":
+        return SessionBoundCollection(self._collection[name], self._session)
+class SessionBoundDatabase(Database):
+    """
+    A wrapper around pymongo.database.Database that automatically passes a session
+    to methods that accept it.
+    """
+    def __init__(self, database: Database, session: Optional[ClientSession] = None):
+        super().__init__(
+            database.client,
+            database.name,
+            database.codec_options,
+            database.read_preference,
+            database.write_concern,
+            database.read_concern,
+        )
+        self._database = database
+        self._session = session
+    def __getattr__(self, name: str):
+        return _wrap_with_session(self._database, name, self._session)
+    def __getitem__(self, name: str) -> SessionBoundCollection:
+        return SessionBoundCollection(self._database[name], self._session)
+    def get_collection(self, name: str, **kwargs) -> SessionBoundCollection:
+        """Get a :class:`~pymongo.collection.Collection` with the given name and options."""
+        collection = super().get_collection(name, **kwargs)
+        return SessionBoundCollection(collection, self._session)
+    @property
+    def client(self):
+        return self._database.client
+    @property
+    def unbounded(self):
+        return self._database
+    @property
+    def name(self):
+        return self._database.name

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -27,7 +27,11 @@ class NCBISubmissionXML:
         self.nmdc_study_id = nmdc_study.get("id")
         self.nmdc_study_title = nmdc_study.get("title")
         self.nmdc_study_description = nmdc_study.get("description")
-        self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")
+        # get the first INSDC BioProject ID from the NMDC study
+        self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
+        # the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
+        # everything after the prefix and delimiter (":")
+        self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
         self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
         nmdc_study_pi_name = (
             nmdc_study.get("principal_investigator", {}).get("name").split()
@@ -251,7 +255,11 @@ class NCBISubmissionXML:
                     children=[
                         self.set_element(
                             "Title",
-                            f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
+                            attributes.get(
+                                "name",
+                                # fallback title if "name" is not present
+                                f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
+                            ),
                         ),
                     ],
                 ),
@@ -577,18 +585,45 @@ class NCBISubmissionXML:
         biosample_library_preparation_list: list,
         instruments_dict: dict,
     ):
-        data_type = None
-        ncbi_project_id = None
+        # data_type = None
+        biosamples_to_exclude = set()
         for bsm_ntseq in biosample_nucleotide_sequencing_list:
-            for _, ntseq_list in bsm_ntseq.items():
+            for bsm_id, ntseq_list in bsm_ntseq.items():
+                # Check if any processing_institution is "JGI"
                 for ntseq in ntseq_list:
-                    if "analyte_category" in ntseq:
-                        data_type = handle_string_value(
-                            ntseq["analyte_category"]
-                        ).capitalize()
-                    if "ncbi_project_name" in ntseq:
-                        ncbi_project_id = ntseq["ncbi_project_name"]
+                    if (
+                        "processing_institution" in ntseq
+                        and ntseq["processing_institution"] == "JGI"
+                    ):
+                        biosamples_to_exclude.add(bsm_id)
+                        break
+        # Filter biosample_nucleotide_sequencing_list to exclude JGI records
+        filtered_nucleotide_sequencing_list = []
+        for bsm_ntseq in biosample_nucleotide_sequencing_list:
+            filtered_dict = {}
+            for bsm_id, ntseq_list in bsm_ntseq.items():
+                if bsm_id not in biosamples_to_exclude:
+                    filtered_dict[bsm_id] = ntseq_list
+            if filtered_dict:  # Only add non-empty dictionaries
+                filtered_nucleotide_sequencing_list.append(filtered_dict)
+        # Filter biosamples_list to exclude JGI-processed biosamples
+        filtered_biosamples_list = [
+            biosample
+            for biosample in biosamples_list
+            if biosample.get("id") not in biosamples_to_exclude
+        ]
+        # Get data_type from filtered list
+        # for bsm_ntseq in filtered_nucleotide_sequencing_list:
+        #     for _, ntseq_list in bsm_ntseq.items():
+        #         for ntseq in ntseq_list:
+        #             if "analyte_category" in ntseq:
+        #                 data_type = handle_string_value(
+        #                     ntseq["analyte_category"]
+        #                 ).capitalize()
         self.set_description(
             email=self.nmdc_pi_email,
@@ -597,29 +632,65 @@ class NCBISubmissionXML:
             org=self.ncbi_submission_metadata.get("organization", ""),
         )
-        if not ncbi_project_id:
-            self.set_bioproject(
-                title=self.nmdc_study_title,
-                project_id=ncbi_project_id,
-                description=self.nmdc_study_description,
-                data_type=data_type,
-                org=self.ncbi_submission_metadata.get("organization", ""),
-            )
+        # if not self.ncbi_bioproject_id:
+        #     self.set_bioproject(
+        #         title=self.nmdc_study_title,
+        #         project_id=self.ncbi_bioproject_id,
+        #         description=self.nmdc_study_description,
+        #         data_type=data_type,
+        #         org=self.ncbi_submission_metadata.get("organization", ""),
+        #     )
         self.set_biosample(
             organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
             org=self.ncbi_submission_metadata.get("organization", ""),
-            bioproject_id=ncbi_project_id,
-            nmdc_biosamples=biosamples_list,
+            bioproject_id=self.ncbi_bioproject_id,
+            nmdc_biosamples=filtered_biosamples_list,
         )
+        # Also filter biosample_data_objects_list
+        filtered_data_objects_list = []
+        acceptable_extensions = [".fastq.gz", ".fastq"]
+        for entry in biosample_data_objects_list:
+            filtered_entry = {}
+            for biosample_id, data_objects in entry.items():
+                if biosample_id not in biosamples_to_exclude:
+                    # filter data_objects based on acceptable/allowed extensions
+                    # for "url" key in data_object
+                    filtered_objects = []
+                    for data_object in data_objects:
+                        if "url" in data_object:
+                            url = urlparse(data_object["url"])
+                            file_path = os.path.basename(url.path)
+                            if any(
+                                file_path.endswith(ext) for ext in acceptable_extensions
+                            ):
+                                filtered_objects.append(data_object)
+                    if filtered_objects:
+                        filtered_entry[biosample_id] = filtered_objects
+            if filtered_entry:  # Only add non-empty entries
+                filtered_data_objects_list.append(filtered_entry)
+        # Filter library preparation list as well
+        filtered_library_preparation_list = []
+        for lib_prep_dict in biosample_library_preparation_list:
+            filtered_lib_prep = {}
+            for biosample_id, lib_prep in lib_prep_dict.items():
+                if biosample_id not in biosamples_to_exclude:
+                    filtered_lib_prep[biosample_id] = lib_prep
+            if filtered_lib_prep:  # Only add non-empty entries
+                filtered_library_preparation_list.append(filtered_lib_prep)
         self.set_fastq(
-            biosample_data_objects=biosample_data_objects_list,
-            bioproject_id=ncbi_project_id,
+            biosample_data_objects=filtered_data_objects_list,
+            bioproject_id=self.ncbi_bioproject_id,
             org=self.ncbi_submission_metadata.get("organization", ""),
-            nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
-            nmdc_biosamples=biosamples_list,
-            nmdc_library_preparation=biosample_library_preparation_list,
+            nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
+            nmdc_biosamples=filtered_biosamples_list,
+            nmdc_library_preparation=filtered_library_preparation_list,
             all_instruments=instruments_dict,
         )

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -99,31 +99,36 @@ def fetch_nucleotide_sequencing_from_biosamples(
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
         collected_ntseq_objects = []
+        processed_ids = set()  # Track already processed nucleotide sequencing IDs
         while current_ids:
             new_current_ids = []
             for current_id in current_ids:
-                query = {"has_input": current_id}
-                document = all_docs_collection.find_one(query)
-                if not document:
-                    continue
-                has_output = document.get("has_output")
-                if not has_output:
-                    continue
-                for output_id in has_output:
-                    if get_classname_from_typecode(output_id) == "DataObject":
-                        nucleotide_sequencing_doc = data_generation_set.find_one(
-                            {"id": document["id"]}
-                        )
-                        if nucleotide_sequencing_doc:
-                            collected_ntseq_objects.append(
-                                strip_oid(nucleotide_sequencing_doc)
-                            )
-                    else:
-                        new_current_ids.append(output_id)
+                # Find all documents with current_id as input instead of just one
+                for document in all_docs_collection.find({"has_input": current_id}):
+                    has_output = document.get("has_output")
+                    if not has_output:
+                        continue
+                    for output_id in has_output:
+                        if get_classname_from_typecode(output_id) == "DataObject":
+                            # Only process if we haven't seen this document ID before
+                            if document["id"] not in processed_ids:
+                                nucleotide_sequencing_doc = (
+                                    data_generation_set.find_one(
+                                        {
+                                            "id": document["id"],
+                                            "type": "nmdc:NucleotideSequencing",
+                                        }
+                                    )
+                                )
+                                if nucleotide_sequencing_doc:
+                                    collected_ntseq_objects.append(
+                                        strip_oid(nucleotide_sequencing_doc)
+                                    )
+                                    processed_ids.add(document["id"])
+                        else:
+                            new_current_ids.append(output_id)
             current_ids = new_current_ids
@@ -187,10 +192,7 @@ def handle_quantity_value(slot_value):
         and "has_minimum_numeric_value" in slot_value
         and "has_unit" in slot_value
     ):
-        range_value = (
-            slot_value["has_maximum_numeric_value"]
-            - slot_value["has_minimum_numeric_value"]
-        )
+        range_value = f"{slot_value['has_minimum_numeric_value']} - {slot_value['has_maximum_numeric_value']}"
         return f"{range_value} {slot_value['has_unit']}"
     elif "has_raw_value" in slot_value:
         return slot_value["has_raw_value"]

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from dagster import graph
+from dagster import graph, GraphIn
 from nmdc_runtime.site.ops import (
     build_merged_db,
@@ -22,6 +22,7 @@ from nmdc_runtime.site.ops import (
     filter_ops_done_object_puts,
     hello,
     mongo_stats,
+    run_script_to_update_insdc_biosample_identifiers,
     submit_metadata_to_db,
     filter_ops_undone_expired,
     construct_jobs,
@@ -50,6 +51,7 @@ from nmdc_runtime.site.ops import (
     get_df_from_url,
     site_code_mapping,
     materialize_alldocs,
+    load_ontology,
     get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
     get_nucleotide_sequencing_from_biosamples,
@@ -58,6 +60,7 @@ from nmdc_runtime.site.ops import (
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
     ncbi_submission_xml_asset,
+    render_text,
     get_database_updater_inputs,
     post_submission_portal_biosample_ingest_record_stitching_filename,
     generate_data_generation_set_post_biosample_ingest,
@@ -112,6 +115,16 @@ def ensure_alldocs():
     materialize_alldocs()
+@graph
+def run_ontology_load():
+    """
+    A graph for loading ontologies.
+    The source_ontology parameter is provided by the job configuration
+    and passed to the load_ontology op.
+    """
+    load_ontology()
 @graph
 def ensure_jobs():
     jobs = construct_jobs()
@@ -120,17 +133,24 @@ def ensure_jobs():
 @graph
 def apply_changesheet():
+    # Note: We use `_` as a "placeholder" variable.
+    #       It's a variable to whose value we assign no significance. In this case, we use it to
+    #       tell Dagster that one op depends upon the output of the other (so Dagster runs them
+    #       in that order), without implying to maintainers that its value is significant to us.
+    #       Reference (this strategy): https://docs.dagster.io/api/dagster/types#dagster.Nothing
+    #       Reference (`_` variables): https://stackoverflow.com/a/47599668
     sheet_in = get_changesheet_in()
     outputs = perform_changesheet_updates(sheet_in)
-    add_output_run_event(outputs)
-    materialize_alldocs()
+    _ = add_output_run_event(outputs)
+    materialize_alldocs(waits_for=_)
 @graph
 def apply_metadata_in():
+    # Note: We use `_` as a "placeholder" variable.
     outputs = perform_mongo_updates(get_json_in())
-    add_output_run_event(outputs)
-    materialize_alldocs()
+    _ = add_output_run_event(outputs)
+    materialize_alldocs(waits_for=_)
 @graph
@@ -140,6 +160,7 @@ def gold_study_to_database():
         study_type,
         gold_nmdc_instrument_mapping_file_url,
         include_field_site_info,
+        enable_biosample_filtering,
     ) = get_gold_study_pipeline_inputs()
     projects = gold_projects_by_study(study_id)
@@ -156,6 +177,7 @@ def gold_study_to_database():
         analysis_projects,
         gold_nmdc_instrument_map_df,
         include_field_site_info,
+        enable_biosample_filtering,
     )
     database_dict = nmdc_schema_object_to_dict(database)
     filename = nmdc_schema_database_export_filename(study)
@@ -486,11 +508,19 @@ def nmdc_study_to_ncbi_submission_export():
 @graph
 def generate_data_generation_set_for_biosamples_in_nmdc_study():
-    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    (
+        study_id,
+        gold_nmdc_instrument_mapping_file_url,
+        include_field_site_info,
+        enable_biosample_filtering,
+    ) = get_database_updater_inputs()
     gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
     database = generate_data_generation_set_post_biosample_ingest(
-        study_id, gold_nmdc_instrument_map_df
+        study_id,
+        gold_nmdc_instrument_map_df,
+        include_field_site_info,
+        enable_biosample_filtering,
     )
     database_dict = nmdc_schema_object_to_dict(database)
@@ -503,11 +533,19 @@ def generate_data_generation_set_for_biosamples_in_nmdc_study():
 @graph
 def generate_biosample_set_from_samples_in_gold():
-    (study_id, gold_nmdc_instrument_mapping_file_url) = get_database_updater_inputs()
+    (
+        study_id,
+        gold_nmdc_instrument_mapping_file_url,
+        include_field_site_info,
+        enable_biosample_filtering,
+    ) = get_database_updater_inputs()
     gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
     database = generate_biosample_set_for_nmdc_study_from_gold(
-        study_id, gold_nmdc_instrument_map_df
+        study_id,
+        gold_nmdc_instrument_map_df,
+        include_field_site_info,
+        enable_biosample_filtering,
     )
     database_dict = nmdc_schema_object_to_dict(database)
     filename = post_submission_portal_biosample_ingest_record_stitching_filename(
@@ -515,3 +553,28 @@ def generate_biosample_set_from_samples_in_gold():
     )
     outputs = export_json_to_drs(database_dict, filename)
     add_output_run_event(outputs)
+@graph
+def generate_update_script_for_insdc_biosample_identifiers():
+    """Generate a MongoDB update script to add INSDC biosample identifiers to biosamples based on GOLD data.
+    This graph fetches the necessary inputs, then calls the run_script_to_update_insdc_biosample_identifiers op
+    to generate a script for updating biosample records with INSDC identifiers obtained from GOLD.
+    The script is returned as a dictionary that can be executed against MongoDB.
+    """
+    (
+        study_id,
+        gold_nmdc_instrument_mapping_file_url,
+        include_field_site_info,
+        enable_biosample_filtering,
+    ) = get_database_updater_inputs()
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
+    update_script = run_script_to_update_insdc_biosample_identifiers(
+        study_id,
+        gold_nmdc_instrument_map_df,
+        include_field_site_info,
+        enable_biosample_filtering,
+    )
+    render_text(update_script)

nmdc-runtime 2.7.0__py3-none-any.whl → 2.9.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.7.0py3-none-any.whl → 2.9.0py3-none-any.whl