PyPI - nmdc-runtime - Versions diffs - 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl - Mend

nmdc-runtime 1.9.0py3-none-any.whl → 2.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (19) hide show

nmdc_runtime/minter/config.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 from functools import lru_cache
+from typing import List
 from nmdc_runtime.util import get_nmdc_jsonschema_dict
@@ -11,18 +12,73 @@ def minting_service_id() -> str | None:
     return os.getenv("MINTING_SERVICE_ID")
+def extract_typecode_from_pattern(pattern: str) -> str:
+    r"""
+    Returns the typecode portion of the specified string.
+    >>> extract_typecode_from_pattern("foo-123-456$")  # original behavior
+    'foo'
+    >>> extract_typecode_from_pattern("(foo)-123-456$")  # returns first and only typecode
+    'foo'
+    >>> extract_typecode_from_pattern("(foo|bar)-123-456$")  # returns first of 2 typecodes
+    'foo'
+    >>> extract_typecode_from_pattern("(foo|bar|baz)-123-456$")  # returns first of > 2 typecodes
+    'foo'
+    """
+    # Get the portion of the pattern preceding the first hyphen.
+    # e.g. "foo-bar-baz" → ["foo", "bar-baz"] → "foo"
+    typecode_sub_pattern = pattern.split("-", maxsplit=1)[0]
+    # If that portion of the pattern is enclosed in parentheses, get the portion between the parentheses.
+    # e.g. "(apple|banana|carrot)" → "apple|banana|carrot"
+    if typecode_sub_pattern.startswith("(") and typecode_sub_pattern.endswith(")"):
+        inner_pattern = typecode_sub_pattern[1:-1]
+        # Finally, get everything before the first `|`, if any.
+        # e.g. "apple|banana|carrot" → "apple"
+        # e.g. "apple" → "apple"
+        typecode = inner_pattern.split("|", maxsplit=1)[0]
+    else:
+        # Note: This is the original behavior, before we added support for multi-typecode patterns.
+        # e.g. "apple" → "apple"
+        typecode = typecode_sub_pattern
+    return typecode
 @lru_cache()
-def typecodes():
+def typecodes() -> List[dict]:
+    r"""
+    Returns a list of dictionaries containing typecodes and associated information derived from the schema.
+    Preconditions about the schema:
+    - The typecode portion of the pattern is between the pattern prefix and the first subsequent hyphen.
+    - The typecode portion of the pattern either consists of a single typecode verbatim (e.g. "foo");
+      or consists of multiple typecodes in a pipe-delimited list enclosed in parentheses (e.g. "(foo|bar|baz)").
+    - The typecode portion of the pattern does not, itself, contain any hyphens.
+    TODO: Get the typecodes in a different way than by extracting them from a larger string, which seems brittle to me.
+          Getting them a different way may require schema authors to _define_ them a different way (e.g. defining them
+          in a dedicated property of a class; for example, one named `typecode`).
+    """
+    id_pattern_prefix = r"^(nmdc):"
     rv = []
     schema_dict = get_nmdc_jsonschema_dict()
     for cls_name, defn in schema_dict["$defs"].items():
         match defn.get("properties"):
-            case {"id": {"pattern": p}} if p.startswith("^(nmdc):"):
+            case {"id": {"pattern": p}} if p.startswith(id_pattern_prefix):
+                # Get the portion of the pattern following the prefix.
+                # e.g. "^(nmdc):foo-bar-baz" → "foo-bar-baz"
+                index_of_first_character_following_prefix = len(id_pattern_prefix)
+                pattern_without_prefix = p[index_of_first_character_following_prefix:]
                 rv.append(
                     {
                         "id": "nmdc:" + cls_name + "_" + "typecode",
                         "schema_class": "nmdc:" + cls_name,
-                        "name": p.split(":", maxsplit=1)[-1].split("-", maxsplit=1)[0],
+                        "name": extract_typecode_from_pattern(pattern_without_prefix),
                     }
                 )
             case _:

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -283,7 +283,7 @@ class NCBISubmissionXML:
         biosample_data_objects: list,
         bioproject_id: str,
         org: str,
-        nmdc_omics_processing: list,
+        nmdc_nucleotide_sequencing: list,
         nmdc_biosamples: list,
         nmdc_library_preparation: list,
     ):
@@ -294,10 +294,10 @@ class NCBISubmissionXML:
         for entry in biosample_data_objects:
             fastq_files = []
             biosample_ids = []
-            omics_processing_ids = {}
+            nucleotide_sequencing_ids = {}
             lib_prep_protocol_names = {}
             instrument_name = ""
-            omics_type = ""
+            analyte_category = ""
             library_name = ""
             for biosample_id, data_objects in entry.items():
@@ -308,16 +308,16 @@ class NCBISubmissionXML:
                         file_path = os.path.basename(url.path)
                         fastq_files.append(file_path)
-                for omprc_dict in nmdc_omics_processing:
-                    if biosample_id in omprc_dict:
-                        for omprc in omprc_dict[biosample_id]:
-                            omics_processing_ids[biosample_id] = omprc.get("id", "")
-                            instrument_name = omprc.get("instrument_name", "")
-                            omics_type = (
-                                omprc.get("omics_type", {})
-                                .get("has_raw_value", "")
-                                .lower()
+                for ntseq_dict in nmdc_nucleotide_sequencing:
+                    if biosample_id in ntseq_dict:
+                        for ntseq in ntseq_dict[biosample_id]:
+                            nucleotide_sequencing_ids[biosample_id] = ntseq.get(
+                                "id", ""
                             )
+                            # Currently, we are making the assumption that only one instrument
+                            # is used to sequence a Biosample
+                            instrument_name = ntseq.get("instrument_used", "")[0]
+                            analyte_category = ntseq.get("analyte_category", "")
                             library_name = bsm_id_name_dict.get(biosample_id, "")
                 for lib_prep_dict in nmdc_library_preparation:
@@ -395,7 +395,7 @@ class NCBISubmissionXML:
                             )
                         )
-                if omics_type == "metagenome":
+                if analyte_category == "metagenome":
                     sra_attributes.append(
                         self.set_element(
                             "Attribute", "WGS", {"name": "library_strategy"}
@@ -411,8 +411,7 @@ class NCBISubmissionXML:
                             "Attribute", "RANDOM", {"name": "library_selection"}
                         )
                     )
-                if omics_type == "metatranscriptome":
+                elif analyte_category == "metatranscriptome":
                     sra_attributes.append(
                         self.set_element(
                             "Attribute",
@@ -467,7 +466,10 @@ class NCBISubmissionXML:
                         )
                     )
-                for biosample_id, omics_processing_id in omics_processing_ids.items():
+                for (
+                    biosample_id,
+                    omics_processing_id,
+                ) in nucleotide_sequencing_ids.items():
                     identifier_element = self.set_element(
                         "Identifier",
                         children=[
@@ -496,20 +498,22 @@ class NCBISubmissionXML:
     def get_submission_xml(
         self,
         biosamples_list: list,
-        biosample_omics_processing_list: list,
+        biosample_nucleotide_sequencing_list: list,
         biosample_data_objects_list: list,
         biosample_library_preparation_list: list,
     ):
         data_type = None
         ncbi_project_id = None
-        for bsm_omprc in biosample_omics_processing_list:
-            for _, omprc_list in bsm_omprc.items():
-                for omprc in omprc_list:
-                    if "omics_type" in omprc:
-                        data_type = handle_text_value(omprc["omics_type"]).capitalize()
+        for bsm_ntseq in biosample_nucleotide_sequencing_list:
+            for _, ntseq_list in bsm_ntseq.items():
+                for ntseq in ntseq_list:
+                    if "analyte_category" in ntseq:
+                        data_type = handle_string_value(
+                            ntseq["analyte_category"]
+                        ).capitalize()
-                    if "ncbi_project_name" in omprc:
-                        ncbi_project_id = omprc["ncbi_project_name"]
+                    if "ncbi_project_name" in ntseq:
+                        ncbi_project_id = ntseq["ncbi_project_name"]
         self.set_description(
             email=self.nmdc_pi_email,
@@ -538,7 +542,7 @@ class NCBISubmissionXML:
             biosample_data_objects=biosample_data_objects_list,
             bioproject_id=ncbi_project_id,
             org=self.ncbi_submission_metadata.get("organization", ""),
-            nmdc_omics_processing=biosample_omics_processing_list,
+            nmdc_nucleotide_sequencing=biosample_nucleotide_sequencing_list,
             nmdc_biosamples=biosamples_list,
             nmdc_library_preparation=biosample_library_preparation_list,
         )

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -58,7 +58,7 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     return biosample_data_objects
-def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
     biosample_data_objects = []
     for biosample in biosamples_list:
@@ -80,11 +80,11 @@ def fetch_omics_processing_from_biosamples(all_docs_collection, biosamples_list)
                 for output_id in has_output:
                     if get_classname_from_typecode(output_id) == "DataObject":
-                        omics_processing_doc = all_docs_collection.find_one(
+                        nucleotide_sequencing_doc = all_docs_collection.find_one(
                             {"id": document["id"]}
                         )
-                        if omics_processing_doc:
-                            collected_data_objects.append(omics_processing_doc)
+                        if nucleotide_sequencing_doc:
+                            collected_data_objects.append(nucleotide_sequencing_doc)
                     else:
                         new_current_ids.append(output_id)
@@ -117,7 +117,7 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
         for output_id in initial_output:
             lib_prep_query = {
                 "has_input": output_id,
-                "designated_class": "nmdc:LibraryPreparation",
+                "type": {"$in": ["LibraryPreparation"]},
             }
             lib_prep_doc = all_docs_collection.find_one(lib_prep_query)

nmdc_runtime/site/export/study_metadata.py CHANGED Viewed

@@ -133,5 +133,7 @@ def export_study_biosamples_metadata():
 @op(required_resource_keys={"runtime_api_site_client"})
 def get_biosamples_by_study_id(context: OpExecutionContext, nmdc_study: dict):
     client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
-    biosamples = get_all_docs(client, "biosamples", f"part_of:{nmdc_study['id']}")
+    biosamples = get_all_docs(
+        client, "biosamples", f"associated_studies:{nmdc_study['id']}"
+    )
     return biosamples

nmdc_runtime/site/graphs.py CHANGED Viewed

@@ -51,7 +51,7 @@ from nmdc_runtime.site.ops import (
     materialize_alldocs,
     get_ncbi_export_pipeline_study,
     get_data_objects_from_biosamples,
-    get_omics_processing_from_biosamples,
+    get_nucleotide_sequencing_from_biosamples,
     get_library_preparation_from_biosamples,
     get_ncbi_export_pipeline_inputs,
     ncbi_submission_xml_from_nmdc_study,
@@ -126,15 +126,23 @@ def apply_metadata_in():
 @graph
 def gold_study_to_database():
-    study_id = get_gold_study_pipeline_inputs()
+    (study_id, study_type, gold_nmdc_instrument_mapping_file_url) = (
+        get_gold_study_pipeline_inputs()
+    )
     projects = gold_projects_by_study(study_id)
     biosamples = gold_biosamples_by_study(study_id)
     analysis_projects = gold_analysis_projects_by_study(study_id)
     study = gold_study(study_id)
+    gold_nmdc_instrument_map_df = get_df_from_url(gold_nmdc_instrument_mapping_file_url)
     database = nmdc_schema_database_from_gold_study(
-        study, projects, biosamples, analysis_projects
+        study,
+        study_type,
+        projects,
+        biosamples,
+        analysis_projects,
+        gold_nmdc_instrument_map_df,
     )
     database_dict = nmdc_schema_object_to_dict(database)
     filename = nmdc_schema_database_export_filename(study)
@@ -147,14 +155,16 @@ def gold_study_to_database():
 def translate_metadata_submission_to_nmdc_schema_database():
     (
         submission_id,
-        omics_processing_mapping_file_url,
+        nucleotide_sequencing_mapping_file_url,
         data_object_mapping_file_url,
         biosample_extras_file_url,
         biosample_extras_slot_mapping_file_url,
     ) = get_submission_portal_pipeline_inputs()
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
-    omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
+    nucleotide_sequencing_mapping = get_csv_rows_from_url(
+        nucleotide_sequencing_mapping_file_url
+    )
     data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
     biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
     biosample_extras_slot_mapping = get_csv_rows_from_url(
@@ -163,8 +173,8 @@ def translate_metadata_submission_to_nmdc_schema_database():
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission,
-        omics_processing_mapping,
-        data_object_mapping,
+        nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
+        data_object_mapping=data_object_mapping,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
     )
@@ -181,14 +191,16 @@ def translate_metadata_submission_to_nmdc_schema_database():
 def ingest_metadata_submission():
     (
         submission_id,
-        omics_processing_mapping_file_url,
+        nucleotide_sequencing_mapping_file_url,
         data_object_mapping_file_url,
         biosample_extras_file_url,
         biosample_extras_slot_mapping_file_url,
     ) = get_submission_portal_pipeline_inputs()
     metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
-    omics_processing_mapping = get_csv_rows_from_url(omics_processing_mapping_file_url)
+    nucleotide_sequencing_mapping = get_csv_rows_from_url(
+        nucleotide_sequencing_mapping_file_url
+    )
     data_object_mapping = get_csv_rows_from_url(data_object_mapping_file_url)
     biosample_extras = get_csv_rows_from_url(biosample_extras_file_url)
     biosample_extras_slot_mapping = get_csv_rows_from_url(
@@ -197,8 +209,8 @@ def ingest_metadata_submission():
     database = translate_portal_submission_to_nmdc_schema_database(
         metadata_submission,
-        omics_processing_mapping,
-        data_object_mapping,
+        nucleotide_sequencing_mapping=nucleotide_sequencing_mapping,
+        data_object_mapping=data_object_mapping,
         biosample_extras=biosample_extras,
         biosample_extras_slot_mapping=biosample_extras_slot_mapping,
     )
@@ -217,6 +229,7 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -225,8 +238,16 @@ def translate_neon_api_soil_metadata_to_nmdc_schema_database():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_soil_data(
-        mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
+        mms_data,
+        sls_data,
+        neon_envo_mappings_file,
+        neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     database_dict = nmdc_schema_object_to_dict(database)
@@ -247,6 +268,7 @@ def ingest_neon_soil_metadata():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -255,8 +277,16 @@ def ingest_neon_soil_metadata():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_soil_data(
-        mms_data, sls_data, neon_envo_mappings_file, neon_raw_data_file_mappings_file
+        mms_data,
+        sls_data,
+        neon_envo_mappings_file,
+        neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     run_id = submit_metadata_to_db(database)
     poll_for_run_completion(run_id)
@@ -267,6 +297,7 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     mms_benthic_data_product = get_neon_pipeline_benthic_data_product()
@@ -280,11 +311,16 @@ def translate_neon_api_benthic_metadata_to_nmdc_schema_database():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_benthic_data(
         mms_benthic,
         sites_mapping_dict,
         neon_envo_mappings_file,
         neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     database_dict = nmdc_schema_object_to_dict(database)
@@ -305,6 +341,7 @@ def ingest_neon_benthic_metadata():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -313,11 +350,16 @@ def ingest_neon_benthic_metadata():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_benthic_data(
         mms_benthic,
         sites_mapping_dict,
         neon_envo_mappings_file,
         neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     run_id = submit_metadata_to_db(database)
     poll_for_run_completion(run_id)
@@ -334,6 +376,7 @@ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -342,11 +385,16 @@ def translate_neon_api_surface_water_metadata_to_nmdc_schema_database():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_surface_water_data(
         mms_surface_water,
         sites_mapping_dict,
         neon_envo_mappings_file,
         neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     database_dict = nmdc_schema_object_to_dict(database)
@@ -367,6 +415,7 @@ def ingest_neon_surface_water_metadata():
     (
         neon_envo_mappings_file_url,
         neon_raw_data_file_mappings_file_url,
+        neon_nmdc_instrument_mapping_file_url,
     ) = get_neon_pipeline_inputs()
     neon_envo_mappings_file = get_df_from_url(neon_envo_mappings_file_url)
@@ -375,11 +424,16 @@ def ingest_neon_surface_water_metadata():
         neon_raw_data_file_mappings_file_url
     )
+    neon_nmdc_instrument_mapping_file = get_df_from_url(
+        neon_nmdc_instrument_mapping_file_url
+    )
     database = nmdc_schema_database_from_neon_benthic_data(
         mms_surface_water,
         sites_mapping_dict,
         neon_envo_mappings_file,
         neon_raw_data_file_mappings_file,
+        neon_nmdc_instrument_mapping_file,
     )
     run_id = submit_metadata_to_db(database)
     poll_for_run_completion(run_id)
@@ -390,14 +444,16 @@ def nmdc_study_to_ncbi_submission_export():
     nmdc_study = get_ncbi_export_pipeline_study()
     ncbi_submission_metadata = get_ncbi_export_pipeline_inputs()
     biosamples = get_biosamples_by_study_id(nmdc_study)
-    omics_processing_records = get_omics_processing_from_biosamples(biosamples)
+    nucleotide_sequencing_records = get_nucleotide_sequencing_from_biosamples(
+        biosamples
+    )
     data_object_records = get_data_objects_from_biosamples(biosamples)
     library_preparation_records = get_library_preparation_from_biosamples(biosamples)
     xml_data = ncbi_submission_xml_from_nmdc_study(
         nmdc_study,
         ncbi_submission_metadata,
         biosamples,
-        omics_processing_records,
+        nucleotide_sequencing_records,
         data_object_records,
         library_preparation_records,
     )

nmdc-runtime 1.9.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 1.9.0py3-none-any.whl → 2.0.0py3-none-any.whl