PyPI - nmdc-runtime - Versions diffs - 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl - Mend

nmdc-runtime 2.3.0py3-none-any.whl → 2.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (13) hide show

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -4,7 +4,7 @@ import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
-from typing import Any
+from typing import Any, List, Union
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument_used: List[str] = ntseq.get(
+                                "instrument_used", []
+                            )
+                            if not instrument_used:
+                                instrument_id = None
+                            else:
+                                instrument_id = instrument_used[0]
                             instrument = all_instruments.get(instrument_id, {})
                             instrument_vendor = instrument.get("vendor", "")
                             instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
                             )
                         )
+                    elif instrument_model == "novaseq_6000":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute",
+                                "NovaSeq 6000",
+                                {"name": "instrument_model"},
+                            )
+                        )
+                    elif instrument_model == "hiseq":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "HiSeq", {"name": "instrument_model"}
+                            )
+                        )
                 if analyte_category == "metagenome":
                     sra_attributes.append(

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from io import BytesIO, StringIO
+from typing import Any, Dict, List, Union
+from nmdc_runtime.api.endpoints.util import strip_oid
 from nmdc_runtime.minter.config import typecodes
 from lxml import etree
+from pymongo.collection import Collection
 import csv
 import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
         raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
-def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_data_objects_from_biosamples(
+    all_docs_collection: Collection,
+    data_object_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the data objects that are "associated" (derived from/products of)
+    with their respective biosamples by iterating over the alldocs collection recursively.
+    The methods returns a dictionary with biosample ids as keys and the associated list of
+    data objects as values.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_object_set: reference to the data_object_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated data objects as values
+    """
+    biosample_data_objects = []
+    def collect_data_objects(doc_ids, collected_objects, unique_ids):
+        for doc_id in doc_ids:
+            if (
+                get_classname_from_typecode(doc_id) == "DataObject"
+                and doc_id not in unique_ids
+            ):
+                data_obj = data_object_set.find_one({"id": doc_id})
+                if data_obj:
+                    collected_objects.append(strip_oid(data_obj))
+                    unique_ids.add(doc_id)
     biosample_data_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
         collected_data_objects = []
+        unique_ids = set()
         while current_ids:
             new_current_ids = []
             for current_id in current_ids:
-                query = {"has_input": current_id}
-                document = all_docs_collection.find_one(query)
+                for doc in all_docs_collection.find({"has_input": current_id}):
+                    has_output = doc.get("has_output", [])
-                if not document:
-                    continue
-                has_output = document.get("has_output")
-                if not has_output:
-                    continue
-                for output_id in has_output:
-                    if get_classname_from_typecode(output_id) == "DataObject":
-                        data_object_doc = all_docs_collection.find_one(
-                            {"id": output_id}
-                        )
-                        if data_object_doc:
-                            collected_data_objects.append(data_object_doc)
-                    else:
-                        new_current_ids.append(output_id)
+                    collect_data_objects(has_output, collected_data_objects, unique_ids)
+                    new_current_ids.extend(
+                        op
+                        for op in has_output
+                        if get_classname_from_typecode(op) != "DataObject"
+                    )
             current_ids = new_current_ids
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     return biosample_data_objects
-def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
-    biosample_data_objects = []
+def fetch_nucleotide_sequencing_from_biosamples(
+    all_docs_collection: Collection,
+    data_generation_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the nucleotide sequencing process records that create data objects
+    for biosamples by iterating over the alldocs collection recursively.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_generation_set: reference to the data_generation_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
+    process objects as values
+    """
+    biosample_ntseq_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
-        collected_data_objects = []
+        collected_ntseq_objects = []
         while current_ids:
             new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
                 for output_id in has_output:
                     if get_classname_from_typecode(output_id) == "DataObject":
-                        nucleotide_sequencing_doc = all_docs_collection.find_one(
+                        nucleotide_sequencing_doc = data_generation_set.find_one(
                             {"id": document["id"]}
                         )
                         if nucleotide_sequencing_doc:
-                            collected_data_objects.append(nucleotide_sequencing_doc)
+                            collected_ntseq_objects.append(
+                                strip_oid(nucleotide_sequencing_doc)
+                            )
                     else:
                         new_current_ids.append(output_id)
             current_ids = new_current_ids
-        if collected_data_objects:
-            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+        if collected_ntseq_objects:
+            biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
+    return biosample_ntseq_objects
-    return biosample_data_objects
+def fetch_library_preparation_from_biosamples(
+    all_docs_collection: Collection,
+    material_processing_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the library preparation process records that create processed samples,
+    which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
+    for biosamples by iterating over the alldocs collection recursively.
-def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
+    :param all_docs_collection: reference to the alldocs collection
+    :param material_processing_set: reference to the material_processing_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated library preparation process
+    objects as values
+    """
     biosample_lib_prep = []
     for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
                 "has_input": output_id,
                 "type": {"$in": ["LibraryPreparation"]},
             }
-            lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
+            lib_prep_doc = material_processing_set.find_one(lib_prep_query)
             if lib_prep_doc:
-                biosample_lib_prep.append({biosample_id: lib_prep_doc})
+                biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
                 break  # Stop at the first document that meets the criteria
     return biosample_lib_prep

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -1100,7 +1100,12 @@ def materialize_alldocs(context) -> int:
         write_operations = []
         documents_processed_counter = 0
         for doc in mdb[coll_name].find():
-            doc_type = doc["type"][5:]  # lop off "nmdc:" prefix
+            try:
+                doc_type = doc["type"][5:]  # lop off "nmdc:" prefix
+            except KeyError:
+                raise Exception(
+                    f"doc {doc['id']} in collection {coll_name} has no 'type'!"
+                )
             slots_to_include = ["id", "type"] + document_reference_ranged_slots[
                 doc_type
             ]
@@ -1188,8 +1193,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_object_set = mdb["data_object_set"]
     biosample_data_objects = fetch_data_objects_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_object_set, biosamples
     )
     return biosample_data_objects
@@ -1200,8 +1206,9 @@ def get_nucleotide_sequencing_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_generation_set = mdb["data_generation_set"]
     biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_generation_set, biosamples
     )
     return biosample_omics_processing
@@ -1212,8 +1219,9 @@ def get_library_preparation_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    material_processing_set = mdb["material_processing_set"]
     biosample_lib_prep = fetch_library_preparation_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, material_processing_set, biosamples
     )
     return biosample_lib_prep

nmdc_runtime/site/repair/database_updater.py CHANGED Viewed

@@ -199,8 +199,20 @@ class DatabaseUpdater:
             if gbs.get("biosampleGoldId") not in nmdc_gold_ids
         ]
+        # use the GOLD study id to fetch all sequencing project records associated with the study
+        gold_sequencing_projects_for_study = (
+            self.gold_api_client.fetch_projects_by_study(gold_study_id)
+        )
+        # use the GOLD study id to fetch all analysis project records associated with the study
+        gold_analysis_projects_for_study = (
+            self.gold_api_client.fetch_analysis_projects_by_study(gold_study_id)
+        )
         gold_study_translator = GoldStudyTranslator(
             biosamples=missing_gold_biosamples,
+            projects=gold_sequencing_projects_for_study,
+            analysis_projects=gold_analysis_projects_for_study,
             gold_nmdc_instrument_map_df=self.gold_nmdc_instrument_map_df,
         )

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -652,7 +652,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -737,14 +737,14 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_benthic_data_product": {
                         "config": {
                             "benthic_data_product": {
                                 "product_id": "DP1.20279.001",
-                                "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
+                                "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
                             }
                         }
                     },
@@ -771,7 +771,7 @@ def biosample_submission_ingest():
                         "config": {
                             "benthic_data_product": {
                                 "product_id": "DP1.20279.001",
-                                "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent",
+                                "product_tables": "mms_benthicMetagenomeSequencing, mms_benthicMetagenomeDnaExtraction, mms_benthicRawDataFiles, amb_fieldParent, mms_mms_benthicRawDataFiles",
                             }
                         }
                     },
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_surface_water_data_product": {
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },

nmdc_runtime/site/translation/neon_benthic_translator.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import re
 import sqlite3
-from typing import Union
+from typing import Optional, Union
 import pandas as pd
 import requests_cache
@@ -61,6 +61,7 @@ class NeonBenthicDataTranslator(Translator):
             "mms_benthicMetagenomeSequencing",
             "mms_benthicMetagenomeDnaExtraction",
             "amb_fieldParent",
+            "mms_benthicRawDataFiles",  # <--- ensure this is present
         )
         if all(k in benthic_data for k in neon_amb_data_tables):
@@ -79,6 +80,12 @@ class NeonBenthicDataTranslator(Translator):
             benthic_data["amb_fieldParent"].to_sql(
                 "amb_fieldParent", self.conn, if_exists="replace", index=False
             )
+            benthic_data["mms_benthicRawDataFiles"].to_sql(
+                "mms_benthicRawDataFiles",
+                self.conn,
+                if_exists="replace",
+                index=False,
+            )
         else:
             raise ValueError(
                 f"You are missing one of the aquatic benthic microbiome tables: {neon_amb_data_tables}"
@@ -88,14 +95,19 @@ class NeonBenthicDataTranslator(Translator):
             "neonEnvoTerms", self.conn, if_exists="replace", index=False
         )
-        self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
-        self.neon_raw_data_file_mappings_df.to_sql(
-            "neonRawDataFile", self.conn, if_exists="replace", index=False
-        )
+        self.neon_raw_data_file_mappings_df = benthic_data["mms_benthicRawDataFiles"]
         self.site_code_mapping = site_code_mapping
         self.neon_nmdc_instrument_map_df = neon_nmdc_instrument_map_df
+    def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
+        return nmdc.Manifest(
+            id=manifest_id,
+            manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
+            type="nmdc:Manifest",
+        )
     def _translate_biosample(
         self, neon_id: str, nmdc_id: str, biosample_row: pd.DataFrame
     ) -> nmdc.Biosample:
@@ -313,7 +325,7 @@ class NeonBenthicDataTranslator(Translator):
         )
     def _translate_data_object(
-        self, do_id: str, url: str, do_type: str, checksum: str
+        self, do_id: str, url: str, do_type: str, manifest_id: str
     ) -> nmdc.DataObject:
         """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
         object mainly contains information about the sequencing file that was generated as
@@ -324,7 +336,6 @@ class NeonBenthicDataTranslator(Translator):
         :param url: URL of zipped FASTQ file on NEON file server. Retrieved from file provided
         by Hugh Cross at NEON.
         :param do_type: Indicate whether it is FASTQ for Read 1 or Read 2 (paired end sequencing).
-        :param checksum: Checksum value for FASTQ in zip file, once again provided by Hugh Cross
         at NEON.
         :return: DataObject with all the sequencing file metadata.
         """
@@ -337,14 +348,14 @@ class NeonBenthicDataTranslator(Translator):
             url=url,
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
-            md5_checksum=checksum,
             data_object_type=do_type,
+            in_manifest=manifest_id,
         )
-    def get_database(self):
+    def get_database(self) -> nmdc.Database:
         database = nmdc.Database()
-        query = """
+        join_query = """
             SELECT
                 merged.laboratoryName,
                 merged.sequencingFacilityID,
@@ -372,202 +383,190 @@ class NeonBenthicDataTranslator(Translator):
                 afp.siteID,
                 afp.sampleID,
                 afp.collectDate
-            FROM
-                (
-                    SELECT
-                        bs.collectDate,
-                        bs.laboratoryName,
-                        bs.sequencingFacilityID,
-                        bs.processedDate,
-                        bs.dnaSampleID,
-                        bs.dnaSampleCode,
-                        bs.internalLabID,
-                        bs.instrument_model,
-                        bs.sequencingMethod,
-                        bs.investigation_type,
-                        bs.qaqcStatus,
-                        bs.ncbiProjectID,
-                        bd.genomicsSampleID,
-                        bd.sequenceAnalysisType,
-                        bd.sampleMass,
-                        bd.nucleicAcidConcentration
-                    FROM
-                        mms_benthicMetagenomeSequencing AS bs
-                    JOIN
-                        mms_benthicMetagenomeDnaExtraction AS bd
-                    ON
-                        bs.dnaSampleID = bd.dnaSampleID
-                ) AS merged
+            FROM (
+                SELECT
+                    bs.collectDate,
+                    bs.laboratoryName,
+                    bs.sequencingFacilityID,
+                    bs.processedDate,
+                    bs.dnaSampleID,
+                    bs.dnaSampleCode,
+                    bs.internalLabID,
+                    bs.instrument_model,
+                    bs.sequencingMethod,
+                    bs.investigation_type,
+                    bs.qaqcStatus,
+                    bs.ncbiProjectID,
+                    bd.genomicsSampleID,
+                    bd.sequenceAnalysisType,
+                    bd.sampleMass,
+                    bd.nucleicAcidConcentration
+                FROM mms_benthicMetagenomeSequencing AS bs
+                JOIN mms_benthicMetagenomeDnaExtraction AS bd
+                ON bs.dnaSampleID = bd.dnaSampleID
+            ) AS merged
             LEFT JOIN amb_fieldParent AS afp
-            ON
-                merged.genomicsSampleID = afp.geneticSampleID
+            ON merged.genomicsSampleID = afp.geneticSampleID
         """
-        benthic_samples = pd.read_sql_query(query, self.conn)
+        benthic_samples = pd.read_sql_query(join_query, self.conn)
         benthic_samples.to_sql(
             "benthicSamples", self.conn, if_exists="replace", index=False
         )
-        neon_biosample_ids = benthic_samples["sampleID"]
-        nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
-        neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
+        sample_ids = benthic_samples["sampleID"]
+        nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(sample_ids))
+        neon_to_nmdc_biosample_ids = dict(zip(sample_ids, nmdc_biosample_ids))
-        neon_extraction_ids = benthic_samples["sampleID"]
-        nmdc_extraction_ids = self._id_minter(
-            "nmdc:Extraction", len(neon_extraction_ids)
-        )
-        neon_to_nmdc_extraction_ids = dict(
-            zip(neon_extraction_ids, nmdc_extraction_ids)
-        )
+        nmdc_extraction_ids = self._id_minter("nmdc:Extraction", len(sample_ids))
+        neon_to_nmdc_extraction_ids = dict(zip(sample_ids, nmdc_extraction_ids))
-        neon_extraction_processed_ids = benthic_samples["sampleID"]
         nmdc_extraction_processed_ids = self._id_minter(
-            "nmdc:ProcessedSample", len(neon_extraction_processed_ids)
+            "nmdc:ProcessedSample", len(sample_ids)
         )
         neon_to_nmdc_extraction_processed_ids = dict(
-            zip(neon_extraction_processed_ids, nmdc_extraction_processed_ids)
+            zip(sample_ids, nmdc_extraction_processed_ids)
         )
-        neon_lib_prep_ids = benthic_samples["sampleID"]
-        nmdc_lib_prep_ids = self._id_minter(
-            "nmdc:LibraryPreparation", len(neon_lib_prep_ids)
-        )
-        neon_to_nmdc_lib_prep_ids = dict(zip(neon_lib_prep_ids, nmdc_lib_prep_ids))
+        nmdc_libprep_ids = self._id_minter("nmdc:LibraryPreparation", len(sample_ids))
+        neon_to_nmdc_libprep_ids = dict(zip(sample_ids, nmdc_libprep_ids))
-        neon_lib_prep_processed_ids = benthic_samples["sampleID"]
-        nmdc_lib_prep_processed_ids = self._id_minter(
-            "nmdc:ProcessedSample", len(neon_lib_prep_processed_ids)
+        nmdc_libprep_processed_ids = self._id_minter(
+            "nmdc:ProcessedSample", len(sample_ids)
         )
-        neon_to_nmdc_lib_prep_processed_ids = dict(
-            zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
+        neon_to_nmdc_libprep_processed_ids = dict(
+            zip(sample_ids, nmdc_libprep_processed_ids)
         )
-        neon_omprc_ids = benthic_samples["sampleID"]
-        nmdc_omprc_ids = self._id_minter(
-            "nmdc:NucleotideSequencing", len(neon_omprc_ids)
-        )
-        neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
+        nmdc_ntseq_ids = self._id_minter("nmdc:NucleotideSequencing", len(sample_ids))
+        neon_to_nmdc_ntseq_ids = dict(zip(sample_ids, nmdc_ntseq_ids))
-        neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
-        neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
-        nmdc_data_object_ids = self._id_minter(
-            "nmdc:DataObject", len(neon_raw_file_paths)
-        )
-        neon_to_nmdc_data_object_ids = dict(
-            zip(neon_raw_file_paths, nmdc_data_object_ids)
-        )
+        raw_df = self.neon_raw_data_file_mappings_df
+        raw_file_paths = raw_df["rawDataFilePath"]
+        dataobject_ids = self._id_minter("nmdc:DataObject", len(raw_file_paths))
+        neon_to_nmdc_dataobject_ids = dict(zip(raw_file_paths, dataobject_ids))
-        for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
-            biosample_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+        for neon_id, biosample_id in neon_to_nmdc_biosample_ids.items():
+            row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+            if row.empty:
+                continue
+            # Example of how you might call _translate_biosample:
             database.biosample_set.append(
-                self._translate_biosample(neon_id, nmdc_id, biosample_row)
+                self._translate_biosample(neon_id, biosample_id, row)
             )
-        for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
-            extraction_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+        for neon_id, extraction_id in neon_to_nmdc_extraction_ids.items():
+            row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+            if row.empty:
+                continue
-            extraction_input = neon_to_nmdc_biosample_ids.get(neon_id)
-            processed_sample_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
+            biosample_id = neon_to_nmdc_biosample_ids.get(neon_id)
+            extraction_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
-            if extraction_input is not None and processed_sample_id is not None:
+            if biosample_id and extraction_ps_id:
                 database.material_processing_set.append(
                     self._translate_extraction_process(
-                        nmdc_id,
-                        extraction_input,
-                        processed_sample_id,
-                        extraction_row,
+                        extraction_id, biosample_id, extraction_ps_id, row
                     )
                 )
-                genomics_sample_id = _get_value_or_none(
-                    extraction_row, "genomicsSampleID"
-                )
+                genomics_sample_id = _get_value_or_none(row, "genomicsSampleID")
                 database.processed_sample_set.append(
                     self._translate_processed_sample(
-                        processed_sample_id,
+                        extraction_ps_id,
                         f"Extracted DNA from {genomics_sample_id}",
                     )
                 )
-        query = """
+        query2 = """
             SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
-            FROM neonRawDataFile
+            FROM mms_benthicRawDataFiles
             GROUP BY dnaSampleID
         """
-        neon_raw_data_files = pd.read_sql_query(query, self.conn)
-        neon_raw_data_files_dict = (
-            neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
+        raw_data_files_df = pd.read_sql_query(query2, self.conn)
+        dna_files_dict = (
+            raw_data_files_df.set_index("dnaSampleID")["rawDataFilePaths"]
             .str.split("|")
             .to_dict()
         )
-        filtered_neon_raw_data_files_dict = {
-            key: value
-            for key, value in neon_raw_data_files_dict.items()
-            if len(value) <= 2
-        }
-        for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
-            lib_prep_row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+        dna_sample_to_manifest_id: dict[str, str] = {}
-            lib_prep_input = neon_to_nmdc_extraction_processed_ids.get(neon_id)
-            processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(neon_id)
+        for neon_id, libprep_id in neon_to_nmdc_libprep_ids.items():
+            row = benthic_samples[benthic_samples["sampleID"] == neon_id]
+            if row.empty:
+                continue
-            if lib_prep_input is not None and processed_sample_id is not None:
-                database.material_processing_set.append(
-                    self._translate_library_preparation(
-                        nmdc_id,
-                        lib_prep_input,
-                        processed_sample_id,
-                        lib_prep_row,
-                    )
+            extr_ps_id = neon_to_nmdc_extraction_processed_ids.get(neon_id)
+            libprep_ps_id = neon_to_nmdc_libprep_processed_ids.get(neon_id)
+            if not extr_ps_id or not libprep_ps_id:
+                continue
+            database.material_processing_set.append(
+                self._translate_library_preparation(
+                    libprep_id, extr_ps_id, libprep_ps_id, row
                 )
+            )
-                dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
+            dna_sample_id = _get_value_or_none(row, "dnaSampleID")
+            database.processed_sample_set.append(
+                self._translate_processed_sample(
+                    libprep_ps_id,
+                    f"Library preparation for {dna_sample_id}",
+                )
+            )
-                database.processed_sample_set.append(
-                    self._translate_processed_sample(
-                        processed_sample_id,
-                        f"Library preparation for {dna_sample_id}",
+            filepaths_for_dna: list[str] = dna_files_dict.get(dna_sample_id, [])
+            if not filepaths_for_dna:
+                # no raw files => skip
+                ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
+                if ntseq_id:
+                    continue
+                continue
+            # If multiple => we create a Manifest
+            manifest_id: Optional[str] = None
+            if len(filepaths_for_dna) > 2:
+                if dna_sample_id not in dna_sample_to_manifest_id:
+                    new_man_id = self._id_minter("nmdc:Manifest", 1)[0]
+                    dna_sample_to_manifest_id[dna_sample_id] = new_man_id
+                    database.manifest_set.append(self._translate_manifest(new_man_id))
+                manifest_id = dna_sample_to_manifest_id[dna_sample_id]
+            has_input_value = self.samp_procsm_dict.get(neon_id)
+            if not has_input_value:
+                continue
+            dataobject_ids_for_run: list[str] = []
+            for fp in filepaths_for_dna:
+                if fp not in neon_to_nmdc_dataobject_ids:
+                    continue
+                do_id = neon_to_nmdc_dataobject_ids[fp]
+                do_type = None
+                if "_R1.fastq.gz" in fp:
+                    do_type = "Metagenome Raw Read 1"
+                elif "_R2.fastq.gz" in fp:
+                    do_type = "Metagenome Raw Read 2"
+                database.data_object_set.append(
+                    self._translate_data_object(
+                        do_id=do_id,
+                        url=fp,
+                        do_type=do_type,
+                        manifest_id=manifest_id,
                     )
                 )
-                has_output = None
-                has_output_do_ids = []
-                if dna_sample_id in filtered_neon_raw_data_files_dict:
-                    has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
-                    for item in has_output:
-                        if item in neon_to_nmdc_data_object_ids:
-                            has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
-                        checksum = None
-                        do_type = None
-                        checksum = neon_raw_data_file_mappings_df[
-                            neon_raw_data_file_mappings_df["rawDataFilePath"] == item
-                        ]["checkSum"].values[0]
-                        if "_R1.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 1"
-                        elif "_R2.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 2"
-                        database.data_object_set.append(
-                            self._translate_data_object(
-                                neon_to_nmdc_data_object_ids.get(item),
-                                item,
-                                do_type,
-                                checksum,
-                            )
-                        )
-                    database.data_generation_set.append(
-                        self._translate_nucleotide_sequencing(
-                            neon_to_nmdc_omprc_ids.get(neon_id),
-                            processed_sample_id,
-                            has_output_do_ids,
-                            lib_prep_row,
-                        )
+                dataobject_ids_for_run.append(do_id)
+            ntseq_id = neon_to_nmdc_ntseq_ids.get(neon_id)
+            if ntseq_id:
+                database.data_generation_set.append(
+                    self._translate_nucleotide_sequencing(
+                        ntseq_id,
+                        has_input_value,  # <--- from self.samp_procsm_dict
+                        dataobject_ids_for_run,
+                        row,
                     )
+                )
         return database

nmdc_runtime/site/translation/neon_surface_water_translator.py CHANGED Viewed

@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
         neon_amb_data_tables = (
             "mms_swMetagenomeSequencing",
             "mms_swMetagenomeDnaExtraction",
+            "mms_swRawDataFiles",
             "amc_fieldGenetic",
             "amc_fieldSuperParent",
         )
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
                 if_exists="replace",
                 index=False,
             )
+            surface_water_data["mms_swRawDataFiles"].to_sql(
+                "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
+            )
             surface_water_data["amc_fieldGenetic"].to_sql(
                 "amc_fieldGenetic", self.conn, if_exists="replace", index=False
             )
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
             "neonEnvoTerms", self.conn, if_exists="replace", index=False
         )
-        self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
-        self.neon_raw_data_file_mappings_df.to_sql(
-            "neonRawDataFile", self.conn, if_exists="replace", index=False
-        )
+        self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
         self.site_code_mapping = site_code_mapping
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
         )
     def _translate_data_object(
-        self, do_id: str, url: str, do_type: str, checksum: str
+        self, do_id: str, url: str, do_type: str, manifest_id: str
     ) -> nmdc.DataObject:
         """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
         object mainly contains information about the sequencing file that was generated as
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
             url=url,
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
-            md5_checksum=checksum,
             data_object_type=do_type,
+            in_manifest=manifest_id,
+        )
+    def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
+        return nmdc.Manifest(
+            id=manifest_id,
+            manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
+            type="nmdc:Manifest",
         )
     def get_database(self):
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
         """
         surface_water_samples = pd.read_sql_query(query, self.conn)
+        # --------------------------------------------------
+        # Create mappings for minted NMDC IDs
+        # --------------------------------------------------
         neon_biosample_ids = surface_water_samples["parentSampleID"]
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
         neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
             zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
         )
-        neon_omprc_ids = surface_water_samples["parentSampleID"]
-        nmdc_omprc_ids = self._id_minter(
-            "nmdc:NucleotideSequencing", len(neon_omprc_ids)
-        )
-        neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
-        neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
-        neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
-        nmdc_data_object_ids = self._id_minter(
-            "nmdc:DataObject", len(neon_raw_file_paths)
-        )
-        neon_to_nmdc_data_object_ids = dict(
-            zip(neon_raw_file_paths, nmdc_data_object_ids)
-        )
+        # --------------------------------------------------
+        # STEP 1: Insert Biosamples
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
             biosample_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
             ]
+            # database.biosample_set.append(
+            #     self._translate_biosample(neon_id, nmdc_id, biosample_row)
+            # )
-            database.biosample_set.append(
-                self._translate_biosample(neon_id, nmdc_id, biosample_row)
-            )
+        # --------------------------------------------------
+        # STEP 2: Insert Extraction Processes
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
             extraction_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     extraction_row, "genomicsSampleID"
                 )
+                # Each Extraction process output => ProcessedSample
                 database.processed_sample_set.append(
                     self._translate_processed_sample(
                         processed_sample_id,
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     )
                 )
-        query = """
-            SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
-            FROM neonRawDataFile
-            GROUP BY dnaSampleID
-        """
-        neon_raw_data_files = pd.read_sql_query(query, self.conn)
-        neon_raw_data_files_dict = (
-            neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
-            .str.split("|")
-            .to_dict()
-        )
-        filtered_neon_raw_data_files_dict = {
-            key: value
-            for key, value in neon_raw_data_files_dict.items()
-            if len(value) <= 2
-        }
+        # --------------------------------------------------
+        # STEP 3: Insert LibraryPreparation Processes
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
             lib_prep_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
                 dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
+                # Each LibraryPreparation process output => ProcessedSample
                 database.processed_sample_set.append(
                     self._translate_processed_sample(
                         processed_sample_id,
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     )
                 )
-                has_output = None
-                has_output_do_ids = []
-                if dna_sample_id in filtered_neon_raw_data_files_dict:
-                    has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
-                    for item in has_output:
-                        if item in neon_to_nmdc_data_object_ids:
-                            has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
-                        checksum = None
-                        do_type = None
-                        checksum = neon_raw_data_file_mappings_df[
-                            neon_raw_data_file_mappings_df["rawDataFilePath"] == item
-                        ]["checkSum"].values[0]
-                        if "_R1.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 1"
-                        elif "_R2.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 2"
-                        database.data_object_set.append(
-                            self._translate_data_object(
-                                neon_to_nmdc_data_object_ids.get(item),
-                                item,
-                                do_type,
-                                checksum,
-                            )
-                        )
-                    database.data_generation_set.append(
-                        self._translate_nucleotide_sequencing(
-                            neon_to_nmdc_omprc_ids.get(neon_id),
-                            processed_sample_id,
-                            has_output_do_ids,
-                            lib_prep_row,
-                        )
+        # --------------------------------------------------
+        # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
+        #         and insert DataObjects + DataGeneration processes
+        # --------------------------------------------------
+        raw_query = """
+            SELECT dnaSampleID, sequencerRunID, rawDataFilePath
+            FROM mms_swRawDataFiles
+        """
+        neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
+        for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
+            # 1) Pull out the row that corresponds to this parentSampleID
+            lib_prep_row = surface_water_samples[
+                surface_water_samples["parentSampleID"] == neon_id
+            ]
+            # 2) Grab the dnaSampleID from that row
+            dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
+            if not dna_sample_id:
+                # No dnaSampleID => skip
+                continue
+            # 3) Find all raw files for that dnaSampleID
+            dna_files = neon_raw_data_files_df[
+                neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
+            ]
+            if dna_files.empty:
+                # No raw files => skip
+                continue
+            # -----------------------------------------
+            # LOOKUP DICT: get "has_input" for this neon_id
+            # -----------------------------------------
+            has_input_value = self.samp_procsm_dict.get(neon_id)
+            # If some neon_id isn't in the dictionary, handle it as needed
+            if not has_input_value:
+                # Could skip, or raise an error, or set a default
+                continue
+            # -------------------------------------------
+            # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
+            #    for this row's dnaSampleID
+            # -------------------------------------------
+            manifest_id = None
+            if len(dna_files) > 2:
+                # For each row that references a dnaSampleID with multiple raw files,
+                # mint exactly one new manifest record
+                manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
+                new_manifest = self._translate_manifest(manifest_id)
+                # Add to the database
+                database.manifest_set.append(new_manifest)
+            # -------------------------------------------
+            # 5) NOW GROUP FILES BY sequencerRunID
+            #    => one data_generation record per run
+            # -------------------------------------------
+            lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
+                neon_id
+            )
+            if not lib_prep_processed_sample_id:
+                # If we don't have a ProcessedSample for some reason, skip
+                continue
+            for run_id, group_df in dna_files.groupby("sequencerRunID"):
+                # a) Mint new data_generation (NucleotideSequencing) ID for this run
+                data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
+                # b) Create DataObjects for each raw file in this run
+                data_object_ids = []
+                for raw_fp in group_df["rawDataFilePath"]:
+                    do_id = self._id_minter("nmdc:DataObject", 1)[0]
+                    # Distinguish read type
+                    do_type = None
+                    if "_R1.fastq.gz" in raw_fp:
+                        do_type = "Metagenome Raw Read 1"
+                    elif "_R2.fastq.gz" in raw_fp:
+                        do_type = "Metagenome Raw Read 2"
+                    # Create the DataObject
+                    data_obj = self._translate_data_object(
+                        do_id=do_id,
+                        url=raw_fp,
+                        do_type=do_type,
+                        manifest_id=manifest_id,  # link to the new Manifest if it exists
+                    )
+                    database.data_object_set.append(data_obj)
+                    data_object_ids.append(do_id)
+                # c) Finally, create the data generation record for this run
+                database.data_generation_set.append(
+                    self._translate_nucleotide_sequencing(
+                        nucleotide_sequencing_id=data_generation_id,
+                        processed_sample_id=has_input_value,
+                        raw_data_file_data=data_object_ids,
+                        nucleotide_sequencing_row=lib_prep_row,
                     )
+                )
         return database

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: nmdc_runtime
-Version: 2.3.0
+Version: 2.5.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston
@@ -17,6 +17,7 @@ Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: home-page
+Dynamic: license-file
 Dynamic: requires-python
 Dynamic: summary
@@ -37,8 +38,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
 * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
 houses code specific to the data portal -- its database, back-end API, and front-end application.
-* [workflow_documentation](https://nmdc-workflow-documentation.readthedocs.io/en/latest/index.html)
-references workflow code spread across several repositories, that take source data and produce computed data.
+* Workflows — documented in the [workflows](https://docs.microbiomedata.org/workflows/) section of the NMDC documentation website — take source data and produce computed data.
 * This repo (nmdc-runtime)
    * houses code that takes source data and computed data, and transforms it
@@ -156,6 +156,9 @@ Tests can be found in `tests` and are run with the following commands:
 ```bash
 make up-test
 make test
+# Run a Specific test file eg. tests/test_api/test_endpoints.py
+make test ARGS="tests/test_api/test_endpoints.py"
 ```
 As you create Dagster solids and pipelines, add tests in `tests/` to check that your code behaves as
@@ -164,6 +167,16 @@ desired and does not break over time.
 [For hints on how to write tests for solids and pipelines in Dagster, see their documentation
 tutorial on Testing](https://docs.dagster.io/tutorial/testable).
+### RAM usage
+The `dagster-daemon` and `dagster-dagit` containers can consume a lot of RAM. If tests are failing and the console of
+the `test` container shows "Error 137," here is something you can try as a workaround: In Docker Desktop, go to
+"Settings > Resources > Advanced," and increase the memory limit. One of our team members has
+found **12 GB** to be sufficient for running the tests.
+> Dedicating 12 GB of RAM to Docker may be prohibitive for some prospective developers.
+> There is an open [issue](https://github.com/microbiomedata/nmdc-runtime/issues/928) about the memory requirement.
 ## Publish to PyPI
 This repository contains a GitHub Actions workflow that publishes a Python package to [PyPI](https://pypi.org/project/nmdc-runtime/).

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/RECORD RENAMED Viewed

@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
-nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
-nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
+nmdc_runtime/site/ops.py,sha256=tg-zRlVSUSJ7B0cJbBsUwmMRmpIUmK5tsL8ABnY0wnY,46626
+nmdc_runtime/site/repository.py,sha256=kVCoIMF2rgAMUAf9a6jk0WbejFpmWgxh6nN4U37Mgc8,43919
 nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
 nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -51,21 +51,21 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
-nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
+nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
+nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
 nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
 nmdc_runtime/site/repair/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/repair/database_updater.py,sha256=EMuY8MfwQEfdejJHp0Y-Gb1eb1zOgKgfJxbtm6wM3YU,10943
+nmdc_runtime/site/repair/database_updater.py,sha256=eTNAPtgAc_xQodADBfgomwow9-14j5rBqQWF8R7BheY,11525
 nmdc_runtime/site/translation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/translation/emsl.py,sha256=-aCTJTSCNaK-Koh8BE_4fTf5nyxP1KkquR6lloLEJl0,1245
 nmdc_runtime/site/translation/gold.py,sha256=R3W99sdQb7Pgu_esN7ruIC-tyREQD_idJ4xCzkqWuGw,1622
 nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-Nfmr42-Na1gw-Xn4tg6I,32272
 nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
-nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
+nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=VxN7yCziQE-ZP9mtrzqI-yaS9taEgTy0EnIEattYeKo,23727
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
-nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
+nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
 nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
 nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
 nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
-nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-2.3.0.dist-info/RECORD,,
+nmdc_runtime-2.5.0.dist-info/licenses/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-2.5.0.dist-info/METADATA,sha256=tli66QKJC-48TzLXbI9iHMzTLyugbRBKj9CJEeKHXLY,8139
+nmdc_runtime-2.5.0.dist-info/WHEEL,sha256=DK49LOLCYiurdXXOXwGJm6U4DkHkg4lcxjhqwRa0CP4,91
+nmdc_runtime-2.5.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-2.5.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-2.5.0.dist-info/RECORD,,

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.8.0)
+Generator: setuptools (78.0.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.5.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 2.3.0__py3-none-any.whl → 2.5.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.3.0py3-none-any.whl → 2.5.0py3-none-any.whl