PyPI - nmdc-runtime - Versions diffs - 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl - Mend

nmdc-runtime 2.3.0py3-none-any.whl → 2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (11) hide show

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -4,7 +4,7 @@ import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
-from typing import Any
+from typing import Any, List, Union
 from urllib.parse import urlparse
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     get_instruments,
@@ -366,7 +366,14 @@ class NCBISubmissionXML:
                             )
                             # Currently, we are making the assumption that only one instrument
                             # is used to sequence a Biosample
-                            instrument_id = ntseq.get("instrument_used", "")[0]
+                            instrument_used: List[str] = ntseq.get(
+                                "instrument_used", []
+                            )
+                            if not instrument_used:
+                                instrument_id = None
+                            else:
+                                instrument_id = instrument_used[0]
                             instrument = all_instruments.get(instrument_id, {})
                             instrument_vendor = instrument.get("vendor", "")
                             instrument_model = instrument.get("model", "")
@@ -448,6 +455,20 @@ class NCBISubmissionXML:
                                 "Attribute", "NextSeq 550", {"name": "instrument_model"}
                             )
                         )
+                    elif instrument_model == "novaseq_6000":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute",
+                                "NovaSeq 6000",
+                                {"name": "instrument_model"},
+                            )
+                        )
+                    elif instrument_model == "hiseq":
+                        sra_attributes.append(
+                            self.set_element(
+                                "Attribute", "HiSeq", {"name": "instrument_model"}
+                            )
+                        )
                 if analyte_category == "metagenome":
                     sra_attributes.append(

nmdc_runtime/site/export/ncbi_xml_utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 from io import BytesIO, StringIO
+from typing import Any, Dict, List, Union
+from nmdc_runtime.api.endpoints.util import strip_oid
 from nmdc_runtime.minter.config import typecodes
 from lxml import etree
+from pymongo.collection import Collection
 import csv
 import requests
@@ -45,35 +49,53 @@ def get_instruments(instrument_set_collection):
         raise RuntimeError(f"An error occurred while fetching instrument data: {e}")
-def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
+def fetch_data_objects_from_biosamples(
+    all_docs_collection: Collection,
+    data_object_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the data objects that are "associated" (derived from/products of)
+    with their respective biosamples by iterating over the alldocs collection recursively.
+    The methods returns a dictionary with biosample ids as keys and the associated list of
+    data objects as values.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_object_set: reference to the data_object_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated data objects as values
+    """
+    biosample_data_objects = []
+    def collect_data_objects(doc_ids, collected_objects, unique_ids):
+        for doc_id in doc_ids:
+            if (
+                get_classname_from_typecode(doc_id) == "DataObject"
+                and doc_id not in unique_ids
+            ):
+                data_obj = data_object_set.find_one({"id": doc_id})
+                if data_obj:
+                    collected_objects.append(strip_oid(data_obj))
+                    unique_ids.add(doc_id)
     biosample_data_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
         collected_data_objects = []
+        unique_ids = set()
         while current_ids:
             new_current_ids = []
             for current_id in current_ids:
-                query = {"has_input": current_id}
-                document = all_docs_collection.find_one(query)
+                for doc in all_docs_collection.find({"has_input": current_id}):
+                    has_output = doc.get("has_output", [])
-                if not document:
-                    continue
-                has_output = document.get("has_output")
-                if not has_output:
-                    continue
-                for output_id in has_output:
-                    if get_classname_from_typecode(output_id) == "DataObject":
-                        data_object_doc = all_docs_collection.find_one(
-                            {"id": output_id}
-                        )
-                        if data_object_doc:
-                            collected_data_objects.append(data_object_doc)
-                    else:
-                        new_current_ids.append(output_id)
+                    collect_data_objects(has_output, collected_data_objects, unique_ids)
+                    new_current_ids.extend(
+                        op
+                        for op in has_output
+                        if get_classname_from_typecode(op) != "DataObject"
+                    )
             current_ids = new_current_ids
@@ -83,12 +105,25 @@ def fetch_data_objects_from_biosamples(all_docs_collection, biosamples_list):
     return biosample_data_objects
-def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_list):
-    biosample_data_objects = []
+def fetch_nucleotide_sequencing_from_biosamples(
+    all_docs_collection: Collection,
+    data_generation_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the nucleotide sequencing process records that create data objects
+    for biosamples by iterating over the alldocs collection recursively.
+    :param all_docs_collection: reference to the alldocs collection
+    :param data_generation_set: reference to the data_generation_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated nucleotide sequencing
+    process objects as values
+    """
+    biosample_ntseq_objects = []
     for biosample in biosamples_list:
         current_ids = [biosample["id"]]
-        collected_data_objects = []
+        collected_ntseq_objects = []
         while current_ids:
             new_current_ids = []
@@ -105,23 +140,39 @@ def fetch_nucleotide_sequencing_from_biosamples(all_docs_collection, biosamples_
                 for output_id in has_output:
                     if get_classname_from_typecode(output_id) == "DataObject":
-                        nucleotide_sequencing_doc = all_docs_collection.find_one(
+                        nucleotide_sequencing_doc = data_generation_set.find_one(
                             {"id": document["id"]}
                         )
                         if nucleotide_sequencing_doc:
-                            collected_data_objects.append(nucleotide_sequencing_doc)
+                            collected_ntseq_objects.append(
+                                strip_oid(nucleotide_sequencing_doc)
+                            )
                     else:
                         new_current_ids.append(output_id)
             current_ids = new_current_ids
-        if collected_data_objects:
-            biosample_data_objects.append({biosample["id"]: collected_data_objects})
+        if collected_ntseq_objects:
+            biosample_ntseq_objects.append({biosample["id"]: collected_ntseq_objects})
+    return biosample_ntseq_objects
-    return biosample_data_objects
+def fetch_library_preparation_from_biosamples(
+    all_docs_collection: Collection,
+    material_processing_set: Collection,
+    biosamples_list: List[Dict[str, Any]],
+) -> List[Dict[str, Dict[str, Any]]]:
+    """This method fetches the library preparation process records that create processed samples,
+    which are further fed/inputted into (by `has_input` slot) a nucleotide sequencing process
+    for biosamples by iterating over the alldocs collection recursively.
-def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_list):
+    :param all_docs_collection: reference to the alldocs collection
+    :param material_processing_set: reference to the material_processing_set collection
+    :param biosamples_list: list of biosamples as JSON documents
+    :return: list of dictionaries with biosample ids as keys and associated library preparation process
+    objects as values
+    """
     biosample_lib_prep = []
     for biosample in biosamples_list:
@@ -144,10 +195,10 @@ def fetch_library_preparation_from_biosamples(all_docs_collection, biosamples_li
                 "has_input": output_id,
                 "type": {"$in": ["LibraryPreparation"]},
             }
-            lib_prep_doc = all_docs_collection.find_one(lib_prep_query)
+            lib_prep_doc = material_processing_set.find_one(lib_prep_query)
             if lib_prep_doc:
-                biosample_lib_prep.append({biosample_id: lib_prep_doc})
+                biosample_lib_prep.append({biosample_id: strip_oid(lib_prep_doc)})
                 break  # Stop at the first document that meets the criteria
     return biosample_lib_prep

nmdc_runtime/site/ops.py CHANGED Viewed

@@ -1188,8 +1188,9 @@ def get_ncbi_export_pipeline_inputs(context: OpExecutionContext) -> str:
 def get_data_objects_from_biosamples(context: OpExecutionContext, biosamples: list):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_object_set = mdb["data_object_set"]
     biosample_data_objects = fetch_data_objects_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_object_set, biosamples
     )
     return biosample_data_objects
@@ -1200,8 +1201,9 @@ def get_nucleotide_sequencing_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    data_generation_set = mdb["data_generation_set"]
     biosample_omics_processing = fetch_nucleotide_sequencing_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, data_generation_set, biosamples
     )
     return biosample_omics_processing
@@ -1212,8 +1214,9 @@ def get_library_preparation_from_biosamples(
 ):
     mdb = context.resources.mongo.db
     alldocs_collection = mdb["alldocs"]
+    material_processing_set = mdb["material_processing_set"]
     biosample_lib_prep = fetch_library_preparation_from_biosamples(
-        alldocs_collection, biosamples
+        alldocs_collection, material_processing_set, biosamples
     )
     return biosample_lib_prep

nmdc_runtime/site/repository.py CHANGED Viewed

@@ -652,7 +652,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -694,7 +694,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -737,7 +737,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_benthic_data_product": {
@@ -779,7 +779,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },
@@ -822,14 +822,14 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                     "get_neon_pipeline_surface_water_data_product": {
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -856,7 +856,7 @@ def biosample_submission_ingest():
                         "config": {
                             "surface_water_data_product": {
                                 "product_id": "DP1.20281.001",
-                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent",
+                                "product_tables": "mms_swMetagenomeSequencing, mms_swMetagenomeDnaExtraction, amc_fieldGenetic, amc_fieldSuperParent, mms_swRawDataFiles",
                             }
                         }
                     },
@@ -864,7 +864,7 @@ def biosample_submission_ingest():
                         "inputs": {
                             "neon_envo_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/neon_mixs_env_triad_mappings/neon-nlcd-local-broad-mappings.tsv",
                             "neon_raw_data_file_mappings_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/main/assets/misc/neon_raw_data_file_mappings.tsv",
-                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/berkeley-schema-fy24/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
+                            "neon_nmdc_instrument_mapping_file_url": "https://raw.githubusercontent.com/microbiomedata/nmdc-schema/refs/heads/main/assets/misc/neon_sequencingMethod_to_nmdc_instrument_set.tsv",
                         }
                     },
                 },

nmdc_runtime/site/translation/neon_surface_water_translator.py CHANGED Viewed

@@ -71,6 +71,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
         neon_amb_data_tables = (
             "mms_swMetagenomeSequencing",
             "mms_swMetagenomeDnaExtraction",
+            "mms_swRawDataFiles",
             "amc_fieldGenetic",
             "amc_fieldSuperParent",
         )
@@ -88,6 +89,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
                 if_exists="replace",
                 index=False,
             )
+            surface_water_data["mms_swRawDataFiles"].to_sql(
+                "mms_swRawDataFiles", self.conn, if_exists="replace", index=False
+            )
             surface_water_data["amc_fieldGenetic"].to_sql(
                 "amc_fieldGenetic", self.conn, if_exists="replace", index=False
             )
@@ -103,10 +107,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
             "neonEnvoTerms", self.conn, if_exists="replace", index=False
         )
-        self.neon_raw_data_file_mappings_df = neon_raw_data_file_mappings_file
-        self.neon_raw_data_file_mappings_df.to_sql(
-            "neonRawDataFile", self.conn, if_exists="replace", index=False
-        )
+        self.neon_raw_data_file_mappings_df = surface_water_data["mms_swRawDataFiles"]
         self.site_code_mapping = site_code_mapping
@@ -371,7 +372,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
         )
     def _translate_data_object(
-        self, do_id: str, url: str, do_type: str, checksum: str
+        self, do_id: str, url: str, do_type: str, manifest_id: str
     ) -> nmdc.DataObject:
         """Create nmdc DataObject which is the output of a NucleotideSequencing process. This
         object mainly contains information about the sequencing file that was generated as
@@ -395,8 +396,15 @@ class NeonSurfaceWaterDataTranslator(Translator):
             url=url,
             description=f"sequencing results for {basename}",
             type="nmdc:DataObject",
-            md5_checksum=checksum,
             data_object_type=do_type,
+            in_manifest=manifest_id,
+        )
+    def _translate_manifest(self, manifest_id: str) -> nmdc.Manifest:
+        return nmdc.Manifest(
+            id=manifest_id,
+            manifest_category=nmdc.ManifestCategoryEnum.poolable_replicates,
+            type="nmdc:Manifest",
         )
     def get_database(self):
@@ -477,6 +485,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
         """
         surface_water_samples = pd.read_sql_query(query, self.conn)
+        # --------------------------------------------------
+        # Create mappings for minted NMDC IDs
+        # --------------------------------------------------
         neon_biosample_ids = surface_water_samples["parentSampleID"]
         nmdc_biosample_ids = self._id_minter("nmdc:Biosample", len(neon_biosample_ids))
         neon_to_nmdc_biosample_ids = dict(zip(neon_biosample_ids, nmdc_biosample_ids))
@@ -511,30 +522,20 @@ class NeonSurfaceWaterDataTranslator(Translator):
             zip(neon_lib_prep_processed_ids, nmdc_lib_prep_processed_ids)
         )
-        neon_omprc_ids = surface_water_samples["parentSampleID"]
-        nmdc_omprc_ids = self._id_minter(
-            "nmdc:NucleotideSequencing", len(neon_omprc_ids)
-        )
-        neon_to_nmdc_omprc_ids = dict(zip(neon_omprc_ids, nmdc_omprc_ids))
-        neon_raw_data_file_mappings_df = self.neon_raw_data_file_mappings_df
-        neon_raw_file_paths = neon_raw_data_file_mappings_df["rawDataFilePath"]
-        nmdc_data_object_ids = self._id_minter(
-            "nmdc:DataObject", len(neon_raw_file_paths)
-        )
-        neon_to_nmdc_data_object_ids = dict(
-            zip(neon_raw_file_paths, nmdc_data_object_ids)
-        )
+        # --------------------------------------------------
+        # STEP 1: Insert Biosamples
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_biosample_ids.items():
             biosample_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
             ]
+            # database.biosample_set.append(
+            #     self._translate_biosample(neon_id, nmdc_id, biosample_row)
+            # )
-            database.biosample_set.append(
-                self._translate_biosample(neon_id, nmdc_id, biosample_row)
-            )
+        # --------------------------------------------------
+        # STEP 2: Insert Extraction Processes
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_extraction_ids.items():
             extraction_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
@@ -557,6 +558,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     extraction_row, "genomicsSampleID"
                 )
+                # Each Extraction process output => ProcessedSample
                 database.processed_sample_set.append(
                     self._translate_processed_sample(
                         processed_sample_id,
@@ -564,23 +566,9 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     )
                 )
-        query = """
-            SELECT dnaSampleID, GROUP_CONCAT(rawDataFilePath, '|') AS rawDataFilePaths
-            FROM neonRawDataFile
-            GROUP BY dnaSampleID
-        """
-        neon_raw_data_files = pd.read_sql_query(query, self.conn)
-        neon_raw_data_files_dict = (
-            neon_raw_data_files.set_index("dnaSampleID")["rawDataFilePaths"]
-            .str.split("|")
-            .to_dict()
-        )
-        filtered_neon_raw_data_files_dict = {
-            key: value
-            for key, value in neon_raw_data_files_dict.items()
-            if len(value) <= 2
-        }
+        # --------------------------------------------------
+        # STEP 3: Insert LibraryPreparation Processes
+        # --------------------------------------------------
         for neon_id, nmdc_id in neon_to_nmdc_lib_prep_ids.items():
             lib_prep_row = surface_water_samples[
                 surface_water_samples["parentSampleID"] == neon_id
@@ -601,6 +589,7 @@ class NeonSurfaceWaterDataTranslator(Translator):
                 dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
+                # Each LibraryPreparation process output => ProcessedSample
                 database.processed_sample_set.append(
                     self._translate_processed_sample(
                         processed_sample_id,
@@ -608,42 +597,103 @@ class NeonSurfaceWaterDataTranslator(Translator):
                     )
                 )
-                has_output = None
-                has_output_do_ids = []
-                if dna_sample_id in filtered_neon_raw_data_files_dict:
-                    has_output = filtered_neon_raw_data_files_dict[dna_sample_id]
-                    for item in has_output:
-                        if item in neon_to_nmdc_data_object_ids:
-                            has_output_do_ids.append(neon_to_nmdc_data_object_ids[item])
-                        checksum = None
-                        do_type = None
-                        checksum = neon_raw_data_file_mappings_df[
-                            neon_raw_data_file_mappings_df["rawDataFilePath"] == item
-                        ]["checkSum"].values[0]
-                        if "_R1.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 1"
-                        elif "_R2.fastq.gz" in item:
-                            do_type = "Metagenome Raw Read 2"
-                        database.data_object_set.append(
-                            self._translate_data_object(
-                                neon_to_nmdc_data_object_ids.get(item),
-                                item,
-                                do_type,
-                                checksum,
-                            )
-                        )
-                    database.data_generation_set.append(
-                        self._translate_nucleotide_sequencing(
-                            neon_to_nmdc_omprc_ids.get(neon_id),
-                            processed_sample_id,
-                            has_output_do_ids,
-                            lib_prep_row,
-                        )
+        # --------------------------------------------------
+        # STEP 4: Group raw files by (dnaSampleID, sequencerRunID)
+        #         and insert DataObjects + DataGeneration processes
+        # --------------------------------------------------
+        raw_query = """
+            SELECT dnaSampleID, sequencerRunID, rawDataFilePath
+            FROM mms_swRawDataFiles
+        """
+        neon_raw_data_files_df = pd.read_sql_query(raw_query, self.conn)
+        for neon_id, nmdc_libprep_id in neon_to_nmdc_lib_prep_ids.items():
+            # 1) Pull out the row that corresponds to this parentSampleID
+            lib_prep_row = surface_water_samples[
+                surface_water_samples["parentSampleID"] == neon_id
+            ]
+            # 2) Grab the dnaSampleID from that row
+            dna_sample_id = _get_value_or_none(lib_prep_row, "dnaSampleID")
+            if not dna_sample_id:
+                # No dnaSampleID => skip
+                continue
+            # 3) Find all raw files for that dnaSampleID
+            dna_files = neon_raw_data_files_df[
+                neon_raw_data_files_df["dnaSampleID"] == dna_sample_id
+            ]
+            if dna_files.empty:
+                # No raw files => skip
+                continue
+            # -----------------------------------------
+            # LOOKUP DICT: get "has_input" for this neon_id
+            # -----------------------------------------
+            has_input_value = self.samp_procsm_dict.get(neon_id)
+            # If some neon_id isn't in the dictionary, handle it as needed
+            if not has_input_value:
+                # Could skip, or raise an error, or set a default
+                continue
+            # -------------------------------------------
+            # 4) CREATE A MANIFEST IF MULTIPLE RAW FILES
+            #    for this row's dnaSampleID
+            # -------------------------------------------
+            manifest_id = None
+            if len(dna_files) > 2:
+                # For each row that references a dnaSampleID with multiple raw files,
+                # mint exactly one new manifest record
+                manifest_id = self._id_minter("nmdc:Manifest", 1)[0]
+                new_manifest = self._translate_manifest(manifest_id)
+                # Add to the database
+                database.manifest_set.append(new_manifest)
+            # -------------------------------------------
+            # 5) NOW GROUP FILES BY sequencerRunID
+            #    => one data_generation record per run
+            # -------------------------------------------
+            lib_prep_processed_sample_id = neon_to_nmdc_lib_prep_processed_ids.get(
+                neon_id
+            )
+            if not lib_prep_processed_sample_id:
+                # If we don't have a ProcessedSample for some reason, skip
+                continue
+            for run_id, group_df in dna_files.groupby("sequencerRunID"):
+                # a) Mint new data_generation (NucleotideSequencing) ID for this run
+                data_generation_id = self._id_minter("nmdc:NucleotideSequencing", 1)[0]
+                # b) Create DataObjects for each raw file in this run
+                data_object_ids = []
+                for raw_fp in group_df["rawDataFilePath"]:
+                    do_id = self._id_minter("nmdc:DataObject", 1)[0]
+                    # Distinguish read type
+                    do_type = None
+                    if "_R1.fastq.gz" in raw_fp:
+                        do_type = "Metagenome Raw Read 1"
+                    elif "_R2.fastq.gz" in raw_fp:
+                        do_type = "Metagenome Raw Read 2"
+                    # Create the DataObject
+                    data_obj = self._translate_data_object(
+                        do_id=do_id,
+                        url=raw_fp,
+                        do_type=do_type,
+                        manifest_id=manifest_id,  # link to the new Manifest if it exists
+                    )
+                    database.data_object_set.append(data_obj)
+                    data_object_ids.append(do_id)
+                # c) Finally, create the data generation record for this run
+                database.data_generation_set.append(
+                    self._translate_nucleotide_sequencing(
+                        nucleotide_sequencing_id=data_generation_id,
+                        processed_sample_id=has_input_value,
+                        raw_data_file_data=data_object_ids,
+                        nucleotide_sequencing_row=lib_prep_row,
                     )
+                )
         return database

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: nmdc_runtime
-Version: 2.3.0
+Version: 2.4.0
 Summary: A runtime system for NMDC data management and orchestration
 Home-page: https://github.com/microbiomedata/nmdc-runtime
 Author: Donny Winston
@@ -37,7 +37,7 @@ houses the LinkML schema specification, as well as generated artifacts (e.g. JSO
 * [nmdc-server](https://github.com/microbiomedata/nmdc-server)
 houses code specific to the data portal -- its database, back-end API, and front-end application.
-* [workflow_documentation](https://nmdc-workflow-documentation.readthedocs.io/en/latest/index.html)
+* [workflow_documentation](https://docs.microbiomedata.org/workflows/)
 references workflow code spread across several repositories, that take source data and produce computed data.
 * This repo (nmdc-runtime)

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/RECORD RENAMED Viewed

@@ -37,8 +37,8 @@ nmdc_runtime/minter/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
 nmdc_runtime/minter/entrypoints/fastapi_app.py,sha256=JC4thvzfFwRc1mhWQ-kHy3yvs0SYxF6ktE7LXNCwqlI,4031
 nmdc_runtime/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/graphs.py,sha256=SA4Ibb1TenVgiYD4wNrMiOCwkMKrv6BTcQ8mbG4VXPs,15666
-nmdc_runtime/site/ops.py,sha256=OrTQLSBToih-kI754RtVqjUMRojgYJQmb7B_VRjZWtg,46223
-nmdc_runtime/site/repository.py,sha256=b3UVQznelU8wDOfuc9_vE_eqFGOoFRiHtQJJH7or73E,43875
+nmdc_runtime/site/ops.py,sha256=p4F5SrDbFdKOrAHu1TUhWQA33QB7hdoQmCCuU-00Eqo,46445
+nmdc_runtime/site/repository.py,sha256=pfx7WAVgdNaPhtfF2pak-tllqPMf4-yUeOXSpr4hu30,43861
 nmdc_runtime/site/resources.py,sha256=sqtRWb4ewU61U-JZTphsC4wBvYT5B0wj33WU70vjq_k,19677
 nmdc_runtime/site/util.py,sha256=-DnNv15r6XmFZrDG3d3Z-bFn2pFo0xpyyudhKlYPJKc,1559
 nmdc_runtime/site/backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -51,8 +51,8 @@ nmdc_runtime/site/drsobjects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 nmdc_runtime/site/drsobjects/ingest.py,sha256=pcMP69WSzFHFqHB9JIL55ePFhilnCLRc2XHCQ97w1Ik,3107
 nmdc_runtime/site/drsobjects/registration.py,sha256=D1T3QUuxEOxqKZIvB5rkb_6ZxFZiA-U9SMPajyeWC2Y,3572
 nmdc_runtime/site/export/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-nmdc_runtime/site/export/ncbi_xml.py,sha256=Vb4rNP3uhnGlHqrwUGgA2DzpOotCf3S8G4sIJml7gl4,25287
-nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Jd-d8GGkB3e71TPpl_lPukQ54TioQZynO1yPSLX_aHs,8390
+nmdc_runtime/site/export/ncbi_xml.py,sha256=Jny1SqaVlscjiMHD73aiO8dxjbdPPJF5ksvNApHh5Ck,26230
+nmdc_runtime/site/export/ncbi_xml_utils.py,sha256=Qh96kk3TxwXxzuaWi6fsqe4Smarc2NILFP8pWRD3gFA,10915
 nmdc_runtime/site/export/study_metadata.py,sha256=yR5pXL6JG8d7cAtqcF-60Hp7bLD3dJ0Rut4AtYc0tXA,4844
 nmdc_runtime/site/normalization/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nmdc_runtime/site/normalization/gold.py,sha256=iISDD4qs4d6uLhv631WYNeQVOzY5DO201ZpPtxHdkVk,1311
@@ -65,7 +65,7 @@ nmdc_runtime/site/translation/gold_translator.py,sha256=Zw4IjxMWJ1wdLyxX44-faq-N
 nmdc_runtime/site/translation/jgi.py,sha256=qk878KhIw674TkrVfbl2x1QJrKi3zlvE0vesIpe9slM,876
 nmdc_runtime/site/translation/neon_benthic_translator.py,sha256=QIDqYLuf-NlGY9_88gy_5qTswkei3OfgJ5AOFpEXzJo,23985
 nmdc_runtime/site/translation/neon_soil_translator.py,sha256=Rol0g67nVBGSBySUzpfdW4Fwes7bKtvnlv2g5cB0aTI,38550
-nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=MQgjIfWPgoRe-bhzyfqHSe2mZwFsjcwjdT8tNqpIhlc,27729
+nmdc_runtime/site/translation/neon_surface_water_translator.py,sha256=k06eULMTYx0sQ00UlyeNJvCJMcX-neClnES1G6zpPKg,30517
 nmdc_runtime/site/translation/neon_utils.py,sha256=d00o7duKKugpLHmsEifNbp4WjeC4GOqcgw0b5qlCg4I,5549
 nmdc_runtime/site/translation/submission_portal_translator.py,sha256=9KhFn2jlRlGEAhsZWRPkpmInpxuVmnbCyR6jhlD7ooA,30587
 nmdc_runtime/site/translation/translator.py,sha256=V6Aq0y03LoQ4LTL2iHDHxGTh_eMjOmDJJSwNHSrp2wo,837
@@ -75,9 +75,9 @@ nmdc_runtime/site/validation/emsl.py,sha256=OG20mv_3E2rkQqTQtYO0_SVRqFb-Z_zKCiAV
 nmdc_runtime/site/validation/gold.py,sha256=Z5ZzYdjERbrJ2Tu06d0TDTBSfwaFdL1Z23Rl-YkZ2Ow,803
 nmdc_runtime/site/validation/jgi.py,sha256=LdJfhqBVHWCDp0Kzyk8eJZMwEI5NQ-zuTda31BcGwOA,1299
 nmdc_runtime/site/validation/util.py,sha256=GGbMDSwR090sr_E_fHffCN418gpYESaiot6XghS7OYk,3349
-nmdc_runtime-2.3.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
-nmdc_runtime-2.3.0.dist-info/METADATA,sha256=BEManThNKOEkfS9woYYiwm1ya6BENBC6vXE6b7L_z2E,7430
-nmdc_runtime-2.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-nmdc_runtime-2.3.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
-nmdc_runtime-2.3.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
-nmdc_runtime-2.3.0.dist-info/RECORD,,
+nmdc_runtime-2.4.0.dist-info/LICENSE,sha256=VWiv65r7gHGjgtr3jMJYVmQny5GRpQ6H-W9sScb1x70,2408
+nmdc_runtime-2.4.0.dist-info/METADATA,sha256=CeZZbucd3jrD0ZqGdreH2x7ALrM9pt4ksGV2olkkpPI,7401
+nmdc_runtime-2.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+nmdc_runtime-2.4.0.dist-info/entry_points.txt,sha256=JxdvOnvxHK_8046cwlvE30s_fV0-k-eTpQtkKYA69nQ,224
+nmdc_runtime-2.4.0.dist-info/top_level.txt,sha256=b0K1s09L_iHH49ueBKaLrB5-lh6cyrSv9vL6x4Qvyz8,13
+nmdc_runtime-2.4.0.dist-info/RECORD,,

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nmdc_runtime-2.3.0.dist-info → nmdc_runtime-2.4.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

nmdc-runtime 2.3.0__py3-none-any.whl → 2.4.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.3.0py3-none-any.whl → 2.4.0py3-none-any.whl