PyPI - nmdc-runtime - Versions diffs - 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (131) hide show

nmdc_runtime/Dockerfile +167 -0
nmdc_runtime/api/analytics.py +90 -0
nmdc_runtime/api/boot/capabilities.py +9 -0
nmdc_runtime/api/boot/object_types.py +126 -0
nmdc_runtime/api/boot/triggers.py +84 -0
nmdc_runtime/api/boot/workflows.py +116 -0
nmdc_runtime/api/core/auth.py +208 -0
nmdc_runtime/api/core/idgen.py +200 -0
nmdc_runtime/api/core/metadata.py +788 -0
nmdc_runtime/api/core/util.py +109 -0
nmdc_runtime/api/db/mongo.py +435 -0
nmdc_runtime/api/db/s3.py +37 -0
nmdc_runtime/api/endpoints/capabilities.py +25 -0
nmdc_runtime/api/endpoints/find.py +634 -0
nmdc_runtime/api/endpoints/jobs.py +143 -0
nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
nmdc_runtime/api/endpoints/metadata.py +260 -0
nmdc_runtime/api/endpoints/nmdcschema.py +502 -0
nmdc_runtime/api/endpoints/object_types.py +38 -0
nmdc_runtime/api/endpoints/objects.py +270 -0
nmdc_runtime/api/endpoints/operations.py +78 -0
nmdc_runtime/api/endpoints/queries.py +701 -0
nmdc_runtime/api/endpoints/runs.py +98 -0
nmdc_runtime/api/endpoints/search.py +38 -0
nmdc_runtime/api/endpoints/sites.py +205 -0
nmdc_runtime/api/endpoints/triggers.py +25 -0
nmdc_runtime/api/endpoints/users.py +214 -0
nmdc_runtime/api/endpoints/util.py +796 -0
nmdc_runtime/api/endpoints/workflows.py +353 -0
nmdc_runtime/api/entrypoint.sh +7 -0
nmdc_runtime/api/main.py +425 -0
nmdc_runtime/api/middleware.py +43 -0
nmdc_runtime/api/models/capability.py +14 -0
nmdc_runtime/api/models/id.py +92 -0
nmdc_runtime/api/models/job.py +37 -0
nmdc_runtime/api/models/lib/helpers.py +78 -0
nmdc_runtime/api/models/metadata.py +11 -0
nmdc_runtime/api/models/nmdc_schema.py +146 -0
nmdc_runtime/api/models/object.py +180 -0
nmdc_runtime/api/models/object_type.py +20 -0
nmdc_runtime/api/models/operation.py +66 -0
nmdc_runtime/api/models/query.py +246 -0
nmdc_runtime/api/models/query_continuation.py +111 -0
nmdc_runtime/api/models/run.py +161 -0
nmdc_runtime/api/models/site.py +87 -0
nmdc_runtime/api/models/trigger.py +13 -0
nmdc_runtime/api/models/user.py +140 -0
nmdc_runtime/api/models/util.py +260 -0
nmdc_runtime/api/models/workflow.py +15 -0
nmdc_runtime/api/openapi.py +178 -0
nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
nmdc_runtime/config.py +7 -8
nmdc_runtime/minter/adapters/repository.py +22 -2
nmdc_runtime/minter/config.py +2 -0
nmdc_runtime/minter/domain/model.py +55 -1
nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
nmdc_runtime/mongo_util.py +1 -2
nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
nmdc_runtime/site/dagster.yaml +53 -0
nmdc_runtime/site/entrypoint-daemon.sh +26 -0
nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
nmdc_runtime/site/entrypoint-dagit.sh +26 -0
nmdc_runtime/site/export/ncbi_xml.py +633 -13
nmdc_runtime/site/export/ncbi_xml_utils.py +115 -1
nmdc_runtime/site/graphs.py +8 -22
nmdc_runtime/site/ops.py +147 -181
nmdc_runtime/site/repository.py +2 -112
nmdc_runtime/site/resources.py +16 -3
nmdc_runtime/site/translation/gold_translator.py +4 -12
nmdc_runtime/site/translation/neon_benthic_translator.py +0 -1
nmdc_runtime/site/translation/neon_soil_translator.py +4 -5
nmdc_runtime/site/translation/neon_surface_water_translator.py +0 -2
nmdc_runtime/site/translation/submission_portal_translator.py +84 -68
nmdc_runtime/site/translation/translator.py +63 -1
nmdc_runtime/site/util.py +8 -3
nmdc_runtime/site/validation/util.py +10 -5
nmdc_runtime/site/workspace.yaml +13 -0
nmdc_runtime/static/NMDC_logo.svg +1073 -0
nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
nmdc_runtime/static/README.md +5 -0
nmdc_runtime/static/favicon.ico +0 -0
nmdc_runtime/util.py +90 -48
nmdc_runtime-2.11.0.dist-info/METADATA +46 -0
nmdc_runtime-2.11.0.dist-info/RECORD +128 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/WHEEL +1 -2
nmdc_runtime/containers.py +0 -14
nmdc_runtime/core/db/Database.py +0 -15
nmdc_runtime/core/exceptions/__init__.py +0 -23
nmdc_runtime/core/exceptions/base.py +0 -47
nmdc_runtime/core/exceptions/token.py +0 -13
nmdc_runtime/domain/users/queriesInterface.py +0 -18
nmdc_runtime/domain/users/userSchema.py +0 -37
nmdc_runtime/domain/users/userService.py +0 -14
nmdc_runtime/infrastructure/database/db.py +0 -3
nmdc_runtime/infrastructure/database/models/user.py +0 -10
nmdc_runtime/lib/__init__.py +0 -1
nmdc_runtime/lib/extract_nmdc_data.py +0 -41
nmdc_runtime/lib/load_nmdc_data.py +0 -121
nmdc_runtime/lib/nmdc_dataframes.py +0 -829
nmdc_runtime/lib/nmdc_etl_class.py +0 -402
nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
nmdc_runtime/site/drsobjects/ingest.py +0 -93
nmdc_runtime/site/drsobjects/registration.py +0 -131
nmdc_runtime/site/translation/emsl.py +0 -43
nmdc_runtime/site/translation/gold.py +0 -53
nmdc_runtime/site/translation/jgi.py +0 -32
nmdc_runtime/site/translation/util.py +0 -132
nmdc_runtime/site/validation/jgi.py +0 -43
nmdc_runtime-2.9.0.dist-info/METADATA +0 -214
nmdc_runtime-2.9.0.dist-info/RECORD +0 -84
nmdc_runtime-2.9.0.dist-info/top_level.txt +0 -1
/nmdc_runtime/{client → api}/__init__.py +0 -0
/nmdc_runtime/{core → api/boot}/__init__.py +0 -0
/nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
/nmdc_runtime/{domain → api/db}/__init__.py +0 -0
/nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
/nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
/nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
/nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/entry_points.txt +0 -0
{nmdc_runtime-2.9.0.dist-info → nmdc_runtime-2.11.0.dist-info}/licenses/LICENSE +0 -0

nmdc_runtime/site/export/ncbi_xml.py CHANGED Viewed

@@ -4,8 +4,9 @@ import datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
-from typing import Any, List, Union
+from typing import Any, List
 from urllib.parse import urlparse
+from unidecode import unidecode
 from nmdc_runtime.site.export.ncbi_xml_utils import (
     handle_controlled_identified_term_value,
     handle_controlled_term_value,
@@ -16,7 +17,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
     handle_float_value,
     handle_string_value,
     load_mappings,
-    validate_xml,
 )
@@ -163,16 +163,53 @@ class NCBISubmissionXML:
         org,
         bioproject_id,
         nmdc_biosamples,
+        pooled_biosamples_data=None,
     ):
         attribute_mappings, slot_range_mappings = load_mappings(
             self.nmdc_ncbi_attribute_mapping_file_url
         )
+        # Use provided pooling data or empty dict
+        pooling_data = pooled_biosamples_data or {}
+        # Group biosamples by pooling process
+        pooling_groups = {}
+        individual_biosamples = []
         for biosample in nmdc_biosamples:
+            pooling_info = pooling_data.get(biosample["id"], {})
+            if pooling_info and pooling_info.get("pooling_process_id"):
+                pooling_process_id = pooling_info["pooling_process_id"]
+                if pooling_process_id not in pooling_groups:
+                    pooling_groups[pooling_process_id] = {
+                        "biosamples": [],
+                        "pooling_info": pooling_info,
+                    }
+                pooling_groups[pooling_process_id]["biosamples"].append(biosample)
+            else:
+                individual_biosamples.append(biosample)
+        # Process pooled sample groups - create one <Action> block per pooling process
+        for pooling_process_id, group_data in pooling_groups.items():
+            self._create_pooled_biosample_action(
+                group_data["biosamples"],
+                group_data["pooling_info"],
+                organism_name,
+                org,
+                bioproject_id,
+                attribute_mappings,
+                slot_range_mappings,
+            )
+        # Process individual biosamples
+        for biosample in individual_biosamples:
             attributes = {}
             sample_id_value = None
             env_package = None
+            # Get pooling info for this specific biosample
+            pooling_info = pooling_data.get(biosample["id"], {})
             for json_key, value in biosample.items():
                 if isinstance(value, list):
                     for item in value:
@@ -191,15 +228,6 @@ class NCBISubmissionXML:
                             attributes[xml_key] = value
                             continue  # Skip applying the handler to this key
-                        # Special handling for "host_taxid"
-                        if json_key == "host_taxid" and isinstance(value, dict):
-                            if "term" in value and "id" in value["term"]:
-                                value = re.findall(
-                                    r"\d+", value["term"]["id"].split(":")[1]
-                                )[0]
-                            attributes[xml_key] = value
-                            continue  # Skip applying the handler to this key
                         formatted_value = handler(item)
                         # Combine multiple values with a separator for list elements
@@ -214,7 +242,11 @@ class NCBISubmissionXML:
                 # Special handling for NMDC Biosample "id"
                 if json_key == "id":
-                    sample_id_value = value
+                    # Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
+                    if pooling_info and pooling_info.get("processed_sample_id"):
+                        sample_id_value = pooling_info["processed_sample_id"]
+                    else:
+                        sample_id_value = value
                     continue
                 if json_key not in attribute_mappings:
@@ -237,10 +269,39 @@ class NCBISubmissionXML:
                     attributes[xml_key] = value
                     continue  # Skip applying the handler to this key
+                # Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
+                if json_key == "geo_loc_name":
+                    formatted_value = handler(value)
+                    formatted_value_ascii = unidecode(formatted_value)
+                    attributes[xml_key] = formatted_value_ascii
+                    continue  # Skip applying the handler to this key
                 # Default processing for other keys
                 formatted_value = handler(value)
                 attributes[xml_key] = formatted_value
+            # Override with aggregated values for pooled samples
+            if pooling_info:
+                if pooling_info.get("aggregated_collection_date"):
+                    # Find the mapping for collection_date
+                    collection_date_key = attribute_mappings.get(
+                        "collection_date", "collection_date"
+                    )
+                    attributes[collection_date_key] = pooling_info[
+                        "aggregated_collection_date"
+                    ]
+                if pooling_info.get("aggregated_depth"):
+                    # Find the mapping for depth
+                    depth_key = attribute_mappings.get("depth", "depth")
+                    attributes[depth_key] = pooling_info["aggregated_depth"]
+                # Add samp_pooling attribute with semicolon-delimited biosample IDs
+                if pooling_info.get("pooled_biosample_ids"):
+                    attributes["samp_pooling"] = ";".join(
+                        pooling_info["pooled_biosample_ids"]
+                    )
             biosample_elements = [
                 self.set_element(
                     "SampleId",
@@ -261,7 +322,48 @@ class NCBISubmissionXML:
                                 f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
                             ),
                         ),
-                    ],
+                    ]
+                    + (
+                        # Add external links for pooled samples
+                        [
+                            self.set_element(
+                                "ExternalLink",
+                                attrib={"label": "NMDC Processed Sample"},
+                                children=[
+                                    self.set_element(
+                                        "URL",
+                                        f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
+                                    )
+                                ],
+                            ),
+                            self.set_element(
+                                "ExternalLink",
+                                attrib={"label": "NMDC Pooling Process"},
+                                children=[
+                                    self.set_element(
+                                        "URL",
+                                        f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
+                                    )
+                                ],
+                            ),
+                        ]
+                        if pooling_info
+                        and pooling_info.get("processed_sample_id")
+                        and pooling_info.get("pooling_process_id")
+                        else [
+                            # Add external link for individual biosamples
+                            self.set_element(
+                                "ExternalLink",
+                                attrib={"label": sample_id_value},
+                                children=[
+                                    self.set_element(
+                                        "URL",
+                                        f"https://bioregistry.io/{sample_id_value}",
+                                    )
+                                ],
+                            ),
+                        ]
+                    ),
                 ),
                 self.set_element(
                     "Organism",
@@ -333,6 +435,248 @@ class NCBISubmissionXML:
             )
             self.root.append(action)
+    def _create_pooled_biosample_action(
+        self,
+        biosamples,
+        pooling_info,
+        organism_name,
+        org,
+        bioproject_id,
+        attribute_mappings,
+        slot_range_mappings,
+    ):
+        # Use the processed sample ID as the primary identifier
+        sample_id_value = pooling_info.get("processed_sample_id")
+        if not sample_id_value:
+            return
+        # Aggregate attributes from all biosamples in the pool
+        aggregated_attributes = {}
+        env_package = None
+        # Get title from the first biosample or use processed sample name
+        title = pooling_info.get(
+            "processed_sample_name", f"Pooled sample {sample_id_value}"
+        )
+        # Process each biosample to collect and aggregate attributes
+        for biosample in biosamples:
+            for json_key, value in biosample.items():
+                if json_key == "id":
+                    continue
+                if json_key == "env_package":
+                    env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
+                    continue
+                if isinstance(value, list):
+                    for item in value:
+                        if json_key not in attribute_mappings:
+                            continue
+                        xml_key = attribute_mappings[json_key]
+                        value_type = slot_range_mappings.get(json_key, "string")
+                        handler = self.type_handlers.get(
+                            value_type, handle_string_value
+                        )
+                        # Special handling for "elev" key
+                        if json_key == "elev":
+                            value = f"{float(value)} m"
+                            aggregated_attributes[xml_key] = value
+                            continue
+                        # Special handling for "host_taxid"
+                        if json_key == "host_taxid" and isinstance(value, dict):
+                            if "term" in value and "id" in value["term"]:
+                                value = re.findall(
+                                    r"\d+", value["term"]["id"].split(":")[1]
+                                )[0]
+                            aggregated_attributes[xml_key] = value
+                            continue
+                        formatted_value = handler(item)
+                        # For pooled samples, we typically want the first value or aggregate appropriately
+                        if xml_key not in aggregated_attributes:
+                            aggregated_attributes[xml_key] = formatted_value
+                    continue
+                if json_key not in attribute_mappings:
+                    continue
+                xml_key = attribute_mappings[json_key]
+                value_type = slot_range_mappings.get(json_key, "string")
+                handler = self.type_handlers.get(value_type, handle_string_value)
+                # Special handling for "elev" key
+                if json_key == "elev":
+                    value = f"{float(value)} m"
+                    aggregated_attributes[xml_key] = value
+                    continue
+                # Special handling for "host_taxid"
+                if json_key == "host_taxid" and isinstance(value, dict):
+                    if "term" in value and "id" in value["term"]:
+                        value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
+                    aggregated_attributes[xml_key] = value
+                    continue
+                formatted_value = handler(value)
+                # For pooled samples, we typically want the first value or aggregate appropriately
+                if xml_key not in aggregated_attributes:
+                    aggregated_attributes[xml_key] = formatted_value
+        # Override with aggregated values for pooled samples
+        if pooling_info.get("aggregated_collection_date"):
+            collection_date_key = attribute_mappings.get(
+                "collection_date", "collection_date"
+            )
+            aggregated_attributes[collection_date_key] = pooling_info[
+                "aggregated_collection_date"
+            ]
+        if pooling_info.get("aggregated_depth"):
+            depth_key = attribute_mappings.get("depth", "depth")
+            aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
+        # Add samp_pooling attribute with semicolon-delimited biosample IDs
+        if pooling_info.get("pooled_biosample_ids"):
+            aggregated_attributes["samp_pooling"] = ";".join(
+                pooling_info["pooled_biosample_ids"]
+            )
+        # Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
+        allowed_attributes = {
+            "collection_date",
+            "depth",
+            "elev",
+            "geo_loc_name",
+            "lat_lon",
+            "env_broad_scale",
+            "env_local_scale",
+            "env_medium",
+            "samp_pooling",
+        }
+        filtered_attributes = {
+            k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
+        }
+        biosample_elements = [
+            self.set_element(
+                "SampleId",
+                children=[
+                    self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
+                ],
+            ),
+            self.set_element(
+                "Descriptor",
+                children=[
+                    self.set_element("Title", title),
+                    self.set_element(
+                        "ExternalLink",
+                        attrib={"label": sample_id_value},
+                        children=[
+                            self.set_element(
+                                "URL",
+                                f"https://bioregistry.io/{sample_id_value}",
+                            )
+                        ],
+                    ),
+                    self.set_element(
+                        "ExternalLink",
+                        attrib={"label": pooling_info["pooling_process_id"]},
+                        children=[
+                            self.set_element(
+                                "URL",
+                                f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
+                            )
+                        ],
+                    ),
+                ]
+                + [
+                    self.set_element(
+                        "ExternalLink",
+                        attrib={"label": biosample_id},
+                        children=[
+                            self.set_element(
+                                "URL",
+                                f"https://bioregistry.io/{biosample_id}",
+                            )
+                        ],
+                    )
+                    for biosample_id in pooling_info.get("pooled_biosample_ids", [])
+                ],
+            ),
+            self.set_element(
+                "Organism",
+                children=[self.set_element("OrganismName", organism_name)],
+            ),
+            self.set_element(
+                "BioProject",
+                children=[
+                    self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
+                ],
+            ),
+            self.set_element("Package", env_package),
+            self.set_element(
+                "Attributes",
+                children=[
+                    self.set_element(
+                        "Attribute", filtered_attributes[key], {"attribute_name": key}
+                    )
+                    for key in sorted(filtered_attributes)
+                ]
+                + [
+                    self.set_element(
+                        "Attribute",
+                        "National Microbiome Data Collaborative",
+                        {"attribute_name": "broker name"},
+                    )
+                ],
+            ),
+        ]
+        action = self.set_element(
+            "Action",
+            children=[
+                self.set_element(
+                    "AddData",
+                    attrib={"target_db": "BioSample"},
+                    children=[
+                        self.set_element(
+                            "Data",
+                            attrib={"content_type": "XML"},
+                            children=[
+                                self.set_element(
+                                    "XmlContent",
+                                    children=[
+                                        self.set_element(
+                                            "BioSample",
+                                            attrib={"schema_version": "2.0"},
+                                            children=biosample_elements,
+                                        ),
+                                    ],
+                                ),
+                            ],
+                        ),
+                        self.set_element(
+                            "Identifier",
+                            children=[
+                                self.set_element(
+                                    "SPUID",
+                                    sample_id_value,
+                                    {"spuid_namespace": org},
+                                ),
+                            ],
+                        ),
+                    ],
+                ),
+            ],
+        )
+        self.root.append(action)
     def set_fastq(
         self,
         biosample_data_objects: list,
@@ -342,12 +686,57 @@ class NCBISubmissionXML:
         nmdc_biosamples: list,
         nmdc_library_preparation: list,
         all_instruments: dict,
+        pooled_biosamples_data=None,
     ):
         bsm_id_name_dict = {
             biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
         }
+        # Use provided pooling data or empty dict
+        pooling_data = pooled_biosamples_data or {}
+        # Group data objects by pooling process
+        pooling_groups = {}
+        individual_entries = []
         for entry in biosample_data_objects:
+            pooling_process_id = None
+            # Check if any biosample in this entry belongs to a pooling process
+            for biosample_id in entry.keys():
+                pooling_info = pooling_data.get(biosample_id, {})
+                if pooling_info and pooling_info.get("pooling_process_id"):
+                    pooling_process_id = pooling_info["pooling_process_id"]
+                    break
+            if pooling_process_id:
+                if pooling_process_id not in pooling_groups:
+                    pooling_groups[pooling_process_id] = {
+                        "entries": [],
+                        "processed_sample_id": pooling_info.get("processed_sample_id"),
+                        "processed_sample_name": pooling_info.get(
+                            "processed_sample_name", ""
+                        ),
+                    }
+                pooling_groups[pooling_process_id]["entries"].append(entry)
+            else:
+                individual_entries.append(entry)
+        # Process pooled entries - create one SRA <Action> block per pooling process
+        for pooling_process_id, group_data in pooling_groups.items():
+            self._create_pooled_sra_action(
+                group_data["entries"],
+                group_data["processed_sample_id"],
+                group_data["processed_sample_name"],
+                bioproject_id,
+                org,
+                nmdc_nucleotide_sequencing,
+                nmdc_library_preparation,
+                all_instruments,
+                bsm_id_name_dict,
+            )
+        # Process individual entries
+        for entry in individual_entries:
             fastq_files = []
             biosample_ids = []
             nucleotide_sequencing_ids = {}
@@ -532,6 +921,7 @@ class NCBISubmissionXML:
                         )
                     )
+                # Add library_name attribute
                 if library_name:
                     sra_attributes.append(
                         self.set_element(
@@ -577,6 +967,233 @@ class NCBISubmissionXML:
                     self.root.append(action)
+    def _create_pooled_sra_action(
+        self,
+        entries,
+        processed_sample_id,
+        processed_sample_name,
+        bioproject_id,
+        org,
+        nmdc_nucleotide_sequencing,
+        nmdc_library_preparation,
+        all_instruments,
+        bsm_id_name_dict,
+    ):
+        if not processed_sample_id:
+            return
+        # Collect all fastq files from all entries
+        all_fastq_files = set()
+        all_biosample_ids = set()
+        nucleotide_sequencing_ids = {}
+        lib_prep_protocol_names = {}
+        analyte_category = ""
+        instrument_vendor = ""
+        instrument_model = ""
+        for entry in entries:
+            for biosample_id, data_objects in entry.items():
+                all_biosample_ids.add(biosample_id)
+                for data_object in data_objects:
+                    if "url" in data_object:
+                        url = urlparse(data_object["url"])
+                        file_path = os.path.basename(url.path)
+                        all_fastq_files.add(file_path)
+                # Get nucleotide sequencing info
+                for ntseq_dict in nmdc_nucleotide_sequencing:
+                    if biosample_id in ntseq_dict:
+                        for ntseq in ntseq_dict[biosample_id]:
+                            nucleotide_sequencing_ids[biosample_id] = ntseq.get(
+                                "id", ""
+                            )
+                            instrument_used = ntseq.get("instrument_used", [])
+                            if instrument_used:
+                                instrument_id = instrument_used[0]
+                                instrument = all_instruments.get(instrument_id, {})
+                                instrument_vendor = instrument.get("vendor", "")
+                                instrument_model = instrument.get("model", "")
+                            analyte_category = ntseq.get("analyte_category", "")
+                # Get library preparation info
+                for lib_prep_dict in nmdc_library_preparation:
+                    if biosample_id in lib_prep_dict:
+                        lib_prep_protocol_names[biosample_id] = (
+                            lib_prep_dict[biosample_id]
+                            .get("protocol_link", {})
+                            .get("name", "")
+                        )
+        if all_fastq_files:
+            files_elements = [
+                self.set_element(
+                    "File",
+                    "",
+                    {"file_path": f},
+                    [
+                        self.set_element(
+                            "DataType",
+                            "sra-run-fastq" if ".fastq" in f else "generic-data",
+                        )
+                    ],
+                )
+                for f in sorted(all_fastq_files)
+            ]
+            attribute_elements = [
+                self.set_element(
+                    "AttributeRefId",
+                    attrib={"name": "BioProject"},
+                    children=[
+                        self.set_element(
+                            "RefId",
+                            children=[
+                                self.set_element(
+                                    "PrimaryId",
+                                    bioproject_id,
+                                    {"db": "BioProject"},
+                                )
+                            ],
+                        )
+                    ],
+                ),
+                # Reference the processed sample, not individual biosamples
+                self.set_element(
+                    "AttributeRefId",
+                    attrib={"name": "BioSample"},
+                    children=[
+                        self.set_element(
+                            "RefId",
+                            children=[
+                                self.set_element(
+                                    "SPUID",
+                                    processed_sample_id,
+                                    {"spuid_namespace": org},
+                                )
+                            ],
+                        )
+                    ],
+                ),
+            ]
+            sra_attributes = []
+            if instrument_vendor == "illumina":
+                sra_attributes.append(
+                    self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
+                )
+                if instrument_model == "nextseq_550":
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "NextSeq 550", {"name": "instrument_model"}
+                        )
+                    )
+                elif instrument_model == "novaseq_6000":
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute",
+                            "NovaSeq 6000",
+                            {"name": "instrument_model"},
+                        )
+                    )
+                elif instrument_model == "hiseq":
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute", "HiSeq", {"name": "instrument_model"}
+                        )
+                    )
+            if analyte_category == "metagenome":
+                sra_attributes.append(
+                    self.set_element("Attribute", "WGS", {"name": "library_strategy"})
+                )
+                sra_attributes.append(
+                    self.set_element(
+                        "Attribute", "METAGENOMIC", {"name": "library_source"}
+                    )
+                )
+                sra_attributes.append(
+                    self.set_element(
+                        "Attribute", "RANDOM", {"name": "library_selection"}
+                    )
+                )
+            elif analyte_category == "metatranscriptome":
+                sra_attributes.append(
+                    self.set_element(
+                        "Attribute",
+                        "METATRANSCRIPTOMIC",
+                        {"name": "library_source"},
+                    )
+                )
+            # Determine library layout based on file patterns
+            has_paired_reads = any(
+                "_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
+                for f in all_fastq_files
+                if "_R1" in f
+            )
+            if has_paired_reads:
+                sra_attributes.append(
+                    self.set_element("Attribute", "paired", {"name": "library_layout"})
+                )
+            else:
+                sra_attributes.append(
+                    self.set_element("Attribute", "single", {"name": "library_layout"})
+                )
+            # Add library_name attribute using ProcessedSample name
+            if processed_sample_name:
+                sra_attributes.append(
+                    self.set_element(
+                        "Attribute", processed_sample_name, {"name": "library_name"}
+                    )
+                )
+            # Add library construction protocol from any of the biosamples
+            for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
+                if lib_prep_name:
+                    sra_attributes.append(
+                        self.set_element(
+                            "Attribute",
+                            lib_prep_name,
+                            {"name": "library_construction_protocol"},
+                        )
+                    )
+                    break  # Only add one protocol name
+            # Use the first nucleotide sequencing ID as the identifier
+            omics_processing_id = None
+            for biosample_id, seq_id in nucleotide_sequencing_ids.items():
+                if seq_id:
+                    omics_processing_id = seq_id
+                    break
+            if omics_processing_id:
+                identifier_element = self.set_element(
+                    "Identifier",
+                    children=[
+                        self.set_element(
+                            "SPUID", omics_processing_id, {"spuid_namespace": org}
+                        )
+                    ],
+                )
+                action = self.set_element(
+                    "Action",
+                    children=[
+                        self.set_element(
+                            "AddFiles",
+                            attrib={"target_db": "SRA"},
+                            children=files_elements
+                            + attribute_elements
+                            + sra_attributes
+                            + [identifier_element],
+                        ),
+                    ],
+                )
+                self.root.append(action)
     def get_submission_xml(
         self,
         biosamples_list: list,
@@ -584,6 +1201,7 @@ class NCBISubmissionXML:
         biosample_data_objects_list: list,
         biosample_library_preparation_list: list,
         instruments_dict: dict,
+        pooled_biosamples_data=None,
     ):
         # data_type = None
@@ -646,6 +1264,7 @@ class NCBISubmissionXML:
             org=self.ncbi_submission_metadata.get("organization", ""),
             bioproject_id=self.ncbi_bioproject_id,
             nmdc_biosamples=filtered_biosamples_list,
+            pooled_biosamples_data=pooled_biosamples_data,
         )
         # Also filter biosample_data_objects_list
@@ -692,6 +1311,7 @@ class NCBISubmissionXML:
             nmdc_biosamples=filtered_biosamples_list,
             nmdc_library_preparation=filtered_library_preparation_list,
             all_instruments=instruments_dict,
+            pooled_biosamples_data=pooled_biosamples_data,
         )
         rough_string = ET.tostring(self.root, "unicode")

nmdc-runtime 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

Potentially problematic release.

nmdc-runtime 2.9.0py3-none-any.whl → 2.11.0py3-none-any.whl