PyPI - masster - Versions diffs - 0.5.13__tar.gz → 0.5.14__tar.gz - Mend

masster 0.5.13tar.gz → 0.5.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (97) hide show

{masster-0.5.13 → masster-0.5.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.13
+Version: 0.5.14
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.13 → masster-0.5.14}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.13"
+version = "0.5.14"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.13 → masster-0.5.14}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.13"
+__version__ = "0.5.14"
 def get_version():

{masster-0.5.13 → masster-0.5.14}/src/masster/lib/lib.py RENAMED Viewed

@@ -46,11 +46,9 @@ annotations = lib.annotate_features(sample.features_df)
 """
 import os
-import uuid
 from typing import Optional, Union, List, Dict, Any, TYPE_CHECKING
 import warnings
-import numpy as np
 import polars as pl
 import pyopenms as oms
@@ -58,6 +56,148 @@ if TYPE_CHECKING:
     import pandas as pd
+def _calculate_formula_mass_shift(formula: str) -> float:
+    """
+    Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
+    Parameters
+    ----------
+    formula : str
+        Formula string (e.g., "+H", "-H2O", "+Na-H")
+    Returns
+    -------
+    float
+        Mass shift in Daltons
+    """
+    # Standard atomic masses
+    atomic_masses = {
+        "H": 1.007825,
+        "C": 12.0,
+        "N": 14.003074,
+        "O": 15.994915,
+        "Na": 22.989769,
+        "K": 38.963707,
+        "Li": 7.016003,
+        "Ca": 39.962591,
+        "Mg": 23.985042,
+        "Fe": 55.934938,
+        "Cl": 34.968853,
+        "Br": 78.918336,
+        "I": 126.904473,
+        "P": 30.973762,
+        "S": 31.972071,
+    }
+    total_mass = 0.0
+    # Parse formula by splitting on + and - while preserving the operators
+    parts = []
+    current_part = ""
+    current_sign = 1
+    for char in formula:
+        if char == "+":
+            if current_part:
+                parts.append((current_sign, current_part))
+            current_part = ""
+            current_sign = 1
+        elif char == "-":
+            if current_part:
+                parts.append((current_sign, current_part))
+            current_part = ""
+            current_sign = -1
+        else:
+            current_part += char
+    if current_part:
+        parts.append((current_sign, current_part))
+    # Process each part
+    for sign, part in parts:
+        if not part:
+            continue
+        # Parse element and count (e.g., "H2O" -> H:2, O:1)
+        elements = _parse_element_counts(part)
+        for element, count in elements.items():
+            if element in atomic_masses:
+                total_mass += sign * atomic_masses[element] * count
+    return total_mass
+def _parse_element_counts(formula_part: str) -> Dict[str, int]:
+    """Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
+    elements = {}
+    i = 0
+    while i < len(formula_part):
+        # Get element (uppercase letter, possibly followed by lowercase)
+        element = formula_part[i]
+        i += 1
+        while i < len(formula_part) and formula_part[i].islower():
+            element += formula_part[i]
+            i += 1
+        # Get count (digits following element)
+        count_str = ""
+        while i < len(formula_part) and formula_part[i].isdigit():
+            count_str += formula_part[i]
+            i += 1
+        count = int(count_str) if count_str else 1
+        elements[element] = elements.get(element, 0) + count
+    return elements
+def _format_adduct_name(components: List[Dict]) -> str:
+    """Format adduct name from components like [M+H]1+ or [M+2H]2+"""
+    if not components:
+        return "[M]"
+    # Count occurrences of each formula
+    from collections import Counter
+    formula_counts = Counter(comp["formula"] for comp in components)
+    total_charge = sum(comp["charge"] for comp in components)
+    # Build formula part with proper multipliers
+    formula_parts = []
+    for formula, count in sorted(
+        formula_counts.items(),
+    ):  # Sort for consistent ordering
+        if count == 1:
+            formula_parts.append(formula)
+        else:
+            # For multiple occurrences, use count prefix (e.g., 2H, 3Na)
+            # Handle special case where formula might already start with + or -
+            if formula.startswith(("+", "-")):
+                sign = formula[0]
+                base_formula = formula[1:]
+                formula_parts.append(f"{sign}{count}{base_formula}")
+            else:
+                formula_parts.append(f"{count}{formula}")
+    # Combine formula parts
+    formula = "".join(formula_parts)
+    # Format charge
+    if total_charge == 0:
+        charge_str = ""
+    elif abs(total_charge) == 1:
+        charge_str = "1+" if total_charge > 0 else "1-"
+    else:
+        charge_str = (
+            f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
+        )
+    return f"[M{formula}]{charge_str}"
 class Lib:
     """
     A class for managing compound libraries and feature annotation in mass spectrometry data.
@@ -89,22 +229,21 @@ class Lib:
         >>> print(f"Loaded {len(lib.lib_df)} library entries")
     """
-    # Define supported adducts and their properties
-    ADDUCT_DEFINITIONS = {
-        # Positive mode adducts
-        "[M+H]1+": {"delta_m": 1.007276, "delta_z": 1, "polarity": "positive"},
-        "[M+Na]1+": {"delta_m": 22.989218, "delta_z": 1, "polarity": "positive"},
-        "[M+K]1+": {"delta_m": 38.962383, "delta_z": 1, "polarity": "positive"},
-        "[M+NH4]1+": {"delta_m": 18.033823, "delta_z": 1, "polarity": "positive"},
-        "[M+H-H2O]1+": {"delta_m": -17.00329, "delta_z": 1, "polarity": "positive"},
-        "[M+2H]2+": {"delta_m": 2.014552, "delta_z": 2, "polarity": "positive"},
-        # Negative mode adducts
-        "[M-H]1-": {"delta_m": -1.007276, "delta_z": -1, "polarity": "negative"},
-        "[M+CH3COO]1-": {"delta_m": 59.013852, "delta_z": -1, "polarity": "negative"},
-        "[M+HCOO]1-": {"delta_m": 44.998203, "delta_z": -1, "polarity": "negative"},
-        "[M+Cl]1-": {"delta_m": 34.968853, "delta_z": -1, "polarity": "negative"},
-        "[M-2H]2-": {"delta_m": -2.014552, "delta_z": -2, "polarity": "negative"},
+    # Default adduct definitions using OpenMS format
+    DEFAULT_ADDUCTS = {
+        "positive": [
+            "+H:1:0.65",
+            "+Na:1:0.15",
+            "+K:1:0.05",
+            "+NH4:1:0.15",
+            "-H2O:0:0.15",
+        ],
+        "negative": [
+            "-H:-1:0.9",
+            "+Cl:-1:0.1",
+            "+CH2O2:0:0.15",
+            "-H2O:0:0.15",
+        ]
     }
     def __init__(self):
@@ -119,12 +258,15 @@ class Lib:
             "cmpd_uid": pl.Series([], dtype=pl.Int64),
             "source_id": pl.Series([], dtype=pl.Utf8),
             "name": pl.Series([], dtype=pl.Utf8),
+            "shortname": pl.Series([], dtype=pl.Utf8),
+            "class": pl.Series([], dtype=pl.Utf8),
             "smiles": pl.Series([], dtype=pl.Utf8),
             "inchi": pl.Series([], dtype=pl.Utf8),
             "inchikey": pl.Series([], dtype=pl.Utf8),
             "formula": pl.Series([], dtype=pl.Utf8),
             "iso": pl.Series([], dtype=pl.Int64),
             "adduct": pl.Series([], dtype=pl.Utf8),
+            "probability": pl.Series([], dtype=pl.Float64),
             "m": pl.Series([], dtype=pl.Float64),
             "z": pl.Series([], dtype=pl.Int8),
             "mz": pl.Series([], dtype=pl.Float64),
@@ -134,6 +276,177 @@ class Lib:
             "db": pl.Series([], dtype=pl.Utf8),
         })
+    def _get_adducts(self,
+                    adducts_list: Optional[List[str]] = None,
+                    polarity: Optional[str] = None,
+                    min_probability: float = 0.03,
+                    **kwargs) -> pl.DataFrame:
+        """
+        Generate comprehensive adduct specifications for the library.
+        This method creates a DataFrame of adduct combinations following the same
+        syntax as Study() and Sample() classes.
+        Args:
+            adducts_list: List of adduct specifications in OpenMS format (e.g., "+H:1:0.65")
+            polarity: "positive", "negative", or None for both
+            min_probability: Minimum probability threshold to filter adducts
+            **kwargs: Additional parameters for adduct generation
+        Returns:
+            DataFrame with columns:
+            - name: Formatted adduct name like "[M+H]1+"
+            - charge: Total charge of the adduct
+            - mass_shift: Total mass shift in Da
+            - probability: Combined probability score
+            - complexity: Number of adduct components
+        """
+        # Get adduct specifications
+        if adducts_list is None:
+            if polarity is None:
+                # Use positive by default
+                adducts_list = self.DEFAULT_ADDUCTS["positive"]
+            elif polarity.lower() in ["positive", "pos"]:
+                adducts_list = self.DEFAULT_ADDUCTS["positive"]
+            elif polarity.lower() in ["negative", "neg"]:
+                adducts_list = self.DEFAULT_ADDUCTS["negative"]
+            else:
+                raise ValueError(f"Unknown polarity: {polarity}")
+        # Parameters
+        charge_min = kwargs.get("charge_min", -2)
+        charge_max = kwargs.get("charge_max", 2)
+        max_combinations = kwargs.get("max_combinations", 2)
+        # Parse base adduct specifications
+        base_specs = []
+        for adduct_str in adducts_list:
+            if not isinstance(adduct_str, str) or ":" not in adduct_str:
+                continue
+            try:
+                parts = adduct_str.split(":")
+                if len(parts) != 3:
+                    continue
+                formula_part = parts[0]
+                charge = int(parts[1])
+                probability = float(parts[2])
+                # Calculate mass shift from formula
+                mass_shift = _calculate_formula_mass_shift(formula_part)
+                base_specs.append({
+                    "formula": formula_part,
+                    "charge": charge,
+                    "mass_shift": mass_shift,
+                    "probability": probability,
+                    "raw_string": adduct_str,
+                })
+            except (ValueError, IndexError):
+                continue
+        if not base_specs:
+            # Return empty DataFrame with correct schema
+            return pl.DataFrame({
+                "name": [],
+                "charge": [],
+                "mass_shift": [],
+                "probability": [],
+                "complexity": [],
+            })
+        # Generate all valid combinations
+        combinations_list = []
+        # Separate specs by charge type
+        positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
+        negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
+        neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
+        # 1. Single adducts
+        for spec in base_specs:
+            if charge_min <= spec["charge"] <= charge_max:
+                formatted_name = _format_adduct_name([spec])
+                combinations_list.append({
+                    "components": [spec],
+                    "formatted_name": formatted_name,
+                    "total_mass_shift": spec["mass_shift"],
+                    "total_charge": spec["charge"],
+                    "combined_probability": spec["probability"],
+                    "complexity": 1,
+                })
+        # 2. Generate multiply charged versions (2H+, 3H+, etc.)
+        if max_combinations >= 2:
+            for spec in positive_specs + negative_specs:
+                base_charge = spec["charge"]
+                for multiplier in range(2, min(max_combinations + 1, 4)):
+                    total_charge = base_charge * multiplier
+                    if charge_min <= total_charge <= charge_max:
+                        components = [spec] * multiplier
+                        formatted_name = _format_adduct_name(components)
+                        combinations_list.append({
+                            "components": components,
+                            "formatted_name": formatted_name,
+                            "total_mass_shift": spec["mass_shift"] * multiplier,
+                            "total_charge": total_charge,
+                            "combined_probability": spec["probability"] ** multiplier,
+                            "complexity": multiplier,
+                        })
+        # 3. Mixed combinations (positive + neutral)
+        if max_combinations >= 2:
+            for pos_spec in positive_specs:
+                for neut_spec in neutral_specs:
+                    total_charge = pos_spec["charge"] + neut_spec["charge"]
+                    if charge_min <= total_charge <= charge_max:
+                        components = [pos_spec, neut_spec]
+                        formatted_name = _format_adduct_name(components)
+                        combinations_list.append({
+                            "components": components,
+                            "formatted_name": formatted_name,
+                            "total_mass_shift": pos_spec["mass_shift"] + neut_spec["mass_shift"],
+                            "total_charge": total_charge,
+                            "combined_probability": pos_spec["probability"] * neut_spec["probability"],
+                            "complexity": 2,
+                        })
+        # Convert to polars DataFrame
+        if combinations_list:
+            combinations_list.sort(
+                key=lambda x: (-x["combined_probability"], x["complexity"])
+            )
+            adducts_df = pl.DataFrame([
+                {
+                    "name": combo["formatted_name"],
+                    "charge": combo["total_charge"],
+                    "mass_shift": combo["total_mass_shift"],
+                    "probability": combo["combined_probability"],
+                    "complexity": combo["complexity"],
+                }
+                for combo in combinations_list
+            ])
+        else:
+            # Return empty DataFrame with correct schema
+            adducts_df = pl.DataFrame({
+                "name": [],
+                "charge": [],
+                "mass_shift": [],
+                "probability": [],
+                "complexity": [],
+            })
+        # Filter by minimum probability
+        if min_probability > 0.0 and len(adducts_df) > 0:
+            adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
+        return adducts_df
     def _calculate_accurate_mass(self, formula: str) -> Optional[float]:
         """
         Calculate the accurate mass for a molecular formula using PyOpenMS.
@@ -185,15 +498,17 @@ class Lib:
                                 compound_data: Dict[str, Any],
                                 adducts: Optional[List[str]] = None,
                                 polarity: Optional[str] = None,
-                                lib_id_counter: Optional[int] = None) -> tuple[List[Dict[str, Any]], int]:
+                                lib_id_counter: Optional[int] = None,
+                                min_probability: float = 0.03) -> tuple[List[Dict[str, Any]], int]:
         """
-        Generate adduct variants for a given compound.
+        Generate adduct variants for a given compound using the new adduct system.
         Args:
             compound_data: Dictionary containing compound information
-            adducts: List of specific adducts to generate. If None, uses all adducts for polarity
-            polarity: Ionization polarity ("positive", "negative", or None for both)
+            adducts: List of specific adducts to generate. If None, uses defaults for polarity
+            polarity: Ionization polarity ("positive", "negative", or None for positive)
             lib_id_counter: Counter for generating unique lib_uid values
+            min_probability: Minimum probability threshold for adduct filtering
         Returns:
             Tuple of (list of dictionaries representing adduct variants, updated counter)
@@ -206,35 +521,25 @@ class Lib:
         if accurate_mass is None:
             return variants, counter
-        # Determine which adducts to use
-        if adducts is None:
-            if polarity is None:
-                # Use all adducts
-                selected_adducts = list(self.ADDUCT_DEFINITIONS.keys())
-            else:
-                # Filter by polarity
-                selected_adducts = [
-                    adduct for adduct, props in self.ADDUCT_DEFINITIONS.items()
-                    if props["polarity"] == polarity.lower()
-                ]
-        else:
-            selected_adducts = adducts
+        # Get adduct specifications using _get_adducts
+        adducts_df = self._get_adducts(
+            adducts_list=adducts,
+            polarity=polarity,
+            min_probability=min_probability
+        )
+        if len(adducts_df) == 0:
+            return variants, counter
         # Generate variants for each adduct
-        for adduct in selected_adducts:
-            if adduct not in self.ADDUCT_DEFINITIONS:
-                warnings.warn(f"Unknown adduct: {adduct}")
-                continue
-            adduct_props = self.ADDUCT_DEFINITIONS[adduct]
-            # Skip if polarity doesn't match
-            if polarity is not None and adduct_props["polarity"] != polarity.lower():
-                continue
+        for adduct_row in adducts_df.iter_rows(named=True):
+            adduct_name = adduct_row["name"]
+            charge = adduct_row["charge"]
+            mass_shift = adduct_row["mass_shift"]
+            probability = adduct_row["probability"]
             # Calculate adducted mass and m/z
-            adducted_mass = accurate_mass + adduct_props["delta_m"]
-            charge = adduct_props["delta_z"]
+            adducted_mass = accurate_mass + mass_shift
             mz = abs(adducted_mass / charge) if charge != 0 else adducted_mass
             # Create variant entry
@@ -243,12 +548,15 @@ class Lib:
                 "cmpd_uid": compound_data.get("cmpd_uid", None),
                 "source_id": compound_data.get("source_id", None),
                 "name": compound_data.get("name", ""),
+                "shortname": compound_data.get("shortname", ""),
+                "class": compound_data.get("class", ""),
                 "smiles": compound_data.get("smiles", ""),
                 "inchi": compound_data.get("inchi", ""),
                 "inchikey": compound_data.get("inchikey", ""),
                 "formula": compound_data["formula"],
                 "iso": 0,  # Default to zero
-                "adduct": adduct,
+                "adduct": adduct_name,
+                "probability": probability,
                 "m": adducted_mass,
                 "z": charge,
                 "mz": mz,
@@ -265,7 +573,8 @@ class Lib:
     def import_csv(self,
                   csvfile: str,
                   polarity: Optional[str] = None,
-                  adducts: Optional[List[str]] = None) -> None:
+                  adducts: Optional[List[str]] = None,
+                  min_probability: float = 0.03) -> None:
         """
         Import compound library from a CSV file.
@@ -274,8 +583,9 @@ class Lib:
         Args:
             csvfile: Path to the CSV file
-            polarity: Ionization polarity ("positive", "negative", or None for both)
-            adducts: Specific adducts to generate. If None, generates all for the polarity
+            polarity: Ionization polarity ("positive", "negative", or None for positive)
+            adducts: Specific adducts to generate. If None, generates defaults for the polarity
+            min_probability: Minimum probability threshold for adduct filtering
         Expected CSV columns (case-insensitive):
             - Required: Formula (or formula)
@@ -319,6 +629,8 @@ class Lib:
             compound_data = {
                 "name": row.get(column_mapping.get("name", ""), ""),
+                "shortname": row.get(column_mapping.get("shortname", ""), ""),
+                "class": row.get(column_mapping.get("class", ""), ""),
                 "smiles": row.get(column_mapping.get("smiles", ""), ""),
                 "inchi": row.get(column_mapping.get("inchi", ""), ""),
                 "inchikey": row.get(column_mapping.get("inchikey", ""), ""),
@@ -331,7 +643,8 @@ class Lib:
             # Generate adduct variants
             variants, lib_id_counter = self._generate_adduct_variants(
-                compound_data, adducts=adducts, polarity=polarity, lib_id_counter=lib_id_counter
+                compound_data, adducts=adducts, polarity=polarity,
+                lib_id_counter=lib_id_counter, min_probability=min_probability
             )
             all_variants.extend(variants)
@@ -349,7 +662,8 @@ class Lib:
                     compound_data_rt2["name"] = compound_data["name"] + " II"
                     variants_rt2, lib_id_counter = self._generate_adduct_variants(
-                        compound_data_rt2, adducts=adducts, polarity=polarity, lib_id_counter=lib_id_counter
+                        compound_data_rt2, adducts=adducts, polarity=polarity,
+                        lib_id_counter=lib_id_counter, min_probability=min_probability
                     )
                     all_variants.extend(variants_rt2)
@@ -529,6 +843,8 @@ class Lib:
                     "cmpd_uid": match_row.get("cmpd_uid"),
                     "source_id": match_row.get("source_id"),
                     "name": match_row["name"],
+                    "shortname": match_row["shortname"],
+                    "class": match_row["class"],
                     "formula": match_row["formula"],
                     "iso": match_row.get("iso", 0),
                     "adduct": match_row["adduct"],
@@ -555,10 +871,8 @@ class Lib:
         Returns:
             List of adduct names
         """
-        return [
-            adduct for adduct, props in self.ADDUCT_DEFINITIONS.items()
-            if props["polarity"] == polarity.lower()
-        ]
+        adducts_df = self._get_adducts(polarity=polarity, min_probability=0.0)
+        return adducts_df.select("name").to_series().to_list()
     def __len__(self) -> int:
         """Return number of library entries."""

{masster-0.5.13 → masster-0.5.14}/src/masster/study/helpers.py RENAMED Viewed

@@ -490,6 +490,7 @@ def align_reset(self):
     # Ensure column order is maintained after with_columns operation
     from masster.study.helpers import _ensure_features_df_schema_order
     _ensure_features_df_schema_order(self)
+    self.logger.info("Alignment reset: all feature RTs set to original_RT.")
 # =====================================================================================

masster 0.5.13__tar.gz → 0.5.14__tar.gz

Potentially problematic release.

masster 0.5.13tar.gz → 0.5.14tar.gz