PyPI - masster - Versions diffs - 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

masster 0.5.27py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (30) hide show

masster/_version.py +1 -1
masster/data/libs/aa_nort.json +240 -0
masster/data/libs/ccm_nort.json +1319 -0
masster/lib/lib.py +1 -1
masster/logger.py +0 -6
masster/sample/adducts.py +1 -1
masster/sample/defaults/find_adducts_def.py +1 -1
masster/sample/h5.py +152 -2
masster/sample/helpers.py +91 -5
masster/sample/id.py +1160 -0
masster/sample/importers.py +316 -0
masster/sample/plot.py +175 -71
masster/sample/sample.py +18 -3
masster/sample/sample5_schema.json +99 -1
masster/study/defaults/study_def.py +8 -12
masster/study/id.py +59 -12
masster/study/load.py +0 -11
masster/study/merge.py +153 -0
masster/study/plot.py +197 -0
masster/study/study.py +3 -1
masster/study/study5_schema.json +15 -0
masster/wizard/wizard.py +11 -12
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/METADATA +99 -60
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/RECORD +27 -26
masster/data/libs/aa.csv +0 -22
masster/data/libs/ccm.csv +0 -120
masster/data/libs/urine.csv +0 -4693
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/WHEEL +0 -0
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/entry_points.txt +0 -0
{masster-0.5.27.dist-info → masster-0.6.0.dist-info}/licenses/LICENSE +0 -0

masster/sample/sample.py CHANGED Viewed

@@ -129,6 +129,12 @@ from masster.sample.helpers import get_eic
 from masster.sample.helpers import set_source
 from masster.sample.helpers import _recreate_feature_map
 from masster.sample.helpers import _get_feature_map
+from masster.sample.id import lib_load
+from masster.sample.id import identify
+from masster.sample.id import get_id
+from masster.sample.id import id_reset
+from masster.sample.id import lib_reset
+from masster.sample.importers import import_oracle
 from masster.sample.load import chrom_extract
 from masster.sample.load import _index_file
 from masster.sample.load import load
@@ -259,9 +265,10 @@ class Sample:
         # the polars data frame with MS1 level data
         self.ms1_df = pl.DataFrame()
-        # lightweight lib data for matching, targeted analyses, etc. > superseded by study methods
-        self.lib = None
-        self.lib_match = None
+        # identification DataFrames (lib_df and id_df)
+        self.lib_df = None  # library DataFrame (from masster.lib or CSV/JSON)
+        self.id_df = None   # identification results DataFrame
+        self._lib = None    # reference to Lib object if loaded
         self.chrom_df = None
         if params.filename is not None:
@@ -292,6 +299,14 @@ class Sample:
     update_parameters = update_parameters
     get_parameters_property = get_parameters_property
     set_parameters_property = set_parameters_property
+    # Identification methods from id.py
+    lib_load = lib_load
+    identify = identify
+    get_id = get_id
+    id_reset = id_reset
+    lib_reset = lib_reset
+    # Importers from importers.py
+    import_oracle = import_oracle
     export_features = export_features
     export_xlsx = export_xlsx
     export_mgf = export_mgf

masster/sample/sample5_schema.json CHANGED Viewed

@@ -93,10 +93,108 @@
       },
       "ms1_spec": {
         "dtype": "pl.Object"
+      },
+      "id_top_name": {
+        "dtype": "pl.Utf8"
+      },
+      "id_top_class": {
+        "dtype": "pl.Utf8"
+      },
+      "id_top_adduct": {
+        "dtype": "pl.Utf8"
+      },
+      "id_top_score": {
+        "dtype": "pl.Float64"
+      },
+      "id_source": {
+        "dtype": "pl.Utf8"
+      }
+    }
+  },
+  "lib_df": {
+    "columns": {
+      "lib_uid": {
+        "dtype": "pl.Int64"
+      },
+      "cmpd_uid": {
+        "dtype": "pl.Int64"
+      },
+      "name": {
+        "dtype": "pl.Utf8"
+      },
+      "shortname": {
+        "dtype": "pl.Utf8"
+      },
+      "class": {
+        "dtype": "pl.Utf8"
+      },
+      "formula": {
+        "dtype": "pl.Utf8"
+      },
+      "iso": {
+        "dtype": "pl.Int64"
+      },
+      "smiles": {
+        "dtype": "pl.Utf8"
+      },
+      "inchi": {
+        "dtype": "pl.Utf8"
+      },
+      "inchikey": {
+        "dtype": "pl.Utf8"
+      },
+      "adduct": {
+        "dtype": "pl.Utf8"
+      },
+      "z": {
+        "dtype": "pl.Int64"
+      },
+      "m": {
+        "dtype": "pl.Float64"
+      },
+      "mz": {
+        "dtype": "pl.Float64"
+      },
+      "rt": {
+        "dtype": "pl.Float64"
+      },
+      "quant_group": {
+        "dtype": "pl.Int64"
+      },
+      "probability": {
+        "dtype": "pl.Float64"
+      },
+      "source_id": {
+        "dtype": "pl.Utf8"
+      }
+    }
+  },
+  "id_df": {
+    "columns": {
+      "feature_uid": {
+        "dtype": "pl.Int64"
+      },
+      "lib_uid": {
+        "dtype": "pl.Int64"
+      },
+      "mz_delta": {
+        "dtype": "pl.Float64"
+      },
+      "rt_delta": {
+        "dtype": "pl.Float64"
+      },
+      "matcher": {
+        "dtype": "pl.Utf8"
+      },
+      "score": {
+        "dtype": "pl.Float64"
+      },
+      "iso": {
+        "dtype": "pl.Int64"
       }
     }
   },
-  "generated_date": "2025-08-03",
+  "generated_date": "2025-10-30",
   "ms1_df": {
     "columns": {
       "cycle": {

masster/study/defaults/study_def.py CHANGED Viewed

@@ -96,19 +96,15 @@ class study_defaults:
             "adducts": {
                 "dtype": "list[str]",
                 "description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
-                "default": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
+                "default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
                 "examples": {
-                    "positive": ["H:+:0.8", "Na:+:0.1", "NH4:+:0.1"],
-                    "negative": [
-                        "H-1:-:0.95",
-                        "Cl:-:0.05",
-                        "CH2O2:0:0.2",
-                        "H-2-O:0:0.2",
-                    ],
+                    "positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
+                    "negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
                 },
                 "validation_rules": [
-                    "Format: element:charge:probability",
-                    "Charge must be +, -, or 0 (neutral)",
+                    "Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
+                    "Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
+                    "Charge must be an integer (positive, negative, or 0 for neutral)",
                     "Probability must be between 0.0 and 1.0",
                     "Sum of all charged adduct probabilities must equal 1.0",
                 ],
@@ -128,7 +124,7 @@ class study_defaults:
         """Set polarity-specific defaults for adducts if not explicitly provided."""
         # If adducts is None, set based on polarity
         if self.adducts is None:
-            if self.polarity.lower() in ["positive", "pos"]:
+            if self.polarity.lower() in ["positive", "pos", "+"]:
                 self.adducts = [
                     "+H:1:0.65",
                     "+Na:1:0.15",
@@ -136,7 +132,7 @@ class study_defaults:
                     "+K:1:0.05",
                     "-H2O:0:0.15",
                 ]
-            elif self.polarity.lower() in ["negative", "neg"]:
+            elif self.polarity.lower() in ["negative", "neg", "-"]:
                 self.adducts = [
                     "-H:-1:0.9",
                     "+Cl:-1:0.1",

masster/study/id.py CHANGED Viewed

@@ -24,7 +24,8 @@ def lib_load(
         lib_source: either a CSV/JSON file path (str) or a Lib instance
         polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
                  If None, uses study.polarity automatically.
-        adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
+        adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
+                 If None, uses study.parameters.adducts if available.
         iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
     Side effects:
@@ -51,6 +52,18 @@ def lib_load(
         else:
             polarity = "positive"  # Default fallback
         study.logger.debug(f"Using study polarity: {polarity}")
+    # Use study.parameters.adducts if adducts not explicitly provided
+    # If study.parameters.adducts is also None, lib will use its default adducts for the polarity
+    if adducts is None:
+        if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
+            adducts = study.parameters.adducts
+            if adducts:
+                study.logger.debug(f"Using study.parameters.adducts: {adducts}")
+            else:
+                study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
+        else:
+            study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
     # Handle string input (CSV or JSON file path)
     if isinstance(lib_source, str):
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
     """
     Find library matches using optimized vectorized operations.
-    FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
+    Automatically skips RT filtering if library has no RT data for the matched entries.
     """
     # Filter by m/z tolerance using vectorized operations
     matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
     initial_match_count = len(matches)
-    # Apply RT filter if available - STRICT VERSION (no fallback)
+    # Apply RT filter if requested AND if data is available
+    # Strategy: Handle mixed RT/no-RT entries properly by treating them separately
     if rt_tol is not None and cons_rt is not None and not matches.is_empty():
-        # First, check if any m/z matches have RT data
+        # Separate entries with and without RT data
         rt_candidates = matches.filter(pl.col("rt").is_not_null())
+        no_rt_entries = matches.filter(pl.col("rt").is_null())
         if not rt_candidates.is_empty():
             # Apply RT filtering to candidates with RT data
             rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
-            if not rt_matches.is_empty():
+            # Combine RT-filtered matches with entries that have no RT data
+            # Rationale: Entries without RT can't be filtered by RT, so include them
+            if not rt_matches.is_empty() and not no_rt_entries.is_empty():
+                # Both RT matches and no-RT entries exist
+                matches = pl.concat([rt_matches, no_rt_entries])
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
+                        f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
+                    )
+            elif not rt_matches.is_empty():
+                # Only RT matches, no entries without RT
                 matches = rt_matches
                 if logger:
                     logger.debug(
-                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, {len(matches)} after RT filter"
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
+                        f"{len(matches)} passed RT filter"
+                    )
+            elif not no_rt_entries.is_empty():
+                # No RT matches passed filter, but there are entries without RT
+                matches = no_rt_entries
+                if logger:
+                    logger.debug(
+                        f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
+                        f"using {len(matches)} entries with no RT data"
                     )
             else:
-                # NO FALLBACK - if RT filtering finds no matches, return empty
-                matches = rt_matches  # This is empty
+                # No RT matches and no entries without RT - return empty
+                matches = pl.DataFrame()
                 if logger:
                     logger.debug(
                         f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
                     )
         else:
-            # No RT data in library matches - return empty if strict RT filtering requested
+            # All m/z matches have no RT data - keep all m/z matches
             if logger:
                 logger.debug(
-                    f"Consensus {cons_uid}: {initial_match_count} m/z matches but none have library RT data - no matches returned due to RT filtering"
+                    f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
                 )
-            matches = pl.DataFrame()  # Return empty DataFrame
+            # matches already contains the m/z-filtered results (which are all no_rt_entries)
     # FIX 1: Add stricter m/z validation - prioritize more accurate matches
     if not matches.is_empty():
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
     effective_mz_tol = getattr(params, "mz_tol", 0.01)
     effective_rt_tol = getattr(params, "rt_tol", 2.0)
+    # Check if library has RT data - if not, disable RT filtering
+    if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
+        if "rt" in study.lib_df.columns:
+            # Check if library has any non-null RT values
+            rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
+            if rt_count == 0:
+                if logger:
+                    logger.info(
+                        f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
+                    )
+                effective_rt_tol = None
     if logger:
         logger.debug(
             f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
             if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
                 components = [spec] * multiplier
                 formatted_name = _format_adduct_name(components)
-                probability_multiplied = float(spec["probability"]) ** multiplier
+                probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
                 combinations_list.append(
                     {

masster/study/load.py CHANGED Viewed

@@ -191,17 +191,6 @@ def load(self, filename=None):
     _load_study5(self, filename)
-    # After loading the study, check if we have consensus features before loading consensus XML
-    # if (self.consensus_df is not None and not self.consensus_df.is_empty()):
-    #    consensus_xml_path = filename.replace(".study5", ".consensusXML")
-    #    if os.path.exists(consensus_xml_path):
-    #        self._load_consensusXML(filename=consensus_xml_path)
-    # self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
-    #    else:
-    #        self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
-    # else:
-    #    self.logger.debug("No consensus features found, skipping consensusXML loading")
     self.filename = filename

masster/study/merge.py CHANGED Viewed

@@ -441,9 +441,15 @@ def merge(study, **kwargs) -> None:
     cached_valid_adducts = None
     try:
         cached_adducts_df = study._get_adducts()
+        # Remove all adducts with wrong polarity
+        if study.polarity == "positive":
+            cached_adducts_df = cached_adducts_df.filter(pl.col("charge") >= 0)
+        else:
+            cached_adducts_df = cached_adducts_df.filter(pl.col("charge") <= 0)
         if not cached_adducts_df.is_empty():
             cached_valid_adducts = set(cached_adducts_df["name"].to_list())
         else:
+            study.logger.warning(f"No valid adducts found for polarity '{study.polarity}'")
             cached_valid_adducts = set()
     except Exception as e:
         study.logger.warning(f"Could not retrieve study adducts: {e}")
@@ -452,6 +458,13 @@ def merge(study, **kwargs) -> None:
     # Always allow '?' adducts
     cached_valid_adducts.add("?")
+    # Bypass for single sample case
+    if len(study.samples_df) == 1:
+        study.logger.info("Single sample detected - bypassing merge algorithm and using direct feature mapping")
+        _handle_single_sample_merge(study, cached_adducts_df, cached_valid_adducts)
+        # Skip all post-processing for single sample case
+        return
     # Route to algorithm implementation
     if params.method == "kd":
         consensus_map = _merge_kd(study, params)
@@ -1719,6 +1732,10 @@ def _calculate_consensus_statistics(
         mz_values: m/z values from chunk consensus features
         intensity_values: Intensity values from chunk consensus features
         quality_values: Quality values from chunk consensus features
+        number_features: Number of unique features contributing
+        number_samples: Number of unique samples contributing
+        cached_adducts_df: Cached DataFrame of valid adducts for the study
+        cached_valid_adducts: Cached set of valid adduct names for the study
     Returns:
         Dictionary with consensus feature metadata
@@ -3612,6 +3629,142 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
     return adduct_group_list, adduct_of_list
+def _handle_single_sample_merge(study, cached_adducts_df=None, cached_valid_adducts=None):
+    """
+    Handle merge for the special case of a single sample.
+    Directly populate consensus_df from the sample's features_df without any filtering.
+    Args:
+        study: Study object with single sample
+        cached_adducts_df: Pre-computed adducts DataFrame (optional)
+        cached_valid_adducts: Set of valid adduct names (optional)
+    """
+    import polars as pl
+    import uuid
+    if len(study.samples_df) != 1:
+        raise ValueError("_handle_single_sample_merge should only be called with exactly one sample")
+    # Get the single sample's features
+    sample_row = study.samples_df.row(0, named=True)
+    sample_uid = sample_row["sample_uid"]
+    # Filter features for this sample
+    sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
+    if len(sample_features) == 0:
+        study.logger.warning("No features found for single sample")
+        study.consensus_df = pl.DataFrame()
+        study.consensus_mapping_df = pl.DataFrame()
+        return
+    study.logger.info(f"Creating consensus from {len(sample_features)} features in single sample")
+    # Create consensus features directly from sample features
+    consensus_list = []
+    mapping_list = []
+    # Cache valid adducts
+    valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
+    valid_adducts.add("?")  # Always allow '?' adducts
+    for i, feature_row in enumerate(sample_features.iter_rows(named=True)):
+        # Generate unique consensus ID
+        consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
+        # Handle adduct information
+        adduct = feature_row.get("adduct")
+        if adduct is None or adduct not in valid_adducts:
+            # Set default adduct based on study polarity
+            study_polarity = getattr(study, "polarity", "positive")
+            if study_polarity in ["negative", "neg"]:
+                adduct = "[M-?]1-"
+                adduct_charge = -1
+                adduct_mass_shift = -1.007825
+            else:
+                adduct = "[M+?]1+"
+                adduct_charge = 1
+                adduct_mass_shift = 1.007825
+        else:
+            # Try to get charge and mass shift from cached adducts
+            adduct_charge = 1
+            adduct_mass_shift = 1.007825
+            if cached_adducts_df is not None and not cached_adducts_df.is_empty():
+                matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct)
+                if not matching_adduct.is_empty():
+                    adduct_row = matching_adduct.row(0, named=True)
+                    adduct_charge = adduct_row["charge"]
+                    adduct_mass_shift = adduct_row["mass_shift"]
+        # Calculate neutral mass
+        mz = feature_row.get("mz", 0.0)
+        if adduct_charge and adduct_mass_shift is not None:
+            adduct_mass_neutral = mz * abs(adduct_charge) - adduct_mass_shift
+        else:
+            adduct_mass_neutral = None
+        # Count MS2 scans
+        ms2_scans = feature_row.get("ms2_scans", [])
+        ms2_count = len(ms2_scans) if ms2_scans else 0
+        # Create consensus feature metadata
+        consensus_feature = {
+            "consensus_uid": i,
+            "consensus_id": consensus_id_str,
+            "quality": feature_row.get("quality", 1.0),
+            "number_samples": 1,  # Always 1 for single sample
+            "rt": feature_row.get("rt", 0.0),
+            "mz": mz,
+            "rt_min": feature_row.get("rt", 0.0),
+            "rt_max": feature_row.get("rt", 0.0),
+            "rt_mean": feature_row.get("rt", 0.0),
+            "rt_start_mean": feature_row.get("rt_start", 0.0),
+            "rt_end_mean": feature_row.get("rt_end", 0.0),
+            "rt_delta_mean": feature_row.get("rt_delta", 0.0),
+            "mz_min": mz,
+            "mz_max": mz,
+            "mz_mean": mz,
+            "mz_start_mean": feature_row.get("mz_start", 0.0),
+            "mz_end_mean": feature_row.get("mz_end", 0.0),
+            "inty_mean": feature_row.get("inty", 0.0),
+            "bl": -1.0,
+            "chrom_coherence_mean": feature_row.get("chrom_coherence", 0.0),
+            "chrom_prominence_mean": feature_row.get("chrom_prominence", 0.0),
+            "chrom_prominence_scaled_mean": feature_row.get("chrom_prominence_scaled", 0.0),
+            "chrom_height_scaled_mean": feature_row.get("chrom_height_scaled", 0.0),
+            "iso": None,  # Will be filled by find_iso() function
+            "iso_mean": feature_row.get("iso", 0.0),
+            "charge_mean": feature_row.get("charge", 0.0),
+            "number_ms2": ms2_count,
+            "adducts": [[adduct, 1, 100.0]],  # Single adduct with 100% frequency
+            "adduct_top": adduct,
+            "adduct_charge_top": adduct_charge,
+            "adduct_mass_neutral_top": adduct_mass_neutral,
+            "adduct_mass_shift_top": adduct_mass_shift,
+            "id_top_name": None,
+            "id_top_class": None,
+            "id_top_adduct": None,
+            "id_top_score": None,
+            "id_source": None,
+        }
+        consensus_list.append(consensus_feature)
+        # Create mapping entry
+        mapping_entry = {
+            "consensus_uid": i,
+            "sample_uid": sample_uid,
+            "feature_uid": feature_row.get("feature_uid"),
+        }
+        mapping_list.append(mapping_entry)
+    # Create DataFrames
+    study.consensus_df = pl.DataFrame(consensus_list, strict=False)
+    study.consensus_mapping_df = pl.DataFrame(mapping_list, strict=False)
+    study.logger.info(f"Created {len(consensus_list)} consensus features from single sample")
 def _fast_correlation(x, y):
     """
     Fast correlation coefficient calculation for consensus matrix data.

masster 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl

Potentially problematic release.

masster 0.5.27py3-none-any.whl → 0.6.0py3-none-any.whl