PyPI - masster - Versions diffs - 0.5.3__tar.gz → 0.5.5__tar.gz - Mend

masster 0.5.3tar.gz → 0.5.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (94) hide show

{masster-0.5.3 → masster-0.5.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: masster
-Version: 0.5.3
+Version: 0.5.5
 Summary: Mass spectrometry data analysis package
 Project-URL: homepage, https://github.com/zamboni-lab/masster
 Project-URL: repository, https://github.com/zamboni-lab/masster

{masster-0.5.3 → masster-0.5.5}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [project]
 name = "masster"
-version = "0.5.3"
+version = "0.5.5"
 description = "Mass spectrometry data analysis package"
 authors = [
     { name = "Zamboni Lab" }

{masster-0.5.3 → masster-0.5.5}/src/masster/_version.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from __future__ import annotations
-__version__ = "0.5.4"
+__version__ = "0.5.5"
 def get_version():

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/adducts.py RENAMED Viewed

@@ -473,7 +473,7 @@ def find_adducts(self, **kwargs):
     self.logger.debug(f"Min probability threshold: {min_probability}")
     # Generate comprehensive adduct specifications using the Sample method
-    adducts_df = self._get_adducts(
+    adducts_df = _get_adducts(self,
         adducts_list=adducts_list,
         charge_min=charge_min,
         charge_max=charge_max,

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/h5.py RENAMED Viewed

@@ -62,8 +62,8 @@ def _save_sample5(
             return
     # synchronize feature_map if it exists
-    if hasattr(self, "_feature_map") and self._feature_map is not None:
-        self._features_sync()
+    #if hasattr(self, "_feature_map") and self._feature_map is not None:
+    #    self._features_sync()
     # if no extension is given, add .sample5
     if not filename.endswith(".sample5"):
@@ -1057,15 +1057,15 @@ def _load_sample5(self, filename: str, map: bool = False):
         # Parameters are now loaded from metadata JSON (see above)
         # Lib and lib_match are no longer saved/loaded
-    if map:
-        featureXML = filename.replace(".sample5", ".featureXML")
-        if os.path.exists(featureXML):
-            self._load_featureXML(featureXML)
-            self._features_sync()
-        else:
-            self.logger.warning(
-                f"Feature XML file {featureXML} not found, skipping loading.",
-            )
+    #if map:
+    #    featureXML = filename.replace(".sample5", ".featureXML")
+    #    if os.path.exists(featureXML):
+    #        self._load_featureXML(featureXML)
+    #        #self._features_sync()
+    #    else:
+    #        self.logger.warning(
+    #            f"Feature XML file {featureXML} not found, skipping loading.",
+    #        )
     # set self.file_path to *.sample5
     self.file_path = filename

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/helpers.py RENAMED Viewed

@@ -569,7 +569,7 @@ def select(
         self.logger.info(f"Selected features. Features remaining: {len(feats)}")
     return feats
+'''
 def _features_sync(self):
     """
     Synchronizes the cached FeatureMap with features_df.
@@ -675,7 +675,7 @@ def _features_sync(self):
         self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
     except Exception as e:
         self.logger.error(f"Error during feature synchronization: {e}")
+'''
 def features_delete(self, features: list | None = None):
     """

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/load.py RENAMED Viewed

@@ -46,6 +46,7 @@ import polars as pl
 from tqdm import tqdm
 from masster.chromatogram import Chromatogram
+from .h5 import _load_sample5
 from masster.spectrum import Spectrum
 # Suppress pyOpenMS warnings globally
@@ -96,13 +97,13 @@ def load(
     # check if file is mzML
     if filename.lower().endswith(".mzml"):
-        self._load_mzML(filename)
+        _load_mzML(self, filename)
     elif filename.lower().endswith(".wiff") or filename.lower().endswith(".wiff2"):
-        self._load_wiff(filename)
+        _load_wiff(self, filename)
     elif filename.lower().endswith(".raw"):
-        self._load_raw(filename)
+        _load_raw(self, filename)
     elif filename.lower().endswith(".sample5"):
-        self._load_sample5(filename)
+        _load_sample5(self, filename)
     # elif filename.lower().endswith(".h5"):
     #    self._load_h5(filename)
     else:

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/processing.py RENAMED Viewed

@@ -795,7 +795,7 @@ def find_features(self, **kwargs):
     )
     self.features_df = df
-    self._features_sync()
+    #self._features_sync()
     self.logger.info(f"Feature detection completed. Total features: {len(df)}")
     # store params

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/sample.py RENAMED Viewed

@@ -48,9 +48,9 @@ from masster.sample.defaults.find_ms2_def import find_ms2_defaults
 from masster.sample.defaults.get_spectrum_def import get_spectrum_defaults
 # Sample-specific imports - keeping these private, only for internal use
-# from masster.sample.h5 import _load_sample5
+from masster.sample.h5 import _load_sample5
 # from masster.sample.h5 import _load_sample5_study
-# from masster.sample.h5 import _save_sample5
+from masster.sample.h5 import _save_sample5
 # from masster.sample.helpers import _delete_ms2
 from masster.sample.helpers import _estimate_memory_usage
 from masster.sample.helpers import _get_scan_uids
@@ -263,12 +263,16 @@ class Sample:
     _get_feature_map = _get_feature_map
     # Additional method assignments for all imported functions
-    # Removed internal-only methods: _load_sample5, _load_sample5_study, _save_sample5, _delete_ms2, _features_sync
+    # Removed internal-only methods: _load_sample5_study, _delete_ms2, _features_sync
     _estimate_memory_usage = _estimate_memory_usage
     _get_scan_uids = _get_scan_uids
     _get_feature_uids = _get_feature_uids
     features_delete = features_delete
     features_filter = features_filter
+    _save_sample5 = _save_sample5
+    _load_sample5 = _load_sample5
     # Removed internal-only load methods: _load_featureXML, _load_ms2data, _load_mzML, _load_raw, _load_wiff
     chrom_extract = chrom_extract
     _index_file = _index_file  # Renamed from index_file to be internal-only

{masster-0.5.3 → masster-0.5.5}/src/masster/sample/save.py RENAMED Viewed

@@ -411,6 +411,11 @@ def export_mgf(
             rt_str = f"{rt:.2f}"
             mz_str = f"{mz:.4f}"
+            # Initialize charge for this feature
+            charge = preferred_charge
+            if row["charge"] is not None and row["charge"] != 0:
+                charge = row["charge"]
             # Skip features without MS2 data (unless include_all_ms1 is True, but we already handled MS1 above)
             if row["ms2_scans"] is None:
                 skip = skip + 1

{masster-0.5.3 → masster-0.5.5}/src/masster/study/h5.py RENAMED Viewed

@@ -304,6 +304,30 @@ def _save_object_columns_optimized(group, df, object_cols, logger, chunk_size):
                     serialized_chunk.append(item.to_json())
                 else:
                     serialized_chunk.append("None")
+        elif col_name == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
+        elif col_name == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            for item in chunk_data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        serialized_chunk.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        serialized_chunk.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    serialized_chunk.append("None")
         else:
             logger.warning(
                 f"Unknown object column '{col_name}', using default serialization",
@@ -564,6 +588,34 @@ def _save_dataframe_column_legacy(
                 else:
                     data_as_str.append("None")
             group.create_dataset(col, data=data_as_str, compression=compression)
+        elif col == "iso":
+            # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
+        elif col == "ms1_spec":
+            # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+            data_as_json_strings = []
+            for item in data:
+                if item is not None:
+                    try:
+                        # Convert numpy array to nested list for JSON serialization
+                        data_as_json_strings.append(json.dumps(item.tolist()))
+                    except (AttributeError, TypeError):
+                        # Fallback for non-numpy data
+                        data_as_json_strings.append(json.dumps(list(item) if hasattr(item, '__iter__') else []))
+                else:
+                    data_as_json_strings.append("None")
+            group.create_dataset(col, data=data_as_json_strings, **optimal_compression)
         else:
             logger.warning(
                 f"Unexpectedly, column '{col}' has dtype '{dtype}'. Implement serialization for this column.",
@@ -666,6 +718,24 @@ def _reconstruct_object_column(data_col, col_name: str):
                             },
                         )
                 reconstructed_data.append(converted_adducts)
+            elif col_name == "iso":
+                # Handle isotope patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    iso_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(iso_data) if iso_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
+            elif col_name == "ms1_spec":
+                # Handle MS1 spectra patterns (numpy arrays with [mz, intensity] data)
+                try:
+                    import numpy as np
+                    ms1_spec_data = json.loads(item)
+                    # Convert back to numpy array
+                    reconstructed_data.append(np.array(ms1_spec_data) if ms1_spec_data else None)
+                except (json.JSONDecodeError, ValueError, ImportError):
+                    reconstructed_data.append(None)
             else:
                 # Unknown object column
                 reconstructed_data.append(None)

{masster-0.5.3 → masster-0.5.5}/src/masster/study/helpers.py RENAMED Viewed

@@ -500,7 +500,7 @@ def align_reset(self):
 # TODO I don't get this param
 def get_consensus(self, quant="chrom_area"):
     if self.consensus_df is None:
-        self.logger.error("No consensus map found.")
+        self.logger.error("No consensus found.")
         return None
     # Convert Polars DataFrame to pandas for this operation since the result is used for export
@@ -613,7 +613,7 @@ def get_gaps_matrix(self, uids=None, samples=None):
     import polars as pl
     if self.consensus_df is None or self.consensus_df.is_empty():
-        self.logger.error("No consensus map found.")
+        self.logger.error("No consensus found.")
         return None
     if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():

{masster-0.5.3 → masster-0.5.5}/src/masster/study/plot.py RENAMED Viewed

@@ -564,6 +564,10 @@ def plot_consensus_2d(
     Parameters:
         filename (str, optional): Path to save the plot
         colorby (str): Column name to use for color mapping (default: "number_samples")
+                      Automatically detects if column contains categorical (string) or
+                      numeric data and applies appropriate color mapping:
+                      - Categorical: Uses factor_cmap with distinct colors and legend
+                      - Numeric: Uses LinearColorMapper with continuous colorbar
         sizeby (str): Column name to use for size mapping (default: "inty_mean")
         markersize (int): Base marker size (default: 6)
         scaling (str): Controls whether points scale with zoom. Options:
@@ -603,7 +607,7 @@ def plot_consensus_2d(
                 pl.when(
                     (pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
                 )
-                .then((pl.col(sizeby).log10() * markersize / 12).pow(2))
+                .then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
                 .otherwise(markersize)
                 .alias("markersize"),
             ])
@@ -645,12 +649,13 @@ def plot_consensus_2d(
     from bokeh.models import HoverTool
     from bokeh.models import LinearColorMapper
     from bokeh.io.export import export_png
+    from bokeh.transform import factor_cmap
     try:
         from bokeh.models import ColorBar  # type: ignore[attr-defined]
     except ImportError:
         from bokeh.models.annotations import ColorBar
-    from bokeh.palettes import viridis
+    from bokeh.palettes import viridis, Category20
     # Import cmap for colormap handling
     from cmap import Colormap
@@ -695,61 +700,144 @@ def plot_consensus_2d(
         self.logger.warning(f"Could not interpret colormap '{cmap}': {e}, falling back to viridis")
         palette = viridis(256)
-    color_mapper = LinearColorMapper(
-        palette=palette,
-        low=data[colorby].min(),
-        high=data[colorby].max(),
+    # Check if colorby column contains categorical data (string/object)
+    colorby_values = data[colorby].to_list()
+    is_categorical = (
+        data_pd[colorby].dtype in ["object", "string", "category"] or
+        isinstance(colorby_values[0], str) if colorby_values else False
     )
+    if is_categorical:
+        # Handle categorical coloring
+        # Use natural order of unique values - don't sort to preserve correct legend mapping
+        # Sorting would break the correspondence between legend labels and point colors
+        unique_values = [v for v in data_pd[colorby].unique() if v is not None]
+        if len(unique_values) <= 20:
+            palette = Category20[min(20, max(3, len(unique_values)))]
+        else:
+            # For many categories, use a subset of the viridis palette
+            palette = viridis(min(256, len(unique_values)))
+        color_mapper = factor_cmap(colorby, palette, unique_values)
+    else:
+        # Handle numeric coloring with LinearColorMapper
+        color_mapper = LinearColorMapper(
+            palette=palette,
+            low=data[colorby].min(),
+            high=data[colorby].max(),
+        )
     # scatter plot rt vs mz
     p = bp.figure(
         width=width,
         height=height,
-        title="Consensus map",
+        title=f"Consensus features, colored by {colorby}",
     )
-    p.xaxis.axis_label = "Retention Time (min)"
-    p.yaxis.axis_label = "m/z"
+    p.xaxis.axis_label = "RT [s]"
+    p.yaxis.axis_label = "m/z [Th]"
     scatter_renderer: Any = None
-    if scaling.lower() in ["dyn", "dynamic"]:
-        # Calculate appropriate radius for dynamic scaling based on data range
-        rt_range = data["rt"].max() - data["rt"].min()
-        mz_range = data["mz"].max() - data["mz"].min()
-        # Use a fraction of the smaller dimension for radius, similar to sample plotting
-        dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
+    if is_categorical:
+        # For categorical data, create separate renderers for each category
+        # This enables proper legend interactivity where each category can be toggled independently
+        unique_values = [v for v in data_pd[colorby].unique() if v is not None]
+        if len(unique_values) <= 20:
+            palette = Category20[min(20, max(3, len(unique_values)))]
+        else:
+            palette = viridis(min(256, len(unique_values)))
+        # Create a separate renderer for each category
+        for i, category in enumerate(unique_values):
+            # Filter data for this category
+            category_data = data.filter(pl.col(colorby) == category)
+            category_data_pd = category_data.to_pandas()
+            category_source = bp.ColumnDataSource(category_data_pd)
+            color = palette[i % len(palette)]
+            if scaling.lower() in ["dyn", "dynamic"]:
+                # Calculate appropriate radius for dynamic scaling
+                rt_range = data["rt"].max() - data["rt"].min()
+                mz_range = data["mz"].max() - data["mz"].min()
+                dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
+                renderer = p.circle(
+                    x="rt",
+                    y="mz",
+                    radius=dynamic_radius,
+                    fill_color=color,
+                    line_color=None,
+                    alpha=alpha,
+                    source=category_source,
+                    legend_label=str(category),
+                )
+            else:
+                renderer = p.scatter(
+                    x="rt",
+                    y="mz",
+                    size="markersize",
+                    fill_color=color,
+                    line_color=None,
+                    alpha=alpha,
+                    source=category_source,
+                    legend_label=str(category),
+                )
+        # No single scatter_renderer for categorical data
+        scatter_renderer = None
-        scatter_renderer = p.circle(
-            x="rt",
-            y="mz",
-            radius=dynamic_radius,
-            fill_color={"field": colorby, "transform": color_mapper},
-            line_color=None,
-            alpha=alpha,
-            source=source,
-        )
     else:
-        scatter_renderer = p.scatter(
-            x="rt",
-            y="mz",
-            size="markersize",
-            fill_color={"field": colorby, "transform": color_mapper},
-            line_color=None,
-            alpha=alpha,
-            source=source,
-        )
+        # Handle numeric coloring - single renderer with color mapping
+        if scaling.lower() in ["dyn", "dynamic"]:
+            # Calculate appropriate radius for dynamic scaling
+            rt_range = data["rt"].max() - data["rt"].min()
+            mz_range = data["mz"].max() - data["mz"].min()
+            dynamic_radius = min(rt_range, mz_range) * 0.0005 * markersize
+            scatter_renderer = p.circle(
+                x="rt",
+                y="mz",
+                radius=dynamic_radius,
+                fill_color={"field": colorby, "transform": color_mapper},
+                line_color=None,
+                alpha=alpha,
+                source=source,
+            )
+        else:
+            scatter_renderer = p.scatter(
+                x="rt",
+                y="mz",
+                size="markersize",
+                fill_color={"field": colorby, "transform": color_mapper},
+                line_color=None,
+                alpha=alpha,
+                source=source,
+            )
     # add hover tool
-    # Start with base tooltips
+    # Start with base tooltips - rt and mz moved to top, removed consensus_id and iso_mean
     tooltips = [
+        ("rt", "@rt"),
+        ("mz", "@mz"),
         ("consensus_uid", "@consensus_uid"),
-        ("consensus_id", "@consensus_id"),
         ("number_samples", "@number_samples"),
         ("number_ms2", "@number_ms2"),
-        ("rt", "@rt"),
-        ("mz", "@mz"),
         ("inty_mean", "@inty_mean"),
-        ("iso_mean", "@iso_mean"),
         ("coherence_mean", "@chrom_coherence_mean"),
         ("prominence_scaled_mean", "@chrom_prominence_scaled_mean"),
     ]
+    # Add adduct_top if it exists in data
+    if "adduct_top" in data.columns:
+        tooltips.append(("adduct_top", "@adduct_top"))
+    # Add id_top_name if it exists in data
+    if "id_top_name" in data.columns:
+        tooltips.append(("id_top_name", "@id_top_name"))
+    # Add id_top_adduct if it exists in data
+    if "id_top_adduct" in data.columns:
+        tooltips.append(("id_top_adduct", "@id_top_adduct"))
     # Add id_top_* columns if they exist and have non-null values
     id_top_columns = ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score"]
     for col in id_top_columns:
@@ -764,19 +852,28 @@ def plot_consensus_2d(
     hover = HoverTool(
         tooltips=tooltips,
-        renderers=[scatter_renderer],
     )
+    # For categorical data, hover will work on all renderers automatically
+    # For numeric data, specify the single renderer
+    if not is_categorical and scatter_renderer:
+        hover.renderers = [scatter_renderer]
     p.add_tools(hover)
-    # add colorbar
-    color_bar = ColorBar(
-        color_mapper=color_mapper,
-        label_standoff=12,
-        location=(0, 0),
-        title=colorby,
-        ticker=BasicTicker(desired_num_ticks=8),
-    )
-    p.add_layout(color_bar, "right")
+    # add colorbar only for numeric data (LinearColorMapper)
+    if not is_categorical:
+        color_bar = ColorBar(
+            color_mapper=color_mapper,
+            label_standoff=12,
+            location=(0, 0),
+            title=colorby,
+            ticker=BasicTicker(desired_num_ticks=8),
+        )
+        p.add_layout(color_bar, "right")
+    else:
+        # For categorical data, configure the legend that was automatically created
+        p.legend.location = "top_right"
+        p.legend.click_policy = "hide"
     if filename is not None:
         # Convert relative paths to absolute paths using study folder as base
@@ -1421,46 +1518,60 @@ def plot_rt_correction(
     p.xaxis.axis_label = f"Retention Time ({rt_unit})"
     p.yaxis.axis_label = "RT - RT_original (s)"
-    samples_info = None
+    # Create sample name lookup dictionary from samples_df (all in Polars)
+    sample_names_dict = {}
     if hasattr(self, "samples_df") and self.samples_df is not None:
         try:
-            samples_info = self.samples_df.to_pandas()
+            sample_name_mapping = (
+                self.samples_df
+                .filter(pl.col("sample_uid").is_in(sample_uids))
+                .select(["sample_uid", "sample_name"])
+            )
+            sample_names_dict = dict(zip(
+                sample_name_mapping["sample_uid"].to_list(),
+                sample_name_mapping["sample_name"].to_list()
+            ))
         except Exception:
-            samples_info = None
+            pass
     renderers = []
-    # Iterate samples and build curves
-    for uid in sample_uids:
-        # Select features belonging to this sample
-        try:
-            if "sample_uid" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
-            elif "sample_name" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
-            else:
-                self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
-                continue
-        except Exception as e:
-            self.logger.debug(f"Error filtering features for sample {uid}: {e}")
-            continue
+    # Check sample identifier column
+    if "sample_uid" not in self.features_df.columns:
+        if "sample_name" in self.features_df.columns:
+            sample_id_col = "sample_name"
+        else:
+            self.logger.debug("No sample identifier column in features_df")
+            return
+    else:
+        sample_id_col = "sample_uid"
-        if sample_feats.is_empty():
-            continue
+    # OPTIMIZED: Filter once, group once instead of per-sample filtering
+    try:
+        # Filter all data once for selected samples and required conditions
+        all_sample_feats = self.features_df.filter(
+            pl.col(sample_id_col).is_in(sample_uids)
+        )
+        if all_sample_feats.is_empty():
+            self.logger.warning("No features found for the selected samples.")
+            return
-        # Filter to only use features with filled==False
-        if "filled" in sample_feats.columns:
-            sample_feats = sample_feats.filter(~pl.col("filled"))
-            if sample_feats.is_empty():
-                continue
+        # Filter to only use features with filled==False if column exists
+        if "filled" in all_sample_feats.columns:
+            all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
+            if all_sample_feats.is_empty():
+                self.logger.warning("No non-filled features found for the selected samples.")
+                return
-        # Stay in Polars - much faster than pandas conversion!
-        if "rt" not in sample_feats.columns or "rt_original" not in sample_feats.columns:
-            continue
+        # Check required columns
+        if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
+            self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
+            return
-        # Filter nulls and add delta column in Polars
-        sample_feats = (
-            sample_feats
+        # Filter nulls, add delta column, and sort - all in one operation
+        all_sample_feats = (
+            all_sample_feats
             .filter(
                 pl.col("rt").is_not_null() &
                 pl.col("rt_original").is_not_null()
@@ -1468,33 +1579,36 @@ def plot_rt_correction(
             .with_columns([
                 (pl.col("rt") - pl.col("rt_original")).alias("delta")
             ])
-            .sort("rt")
+            .sort([sample_id_col, "rt"])
         )
-        if sample_feats.is_empty():
-            continue
+        if all_sample_feats.is_empty():
+            self.logger.warning("No valid RT data found for the selected samples.")
+            return
-        # Extract arrays directly from Polars
-        rt = sample_feats["rt"].to_numpy()
-        delta = sample_feats["delta"].to_numpy()
+        # Group by sample and process each group (much faster than individual filtering)
+        for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
+            if sample_group.is_empty():
+                continue
-        sample_name = str(uid)
-        if samples_info is not None:
-            try:
-                row = samples_info[samples_info["sample_uid"] == uid]
-                if not row.empty:
-                    sample_name = row.iloc[0].get("sample_name", sample_name)
-            except Exception:
-                pass
+            # Extract arrays directly from Polars
+            rt = sample_group["rt"].to_numpy()
+            delta = sample_group["delta"].to_numpy()
-        color = color_map.get(uid, "#000000")
+            # Get sample name efficiently from pre-built dictionary
+            sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
+            color = color_map.get(sample_uid, "#000000")
-        data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
-        src = ColumnDataSource(data)
+            data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
+            src = ColumnDataSource(data)
-        r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
-        p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
-        renderers.append(r_line)
+            r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
+            p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
+            renderers.append(r_line)
+    except Exception as e:
+        self.logger.error(f"Error in optimized RT correction plotting: {e}")
+        return
     if not renderers:
         self.logger.warning("No RT correction curves to plot for the selected samples.")

{masster-0.5.3 → masster-0.5.5}/src/masster/study/processing.py RENAMED Viewed

@@ -341,9 +341,6 @@ def _integrate_chrom_impl(self, **kwargs):
     uids = params.get("uids")
     rt_tol = params.get("rt_tol")
-    if self.consensus_map is None:
-        self.logger.error("No consensus map found.")
-        return
     if uids is None:
         # get all consensus_id from consensus_df
         ids = self.consensus_df["consensus_uid"].to_list()

{masster-0.5.3 → masster-0.5.5}/src/masster/wizard/wizard.py RENAMED Viewed

@@ -455,6 +455,9 @@ class Wizard:
         params_lines.append('    # === Processing Parameters ===')
         params_lines.append(f'    "adducts": {params_dict.get("adducts", [])!r},  # Adduct specifications for feature detection and annotation')
         params_lines.append(f'    "detector_type": {params_dict.get("detector_type", "unknown")!r},  # MS detector type ("orbitrap", "tof", "unknown")')
+        params_lines.append(f'    "noise": {params_dict.get("noise", 50.0)},  # Noise threshold for feature detection')
+        params_lines.append(f'    "chrom_fwhm": {params_dict.get("chrom_fwhm", 0.5)},  # Chromatographic peak full width at half maximum (seconds)')
+        params_lines.append(f'    "chrom_peak_snr": {params_dict.get("chrom_peak_snr", 5.0)},  # Minimum signal-to-noise ratio for chromatographic peaks')
         params_lines.append('')
         # Alignment & Merging
@@ -643,6 +646,7 @@ class Wizard:
             '        # Step 4: Add sample5 files to study',
             '        print("\\nStep 4/7: Adding samples to study...")',
             '        study.add(str(Path(PARAMS[\'folder\']) / "*.sample5"))',
+            '        study.features_filter(study.features_select(chrom_coherence=0.1, chrom_prominence_scaled=1))',
             '        ',
             '        # Step 5: Core processing',
             '        print("\\nStep 5/7: Processing...")',
@@ -651,29 +655,14 @@ class Wizard:
             '            rt_tol=PARAMS[\'rt_tol\']',
             '        )',
             '        ',
-            '        # Merge and create consensus features',
-            '        # Use optimized method for large datasets (>500 samples)',
-            '        num_samples = len(study.samples)',
-            '        if num_samples > 500:',
-            '            print(f"  Large dataset detected ({num_samples} samples), using optimized qt_chunked + hierarchical method")',
-            '            study.merge(',
-            '                method="qt_chunked",',
-            '                dechunking="hierarchical",',
-            '                min_samples=PARAMS[\'min_samples_per_feature\'],',
-            '                threads=PARAMS[\'num_cores\'],',
-            '                rt_tol=PARAMS[\'rt_tol\'],',
-            '                mz_tol=PARAMS[\'mz_tol\']',
-            '            )',
-            '        else:',
-            '            print(f"  Using standard merge method for {num_samples} samples")',
-            '            study.merge(',
-            '                min_samples=PARAMS[\'min_samples_per_feature\'],',
-            '                threads=PARAMS[\'num_cores\'],',
-            '                rt_tol=PARAMS[\'rt_tol\'],',
-            '                mz_tol=PARAMS[\'mz_tol\']',
-            '            )',
+            '        study.merge(',
+            '            method="qt",',
+            '            min_samples=PARAMS[\'min_samples_per_feature\'],',
+            '            threads=PARAMS[\'num_cores\'],',
+            '            rt_tol=PARAMS[\'rt_tol\'],'
+            '        )',
             '        study.find_iso()',
-            '        study.fill(min_samples_rel=0.0)',
+            '        study.fill()',
             '        study.integrate()',
             '        ',
             '        # Step 6/7: Saving results',
@@ -689,8 +678,8 @@ class Wizard:
             '        study.plot_consensus_2d(filename="consensus.png")',
             '        study.plot_alignment(filename="alignment.html")',
             '        study.plot_alignment(filename="alignment.png")',
-            '        study.plot_pca(filename="pca.html")',
-            '        study.plot_pca(filename="pca.png")',
+            '        study.plot_samples_pca(filename="pca.html")',
+            '        study.plot_samples_pca(filename="pca.png")',
             '        study.plot_bpc(filename="bpc.html")',
             '        study.plot_bpc(filename="bpc.png")',
             '        study.plot_rt_correction(filename="rt_correction.html")',

{masster-0.5.3 → masster-0.5.5}/uv.lock RENAMED Viewed

@@ -1393,7 +1393,7 @@ wheels = [
 [[package]]
 name = "masster"
-version = "0.5.3"
+version = "0.5.5"
 source = { editable = "." }
 dependencies = [
     { name = "alpharaw" },