PyPI - masster - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

masster 0.5.1py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (25) hide show

masster/_version.py +1 -1
masster/sample/adducts.py +1 -1
masster/sample/h5.py +11 -11
masster/sample/helpers.py +2 -2
masster/sample/load.py +10 -8
masster/sample/processing.py +1 -1
masster/sample/sample.py +7 -3
masster/study/defaults/align_def.py +0 -204
masster/study/defaults/fill_def.py +9 -1
masster/study/defaults/merge_def.py +20 -69
masster/study/export.py +25 -5
masster/study/h5.py +230 -42
masster/study/helpers.py +430 -53
masster/study/load.py +986 -158
masster/study/merge.py +683 -1076
masster/study/plot.py +95 -73
masster/study/processing.py +337 -280
masster/study/study.py +58 -135
masster/wizard/wizard.py +20 -6
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/METADATA +1 -1
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/RECORD +24 -25
masster/study/defaults/fill_chrom_def.py +0 -260
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/WHEEL +0 -0
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/entry_points.txt +0 -0
{masster-0.5.1.dist-info → masster-0.5.4.dist-info}/licenses/LICENSE +0 -0

masster/study/plot.py CHANGED Viewed

@@ -603,7 +603,7 @@ def plot_consensus_2d(
                 pl.when(
                     (pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
                 )
-                .then((pl.col(sizeby).log10() * markersize / 12).pow(2))
+                .then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
                 .otherwise(markersize)
                 .alias("markersize"),
             ])
@@ -1385,6 +1385,7 @@ def plot_rt_correction(
     """
     Plot RT correction per sample: (rt - rt_original) vs rt overlaid for selected samples.
+    Only features with filled==False are used for the RT correction plot.
     This uses the same color mapping as `plot_bpc` so curves for the same samples match.
     """
     from bokeh.plotting import figure, show, output_file
@@ -1420,74 +1421,97 @@ def plot_rt_correction(
     p.xaxis.axis_label = f"Retention Time ({rt_unit})"
     p.yaxis.axis_label = "RT - RT_original (s)"
-    samples_info = None
+    # Create sample name lookup dictionary from samples_df (all in Polars)
+    sample_names_dict = {}
     if hasattr(self, "samples_df") and self.samples_df is not None:
         try:
-            samples_info = self.samples_df.to_pandas()
+            sample_name_mapping = (
+                self.samples_df
+                .filter(pl.col("sample_uid").is_in(sample_uids))
+                .select(["sample_uid", "sample_name"])
+            )
+            sample_names_dict = dict(zip(
+                sample_name_mapping["sample_uid"].to_list(),
+                sample_name_mapping["sample_name"].to_list()
+            ))
         except Exception:
-            samples_info = None
+            pass
     renderers = []
-    # Iterate samples and build curves
-    for uid in sample_uids:
-        # Select features belonging to this sample
-        try:
-            if "sample_uid" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
-            elif "sample_name" in self.features_df.columns:
-                sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
-            else:
-                self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
-                continue
-        except Exception as e:
-            self.logger.debug(f"Error filtering features for sample {uid}: {e}")
-            continue
+    # Check sample identifier column
+    if "sample_uid" not in self.features_df.columns:
+        if "sample_name" in self.features_df.columns:
+            sample_id_col = "sample_name"
+        else:
+            self.logger.debug("No sample identifier column in features_df")
+            return
+    else:
+        sample_id_col = "sample_uid"
-        if sample_feats.is_empty():
-            continue
+    # OPTIMIZED: Filter once, group once instead of per-sample filtering
+    try:
+        # Filter all data once for selected samples and required conditions
+        all_sample_feats = self.features_df.filter(
+            pl.col(sample_id_col).is_in(sample_uids)
+        )
+        if all_sample_feats.is_empty():
+            self.logger.warning("No features found for the selected samples.")
+            return
-        # Convert to pandas for easy numeric handling
-        try:
-            df = sample_feats.to_pandas()
-        except Exception:
-            continue
+        # Filter to only use features with filled==False if column exists
+        if "filled" in all_sample_feats.columns:
+            all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
+            if all_sample_feats.is_empty():
+                self.logger.warning("No non-filled features found for the selected samples.")
+                return
-        # Need both rt and rt_original
-        if "rt" not in df.columns or "rt_original" not in df.columns:
-            continue
+        # Check required columns
+        if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
+            self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
+            return
-        # Drop NA and ensure numeric arrays
-        df = df.dropna(subset=["rt", "rt_original"]).copy()
-        if df.empty:
-            continue
+        # Filter nulls, add delta column, and sort - all in one operation
+        all_sample_feats = (
+            all_sample_feats
+            .filter(
+                pl.col("rt").is_not_null() &
+                pl.col("rt_original").is_not_null()
+            )
+            .with_columns([
+                (pl.col("rt") - pl.col("rt_original")).alias("delta")
+            ])
+            .sort([sample_id_col, "rt"])
+        )
-        rt = _np.asarray(df["rt"], dtype=float)
-        rt_orig = _np.asarray(df["rt_original"], dtype=float)
-        delta = rt - rt_orig
+        if all_sample_feats.is_empty():
+            self.logger.warning("No valid RT data found for the selected samples.")
+            return
-        # sort by rt
-        idx = _np.argsort(rt)
-        rt = rt[idx]
-        delta = delta[idx]
+        # Group by sample and process each group (much faster than individual filtering)
+        for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
+            if sample_group.is_empty():
+                continue
-        sample_name = str(uid)
-        if samples_info is not None:
-            try:
-                row = samples_info[samples_info["sample_uid"] == uid]
-                if not row.empty:
-                    sample_name = row.iloc[0].get("sample_name", sample_name)
-            except Exception:
-                pass
+            # Extract arrays directly from Polars
+            rt = sample_group["rt"].to_numpy()
+            delta = sample_group["delta"].to_numpy()
-        color = color_map.get(uid, "#000000")
+            # Get sample name efficiently from pre-built dictionary
+            sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
+            color = color_map.get(sample_uid, "#000000")
-        data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
-        src = ColumnDataSource(data)
+            data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
+            src = ColumnDataSource(data)
-        r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
-        p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
-        renderers.append(r_line)
+            r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
+            p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
+            renderers.append(r_line)
+    except Exception as e:
+        self.logger.error(f"Error in optimized RT correction plotting: {e}")
+        return
     if not renderers:
         self.logger.warning("No RT correction curves to plot for the selected samples.")
@@ -1759,21 +1783,26 @@ def plot_consensus_stats(
     import polars as pl
     import numpy as np
-    # Check if consensus_df exists and has data
-    if self.consensus_df is None or self.consensus_df.is_empty():
-        self.logger.error("No consensus data available. Run merge/find_consensus first.")
+    # Get the consensus statistics data using the new helper method
+    data_df = self.get_consensus_stats()
+    if data_df is None or data_df.is_empty():
+        self.logger.error("No consensus statistics data available.")
         return
-    # Get all columns and their data types - work with original dataframe
-    data_df = self.consensus_df.clone()
+    # Remove consensus_uid column for plotting (keep only numeric columns)
+    if "consensus_uid" in data_df.columns:
+        data_df_clean = data_df.drop("consensus_uid")
+    else:
+        data_df_clean = data_df
-    # Define specific columns to plot in the exact order requested
+    # Define specific columns to plot in the exact order requested (excluding consensus_uid)
     desired_columns = [
         "rt",
         "rt_delta_mean",
         "mz",
-        "mz_range",  # mz_max-mz_min (will be calculated)
-        "log10_inty_mean",  # log10(inty_mean) (will be calculated)
+        "mz_range",  # mz_max-mz_min
+        "log10_inty_mean",  # log10(inty_mean)
         "number_samples",
         "number_ms2",
         "charge_mean",
@@ -1783,20 +1812,13 @@ def plot_consensus_stats(
         "chrom_prominence_scaled_mean"
     ]
-    # Calculate derived columns if they don't exist
-    if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
-        data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
-    if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
-        data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
     # Filter to only include columns that exist in the dataframe, preserving order
-    numeric_columns = [col for col in desired_columns if col in data_df.columns]
+    numeric_columns = [col for col in desired_columns if col in data_df_clean.columns]
     # Check if the numeric columns are actually numeric
     final_numeric_columns = []
     for col in numeric_columns:
-        dtype = data_df[col].dtype
+        dtype = data_df_clean[col].dtype
         if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
                     pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
                     pl.Float32, pl.Float64]:
@@ -1805,13 +1827,13 @@ def plot_consensus_stats(
     numeric_columns = final_numeric_columns
     if len(numeric_columns) == 0:
-        self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df.columns)}")
+        self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df_clean.columns)}")
         return
     self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
-    # Work directly with Polars - no conversion to pandas needed
-    data_df_clean = data_df.select(numeric_columns)
+    # Select only the numeric columns for plotting
+    data_df_clean = data_df_clean.select(numeric_columns)
     # Check if all numeric columns are empty
     all_columns_empty = True

masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

Potentially problematic release.

masster 0.5.1py3-none-any.whl → 0.5.4py3-none-any.whl