masster 0.5.1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -1385,6 +1385,7 @@ def plot_rt_correction(
1385
1385
  """
1386
1386
  Plot RT correction per sample: (rt - rt_original) vs rt overlaid for selected samples.
1387
1387
 
1388
+ Only features with filled==False are used for the RT correction plot.
1388
1389
  This uses the same color mapping as `plot_bpc` so curves for the same samples match.
1389
1390
  """
1390
1391
  from bokeh.plotting import figure, show, output_file
@@ -1447,29 +1448,35 @@ def plot_rt_correction(
1447
1448
  if sample_feats.is_empty():
1448
1449
  continue
1449
1450
 
1450
- # Convert to pandas for easy numeric handling
1451
- try:
1452
- df = sample_feats.to_pandas()
1453
- except Exception:
1454
- continue
1451
+ # Filter to only use features with filled==False
1452
+ if "filled" in sample_feats.columns:
1453
+ sample_feats = sample_feats.filter(~pl.col("filled"))
1454
+ if sample_feats.is_empty():
1455
+ continue
1455
1456
 
1456
- # Need both rt and rt_original
1457
- if "rt" not in df.columns or "rt_original" not in df.columns:
1457
+ # Stay in Polars - much faster than pandas conversion!
1458
+ if "rt" not in sample_feats.columns or "rt_original" not in sample_feats.columns:
1458
1459
  continue
1459
1460
 
1460
- # Drop NA and ensure numeric arrays
1461
- df = df.dropna(subset=["rt", "rt_original"]).copy()
1462
- if df.empty:
1463
- continue
1461
+ # Filter nulls and add delta column in Polars
1462
+ sample_feats = (
1463
+ sample_feats
1464
+ .filter(
1465
+ pl.col("rt").is_not_null() &
1466
+ pl.col("rt_original").is_not_null()
1467
+ )
1468
+ .with_columns([
1469
+ (pl.col("rt") - pl.col("rt_original")).alias("delta")
1470
+ ])
1471
+ .sort("rt")
1472
+ )
1464
1473
 
1465
- rt = _np.asarray(df["rt"], dtype=float)
1466
- rt_orig = _np.asarray(df["rt_original"], dtype=float)
1467
- delta = rt - rt_orig
1474
+ if sample_feats.is_empty():
1475
+ continue
1468
1476
 
1469
- # sort by rt
1470
- idx = _np.argsort(rt)
1471
- rt = rt[idx]
1472
- delta = delta[idx]
1477
+ # Extract arrays directly from Polars
1478
+ rt = sample_feats["rt"].to_numpy()
1479
+ delta = sample_feats["delta"].to_numpy()
1473
1480
 
1474
1481
  sample_name = str(uid)
1475
1482
  if samples_info is not None:
@@ -1759,21 +1766,26 @@ def plot_consensus_stats(
1759
1766
  import polars as pl
1760
1767
  import numpy as np
1761
1768
 
1762
- # Check if consensus_df exists and has data
1763
- if self.consensus_df is None or self.consensus_df.is_empty():
1764
- self.logger.error("No consensus data available. Run merge/find_consensus first.")
1769
+ # Get the consensus statistics data using the new helper method
1770
+ data_df = self.get_consensus_stats()
1771
+
1772
+ if data_df is None or data_df.is_empty():
1773
+ self.logger.error("No consensus statistics data available.")
1765
1774
  return
1766
1775
 
1767
- # Get all columns and their data types - work with original dataframe
1768
- data_df = self.consensus_df.clone()
1776
+ # Remove consensus_uid column for plotting (keep only numeric columns)
1777
+ if "consensus_uid" in data_df.columns:
1778
+ data_df_clean = data_df.drop("consensus_uid")
1779
+ else:
1780
+ data_df_clean = data_df
1769
1781
 
1770
- # Define specific columns to plot in the exact order requested
1782
+ # Define specific columns to plot in the exact order requested (excluding consensus_uid)
1771
1783
  desired_columns = [
1772
1784
  "rt",
1773
1785
  "rt_delta_mean",
1774
1786
  "mz",
1775
- "mz_range", # mz_max-mz_min (will be calculated)
1776
- "log10_inty_mean", # log10(inty_mean) (will be calculated)
1787
+ "mz_range", # mz_max-mz_min
1788
+ "log10_inty_mean", # log10(inty_mean)
1777
1789
  "number_samples",
1778
1790
  "number_ms2",
1779
1791
  "charge_mean",
@@ -1783,20 +1795,13 @@ def plot_consensus_stats(
1783
1795
  "chrom_prominence_scaled_mean"
1784
1796
  ]
1785
1797
 
1786
- # Calculate derived columns if they don't exist
1787
- if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
1788
- data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
1789
-
1790
- if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
1791
- data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
1792
-
1793
1798
  # Filter to only include columns that exist in the dataframe, preserving order
1794
- numeric_columns = [col for col in desired_columns if col in data_df.columns]
1799
+ numeric_columns = [col for col in desired_columns if col in data_df_clean.columns]
1795
1800
 
1796
1801
  # Check if the numeric columns are actually numeric
1797
1802
  final_numeric_columns = []
1798
1803
  for col in numeric_columns:
1799
- dtype = data_df[col].dtype
1804
+ dtype = data_df_clean[col].dtype
1800
1805
  if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
1801
1806
  pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
1802
1807
  pl.Float32, pl.Float64]:
@@ -1805,13 +1810,13 @@ def plot_consensus_stats(
1805
1810
  numeric_columns = final_numeric_columns
1806
1811
 
1807
1812
  if len(numeric_columns) == 0:
1808
- self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df.columns)}")
1813
+ self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df_clean.columns)}")
1809
1814
  return
1810
1815
 
1811
1816
  self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
1812
1817
 
1813
- # Work directly with Polars - no conversion to pandas needed
1814
- data_df_clean = data_df.select(numeric_columns)
1818
+ # Select only the numeric columns for plotting
1819
+ data_df_clean = data_df_clean.select(numeric_columns)
1815
1820
 
1816
1821
  # Check if all numeric columns are empty
1817
1822
  all_columns_empty = True