masster 0.5.1__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -603,7 +603,7 @@ def plot_consensus_2d(
603
603
  pl.when(
604
604
  (pl.col(sizeby).is_not_null()) & (pl.col(sizeby).is_finite()) & (pl.col(sizeby) > 0),
605
605
  )
606
- .then((pl.col(sizeby).log10() * markersize / 12).pow(2))
606
+ .then((pl.col(sizeby).log10() * markersize / 12).pow(1.5))
607
607
  .otherwise(markersize)
608
608
  .alias("markersize"),
609
609
  ])
@@ -1385,6 +1385,7 @@ def plot_rt_correction(
1385
1385
  """
1386
1386
  Plot RT correction per sample: (rt - rt_original) vs rt overlaid for selected samples.
1387
1387
 
1388
+ Only features with filled==False are used for the RT correction plot.
1388
1389
  This uses the same color mapping as `plot_bpc` so curves for the same samples match.
1389
1390
  """
1390
1391
  from bokeh.plotting import figure, show, output_file
@@ -1420,74 +1421,97 @@ def plot_rt_correction(
1420
1421
  p.xaxis.axis_label = f"Retention Time ({rt_unit})"
1421
1422
  p.yaxis.axis_label = "RT - RT_original (s)"
1422
1423
 
1423
- samples_info = None
1424
+ # Create sample name lookup dictionary from samples_df (all in Polars)
1425
+ sample_names_dict = {}
1424
1426
  if hasattr(self, "samples_df") and self.samples_df is not None:
1425
1427
  try:
1426
- samples_info = self.samples_df.to_pandas()
1428
+ sample_name_mapping = (
1429
+ self.samples_df
1430
+ .filter(pl.col("sample_uid").is_in(sample_uids))
1431
+ .select(["sample_uid", "sample_name"])
1432
+ )
1433
+ sample_names_dict = dict(zip(
1434
+ sample_name_mapping["sample_uid"].to_list(),
1435
+ sample_name_mapping["sample_name"].to_list()
1436
+ ))
1427
1437
  except Exception:
1428
- samples_info = None
1438
+ pass
1429
1439
 
1430
1440
  renderers = []
1431
1441
 
1432
- # Iterate samples and build curves
1433
- for uid in sample_uids:
1434
- # Select features belonging to this sample
1435
- try:
1436
- if "sample_uid" in self.features_df.columns:
1437
- sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
1438
- elif "sample_name" in self.features_df.columns:
1439
- sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
1440
- else:
1441
- self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
1442
- continue
1443
- except Exception as e:
1444
- self.logger.debug(f"Error filtering features for sample {uid}: {e}")
1445
- continue
1442
+ # Check sample identifier column
1443
+ if "sample_uid" not in self.features_df.columns:
1444
+ if "sample_name" in self.features_df.columns:
1445
+ sample_id_col = "sample_name"
1446
+ else:
1447
+ self.logger.debug("No sample identifier column in features_df")
1448
+ return
1449
+ else:
1450
+ sample_id_col = "sample_uid"
1446
1451
 
1447
- if sample_feats.is_empty():
1448
- continue
1452
+ # OPTIMIZED: Filter once, group once instead of per-sample filtering
1453
+ try:
1454
+ # Filter all data once for selected samples and required conditions
1455
+ all_sample_feats = self.features_df.filter(
1456
+ pl.col(sample_id_col).is_in(sample_uids)
1457
+ )
1458
+
1459
+ if all_sample_feats.is_empty():
1460
+ self.logger.warning("No features found for the selected samples.")
1461
+ return
1449
1462
 
1450
- # Convert to pandas for easy numeric handling
1451
- try:
1452
- df = sample_feats.to_pandas()
1453
- except Exception:
1454
- continue
1463
+ # Filter to only use features with filled==False if column exists
1464
+ if "filled" in all_sample_feats.columns:
1465
+ all_sample_feats = all_sample_feats.filter(~pl.col("filled"))
1466
+ if all_sample_feats.is_empty():
1467
+ self.logger.warning("No non-filled features found for the selected samples.")
1468
+ return
1455
1469
 
1456
- # Need both rt and rt_original
1457
- if "rt" not in df.columns or "rt_original" not in df.columns:
1458
- continue
1470
+ # Check required columns
1471
+ if "rt" not in all_sample_feats.columns or "rt_original" not in all_sample_feats.columns:
1472
+ self.logger.error("Required columns 'rt' or 'rt_original' not found in features_df.")
1473
+ return
1459
1474
 
1460
- # Drop NA and ensure numeric arrays
1461
- df = df.dropna(subset=["rt", "rt_original"]).copy()
1462
- if df.empty:
1463
- continue
1475
+ # Filter nulls, add delta column, and sort - all in one operation
1476
+ all_sample_feats = (
1477
+ all_sample_feats
1478
+ .filter(
1479
+ pl.col("rt").is_not_null() &
1480
+ pl.col("rt_original").is_not_null()
1481
+ )
1482
+ .with_columns([
1483
+ (pl.col("rt") - pl.col("rt_original")).alias("delta")
1484
+ ])
1485
+ .sort([sample_id_col, "rt"])
1486
+ )
1464
1487
 
1465
- rt = _np.asarray(df["rt"], dtype=float)
1466
- rt_orig = _np.asarray(df["rt_original"], dtype=float)
1467
- delta = rt - rt_orig
1488
+ if all_sample_feats.is_empty():
1489
+ self.logger.warning("No valid RT data found for the selected samples.")
1490
+ return
1468
1491
 
1469
- # sort by rt
1470
- idx = _np.argsort(rt)
1471
- rt = rt[idx]
1472
- delta = delta[idx]
1492
+ # Group by sample and process each group (much faster than individual filtering)
1493
+ for (sample_uid,), sample_group in all_sample_feats.group_by(sample_id_col):
1494
+ if sample_group.is_empty():
1495
+ continue
1473
1496
 
1474
- sample_name = str(uid)
1475
- if samples_info is not None:
1476
- try:
1477
- row = samples_info[samples_info["sample_uid"] == uid]
1478
- if not row.empty:
1479
- sample_name = row.iloc[0].get("sample_name", sample_name)
1480
- except Exception:
1481
- pass
1497
+ # Extract arrays directly from Polars
1498
+ rt = sample_group["rt"].to_numpy()
1499
+ delta = sample_group["delta"].to_numpy()
1482
1500
 
1483
- color = color_map.get(uid, "#000000")
1501
+ # Get sample name efficiently from pre-built dictionary
1502
+ sample_name = sample_names_dict.get(sample_uid, str(sample_uid))
1503
+ color = color_map.get(sample_uid, "#000000")
1484
1504
 
1485
- data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
1486
- src = ColumnDataSource(data)
1505
+ data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt), "sample_color": [color] * len(rt)}
1506
+ src = ColumnDataSource(data)
1487
1507
 
1488
- r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
1489
- p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
1490
- renderers.append(r_line)
1508
+ r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
1509
+ p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
1510
+ renderers.append(r_line)
1511
+
1512
+ except Exception as e:
1513
+ self.logger.error(f"Error in optimized RT correction plotting: {e}")
1514
+ return
1491
1515
 
1492
1516
  if not renderers:
1493
1517
  self.logger.warning("No RT correction curves to plot for the selected samples.")
@@ -1759,21 +1783,26 @@ def plot_consensus_stats(
1759
1783
  import polars as pl
1760
1784
  import numpy as np
1761
1785
 
1762
- # Check if consensus_df exists and has data
1763
- if self.consensus_df is None or self.consensus_df.is_empty():
1764
- self.logger.error("No consensus data available. Run merge/find_consensus first.")
1786
+ # Get the consensus statistics data using the new helper method
1787
+ data_df = self.get_consensus_stats()
1788
+
1789
+ if data_df is None or data_df.is_empty():
1790
+ self.logger.error("No consensus statistics data available.")
1765
1791
  return
1766
1792
 
1767
- # Get all columns and their data types - work with original dataframe
1768
- data_df = self.consensus_df.clone()
1793
+ # Remove consensus_uid column for plotting (keep only numeric columns)
1794
+ if "consensus_uid" in data_df.columns:
1795
+ data_df_clean = data_df.drop("consensus_uid")
1796
+ else:
1797
+ data_df_clean = data_df
1769
1798
 
1770
- # Define specific columns to plot in the exact order requested
1799
+ # Define specific columns to plot in the exact order requested (excluding consensus_uid)
1771
1800
  desired_columns = [
1772
1801
  "rt",
1773
1802
  "rt_delta_mean",
1774
1803
  "mz",
1775
- "mz_range", # mz_max-mz_min (will be calculated)
1776
- "log10_inty_mean", # log10(inty_mean) (will be calculated)
1804
+ "mz_range", # mz_max-mz_min
1805
+ "log10_inty_mean", # log10(inty_mean)
1777
1806
  "number_samples",
1778
1807
  "number_ms2",
1779
1808
  "charge_mean",
@@ -1783,20 +1812,13 @@ def plot_consensus_stats(
1783
1812
  "chrom_prominence_scaled_mean"
1784
1813
  ]
1785
1814
 
1786
- # Calculate derived columns if they don't exist
1787
- if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
1788
- data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
1789
-
1790
- if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
1791
- data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
1792
-
1793
1815
  # Filter to only include columns that exist in the dataframe, preserving order
1794
- numeric_columns = [col for col in desired_columns if col in data_df.columns]
1816
+ numeric_columns = [col for col in desired_columns if col in data_df_clean.columns]
1795
1817
 
1796
1818
  # Check if the numeric columns are actually numeric
1797
1819
  final_numeric_columns = []
1798
1820
  for col in numeric_columns:
1799
- dtype = data_df[col].dtype
1821
+ dtype = data_df_clean[col].dtype
1800
1822
  if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
1801
1823
  pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
1802
1824
  pl.Float32, pl.Float64]:
@@ -1805,13 +1827,13 @@ def plot_consensus_stats(
1805
1827
  numeric_columns = final_numeric_columns
1806
1828
 
1807
1829
  if len(numeric_columns) == 0:
1808
- self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df.columns)}")
1830
+ self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df_clean.columns)}")
1809
1831
  return
1810
1832
 
1811
1833
  self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
1812
1834
 
1813
- # Work directly with Polars - no conversion to pandas needed
1814
- data_df_clean = data_df.select(numeric_columns)
1835
+ # Select only the numeric columns for plotting
1836
+ data_df_clean = data_df_clean.select(numeric_columns)
1815
1837
 
1816
1838
  # Check if all numeric columns are empty
1817
1839
  all_columns_empty = True