masster 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -7,148 +7,225 @@ import holoviews as hv
7
7
  import numpy as np
8
8
  import panel
9
9
  import polars as pl
10
-
11
- from bokeh.io.export import export_png
12
- from bokeh.models import ColumnDataSource
13
- from bokeh.models import HoverTool
14
- from bokeh.palettes import Turbo256
15
- from bokeh.plotting import figure
16
- from bokeh.plotting import output_file
17
- from bokeh.plotting import show
18
10
  from tqdm import tqdm
19
11
 
20
12
  hv.extension("bokeh")
21
13
 
22
14
 
23
- def plot_alignment(self, filename=None):
24
- import matplotlib.pyplot as plt
25
- import numpy as np
15
+ # Replace any unaliased import that could be shadowed:
16
+ # from bokeh.layouts import row
17
+ from bokeh.layouts import row as bokeh_row
26
18
 
27
- if self.features_maps is None or len(self.features_maps) == 0:
28
- self.load_features()
29
19
 
30
- feature_maps = self.features_maps
31
- ref_index = self.alignment_ref_index
32
- if ref_index is None:
33
- self.logger.error("No alignment performed yet.")
34
- return
20
+ def plot_alignment(self, maps: bool = True, filename: str | None = None, width: int = 450, height: int = 450, markersize: int = 3):
21
+ """Visualize retention time alignment using two synchronized Bokeh scatter plots.
35
22
 
36
- fmaps = [
37
- feature_maps[ref_index],
38
- *feature_maps[:ref_index],
39
- *feature_maps[ref_index + 1 :],
40
- ]
23
+ - When ``maps=True`` the function reads ``self.features_maps`` (list of FeatureMap)
24
+ and builds two side-by-side plots: Original RT (left) and Current/Aligned RT (right).
25
+ - When ``maps=False`` the function uses ``self.features_df`` and expects an
26
+ ``rt_original`` column (before) and ``rt`` column (after).
41
27
 
42
- fig = plt.figure(figsize=(12, 6))
28
+ Parameters
29
+ - maps: whether to use feature maps (default True).
30
+ - filename: optional HTML file path to save the plot.
31
+ - width/height: pixel size of each subplot.
32
+ - markersize: base marker size.
43
33
 
44
- ax = fig.add_subplot(1, 2, 1)
45
- ax.set_title("Feature maps before alignment")
46
- ax.set_ylabel("m/z")
47
- ax.set_xlabel("RT")
34
+ Returns
35
+ - Bokeh layout (row) containing the two synchronized plots.
36
+ """
37
+ # Local imports so the module can be used even if bokeh isn't needed elsewhere
38
+ from bokeh.models import ColumnDataSource, HoverTool
39
+ from bokeh.plotting import figure, show, output_file
40
+ from bokeh.palettes import Turbo256
41
+ import pandas as pd
42
+
43
+ # Build the before/after tabular data used for plotting
44
+ before_data: list[dict[str, Any]] = []
45
+ after_data: list[dict[str, Any]] = []
46
+
47
+ if maps:
48
+ # Ensure feature maps are loaded
49
+ if self.features_maps is None or len(self.features_maps) == 0:
50
+ self.load_features()
51
+
52
+ fmaps = self.features_maps or []
53
+
54
+ if not fmaps:
55
+ self.logger.error("No feature maps available for plotting.")
56
+ return
57
+
58
+ # Reference (first) sample: use current RT for both before and after
59
+ ref = fmaps[0]
60
+ ref_rt = [f.getRT() for f in ref]
61
+ ref_mz = [f.getMZ() for f in ref]
62
+ ref_inty = [f.getIntensity() for f in ref]
63
+ max_ref_inty = max(ref_inty) if ref_inty else 1
64
+
65
+ # sample metadata
66
+ if hasattr(self, 'samples_df') and self.samples_df is not None and not self.samples_df.is_empty():
67
+ samples_info = self.samples_df.to_pandas()
68
+ ref_sample_uid = samples_info.iloc[0]['sample_uid'] if 'sample_uid' in samples_info.columns else 'Reference_UID'
69
+ ref_sample_name = samples_info.iloc[0]['sample_name'] if 'sample_name' in samples_info.columns else 'Reference'
70
+ else:
71
+ ref_sample_uid = 'Reference_UID'
72
+ ref_sample_name = 'Reference'
73
+
74
+ for rt, mz, inty in zip(ref_rt, ref_mz, ref_inty):
75
+ before_data.append({'rt': rt, 'mz': mz, 'inty': inty, 'alpha': inty / max_ref_inty, 'sample_idx': 0, 'sample_name': ref_sample_name, 'sample_uid': ref_sample_uid, 'size': markersize + 2})
76
+ after_data.append({'rt': rt, 'mz': mz, 'inty': inty, 'alpha': inty / max_ref_inty, 'sample_idx': 0, 'sample_name': ref_sample_name, 'sample_uid': ref_sample_uid, 'size': markersize + 2})
77
+
78
+ # Remaining samples
79
+ for sample_idx, fm in enumerate(fmaps[1:], start=1):
80
+ mz_vals = []
81
+ inty_vals = []
82
+ original_rt = []
83
+ aligned_rt = []
84
+
85
+ for f in fm:
86
+ try:
87
+ orig = f.getMetaValue('original_RT')
88
+ except Exception:
89
+ orig = None
90
+
91
+ if orig is None:
92
+ original_rt.append(f.getRT())
93
+ else:
94
+ original_rt.append(orig)
95
+
96
+ aligned_rt.append(f.getRT())
97
+ mz_vals.append(f.getMZ())
98
+ inty_vals.append(f.getIntensity())
99
+
100
+ if not inty_vals:
101
+ continue
102
+
103
+ max_inty = max(inty_vals)
104
+
105
+ if hasattr(self, 'samples_df') and self.samples_df is not None and not self.samples_df.is_empty():
106
+ samples_info = self.samples_df.to_pandas()
107
+ if sample_idx < len(samples_info):
108
+ sample_name = samples_info.iloc[sample_idx].get('sample_name', f'Sample {sample_idx}')
109
+ sample_uid = samples_info.iloc[sample_idx].get('sample_uid', f'Sample_{sample_idx}_UID')
110
+ else:
111
+ sample_name = f'Sample {sample_idx}'
112
+ sample_uid = f'Sample_{sample_idx}_UID'
113
+ else:
114
+ sample_name = f'Sample {sample_idx}'
115
+ sample_uid = f'Sample_{sample_idx}_UID'
48
116
 
49
- # use alpha value to display feature intensity
50
- ax.scatter(
51
- [f.getRT() for f in fmaps[0]],
52
- [f.getMZ() for f in fmaps[0]],
53
- alpha=np.asarray([f.getIntensity() for f in fmaps[0]]) / max([f.getIntensity() for f in fmaps[0]]),
54
- s=4,
55
- )
117
+ for rt, mz, inty in zip(original_rt, mz_vals, inty_vals):
118
+ before_data.append({'rt': rt, 'mz': mz, 'inty': inty, 'alpha': inty / max_inty, 'sample_idx': sample_idx, 'sample_name': sample_name, 'sample_uid': sample_uid, 'size': markersize})
56
119
 
57
- for fm in fmaps[1:]:
58
- ax.scatter(
59
- [f.getMetaValue("original_RT") for f in fm],
60
- [f.getMZ() for f in fm],
61
- alpha=np.asarray([f.getIntensity() for f in fm]) / max([f.getIntensity() for f in fm]),
62
- s=2, # Set symbol size to 3
63
- )
120
+ for rt, mz, inty in zip(aligned_rt, mz_vals, inty_vals):
121
+ after_data.append({'rt': rt, 'mz': mz, 'inty': inty, 'alpha': inty / max_inty, 'sample_idx': sample_idx, 'sample_name': sample_name, 'sample_uid': sample_uid, 'size': markersize})
64
122
 
65
- ax = fig.add_subplot(1, 2, 2)
66
- ax.set_title("Feature maps after alignment")
67
- ax.set_ylabel("m/z")
68
- ax.set_xlabel("RT")
69
-
70
- for fm in fmaps:
71
- ax.scatter(
72
- [f.getRT() for f in fm],
73
- [f.getMZ() for f in fm],
74
- alpha=np.asarray([f.getIntensity() for f in fm]) / max([f.getIntensity() for f in fm]),
75
- s=2, # Set symbol size to 3
76
- )
123
+ else:
124
+ # Use features_df
125
+ if self.features_df is None or self.features_df.is_empty():
126
+ self.logger.error("No features_df found. Load features first.")
127
+ return
77
128
 
78
- fig.tight_layout()
129
+ required_cols = ['rt', 'mz', 'inty']
130
+ missing = [c for c in required_cols if c not in self.features_df.columns]
131
+ if missing:
132
+ self.logger.error(f"Missing required columns in features_df: {missing}")
133
+ return
79
134
 
135
+ if 'rt_original' not in self.features_df.columns:
136
+ self.logger.error("Column 'rt_original' not found in features_df. Alignment may not have been performed.")
137
+ return
80
138
 
81
- def plot_alignment_bokeh(self, filename=None):
82
- from bokeh.plotting import figure, show, output_file
83
- from bokeh.layouts import gridplot
139
+ features_pd = self.features_df.to_pandas()
84
140
 
85
- feature_maps = self.features_maps
86
- ref_index = self.alignment_ref_index
87
- if ref_index is None:
88
- self.logger.warning("No alignment performed yet.")
89
- return
141
+ sample_col = 'sample_uid' if 'sample_uid' in features_pd.columns else 'sample_name'
142
+ if sample_col not in features_pd.columns:
143
+ self.logger.error("No sample identifier column found in features_df.")
144
+ return
90
145
 
91
- fmaps = [
92
- feature_maps[ref_index],
93
- *feature_maps[:ref_index],
94
- *feature_maps[ref_index + 1 :],
95
- ]
146
+ samples = features_pd[sample_col].unique()
96
147
 
97
- # Create Bokeh figures
98
- p1 = figure(
99
- title="Feature maps before alignment",
100
- width=600,
101
- height=400,
102
- )
103
- p1.xaxis.axis_label = "RT"
104
- p1.yaxis.axis_label = "m/z"
105
- p2 = figure(
106
- title="Feature maps after alignment",
107
- width=600,
108
- height=400,
109
- )
110
- p2.xaxis.axis_label = "RT"
111
- p2.yaxis.axis_label = "m/z"
112
-
113
- # Plot before alignment
114
- p1.scatter(
115
- x=[f.getRT() for f in fmaps[0]],
116
- y=[f.getMZ() for f in fmaps[0]],
117
- size=4,
118
- alpha=[f.getIntensity() / max([f.getIntensity() for f in fmaps[0]]) for f in fmaps[0]],
119
- color="blue",
120
- )
148
+ for sample_idx, sample in enumerate(samples):
149
+ sample_data = features_pd[features_pd[sample_col] == sample]
150
+ max_inty = sample_data['inty'].max() if sample_data['inty'].max() > 0 else 1
151
+ sample_name = str(sample)
152
+ sample_uid = sample if sample_col == 'sample_uid' else (sample_data['sample_uid'].iloc[0] if 'sample_uid' in sample_data.columns else sample)
121
153
 
122
- for fm in fmaps[1:]:
123
- p1.scatter(
124
- x=[f.getMetaValue("original_RT") for f in fm],
125
- y=[f.getMZ() for f in fm],
126
- size=2,
127
- alpha=[f.getIntensity() / max([f.getIntensity() for f in fm]) for f in fm],
128
- color="green",
129
- )
154
+ for _, row in sample_data.iterrows():
155
+ before_data.append({'rt': row['rt_original'], 'mz': row['mz'], 'inty': row['inty'], 'alpha': row['inty'] / max_inty, 'sample_idx': sample_idx, 'sample_name': sample_name, 'sample_uid': sample_uid, 'size': markersize + 2 if sample_idx == 0 else markersize})
156
+ after_data.append({'rt': row['rt'], 'mz': row['mz'], 'inty': row['inty'], 'alpha': row['inty'] / max_inty, 'sample_idx': sample_idx, 'sample_name': sample_name, 'sample_uid': sample_uid, 'size': markersize + 2 if sample_idx == 0 else markersize})
130
157
 
131
- # Plot after alignment
132
- for fm in fmaps:
133
- p2.scatter(
134
- x=[f.getRT() for f in fm],
135
- y=[f.getMZ() for f in fm],
136
- size=2,
137
- alpha=[f.getIntensity() / max([f.getIntensity() for f in fm]) for f in fm],
138
- color="red",
139
- )
158
+ # Ensure dataframes exist even if empty
159
+ before_df = pd.DataFrame(before_data)
160
+ after_df = pd.DataFrame(after_data)
140
161
 
141
- # Arrange plots in a grid
142
- # Link the x_range and y_range of both plots for synchronized zooming/panning
143
- p2.x_range = p1.x_range
144
- p2.y_range = p1.y_range
162
+ # Create ColumnDataSources (safe even for empty dfs)
163
+ from bokeh.models import ColumnDataSource
145
164
 
146
- grid = gridplot([[p1, p2]])
165
+ before_source = ColumnDataSource(before_df)
166
+ after_source = ColumnDataSource(after_df)
147
167
 
148
- # Output to file and show
168
+ # Create Bokeh figures
169
+ p1 = figure(width=width, height=height, title='Original RT', x_axis_label='Retention Time (s)', y_axis_label='m/z', tools='pan,wheel_zoom,box_zoom,reset,save')
170
+ p1.outline_line_color = None
171
+ p1.background_fill_color = 'white'
172
+ p1.border_fill_color = 'white'
173
+ p1.min_border = 0
174
+
175
+ p2 = figure(width=width, height=height, title='Current RT', x_axis_label='Retention Time (s)', y_axis_label='m/z', tools='pan,wheel_zoom,box_zoom,reset,save', x_range=p1.x_range, y_range=p1.y_range)
176
+ p2.outline_line_color = None
177
+ p2.background_fill_color = 'white'
178
+ p2.border_fill_color = 'white'
179
+ p2.min_border = 0
180
+
181
+ # Color mapping using Turbo256
182
+ unique_samples = sorted(list(set(before_df['sample_idx'].tolist()))) if not before_df.empty else []
183
+ colors = Turbo256
184
+ color_map: dict[int, str] = {}
185
+ n = max(1, len(unique_samples))
186
+ step = max(1, 256 // n)
187
+ for i, sample_idx in enumerate(unique_samples):
188
+ color_map[sample_idx] = colors[(i * step) % 256]
189
+
190
+ renderers_before = []
191
+ renderers_after = []
192
+
193
+ for sample_idx in unique_samples:
194
+ sb = before_df[before_df['sample_idx'] == sample_idx]
195
+ sa = after_df[after_df['sample_idx'] == sample_idx]
196
+ color = color_map.get(sample_idx, '#000000')
197
+
198
+ if not sb.empty:
199
+ src = ColumnDataSource(sb)
200
+ r = p1.scatter('rt', 'mz', size='size', color=color, alpha='alpha', source=src)
201
+ renderers_before.append(r)
202
+
203
+ if not sa.empty:
204
+ src = ColumnDataSource(sa)
205
+ r = p2.scatter('rt', 'mz', size='size', color=color, alpha='alpha', source=src)
206
+ renderers_after.append(r)
207
+
208
+ # Add hover tools
209
+ hover1 = HoverTool(tooltips=[('Sample UID', '@sample_uid'), ('Sample Name', '@sample_name'), ('RT', '@rt{0.00}'), ('m/z', '@mz{0.0000}'), ('Intensity', '@inty{0.0e+0}')], renderers=renderers_before)
210
+ p1.add_tools(hover1)
211
+
212
+ hover2 = HoverTool(tooltips=[('Sample UID', '@sample_uid'), ('Sample Name', '@sample_name'), ('RT', '@rt{0.00}'), ('m/z', '@mz{0.0000}'), ('Intensity', '@inty{0.0e+0}')], renderers=renderers_after)
213
+ p2.add_tools(hover2)
214
+
215
+ # Create layout with both plots side by side
216
+ # Use the aliased bokeh_row and set sizing_mode, width and height to avoid validation warnings.
217
+ layout = bokeh_row(p1, p2, sizing_mode='fixed', width=width, height=height)
218
+
219
+ # Output and show
149
220
  if filename:
221
+ from bokeh.plotting import output_file, show
150
222
  output_file(filename)
151
- show(grid)
223
+ show(layout)
224
+ else:
225
+ from bokeh.plotting import show
226
+ show(layout)
227
+
228
+ return layout
152
229
 
153
230
 
154
231
  def plot_consensus_2d(
@@ -163,11 +240,11 @@ def plot_consensus_2d(
163
240
  width=900,
164
241
  height=900,
165
242
  mz_range=None,
166
- rt_range=None
243
+ rt_range=None,
167
244
  ):
168
245
  """
169
246
  Plot consensus features in a 2D scatter plot with retention time vs m/z.
170
-
247
+
171
248
  Parameters:
172
249
  filename (str, optional): Path to save the plot
173
250
  colorby (str): Column name to use for color mapping (default: "number_samples")
@@ -187,13 +264,13 @@ def plot_consensus_2d(
187
264
  self.logger.error("No consensus map found.")
188
265
  return
189
266
  data = self.consensus_df.clone()
190
-
267
+
191
268
  # Filter by mz_range and rt_range if provided
192
269
  if mz_range is not None:
193
270
  data = data.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
194
271
  if rt_range is not None:
195
272
  data = data.filter((pl.col("rt") >= rt_range[0]) & (pl.col("rt") <= rt_range[1]))
196
-
273
+
197
274
  if colorby not in data.columns:
198
275
  self.logger.error(f"Column {colorby} not found in consensus_df.")
199
276
  return
@@ -339,16 +416,16 @@ def plot_samples_2d(
339
416
  alpha="inty",
340
417
  cmap="Turbo256",
341
418
  max_features=50000,
342
- width=900,
343
- height=900,
419
+ width=600,
420
+ height=600,
344
421
  mz_range=None,
345
- rt_range=None
422
+ rt_range=None,
346
423
  ):
347
424
  """
348
425
  Plot all feature maps for sample_uid in parameter uids in an overlaid scatter plot.
349
426
  Each sample is a different color. Alpha scales with intensity.
350
427
  OPTIMIZED VERSION: Uses vectorized operations and batch processing.
351
-
428
+
352
429
  Parameters:
353
430
  samples: Sample UIDs to plot
354
431
  filename (str, optional): Path to save the plot
@@ -366,6 +443,12 @@ def plot_samples_2d(
366
443
  rt_range (tuple, optional): Retention time range for filtering features (min_rt, max_rt)
367
444
  """
368
445
 
446
+ # Local bokeh imports to avoid heavy top-level dependency
447
+ from bokeh.plotting import figure, show, output_file
448
+ from bokeh.io.export import export_png
449
+ from bokeh.models import ColumnDataSource, HoverTool
450
+ from bokeh.palettes import Turbo256
451
+
369
452
  sample_uids = self._get_sample_uids(samples)
370
453
 
371
454
  if not sample_uids:
@@ -385,7 +468,7 @@ def plot_samples_2d(
385
468
 
386
469
  # OPTIMIZATION 1: Batch filter all features for selected samples at once
387
470
  features_batch = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
388
-
471
+
389
472
  # Filter by mz_range and rt_range if provided
390
473
  if mz_range is not None:
391
474
  features_batch = features_batch.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
@@ -457,7 +540,10 @@ def plot_samples_2d(
457
540
  color_values = {}
458
541
  sample_names = {}
459
542
 
460
- for uid in sample_uids:
543
+ # Decide whether to show tqdm based on log level (show for INFO/DEBUG/TRACE)
544
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
545
+
546
+ for uid in tqdm(sample_uids, desc="Plotting BPCs", disable=tqdm_disable):
461
547
  sample_data = features_pd[features_pd["sample_uid"] == uid]
462
548
  if sample_data.empty:
463
549
  continue
@@ -527,7 +613,9 @@ def plot_samples_2d(
527
613
  p.add_tools(hover)
528
614
 
529
615
  # Remove legend from plot
530
- p.legend.visible = False
616
+ # Only set legend properties if a legend was actually created to avoid Bokeh warnings
617
+ if getattr(p, "legend", None) and len(p.legend) > 0:
618
+ p.legend.visible = False
531
619
  if filename:
532
620
  if filename.endswith(".html"):
533
621
  output_file(filename)
@@ -542,6 +630,441 @@ def plot_samples_2d(
542
630
  return
543
631
 
544
632
 
633
+ def plot_bpc(
634
+ self,
635
+ samples=None,
636
+ title: str | None = None,
637
+ filename: str | None = None,
638
+ width: int = 1000,
639
+ height: int = 300,
640
+ rt_unit: str = "s",
641
+ original: bool = False,
642
+ ):
643
+ """
644
+ Plot Base Peak Chromatograms (BPC) for selected samples overlayed using Bokeh.
645
+
646
+ This collects per-sample BPCs via `get_bpc(self, sample=uid)` and overlays them.
647
+ Colors are mapped per-sample using the same Turbo256 palette as `plot_samples_2d`.
648
+ Parameters:
649
+ original (bool): If True, attempt to map RTs back to original RTs using `features_df`.
650
+ If False (default), return current/aligned RTs.
651
+ """
652
+ # Local imports to avoid heavy top-level deps / circular imports
653
+ from bokeh.plotting import figure, show, output_file
654
+ from bokeh.models import ColumnDataSource, HoverTool
655
+ from bokeh.io.export import export_png
656
+ from bokeh.palettes import Turbo256
657
+ from masster.study.helpers import get_bpc
658
+
659
+ sample_uids = self._get_sample_uids(samples)
660
+ if not sample_uids:
661
+ self.logger.error("No valid sample_uids provided for BPC plotting.")
662
+ return
663
+
664
+ # Debug: show which sample_uids we will process
665
+ self.logger.debug(f"plot_bpc: sample_uids={sample_uids}")
666
+
667
+ colors = Turbo256
668
+ n = max(1, len(sample_uids))
669
+ step = max(1, 256 // n)
670
+ color_map = {uid: colors[(i * step) % 256] for i, uid in enumerate(sample_uids)}
671
+
672
+ # If plotting original (uncorrected) RTs, use the requested title.
673
+ if original:
674
+ plot_title = "Base Peak Chromatogarms (uncorrected)"
675
+ else:
676
+ plot_title = title or "Base Peak Chromatograms"
677
+
678
+ p = figure(width=width, height=height, title=plot_title, tools="pan,wheel_zoom,box_zoom,reset,save")
679
+ p.xaxis.axis_label = f"Retention Time ({rt_unit})"
680
+ p.yaxis.axis_label = "Intensity"
681
+
682
+ renderers = []
683
+
684
+ # Build sample name mapping once
685
+ samples_info = None
686
+ if hasattr(self, "samples_df") and self.samples_df is not None:
687
+ try:
688
+ samples_info = self.samples_df.to_pandas()
689
+ except Exception:
690
+ samples_info = None
691
+
692
+ for uid in sample_uids:
693
+ try:
694
+ chrom = get_bpc(self, sample=uid, rt_unit=rt_unit, label=None, original=original)
695
+ except Exception as e:
696
+ # log and skip samples we can't compute BPC for
697
+ self.logger.debug(f"Skipping sample {uid} for BPC: {e}")
698
+ continue
699
+
700
+ # extract arrays
701
+ try:
702
+ # prefer Chromatogram API
703
+ chrom_dict = chrom.to_dict() if hasattr(chrom, "to_dict") else {"rt": getattr(chrom, "rt"), "inty": getattr(chrom, "inty")}
704
+ rt = chrom_dict.get("rt")
705
+ inty = chrom_dict.get("inty")
706
+ except Exception:
707
+ try:
708
+ rt = chrom.rt
709
+ inty = chrom.inty
710
+ except Exception as e:
711
+ self.logger.debug(f"Invalid chromatogram for sample {uid}: {e}")
712
+ continue
713
+
714
+ if rt is None or inty is None:
715
+ continue
716
+
717
+ # Ensure numpy arrays
718
+ import numpy as _np
719
+
720
+ rt = _np.asarray(rt)
721
+ inty = _np.asarray(inty)
722
+ if rt.size == 0 or inty.size == 0:
723
+ continue
724
+
725
+ # Sort by rt
726
+ idx = _np.argsort(rt)
727
+ rt = rt[idx]
728
+ inty = inty[idx]
729
+
730
+ sample_name = str(uid)
731
+ if samples_info is not None:
732
+ try:
733
+ row = samples_info[samples_info["sample_uid"] == uid]
734
+ if not row.empty:
735
+ sample_name = row.iloc[0].get("sample_name", sample_name)
736
+ except Exception:
737
+ pass
738
+ # Determine color for this sample early so we can log it
739
+ color = color_map.get(uid, "#000000")
740
+
741
+ # Debug: log sample processing details
742
+ self.logger.debug(
743
+ f"Processing BPC for sample_uid={uid}, sample_name={sample_name}, rt_len={rt.size}, color={color}"
744
+ )
745
+
746
+ data = {"rt": rt, "inty": inty, "sample": [sample_name] * len(rt)}
747
+ src = ColumnDataSource(data)
748
+
749
+ r_line = p.line("rt", "inty", source=src, line_width=1, color=color, legend_label=str(sample_name))
750
+ r_points = p.scatter("rt", "inty", source=src, size=2, color=color, alpha=0.6)
751
+ renderers.append(r_line)
752
+
753
+ if not renderers:
754
+ self.logger.warning("No BPC curves to plot for the selected samples.")
755
+ return
756
+
757
+ hover = HoverTool(tooltips=[("sample", "@sample"), ("rt", "@rt{0.00}"), ("inty", "@inty{0.0e+0}")], renderers=renderers)
758
+ p.add_tools(hover)
759
+
760
+ # Only set legend properties if a legend was actually created to avoid Bokeh warnings
761
+ if getattr(p, "legend", None) and len(p.legend) > 0:
762
+ p.legend.visible = False
763
+
764
+ if filename:
765
+ if filename.endswith(".html"):
766
+ output_file(filename)
767
+ show(p)
768
+ elif filename.endswith(".png"):
769
+ try:
770
+ export_png(p, filename=filename)
771
+ except Exception:
772
+ # fallback to saving HTML
773
+ output_file(filename.replace(".png", ".html"))
774
+ show(p)
775
+ else:
776
+ output_file(filename)
777
+ show(p)
778
+ else:
779
+ show(p)
780
+
781
+ return p
782
+
783
+
784
+ def plot_eic(
785
+ self,
786
+ mz,
787
+ mz_tol=0.01,
788
+ samples=None,
789
+ title: str | None = None,
790
+ filename: str | None = None,
791
+ width: int = 1000,
792
+ height: int = 300,
793
+ rt_unit: str = "s",
794
+ original: bool = False,
795
+ ):
796
+ """
797
+ Plot Extracted Ion Chromatograms (EIC) for a target m/z (± mz_tol) for selected samples.
798
+
799
+ Parameters mirror `plot_bpc` with additional `mz` and `mz_tol` arguments. The function
800
+ retrieves a Sample object for each sample UID, calls `sample.get_eic(mz, mz_tol)`, and
801
+ overlays the resulting chromatograms.
802
+ """
803
+ # Local imports to avoid heavy top-level deps / circular imports
804
+ from bokeh.plotting import figure, show, output_file
805
+ from bokeh.models import ColumnDataSource, HoverTool
806
+ from bokeh.io.export import export_png
807
+ from bokeh.palettes import Turbo256
808
+ from masster.study.helpers import get_eic
809
+
810
+ if mz is None:
811
+ self.logger.error("mz must be provided for EIC plotting")
812
+ return
813
+
814
+ sample_uids = self._get_sample_uids(samples)
815
+ if not sample_uids:
816
+ self.logger.error("No valid sample_uids provided for EIC plotting.")
817
+ return
818
+
819
+ colors = Turbo256
820
+ n = max(1, len(sample_uids))
821
+ step = max(1, 256 // n)
822
+ color_map = {uid: colors[(i * step) % 256] for i, uid in enumerate(sample_uids)}
823
+
824
+ plot_title = title or f"Extracted Ion Chromatograms (m/z={mz:.4f} ± {mz_tol})"
825
+
826
+ p = figure(width=width, height=height, title=plot_title, tools="pan,wheel_zoom,box_zoom,reset,save")
827
+ p.xaxis.axis_label = f"Retention Time ({rt_unit})"
828
+ p.yaxis.axis_label = "Intensity"
829
+
830
+ renderers = []
831
+
832
+ # Build sample name mapping once
833
+ samples_info = None
834
+ if hasattr(self, "samples_df") and self.samples_df is not None:
835
+ try:
836
+ samples_info = self.samples_df.to_pandas()
837
+ except Exception:
838
+ samples_info = None
839
+
840
+ for uid in sample_uids:
841
+ try:
842
+ chrom = get_eic(self, sample=uid, mz=mz, mz_tol=mz_tol, rt_unit=rt_unit, label=None)
843
+ except Exception as e:
844
+ # log and skip samples we can't compute EIC for
845
+ self.logger.debug(f"Skipping sample {uid} for EIC: {e}")
846
+ continue
847
+
848
+ # extract arrays
849
+ try:
850
+ # prefer Chromatogram API
851
+ chrom_dict = chrom.to_dict() if hasattr(chrom, "to_dict") else {"rt": getattr(chrom, "rt"), "inty": getattr(chrom, "inty")}
852
+ rt = chrom_dict.get("rt")
853
+ inty = chrom_dict.get("inty")
854
+ except Exception:
855
+ try:
856
+ rt = chrom.rt
857
+ inty = chrom.inty
858
+ except Exception as e:
859
+ self.logger.debug(f"Invalid chromatogram for sample {uid}: {e}")
860
+ continue
861
+
862
+ if rt is None or inty is None:
863
+ continue
864
+
865
+ import numpy as _np
866
+
867
+ rt = _np.asarray(rt)
868
+ inty = _np.asarray(inty)
869
+ if rt.size == 0 or inty.size == 0:
870
+ continue
871
+
872
+ # Sort by rt
873
+ idx = _np.argsort(rt)
874
+ rt = rt[idx]
875
+ inty = inty[idx]
876
+
877
+ sample_name = str(uid)
878
+ if samples_info is not None:
879
+ try:
880
+ row = samples_info[samples_info["sample_uid"] == uid]
881
+ if not row.empty:
882
+ sample_name = row.iloc[0].get("sample_name", sample_name)
883
+ except Exception:
884
+ pass
885
+
886
+ color = color_map.get(uid, "#000000")
887
+
888
+ data = {"rt": rt, "inty": inty, "sample": [sample_name] * len(rt)}
889
+ src = ColumnDataSource(data)
890
+
891
+ r_line = p.line("rt", "inty", source=src, line_width=1, color=color, legend_label=str(sample_name))
892
+ p.scatter("rt", "inty", source=src, size=2, color=color, alpha=0.6)
893
+ renderers.append(r_line)
894
+
895
+ if not renderers:
896
+ self.logger.warning("No EIC curves to plot for the selected samples.")
897
+ return
898
+
899
+ hover = HoverTool(tooltips=[("sample", "@sample"), ("rt", "@rt{0.00}"), ("inty", "@inty{0.0e+0}")], renderers=renderers)
900
+ p.add_tools(hover)
901
+
902
+ if getattr(p, "legend", None) and len(p.legend) > 0:
903
+ p.legend.visible = False
904
+
905
+ if filename:
906
+ if filename.endswith(".html"):
907
+ output_file(filename)
908
+ show(p)
909
+ elif filename.endswith(".png"):
910
+ try:
911
+ export_png(p, filename=filename)
912
+ except Exception:
913
+ output_file(filename.replace(".png", ".html"))
914
+ show(p)
915
+ else:
916
+ output_file(filename)
917
+ show(p)
918
+ else:
919
+ show(p)
920
+
921
+ return p
922
+
923
+
924
+ def plot_rt_correction(
925
+ self,
926
+ samples=None,
927
+ title: str | None = None,
928
+ filename: str | None = None,
929
+ width: int = 1000,
930
+ height: int = 300,
931
+ rt_unit: str = "s",
932
+ ):
933
+ """
934
+ Plot RT correction per sample: (rt - rt_original) vs rt overlayed for selected samples.
935
+
936
+ This uses the same color mapping as `plot_bpc` so curves for the same samples match.
937
+ """
938
+ from bokeh.plotting import figure, show, output_file
939
+ from bokeh.models import ColumnDataSource, HoverTool
940
+ from bokeh.palettes import Turbo256
941
+ import numpy as _np
942
+
943
+ # Validate features dataframe
944
+ if self.features_df is None or self.features_df.is_empty():
945
+ self.logger.error("No features_df found. Load features first.")
946
+ return
947
+
948
+ if "rt_original" not in self.features_df.columns:
949
+ self.logger.error("Column 'rt_original' not found in features_df. Alignment/backup RTs missing.")
950
+ return
951
+
952
+ sample_uids = self._get_sample_uids(samples)
953
+ if not sample_uids:
954
+ self.logger.error("No valid sample_uids provided for RT correction plotting.")
955
+ return
956
+
957
+ # Color mapping like plot_bpc
958
+ colors = Turbo256
959
+ n = max(1, len(sample_uids))
960
+ step = max(1, 256 // n)
961
+ color_map = {uid: colors[(i * step) % 256] for i, uid in enumerate(sample_uids)}
962
+
963
+ p = figure(width=width, height=height, title=title or "RT correction", tools="pan,wheel_zoom,box_zoom,reset,save")
964
+ p.xaxis.axis_label = f"Retention Time ({rt_unit})"
965
+ p.yaxis.axis_label = "RT - RT_original (s)"
966
+
967
+ samples_info = None
968
+ if hasattr(self, "samples_df") and self.samples_df is not None:
969
+ try:
970
+ samples_info = self.samples_df.to_pandas()
971
+ except Exception:
972
+ samples_info = None
973
+
974
+ renderers = []
975
+
976
+ # Iterate samples and build curves
977
+ for uid in sample_uids:
978
+ # Select features belonging to this sample
979
+ try:
980
+ if "sample_uid" in self.features_df.columns:
981
+ sample_feats = self.features_df.filter(pl.col("sample_uid") == uid)
982
+ elif "sample_name" in self.features_df.columns:
983
+ sample_feats = self.features_df.filter(pl.col("sample_name") == uid)
984
+ else:
985
+ self.logger.debug("No sample identifier column in features_df; skipping sample filtering")
986
+ continue
987
+ except Exception as e:
988
+ self.logger.debug(f"Error filtering features for sample {uid}: {e}")
989
+ continue
990
+
991
+ if sample_feats.is_empty():
992
+ continue
993
+
994
+ # Convert to pandas for easy numeric handling
995
+ try:
996
+ df = sample_feats.to_pandas()
997
+ except Exception:
998
+ continue
999
+
1000
+ # Need both rt and rt_original
1001
+ if "rt" not in df.columns or "rt_original" not in df.columns:
1002
+ continue
1003
+
1004
+ # Drop NA and ensure numeric arrays
1005
+ df = df.dropna(subset=["rt", "rt_original"]).copy()
1006
+ if df.empty:
1007
+ continue
1008
+
1009
+ rt = _np.asarray(df["rt"], dtype=float)
1010
+ rt_orig = _np.asarray(df["rt_original"], dtype=float)
1011
+ delta = rt - rt_orig
1012
+
1013
+ # sort by rt
1014
+ idx = _np.argsort(rt)
1015
+ rt = rt[idx]
1016
+ delta = delta[idx]
1017
+
1018
+ sample_name = str(uid)
1019
+ if samples_info is not None:
1020
+ try:
1021
+ row = samples_info[samples_info["sample_uid"] == uid]
1022
+ if not row.empty:
1023
+ sample_name = row.iloc[0].get("sample_name", sample_name)
1024
+ except Exception:
1025
+ pass
1026
+
1027
+ color = color_map.get(uid, "#000000")
1028
+
1029
+ data = {"rt": rt, "delta": delta, "sample": [sample_name] * len(rt)}
1030
+ src = ColumnDataSource(data)
1031
+
1032
+ r_line = p.line("rt", "delta", source=src, line_width=1, color=color)
1033
+ p.scatter("rt", "delta", source=src, size=2, color=color, alpha=0.6)
1034
+ renderers.append(r_line)
1035
+
1036
+ if not renderers:
1037
+ self.logger.warning("No RT correction curves to plot for the selected samples.")
1038
+ return
1039
+
1040
+ hover = HoverTool(tooltips=[("sample", "@sample"), ("rt", "@rt{0.00}"), ("rt - rt_original", "@delta{0.00}")], renderers=renderers)
1041
+ p.add_tools(hover)
1042
+
1043
+ # Only set legend properties if a legend was actually created to avoid Bokeh warnings
1044
+ if getattr(p, "legend", None) and len(p.legend) > 0:
1045
+ p.legend.visible = False
1046
+
1047
+ if filename:
1048
+ if filename.endswith(".html"):
1049
+ output_file(filename)
1050
+ show(p)
1051
+ elif filename.endswith(".png"):
1052
+ try:
1053
+ from bokeh.io.export import export_png
1054
+
1055
+ export_png(p, filename=filename)
1056
+ except Exception:
1057
+ output_file(filename.replace(".png", ".html"))
1058
+ show(p)
1059
+ else:
1060
+ output_file(filename)
1061
+ show(p)
1062
+ else:
1063
+ show(p)
1064
+
1065
+ return p
1066
+
1067
+
545
1068
  def plot_chrom(
546
1069
  self,
547
1070
  uids=None,
@@ -560,6 +1083,9 @@ def plot_chrom(
560
1083
  self.logger.error("No chromatogram data found.")
561
1084
  return
562
1085
 
1086
+ # Local import for color palette
1087
+ from bokeh.palettes import Turbo256
1088
+
563
1089
  # Assign a fixed color to each sample/column
564
1090
  sample_names = [col for col in chroms.columns if col not in ["consensus_uid"]]
565
1091
  if not sample_names:
@@ -569,12 +1095,12 @@ def plot_chrom(
569
1095
 
570
1096
  plots = []
571
1097
  self.logger.info(f"Plotting {chroms.shape[0]} chromatograms...")
572
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
1098
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
573
1099
  for row in tqdm(
574
1100
  chroms.iter_rows(named=True),
575
1101
  total=chroms.shape[0],
576
1102
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Plot chromatograms",
577
- disable=tdqm_disable,
1103
+ disable=tqdm_disable,
578
1104
  ):
579
1105
  consensus_uid = row["consensus_uid"] # Get consensus_uid from the row
580
1106
  consensus_id = consensus_uid # Use the same value for consensus_id
@@ -698,3 +1224,592 @@ def plot_chrom(
698
1224
  # In a server context, return the panel object instead of showing or saving directly
699
1225
  # return panel.panel(layout)
700
1226
  panel.panel(layout).show()
1227
+
1228
+
1229
+ def plot_consensus_stats(
1230
+ self,
1231
+ filename=None,
1232
+ width=1200,
1233
+ height=1200,
1234
+ alpha=0.6,
1235
+ markersize=3,
1236
+ ):
1237
+ """
1238
+ Plot a scatter plot matrix (SPLOM) of consensus statistics using Bokeh.
1239
+
1240
+ Parameters:
1241
+ filename (str, optional): Output filename for saving the plot
1242
+ width (int): Overall width of the plot (default: 1200)
1243
+ height (int): Overall height of the plot (default: 1200)
1244
+ alpha (float): Point transparency (default: 0.6)
1245
+ markersize (int): Size of points (default: 5)
1246
+ """
1247
+ from bokeh.layouts import gridplot
1248
+ from bokeh.models import ColumnDataSource, HoverTool
1249
+ from bokeh.plotting import figure, show, output_file
1250
+
1251
+ # Check if consensus_df exists and has data
1252
+ if self.consensus_df is None or self.consensus_df.is_empty():
1253
+ self.logger.error("No consensus data available. Run merge/find_consensus first.")
1254
+ return
1255
+
1256
+ # Define the columns to plot
1257
+ columns = [
1258
+ "rt",
1259
+ "mz",
1260
+ "number_samples",
1261
+ "log10_quality",
1262
+ "mz_delta_mean",
1263
+ "rt_delta_mean",
1264
+ "chrom_coherence_mean",
1265
+ "chrom_prominence_scaled_mean",
1266
+ "inty_mean",
1267
+ "number_ms2",
1268
+ ]
1269
+
1270
+ # Check which columns exist in the dataframe and compute missing ones
1271
+ available_columns = self.consensus_df.columns
1272
+ data_df = self.consensus_df.clone()
1273
+
1274
+ # Add log10_quality if quality exists
1275
+ if "quality" in available_columns and "log10_quality" not in available_columns:
1276
+ data_df = data_df.with_columns(
1277
+ pl.col("quality").log10().alias("log10_quality"),
1278
+ )
1279
+
1280
+ # Filter columns that actually exist
1281
+ final_columns = [col for col in columns if col in data_df.columns]
1282
+
1283
+ if len(final_columns) < 2:
1284
+ self.logger.error(f"Need at least 2 columns for SPLOM. Available: {final_columns}")
1285
+ return
1286
+
1287
+ self.logger.debug(f"Creating SPLOM with columns: {final_columns}")
1288
+
1289
+ # Add important ID columns for tooltips even if not plotting them
1290
+ tooltip_columns = []
1291
+ for id_col in ["consensus_uid", "consensus_id"]:
1292
+ if id_col in data_df.columns and id_col not in final_columns:
1293
+ tooltip_columns.append(id_col)
1294
+
1295
+ # Select plotting columns plus tooltip columns
1296
+ all_columns = final_columns + tooltip_columns
1297
+ data_pd = data_df.select(all_columns).to_pandas()
1298
+
1299
+ # Remove any infinite or NaN values
1300
+ data_pd = data_pd.replace([np.inf, -np.inf], np.nan).dropna()
1301
+
1302
+ if data_pd.empty:
1303
+ self.logger.error("No valid data after removing NaN/infinite values.")
1304
+ return
1305
+
1306
+ source = ColumnDataSource(data_pd)
1307
+
1308
+ n_vars = len(final_columns)
1309
+
1310
+ # Fixed dimensions - override user input to ensure consistent layout
1311
+ total_width = 1200
1312
+ total_height = 1200
1313
+
1314
+ # Calculate plot sizes to ensure uniform inner plot areas
1315
+ # First column needs extra width for y-axis labels
1316
+ plot_width_first = 180 # Wider to account for y-axis labels
1317
+ plot_width_others = 120 # Standard width for other columns
1318
+ plot_height_normal = 120 # Standard height
1319
+ plot_height_last = 155 # Taller last row to accommodate x-axis labels while keeping inner plot area same size
1320
+
1321
+ # Create grid of plots with variable outer sizes but equal inner areas
1322
+ plots = []
1323
+
1324
+ for i, y_var in enumerate(final_columns):
1325
+ row = []
1326
+ for j, x_var in enumerate(final_columns):
1327
+ # Determine if this plot needs axis labels
1328
+ has_x_label = i == n_vars - 1 # bottom row
1329
+ has_y_label = j == 0 # left column
1330
+
1331
+ # First column wider to accommodate y-axis labels, ensuring equal inner plot areas
1332
+ current_width = plot_width_first if has_y_label else plot_width_others
1333
+ current_height = plot_height_last if has_x_label else plot_height_normal
1334
+
1335
+ p = figure(
1336
+ width=current_width,
1337
+ height=current_height,
1338
+ title=None, # No title on any plot
1339
+ toolbar_location=None,
1340
+ # Adjusted borders - first column has more space, others minimal
1341
+ min_border_left=70 if has_y_label else 15,
1342
+ min_border_bottom=50 if has_x_label else 15,
1343
+ min_border_right=15,
1344
+ min_border_top=15,
1345
+ )
1346
+
1347
+ # Ensure subplot background and border are explicitly white so the plot looks
1348
+ # correct in dark and light themes.
1349
+ p.outline_line_color = None
1350
+ p.border_fill_color = "white"
1351
+ p.border_fill_alpha = 1.0
1352
+ p.background_fill_color = "white"
1353
+
1354
+ # Remove axis lines to eliminate black lines between plots
1355
+ p.xaxis.axis_line_color = None
1356
+ p.yaxis.axis_line_color = None
1357
+
1358
+ # Keep subtle grid lines for data reference
1359
+ p.grid.visible = True
1360
+ p.grid.grid_line_color = "#E0E0E0" # Light gray grid lines
1361
+
1362
+ # Set axis labels and formatting
1363
+ if has_x_label: # bottom row
1364
+ p.xaxis.axis_label = x_var
1365
+ p.xaxis.axis_label_text_font_size = "12pt"
1366
+ p.xaxis.major_label_text_font_size = "9pt"
1367
+ p.xaxis.axis_label_standoff = 15
1368
+ else:
1369
+ p.xaxis.major_label_text_font_size = "0pt"
1370
+ p.xaxis.minor_tick_line_color = None
1371
+ p.xaxis.major_tick_line_color = None
1372
+
1373
+ if has_y_label: # left column
1374
+ p.yaxis.axis_label = y_var
1375
+ p.yaxis.axis_label_text_font_size = "10pt" # Smaller y-axis title
1376
+ p.yaxis.major_label_text_font_size = "8pt"
1377
+ p.yaxis.axis_label_standoff = 12
1378
+ else:
1379
+ p.yaxis.major_label_text_font_size = "0pt"
1380
+ p.yaxis.minor_tick_line_color = None
1381
+ p.yaxis.major_tick_line_color = None
1382
+
1383
+ if i == j:
1384
+ # Diagonal: histogram
1385
+ hist, edges = np.histogram(data_pd[x_var], bins=30)
1386
+ p.quad(
1387
+ top=hist,
1388
+ bottom=0,
1389
+ left=edges[:-1],
1390
+ right=edges[1:],
1391
+ fill_color="green",
1392
+ line_color="white",
1393
+ alpha=alpha,
1394
+ )
1395
+ else:
1396
+ # Off-diagonal: scatter plot
1397
+ scatter = p.scatter(
1398
+ x=x_var,
1399
+ y=y_var,
1400
+ size=markersize,
1401
+ alpha=alpha,
1402
+ color="blue",
1403
+ source=source,
1404
+ )
1405
+
1406
+ # Add hover tool
1407
+ hover = HoverTool(
1408
+ tooltips=[
1409
+ (x_var, f"@{x_var}{{0.0000}}"),
1410
+ (y_var, f"@{y_var}{{0.0000}}"),
1411
+ (
1412
+ "consensus_uid",
1413
+ "@consensus_uid"
1414
+ if "consensus_uid" in data_pd.columns
1415
+ else "@consensus_id"
1416
+ if "consensus_id" in data_pd.columns
1417
+ else "N/A",
1418
+ ),
1419
+ ("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
1420
+ ("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
1421
+ ],
1422
+ renderers=[scatter],
1423
+ )
1424
+ p.add_tools(hover)
1425
+
1426
+ row.append(p)
1427
+ plots.append(row)
1428
+
1429
+ # Link axes for same variables
1430
+ for i in range(n_vars):
1431
+ for j in range(n_vars):
1432
+ if i != j: # Don't link diagonal plots
1433
+ # Link x-axis to other plots in same column
1434
+ for k in range(n_vars):
1435
+ if k != i and k != j:
1436
+ plots[i][j].x_range = plots[k][j].x_range
1437
+
1438
+ # Link y-axis to other plots in same row
1439
+ for k in range(n_vars):
1440
+ if k != j and k != i:
1441
+ plots[i][j].y_range = plots[i][k].y_range
1442
+
1443
+ # Create grid layout and force overall background/border to white so the outer
1444
+ # container doesn't show dark UI colors in night mode.
1445
+ grid = gridplot(plots)
1446
+
1447
+ # Set overall background and border to white when supported
1448
+ if hasattr(grid, "background_fill_color"):
1449
+ grid.background_fill_color = "white"
1450
+ if hasattr(grid, "border_fill_color"):
1451
+ grid.border_fill_color = "white"
1452
+
1453
+ # Output and show
1454
+ if filename:
1455
+ output_file(filename)
1456
+
1457
+ show(grid)
1458
+ return grid
1459
+
1460
+
1461
+ def plot_pca(
1462
+ self,
1463
+ filename=None,
1464
+ width=400,
1465
+ height=400,
1466
+ alpha=0.8,
1467
+ markersize=6,
1468
+ n_components=2,
1469
+ color_by=None,
1470
+ title="PCA of Consensus Matrix",
1471
+ ):
1472
+ """
1473
+ Plot PCA (Principal Component Analysis) of the consensus matrix using Bokeh.
1474
+
1475
+ Parameters:
1476
+ filename (str, optional): Output filename for saving the plot
1477
+ width (int): Plot width (default: 800)
1478
+ height (int): Plot height (default: 600)
1479
+ alpha (float): Point transparency (default: 0.8)
1480
+ markersize (int): Size of points (default: 8)
1481
+ n_components (int): Number of PCA components to compute (default: 2)
1482
+ color_by (str, optional): Column from samples_df to color points by
1483
+ title (str): Plot title (default: "PCA of Consensus Matrix")
1484
+ """
1485
+ from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
1486
+ from bokeh.plotting import figure, show, output_file
1487
+ from bokeh.palettes import Category20, viridis, Turbo256
1488
+ from bokeh.transform import factor_cmap
1489
+ from sklearn.decomposition import PCA
1490
+ from sklearn.preprocessing import StandardScaler
1491
+ import pandas as pd
1492
+ import numpy as np
1493
+
1494
+ # Check if consensus matrix and samples_df exist
1495
+ try:
1496
+ consensus_matrix = self.get_consensus_matrix()
1497
+ samples_df = self.samples_df
1498
+ except Exception as e:
1499
+ self.logger.error(f"Error getting consensus matrix or samples_df: {e}")
1500
+ return
1501
+
1502
+ if consensus_matrix is None or consensus_matrix.shape[0] == 0:
1503
+ self.logger.error("No consensus matrix available. Run merge/find_consensus first.")
1504
+ return
1505
+
1506
+ if samples_df is None or samples_df.is_empty():
1507
+ self.logger.error("No samples dataframe available.")
1508
+ return
1509
+
1510
+ self.logger.info(f"Performing PCA on consensus matrix with shape: {consensus_matrix.shape}")
1511
+
1512
+ # Convert consensus matrix to numpy if it's not already
1513
+ if hasattr(consensus_matrix, "values"):
1514
+ matrix_data = consensus_matrix.values
1515
+ elif hasattr(consensus_matrix, "to_numpy"):
1516
+ matrix_data = consensus_matrix.to_numpy()
1517
+ else:
1518
+ matrix_data = np.array(consensus_matrix)
1519
+
1520
+ # Transpose matrix so samples are rows and features are columns
1521
+ matrix_data = matrix_data.T
1522
+
1523
+ # Handle missing values by replacing with 0
1524
+ matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
1525
+
1526
+ # Standardize the data
1527
+ scaler = StandardScaler()
1528
+ matrix_scaled = scaler.fit_transform(matrix_data)
1529
+
1530
+ # Perform PCA
1531
+ pca = PCA(n_components=n_components)
1532
+ pca_result = pca.fit_transform(matrix_scaled)
1533
+
1534
+ # Get explained variance ratios
1535
+ explained_var = pca.explained_variance_ratio_
1536
+
1537
+ self.logger.info(f"PCA explained variance ratios: {explained_var}")
1538
+
1539
+ # Convert samples_df to pandas for easier manipulation
1540
+ samples_pd = samples_df.to_pandas()
1541
+
1542
+ # Create dataframe with PCA results and sample information
1543
+ pca_df = pd.DataFrame({
1544
+ "PC1": pca_result[:, 0],
1545
+ "PC2": pca_result[:, 1] if n_components > 1 else np.zeros(len(pca_result)),
1546
+ })
1547
+
1548
+ # Add sample information to PCA dataframe
1549
+ if len(samples_pd) == len(pca_df):
1550
+ for col in samples_pd.columns:
1551
+ pca_df[col] = samples_pd[col].values
1552
+ else:
1553
+ self.logger.warning(
1554
+ f"Sample count mismatch: samples_df has {len(samples_pd)} rows, "
1555
+ f"but consensus matrix has {len(pca_df)} samples"
1556
+ )
1557
+
1558
+ # Prepare color mapping
1559
+ color_column = None
1560
+ color_mapper = None
1561
+
1562
+ if color_by and color_by in pca_df.columns:
1563
+ color_column = color_by
1564
+ unique_values = pca_df[color_by].unique()
1565
+
1566
+ # Handle categorical vs numeric coloring
1567
+ if pca_df[color_by].dtype in ["object", "string", "category"]:
1568
+ # Categorical coloring
1569
+ if len(unique_values) <= 20:
1570
+ palette = Category20[min(20, max(3, len(unique_values)))]
1571
+ else:
1572
+ palette = viridis(min(256, len(unique_values)))
1573
+ color_mapper = factor_cmap(color_by, palette, unique_values)
1574
+ else:
1575
+ # Numeric coloring
1576
+ palette = viridis(256)
1577
+ color_mapper = LinearColorMapper(
1578
+ palette=palette,
1579
+ low=pca_df[color_by].min(),
1580
+ high=pca_df[color_by].max(),
1581
+ )
1582
+
1583
+ # Create Bokeh plot
1584
+ p = figure(
1585
+ width=width,
1586
+ height=height,
1587
+ title=f"{title} (PC1: {explained_var[0]:.1%}, PC2: {explained_var[1]:.1%})",
1588
+ tools="pan,wheel_zoom,box_zoom,reset,save",
1589
+ )
1590
+
1591
+ p.xaxis.axis_label = f"PC1 ({explained_var[0]:.1%} variance)"
1592
+ p.yaxis.axis_label = f"PC2 ({explained_var[1]:.1%} variance)"
1593
+
1594
+ # Create data source
1595
+ source = ColumnDataSource(pca_df)
1596
+
1597
+ # Create scatter plot
1598
+ if color_mapper:
1599
+ if isinstance(color_mapper, LinearColorMapper):
1600
+ scatter = p.scatter(
1601
+ "PC1",
1602
+ "PC2",
1603
+ size=markersize,
1604
+ alpha=alpha,
1605
+ color={"field": color_by, "transform": color_mapper},
1606
+ source=source,
1607
+ )
1608
+ # Add colorbar for numeric coloring
1609
+ color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
1610
+ p.add_layout(color_bar, "right")
1611
+ else:
1612
+ scatter = p.scatter(
1613
+ "PC1",
1614
+ "PC2",
1615
+ size=markersize,
1616
+ alpha=alpha,
1617
+ color=color_mapper,
1618
+ source=source,
1619
+ legend_field=color_by,
1620
+ )
1621
+ else:
1622
+ # If no color_by provided, color points by sample similar to plot_samples_2d
1623
+ if "sample_uid" in pca_df.columns or "sample_name" in pca_df.columns:
1624
+ # Choose the identifier to map colors by
1625
+ id_col = "sample_uid" if "sample_uid" in pca_df.columns else "sample_name"
1626
+ sample_ids = list(pd.unique(pca_df[id_col]))
1627
+ colors = Turbo256
1628
+ color_map = {uid: colors[i * (256 // max(1, len(sample_ids)))] for i, uid in enumerate(sample_ids)}
1629
+ # Map colors into dataframe
1630
+ pca_df["color"] = [color_map[x] for x in pca_df[id_col]]
1631
+ # Update the ColumnDataSource with new color column
1632
+ source = ColumnDataSource(pca_df)
1633
+ scatter = p.scatter(
1634
+ "PC1",
1635
+ "PC2",
1636
+ size=markersize,
1637
+ alpha=alpha,
1638
+ color="color",
1639
+ source=source,
1640
+ )
1641
+ else:
1642
+ scatter = p.scatter(
1643
+ "PC1",
1644
+ "PC2",
1645
+ size=markersize,
1646
+ alpha=alpha,
1647
+ color="blue",
1648
+ source=source,
1649
+ )
1650
+
1651
+ # Create comprehensive hover tooltips with all sample information
1652
+ tooltip_list = []
1653
+
1654
+ # Columns to exclude from tooltips (file paths and internal/plot fields)
1655
+ excluded_cols = {"file_source", "file_path", "sample_path", "map_id", "PC1", "PC2", "ms1", "ms2"}
1656
+
1657
+ # Add all sample dataframe columns to tooltips, skipping excluded ones
1658
+ for col in samples_pd.columns:
1659
+ if col in excluded_cols:
1660
+ continue
1661
+ if col in pca_df.columns:
1662
+ if pca_df[col].dtype in ["float64", "float32"]:
1663
+ tooltip_list.append((col, f"@{col}{{0.00}}"))
1664
+ else:
1665
+ tooltip_list.append((col, f"@{col}"))
1666
+
1667
+ hover = HoverTool(
1668
+ tooltips=tooltip_list,
1669
+ renderers=[scatter],
1670
+ )
1671
+ p.add_tools(hover)
1672
+
1673
+ # Add legend if using categorical coloring
1674
+ if color_mapper and not isinstance(color_mapper, LinearColorMapper) and color_by:
1675
+ # Only set legend properties if legends exist (avoid Bokeh warning when none created)
1676
+ if getattr(p, "legend", None) and len(p.legend) > 0:
1677
+ p.legend.location = "top_left"
1678
+ p.legend.click_policy = "hide"
1679
+
1680
+ # Output and show
1681
+ if filename:
1682
+ output_file(filename)
1683
+
1684
+ show(p)
1685
+ return p
1686
+
1687
+ def plot_tic(
1688
+ self,
1689
+ samples=None,
1690
+ title: str | None = None,
1691
+ filename: str | None = None,
1692
+ width: int = 1000,
1693
+ height: int = 300,
1694
+ rt_unit: str = "s",
1695
+ original: bool = False,
1696
+ ):
1697
+ """
1698
+ Plot Total Ion Chromatograms (TIC) for selected samples overlayed using Bokeh.
1699
+
1700
+ Parameters and behavior mirror `plot_bpc` but use per-sample TICs (get_tic).
1701
+ """
1702
+ # Local imports to avoid heavy top-level deps / circular imports
1703
+ from bokeh.plotting import figure, show, output_file
1704
+ from bokeh.models import ColumnDataSource, HoverTool
1705
+ from bokeh.io.export import export_png
1706
+ from bokeh.palettes import Turbo256
1707
+ from masster.study.helpers import get_tic
1708
+
1709
+ sample_uids = self._get_sample_uids(samples)
1710
+ if not sample_uids:
1711
+ self.logger.error("No valid sample_uids provided for TIC plotting.")
1712
+ return
1713
+
1714
+ colors = Turbo256
1715
+ n = max(1, len(sample_uids))
1716
+ step = max(1, 256 // n)
1717
+ color_map = {uid: colors[(i * step) % 256] for i, uid in enumerate(sample_uids)}
1718
+
1719
+ plot_title = title or "Total Ion Chromatograms"
1720
+
1721
+ p = figure(width=width, height=height, title=plot_title, tools="pan,wheel_zoom,box_zoom,reset,save")
1722
+ p.xaxis.axis_label = f"Retention Time ({rt_unit})"
1723
+ p.yaxis.axis_label = "Intensity"
1724
+
1725
+ renderers = []
1726
+
1727
+ # Build sample name mapping once
1728
+ samples_info = None
1729
+ if hasattr(self, "samples_df") and self.samples_df is not None:
1730
+ try:
1731
+ samples_info = self.samples_df.to_pandas()
1732
+ except Exception:
1733
+ samples_info = None
1734
+
1735
+ for uid in sample_uids:
1736
+ try:
1737
+ chrom = get_tic(self, sample=uid, label=None)
1738
+ except Exception as e:
1739
+ self.logger.debug(f"Skipping sample {uid} for TIC: {e}")
1740
+ continue
1741
+
1742
+ # extract arrays
1743
+ try:
1744
+ chrom_dict = chrom.to_dict() if hasattr(chrom, "to_dict") else {"rt": getattr(chrom, "rt"), "inty": getattr(chrom, "inty")}
1745
+ rt = chrom_dict.get("rt")
1746
+ inty = chrom_dict.get("inty")
1747
+ except Exception:
1748
+ try:
1749
+ rt = chrom.rt
1750
+ inty = chrom.inty
1751
+ except Exception as e:
1752
+ self.logger.debug(f"Invalid chromatogram for sample {uid}: {e}")
1753
+ continue
1754
+
1755
+ if rt is None or inty is None:
1756
+ continue
1757
+
1758
+ import numpy as _np
1759
+
1760
+ rt = _np.asarray(rt)
1761
+ inty = _np.asarray(inty)
1762
+ if rt.size == 0 or inty.size == 0:
1763
+ continue
1764
+
1765
+ # Sort by rt
1766
+ idx = _np.argsort(rt)
1767
+ rt = rt[idx]
1768
+ inty = inty[idx]
1769
+
1770
+ sample_name = str(uid)
1771
+ if samples_info is not None:
1772
+ try:
1773
+ row = samples_info[samples_info["sample_uid"] == uid]
1774
+ if not row.empty:
1775
+ sample_name = row.iloc[0].get("sample_name", sample_name)
1776
+ except Exception:
1777
+ pass
1778
+
1779
+ color = color_map.get(uid, "#000000")
1780
+
1781
+ data = {"rt": rt, "inty": inty, "sample": [sample_name] * len(rt)}
1782
+ src = ColumnDataSource(data)
1783
+
1784
+ r_line = p.line("rt", "inty", source=src, line_width=1, color=color, legend_label=str(sample_name))
1785
+ p.scatter("rt", "inty", source=src, size=2, color=color, alpha=0.6)
1786
+ renderers.append(r_line)
1787
+
1788
+ if not renderers:
1789
+ self.logger.warning("No TIC curves to plot for the selected samples.")
1790
+ return
1791
+
1792
+ hover = HoverTool(tooltips=[("sample", "@sample"), ("rt", "@rt{0.00}"), ("inty", "@inty{0.0e+0}")], renderers=renderers)
1793
+ p.add_tools(hover)
1794
+
1795
+ # Only set legend properties if a legend was actually created to avoid Bokeh warnings
1796
+ if getattr(p, "legend", None) and len(p.legend) > 0:
1797
+ p.legend.visible = False
1798
+
1799
+ if filename:
1800
+ if filename.endswith(".html"):
1801
+ output_file(filename)
1802
+ show(p)
1803
+ elif filename.endswith(".png"):
1804
+ try:
1805
+ export_png(p, filename=filename)
1806
+ except Exception:
1807
+ output_file(filename.replace(".png", ".html"))
1808
+ show(p)
1809
+ else:
1810
+ output_file(filename)
1811
+ show(p)
1812
+ else:
1813
+ show(p)
1814
+
1815
+ return p