masster 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -226,8 +226,7 @@ def _isolated_show_panel_notebook(panel_obj):
226
226
 
227
227
  def plot_alignment(
228
228
  self,
229
- samples=None,
230
- maps: bool = True,
229
+ samples=50,
231
230
  filename: str | None = None,
232
231
  width: int = 450,
233
232
  height: int = 450,
@@ -235,322 +234,172 @@ def plot_alignment(
235
234
  ):
236
235
  """Visualize retention time alignment using two synchronized Bokeh scatter plots.
237
236
 
238
- - When ``maps=True`` the function reads ``self.features_maps`` (list of FeatureMap)
239
- and builds two side-by-side plots: Original RT (left) and Current/Aligned RT (right).
240
- - When ``maps=False`` the function uses ``self.features_df`` and expects an
241
- ``rt_original`` column (before) and ``rt`` column (after).
237
+ Uses ``features_df`` to create side-by-side plots showing Original RT (left)
238
+ and Current/Aligned RT (right). If no alignment has been performed yet,
239
+ both plots show the current RT values.
242
240
 
243
- Parameters
241
+ Parameters:
244
242
  - samples: List of sample identifiers (sample_uids or sample_names), or single int for random selection, or None for all samples.
245
- - maps: whether to use feature maps (default True).
246
243
  - filename: optional HTML file path to save the plot.
247
244
  - width/height: pixel size of each subplot.
248
245
  - markersize: base marker size.
249
246
 
250
- Returns
247
+ Returns:
251
248
  - Bokeh layout (row) containing the two synchronized plots.
252
249
  """
253
250
  # Local imports so the module can be used even if bokeh isn't needed elsewhere
254
251
  from bokeh.models import ColumnDataSource, HoverTool
255
- from bokeh.plotting import figure, show, output_file
252
+ from bokeh.plotting import figure
256
253
  import pandas as pd
257
254
 
258
- # Get sample_uids to filter by if specified
259
- sample_uids = self._get_sample_uids(samples) if samples is not None else None
255
+ # Check if features_df exists
256
+ if self.features_df is None or self.features_df.is_empty():
257
+ self.logger.error("No features_df found. Load features first.")
258
+ return
260
259
 
261
- # Build the before/after tabular data used for plotting
262
- before_data: list[dict[str, Any]] = []
263
- after_data: list[dict[str, Any]] = []
260
+ # Check required columns
261
+ required_cols = ["rt", "mz", "inty"]
262
+ missing = [c for c in required_cols if c not in self.features_df.columns]
263
+ if missing:
264
+ self.logger.error(f"Missing required columns in features_df: {missing}")
265
+ return
264
266
 
265
- if maps:
266
- # Ensure feature maps are loaded
267
- if self.features_maps is None or len(self.features_maps) == 0:
268
- self.load_features()
267
+ # Check if alignment has been performed
268
+ has_alignment = "rt_original" in self.features_df.columns
269
+ if not has_alignment:
270
+ self.logger.warning("Column 'rt_original' not found - alignment has not been performed yet.")
271
+ self.logger.info("Showing current RT values for both plots. Run align() first to see alignment comparison.")
272
+
273
+ # Get sample_uids to filter by if specified
274
+ sample_uids = self._get_sample_uids(samples) if samples is not None else None
269
275
 
270
- fmaps = self.features_maps or []
276
+ # Start with full features_df
277
+ features_df = self.features_df
271
278
 
272
- if not fmaps:
273
- self.logger.error("No feature maps available for plotting.")
279
+ # Filter by selected samples if specified
280
+ if sample_uids is not None:
281
+ features_df = features_df.filter(pl.col("sample_uid").is_in(sample_uids))
282
+ if features_df.is_empty():
283
+ self.logger.error("No features found for the selected samples.")
274
284
  return
275
285
 
276
- # Filter feature maps by selected samples if specified
277
- if sample_uids is not None:
278
- # Create mapping from sample_uid to map_id and filter accordingly
279
- if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
280
- samples_info = self.samples_df.to_pandas()
281
-
282
- # Filter samples_info to only selected sample_uids and get their map_ids
283
- selected_samples = samples_info[samples_info["sample_uid"].isin(sample_uids)]
284
- if selected_samples.empty:
285
- self.logger.error("No matching samples found for the provided sample_uids.")
286
- return
287
-
288
- # Get the map_ids for selected samples
289
- selected_map_ids = selected_samples["map_id"].tolist()
290
-
291
- # Filter feature maps based on map_ids
292
- filtered_maps = []
293
- for map_id in selected_map_ids:
294
- if 0 <= map_id < len(fmaps):
295
- filtered_maps.append(fmaps[map_id])
296
-
297
- fmaps = filtered_maps
298
- samples_info = selected_samples.reset_index(drop=True)
299
-
300
- if not fmaps:
301
- self.logger.error("No feature maps found for the selected samples.")
302
- return
303
- else:
304
- self.logger.warning("Cannot filter feature maps: no samples_df available")
286
+ # Determine sample column
287
+ sample_col = "sample_uid" if "sample_uid" in features_df.columns else "sample_name"
288
+ if sample_col not in features_df.columns:
289
+ self.logger.error("No sample identifier column found in features_df.")
290
+ return
305
291
 
306
- if not fmaps:
307
- self.logger.error("No feature maps available after filtering.")
308
- return
292
+ # Get unique samples
293
+ samples_list = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
309
294
 
310
- # Reference (first) sample: use current RT for both before and after
311
- ref = fmaps[0]
312
- ref_rt = [f.getRT() for f in ref]
313
- ref_mz = [f.getMZ() for f in ref]
314
- ref_inty = [f.getIntensity() for f in ref]
315
- max_ref_inty = max(ref_inty) if ref_inty else 1
316
-
317
- # Get sample metadata for reference (first) sample
318
- if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
319
- if 'samples_info' not in locals():
320
- samples_info = self.samples_df.to_pandas()
321
- ref_sample_uid = (
322
- samples_info.iloc[0]["sample_uid"] if "sample_uid" in samples_info.columns else "Reference_UID"
323
- )
324
- ref_sample_name = (
325
- samples_info.iloc[0]["sample_name"] if "sample_name" in samples_info.columns else "Reference"
326
- )
327
- else:
328
- ref_sample_uid = "Reference_UID"
329
- ref_sample_name = "Reference"
295
+ # Build plotting data
296
+ before_data: list[dict[str, Any]] = []
297
+ after_data: list[dict[str, Any]] = []
298
+
299
+ for sample_idx, sample in enumerate(samples_list):
300
+ # Filter sample data
301
+ sample_data = features_df.filter(pl.col(sample_col) == sample)
302
+
303
+ # Sample data if too large for performance
304
+ max_points_per_sample = 10000
305
+ if sample_data.height > max_points_per_sample:
306
+ self.logger.info(f"Sample {sample}: Sampling {max_points_per_sample} points from {sample_data.height} features for performance")
307
+ sample_data = sample_data.sample(n=max_points_per_sample, seed=42)
308
+
309
+ # Calculate max intensity for alpha scaling
310
+ max_inty = sample_data.select(pl.col("inty").max()).item() or 1
311
+
312
+ # Get sample information
313
+ sample_name = str(sample)
314
+ sample_uid = sample if sample_col == "sample_uid" else sample_data.select(pl.col("sample_uid")).item() if "sample_uid" in sample_data.columns else sample
315
+
316
+ # Select columns to process
317
+ cols_to_select = ["rt", "mz", "inty"]
318
+ if has_alignment:
319
+ cols_to_select.append("rt_original")
320
+
321
+ sample_dict = sample_data.select(cols_to_select).to_dicts()
322
+
323
+ for row_dict in sample_dict:
324
+ rt_original = row_dict.get("rt_original", row_dict["rt"]) if has_alignment else row_dict["rt"]
325
+ rt_current = row_dict["rt"]
326
+ mz = row_dict["mz"]
327
+ inty = row_dict["inty"]
328
+ alpha = inty / max_inty
329
+ size = markersize + 2 if sample_idx == 0 else markersize
330
330
 
331
- for rt, mz, inty in zip(ref_rt, ref_mz, ref_inty):
332
331
  before_data.append({
333
- "rt": rt,
332
+ "rt": rt_original,
334
333
  "mz": mz,
335
334
  "inty": inty,
336
- "alpha": inty / max_ref_inty,
337
- "sample_idx": 0,
338
- "sample_name": ref_sample_name,
339
- "sample_uid": ref_sample_uid,
340
- "size": markersize + 2,
335
+ "alpha": alpha,
336
+ "sample_idx": sample_idx,
337
+ "sample_name": sample_name,
338
+ "sample_uid": sample_uid,
339
+ "size": size,
341
340
  })
342
341
  after_data.append({
343
- "rt": rt,
342
+ "rt": rt_current,
344
343
  "mz": mz,
345
344
  "inty": inty,
346
- "alpha": inty / max_ref_inty,
347
- "sample_idx": 0,
348
- "sample_name": ref_sample_name,
349
- "sample_uid": ref_sample_uid,
350
- "size": markersize + 2,
345
+ "alpha": alpha,
346
+ "sample_idx": sample_idx,
347
+ "sample_name": sample_name,
348
+ "sample_uid": sample_uid,
349
+ "size": size,
351
350
  })
352
351
 
353
- # Remaining samples - now using filtered feature maps and samples_info
354
- for sample_idx, fm in enumerate(fmaps[1:], start=1):
355
- mz_vals = []
356
- inty_vals = []
357
- original_rt = []
358
- aligned_rt = []
359
-
360
- for f in fm:
361
- try:
362
- orig = f.getMetaValue("original_RT")
363
- except Exception:
364
- orig = None
365
-
366
- if orig is None:
367
- original_rt.append(f.getRT())
368
- else:
369
- original_rt.append(orig)
370
-
371
- aligned_rt.append(f.getRT())
372
- mz_vals.append(f.getMZ())
373
- inty_vals.append(f.getIntensity())
374
-
375
- if not inty_vals:
376
- continue
377
-
378
- max_inty = max(inty_vals)
379
-
380
- # Get sample metadata from filtered samples_info
381
- if hasattr(self, "samples_df") and self.samples_df is not None and not self.samples_df.is_empty():
382
- # Use filtered samples_info if it exists from the filtering above
383
- if 'samples_info' in locals() and sample_idx < len(samples_info):
384
- sample_name = samples_info.iloc[sample_idx].get("sample_name", f"Sample {sample_idx}")
385
- sample_uid = samples_info.iloc[sample_idx].get("sample_uid", f"Sample_{sample_idx}_UID")
386
- else:
387
- # Fallback to original samples_df if filtered samples_info is not available
388
- all_samples_info = self.samples_df.to_pandas()
389
- if sample_idx < len(all_samples_info):
390
- sample_name = all_samples_info.iloc[sample_idx].get("sample_name", f"Sample {sample_idx}")
391
- sample_uid = all_samples_info.iloc[sample_idx].get("sample_uid", f"Sample_{sample_idx}_UID")
392
- else:
393
- sample_name = f"Sample {sample_idx}"
394
- sample_uid = f"Sample_{sample_idx}_UID"
395
- else:
396
- sample_name = f"Sample {sample_idx}"
397
- sample_uid = f"Sample_{sample_idx}_UID"
398
-
399
- for rt, mz, inty in zip(original_rt, mz_vals, inty_vals):
400
- before_data.append({
401
- "rt": rt,
402
- "mz": mz,
403
- "inty": inty,
404
- "alpha": inty / max_inty,
405
- "sample_idx": sample_idx,
406
- "sample_name": sample_name,
407
- "sample_uid": sample_uid,
408
- "size": markersize,
409
- })
410
-
411
- for rt, mz, inty in zip(aligned_rt, mz_vals, inty_vals):
412
- after_data.append({
413
- "rt": rt,
414
- "mz": mz,
415
- "inty": inty,
416
- "alpha": inty / max_inty,
417
- "sample_idx": sample_idx,
418
- "sample_name": sample_name,
419
- "sample_uid": sample_uid,
420
- "size": markersize,
421
- })
422
-
423
- else:
424
- # Use features_df
425
- if self.features_df is None or self.features_df.is_empty():
426
- self.logger.error("No features_df found. Load features first.")
427
- return
428
-
429
- required_cols = ["rt", "mz", "inty"]
430
- missing = [c for c in required_cols if c not in self.features_df.columns]
431
- if missing:
432
- self.logger.error(f"Missing required columns in features_df: {missing}")
433
- return
434
-
435
- if "rt_original" not in self.features_df.columns:
436
- self.logger.error("Column 'rt_original' not found in features_df. Alignment may not have been performed.")
437
- return
438
-
439
- # Use Polars instead of pandas
440
- features_df = self.features_df
441
-
442
- # Filter by selected samples if specified
443
- if sample_uids is not None:
444
- features_df = features_df.filter(pl.col("sample_uid").is_in(sample_uids))
445
- if features_df.is_empty():
446
- self.logger.error("No features found for the selected samples.")
447
- return
448
-
449
- sample_col = "sample_uid" if "sample_uid" in features_df.columns else "sample_name"
450
- if sample_col not in features_df.columns:
451
- self.logger.error("No sample identifier column found in features_df.")
452
- return
453
-
454
- # Get unique samples using Polars
455
- samples = features_df.select(pl.col(sample_col)).unique().to_series().to_list()
456
-
457
- for sample_idx, sample in enumerate(samples):
458
- # Filter sample data using Polars
459
- sample_data = features_df.filter(pl.col(sample_col) == sample)
460
-
461
- # Calculate max intensity using Polars
462
- max_inty = sample_data.select(pl.col("inty").max()).item()
463
- max_inty = max_inty if max_inty and max_inty > 0 else 1
352
+ # Check if we have any data to plot
353
+ if not before_data:
354
+ self.logger.error("No data to plot.")
355
+ return
464
356
 
465
- sample_name = str(sample)
466
- # Get sample_uid - if sample_col is 'sample_uid', use sample directly
467
- if sample_col == "sample_uid":
468
- sample_uid = sample
469
- else:
470
- # Try to get sample_uid from the first row if it exists
471
- if "sample_uid" in sample_data.columns:
472
- sample_uid = sample_data.select(pl.col("sample_uid")).item()
473
- else:
474
- sample_uid = sample
475
-
476
- # Convert to dict for iteration - more efficient than row-by-row processing
477
- sample_dict = sample_data.select(["rt_original", "rt", "mz", "inty"]).to_dicts()
478
-
479
- for row_dict in sample_dict:
480
- rt_original = row_dict["rt_original"]
481
- rt_current = row_dict["rt"]
482
- mz = row_dict["mz"]
483
- inty = row_dict["inty"]
484
- alpha = inty / max_inty
485
- size = markersize + 2 if sample_idx == 0 else markersize
486
-
487
- before_data.append({
488
- "rt": rt_original,
489
- "mz": mz,
490
- "inty": inty,
491
- "alpha": alpha,
492
- "sample_idx": sample_idx,
493
- "sample_name": sample_name,
494
- "sample_uid": sample_uid,
495
- "size": size,
496
- })
497
- after_data.append({
498
- "rt": rt_current,
499
- "mz": mz,
500
- "inty": inty,
501
- "alpha": alpha,
502
- "sample_idx": sample_idx,
503
- "sample_name": sample_name,
504
- "sample_uid": sample_uid,
505
- "size": size,
506
- })
507
-
508
- # Get sample colors from samples_df using sample indices
509
- # Extract unique sample information from the dictionaries we created
510
- if before_data:
511
- # Create mapping from sample_idx to sample_uid more efficiently
512
- sample_idx_to_uid = {}
513
- for item in before_data:
514
- if item["sample_idx"] not in sample_idx_to_uid:
515
- sample_idx_to_uid[item["sample_idx"]] = item["sample_uid"]
516
- else:
517
- sample_idx_to_uid = {}
357
+ # Get sample colors from samples_df
358
+ sample_idx_to_uid = {}
359
+ for item in before_data:
360
+ if item["sample_idx"] not in sample_idx_to_uid:
361
+ sample_idx_to_uid[item["sample_idx"]] = item["sample_uid"]
518
362
 
519
- # Get colors from samples_df
363
+ # Get colors from samples_df if available
520
364
  sample_uids_list = list(sample_idx_to_uid.values())
365
+ color_map: dict[int, str] = {}
366
+
521
367
  if sample_uids_list and hasattr(self, "samples_df") and self.samples_df is not None:
522
- sample_colors = (
523
- self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids_list))
524
- .select(["sample_uid", "sample_color"])
525
- .to_dict(as_series=False)
526
- )
527
- uid_to_color = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
368
+ try:
369
+ sample_colors = (
370
+ self.samples_df.filter(pl.col("sample_uid").is_in(sample_uids_list))
371
+ .select(["sample_uid", "sample_color"])
372
+ .to_dict(as_series=False)
373
+ )
374
+ uid_to_color = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
375
+
376
+ for sample_idx, sample_uid in sample_idx_to_uid.items():
377
+ color_map[sample_idx] = uid_to_color.get(sample_uid, "#1f77b4")
378
+ except Exception:
379
+ # Fallback to default colors if sample colors not available
380
+ for sample_idx in sample_idx_to_uid.keys():
381
+ color_map[sample_idx] = "#1f77b4"
528
382
  else:
529
- uid_to_color = {}
530
-
531
- # Create color map for sample indices
532
- color_map: dict[int, str] = {}
533
- for sample_idx, sample_uid in sample_idx_to_uid.items():
534
- color_map[sample_idx] = uid_to_color.get(sample_uid, "#1f77b4") # fallback to blue
535
-
536
- # Add sample_color to data dictionaries before creating DataFrames
537
- if before_data:
538
- for item in before_data:
539
- item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
383
+ # Default colors
384
+ for sample_idx in sample_idx_to_uid.keys():
385
+ color_map[sample_idx] = "#1f77b4"
540
386
 
541
- if after_data:
542
- for item in after_data:
543
- item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
387
+ # Add sample_color to data
388
+ for item in before_data + after_data:
389
+ item["sample_color"] = color_map.get(item["sample_idx"], "#1f77b4")
544
390
 
545
- # Now create DataFrames with the sample_color already included
546
- before_df = pd.DataFrame(before_data) if before_data else pd.DataFrame()
547
- after_df = pd.DataFrame(after_data) if after_data else pd.DataFrame()
391
+ # Create DataFrames
392
+ before_df = pd.DataFrame(before_data)
393
+ after_df = pd.DataFrame(after_data)
548
394
 
549
395
  # Create Bokeh figures
396
+ title_before = "Original RT" if has_alignment else "Current RT (No Alignment)"
397
+ title_after = "Aligned RT" if has_alignment else "Current RT (Copy)"
398
+
550
399
  p1 = figure(
551
400
  width=width,
552
401
  height=height,
553
- title="Original RT",
402
+ title=title_before,
554
403
  x_axis_label="Retention Time (s)",
555
404
  y_axis_label="m/z",
556
405
  tools="pan,wheel_zoom,box_zoom,reset,save",
@@ -563,7 +412,7 @@ def plot_alignment(
563
412
  p2 = figure(
564
413
  width=width,
565
414
  height=height,
566
- title="Current RT",
415
+ title=title_after,
567
416
  x_axis_label="Retention Time (s)",
568
417
  y_axis_label="m/z",
569
418
  tools="pan,wheel_zoom,box_zoom,reset,save",
@@ -575,16 +424,15 @@ def plot_alignment(
575
424
  p2.border_fill_color = "white"
576
425
  p2.min_border = 0
577
426
 
578
- # Get unique sample indices for iteration
579
- unique_samples = sorted(list({item["sample_idx"] for item in before_data})) if before_data else []
580
-
427
+ # Plot data by sample
428
+ unique_samples = sorted(list({item["sample_idx"] for item in before_data}))
581
429
  renderers_before = []
582
430
  renderers_after = []
583
431
 
584
432
  for sample_idx in unique_samples:
585
433
  sb = before_df[before_df["sample_idx"] == sample_idx]
586
434
  sa = after_df[after_df["sample_idx"] == sample_idx]
587
- color = color_map.get(sample_idx, "#000000")
435
+ color = color_map.get(sample_idx, "#1f77b4")
588
436
 
589
437
  if not sb.empty:
590
438
  src = ColumnDataSource(sb)
@@ -623,8 +471,7 @@ def plot_alignment(
623
471
  )
624
472
  p2.add_tools(hover2)
625
473
 
626
- # Create layout with both plots side by side
627
- # Use the aliased bokeh_row and set sizing_mode, width and height to avoid validation warnings.
474
+ # Create layout
628
475
  layout = bokeh_row(p1, p2, sizing_mode="fixed", width=width, height=height)
629
476
 
630
477
  # Apply consistent save/display behavior
@@ -878,7 +725,7 @@ def plot_consensus_2d(
878
725
 
879
726
  def plot_samples_2d(
880
727
  self,
881
- samples=None,
728
+ samples=100,
882
729
  filename=None,
883
730
  markersize=2,
884
731
  size="dynamic",
@@ -1112,7 +959,7 @@ def plot_samples_2d(
1112
959
 
1113
960
  def plot_bpc(
1114
961
  self,
1115
- samples=None,
962
+ samples=100,
1116
963
  title: str | None = None,
1117
964
  filename: str | None = None,
1118
965
  width: int = 1000,
@@ -1288,7 +1135,7 @@ def plot_eic(
1288
1135
  self,
1289
1136
  mz,
1290
1137
  mz_tol=None,
1291
- samples=None,
1138
+ samples=100,
1292
1139
  title: str | None = None,
1293
1140
  filename: str | None = None,
1294
1141
  width: int = 1000,
@@ -1457,7 +1304,7 @@ def plot_eic(
1457
1304
 
1458
1305
  def plot_rt_correction(
1459
1306
  self,
1460
- samples=None,
1307
+ samples=200,
1461
1308
  title: str | None = None,
1462
1309
  filename: str | None = None,
1463
1310
  width: int = 1000,
@@ -1611,7 +1458,7 @@ def plot_rt_correction(
1611
1458
  def plot_chrom(
1612
1459
  self,
1613
1460
  uids=None,
1614
- samples=None,
1461
+ samples=100,
1615
1462
  filename=None,
1616
1463
  aligned=True,
1617
1464
  width=800,
@@ -2309,7 +2156,7 @@ def plot_pca(
2309
2156
 
2310
2157
  def plot_tic(
2311
2158
  self,
2312
- samples=None,
2159
+ samples=100,
2313
2160
  title: str | None = None,
2314
2161
  filename: str | None = None,
2315
2162
  width: int = 1000,