masster 0.3.10__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/plot.py CHANGED
@@ -7,14 +7,6 @@ import holoviews as hv
7
7
  import numpy as np
8
8
  import panel
9
9
  import polars as pl
10
-
11
- from bokeh.io.export import export_png
12
- from bokeh.models import ColumnDataSource
13
- from bokeh.models import HoverTool
14
- from bokeh.palettes import Turbo256
15
- from bokeh.plotting import figure
16
- from bokeh.plotting import output_file
17
- from bokeh.plotting import show
18
10
  from tqdm import tqdm
19
11
 
20
12
  hv.extension("bokeh")
@@ -163,11 +155,11 @@ def plot_consensus_2d(
163
155
  width=900,
164
156
  height=900,
165
157
  mz_range=None,
166
- rt_range=None
158
+ rt_range=None,
167
159
  ):
168
160
  """
169
161
  Plot consensus features in a 2D scatter plot with retention time vs m/z.
170
-
162
+
171
163
  Parameters:
172
164
  filename (str, optional): Path to save the plot
173
165
  colorby (str): Column name to use for color mapping (default: "number_samples")
@@ -187,13 +179,13 @@ def plot_consensus_2d(
187
179
  self.logger.error("No consensus map found.")
188
180
  return
189
181
  data = self.consensus_df.clone()
190
-
182
+
191
183
  # Filter by mz_range and rt_range if provided
192
184
  if mz_range is not None:
193
185
  data = data.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
194
186
  if rt_range is not None:
195
187
  data = data.filter((pl.col("rt") >= rt_range[0]) & (pl.col("rt") <= rt_range[1]))
196
-
188
+
197
189
  if colorby not in data.columns:
198
190
  self.logger.error(f"Column {colorby} not found in consensus_df.")
199
191
  return
@@ -342,13 +334,13 @@ def plot_samples_2d(
342
334
  width=900,
343
335
  height=900,
344
336
  mz_range=None,
345
- rt_range=None
337
+ rt_range=None,
346
338
  ):
347
339
  """
348
340
  Plot all feature maps for sample_uid in parameter uids in an overlaid scatter plot.
349
341
  Each sample is a different color. Alpha scales with intensity.
350
342
  OPTIMIZED VERSION: Uses vectorized operations and batch processing.
351
-
343
+
352
344
  Parameters:
353
345
  samples: Sample UIDs to plot
354
346
  filename (str, optional): Path to save the plot
@@ -366,6 +358,12 @@ def plot_samples_2d(
366
358
  rt_range (tuple, optional): Retention time range for filtering features (min_rt, max_rt)
367
359
  """
368
360
 
361
+ # Local bokeh imports to avoid heavy top-level dependency
362
+ from bokeh.plotting import figure, show, output_file
363
+ from bokeh.io.export import export_png
364
+ from bokeh.models import ColumnDataSource, HoverTool
365
+ from bokeh.palettes import Turbo256
366
+
369
367
  sample_uids = self._get_sample_uids(samples)
370
368
 
371
369
  if not sample_uids:
@@ -385,7 +383,7 @@ def plot_samples_2d(
385
383
 
386
384
  # OPTIMIZATION 1: Batch filter all features for selected samples at once
387
385
  features_batch = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
388
-
386
+
389
387
  # Filter by mz_range and rt_range if provided
390
388
  if mz_range is not None:
391
389
  features_batch = features_batch.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
@@ -560,6 +558,9 @@ def plot_chrom(
560
558
  self.logger.error("No chromatogram data found.")
561
559
  return
562
560
 
561
+ # Local import for color palette
562
+ from bokeh.palettes import Turbo256
563
+
563
564
  # Assign a fixed color to each sample/column
564
565
  sample_names = [col for col in chroms.columns if col not in ["consensus_uid"]]
565
566
  if not sample_names:
@@ -569,12 +570,12 @@ def plot_chrom(
569
570
 
570
571
  plots = []
571
572
  self.logger.info(f"Plotting {chroms.shape[0]} chromatograms...")
572
- tdqm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
573
+ tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
573
574
  for row in tqdm(
574
575
  chroms.iter_rows(named=True),
575
576
  total=chroms.shape[0],
576
577
  desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Plot chromatograms",
577
- disable=tdqm_disable,
578
+ disable=tqdm_disable,
578
579
  ):
579
580
  consensus_uid = row["consensus_uid"] # Get consensus_uid from the row
580
581
  consensus_id = consensus_uid # Use the same value for consensus_id
@@ -698,3 +699,438 @@ def plot_chrom(
698
699
  # In a server context, return the panel object instead of showing or saving directly
699
700
  # return panel.panel(layout)
700
701
  panel.panel(layout).show()
702
+
703
+
704
+ def plot_consensus_stats(
705
+ self,
706
+ filename=None,
707
+ width=1200,
708
+ height=1200,
709
+ alpha=0.6,
710
+ markersize=3,
711
+ ):
712
+ """
713
+ Plot a scatter plot matrix (SPLOM) of consensus statistics using Bokeh.
714
+
715
+ Parameters:
716
+ filename (str, optional): Output filename for saving the plot
717
+ width (int): Overall width of the plot (default: 1200)
718
+ height (int): Overall height of the plot (default: 1200)
719
+ alpha (float): Point transparency (default: 0.6)
720
+ markersize (int): Size of points (default: 5)
721
+ """
722
+ from bokeh.layouts import gridplot
723
+ from bokeh.models import ColumnDataSource, HoverTool
724
+ from bokeh.plotting import figure, show, output_file
725
+
726
+ # Check if consensus_df exists and has data
727
+ if self.consensus_df is None or self.consensus_df.is_empty():
728
+ self.logger.error("No consensus data available. Run merge/find_consensus first.")
729
+ return
730
+
731
+ # Define the columns to plot
732
+ columns = [
733
+ "rt",
734
+ "mz",
735
+ "number_samples",
736
+ "log10_quality",
737
+ "mz_delta_mean",
738
+ "rt_delta_mean",
739
+ "chrom_coherence_mean",
740
+ "chrom_prominence_scaled_mean",
741
+ "inty_mean",
742
+ "number_ms2",
743
+ ]
744
+
745
+ # Check which columns exist in the dataframe and compute missing ones
746
+ available_columns = self.consensus_df.columns
747
+ data_df = self.consensus_df.clone()
748
+
749
+ # Add log10_quality if quality exists
750
+ if "quality" in available_columns and "log10_quality" not in available_columns:
751
+ data_df = data_df.with_columns(
752
+ pl.col("quality").log10().alias("log10_quality"),
753
+ )
754
+
755
+ # Filter columns that actually exist
756
+ final_columns = [col for col in columns if col in data_df.columns]
757
+
758
+ if len(final_columns) < 2:
759
+ self.logger.error(f"Need at least 2 columns for SPLOM. Available: {final_columns}")
760
+ return
761
+
762
+ self.logger.debug(f"Creating SPLOM with columns: {final_columns}")
763
+
764
+ # Add important ID columns for tooltips even if not plotting them
765
+ tooltip_columns = []
766
+ for id_col in ["consensus_uid", "consensus_id"]:
767
+ if id_col in data_df.columns and id_col not in final_columns:
768
+ tooltip_columns.append(id_col)
769
+
770
+ # Select plotting columns plus tooltip columns
771
+ all_columns = final_columns + tooltip_columns
772
+ data_pd = data_df.select(all_columns).to_pandas()
773
+
774
+ # Remove any infinite or NaN values
775
+ data_pd = data_pd.replace([np.inf, -np.inf], np.nan).dropna()
776
+
777
+ if data_pd.empty:
778
+ self.logger.error("No valid data after removing NaN/infinite values.")
779
+ return
780
+
781
+ source = ColumnDataSource(data_pd)
782
+
783
+ n_vars = len(final_columns)
784
+
785
+ # Fixed dimensions - override user input to ensure consistent layout
786
+ total_width = 1200
787
+ total_height = 1200
788
+
789
+ # Calculate plot sizes to ensure uniform inner plot areas
790
+ # First column needs extra width for y-axis labels
791
+ plot_width_first = 180 # Wider to account for y-axis labels
792
+ plot_width_others = 120 # Standard width for other columns
793
+ plot_height_normal = 120 # Standard height
794
+ plot_height_last = 155 # Taller last row to accommodate x-axis labels while keeping inner plot area same size
795
+
796
+ # Create grid of plots with variable outer sizes but equal inner areas
797
+ plots = []
798
+
799
+ for i, y_var in enumerate(final_columns):
800
+ row = []
801
+ for j, x_var in enumerate(final_columns):
802
+ # Determine if this plot needs axis labels
803
+ has_x_label = i == n_vars - 1 # bottom row
804
+ has_y_label = j == 0 # left column
805
+
806
+ # First column wider to accommodate y-axis labels, ensuring equal inner plot areas
807
+ current_width = plot_width_first if has_y_label else plot_width_others
808
+ current_height = plot_height_last if has_x_label else plot_height_normal
809
+
810
+ p = figure(
811
+ width=current_width,
812
+ height=current_height,
813
+ title=None, # No title on any plot
814
+ toolbar_location=None,
815
+ # Adjusted borders - first column has more space, others minimal
816
+ min_border_left=70 if has_y_label else 15,
817
+ min_border_bottom=50 if has_x_label else 15,
818
+ min_border_right=15,
819
+ min_border_top=15,
820
+ )
821
+
822
+ # Ensure subplot background and border are explicitly white so the plot looks
823
+ # correct in dark and light themes.
824
+ p.outline_line_color = None
825
+ p.border_fill_color = "white"
826
+ p.border_fill_alpha = 1.0
827
+ p.background_fill_color = "white"
828
+
829
+ # Remove axis lines to eliminate black lines between plots
830
+ p.xaxis.axis_line_color = None
831
+ p.yaxis.axis_line_color = None
832
+
833
+ # Keep subtle grid lines for data reference
834
+ p.grid.visible = True
835
+ p.grid.grid_line_color = "#E0E0E0" # Light gray grid lines
836
+
837
+ # Set axis labels and formatting
838
+ if has_x_label: # bottom row
839
+ p.xaxis.axis_label = x_var
840
+ p.xaxis.axis_label_text_font_size = "12pt"
841
+ p.xaxis.major_label_text_font_size = "9pt"
842
+ p.xaxis.axis_label_standoff = 15
843
+ else:
844
+ p.xaxis.major_label_text_font_size = "0pt"
845
+ p.xaxis.minor_tick_line_color = None
846
+ p.xaxis.major_tick_line_color = None
847
+
848
+ if has_y_label: # left column
849
+ p.yaxis.axis_label = y_var
850
+ p.yaxis.axis_label_text_font_size = "10pt" # Smaller y-axis title
851
+ p.yaxis.major_label_text_font_size = "8pt"
852
+ p.yaxis.axis_label_standoff = 12
853
+ else:
854
+ p.yaxis.major_label_text_font_size = "0pt"
855
+ p.yaxis.minor_tick_line_color = None
856
+ p.yaxis.major_tick_line_color = None
857
+
858
+ if i == j:
859
+ # Diagonal: histogram
860
+ hist, edges = np.histogram(data_pd[x_var], bins=30)
861
+ p.quad(
862
+ top=hist,
863
+ bottom=0,
864
+ left=edges[:-1],
865
+ right=edges[1:],
866
+ fill_color="green",
867
+ line_color="white",
868
+ alpha=alpha,
869
+ )
870
+ else:
871
+ # Off-diagonal: scatter plot
872
+ scatter = p.scatter(
873
+ x=x_var,
874
+ y=y_var,
875
+ size=markersize,
876
+ alpha=alpha,
877
+ color="blue",
878
+ source=source,
879
+ )
880
+
881
+ # Add hover tool
882
+ hover = HoverTool(
883
+ tooltips=[
884
+ (x_var, f"@{x_var}{{0.0000}}"),
885
+ (y_var, f"@{y_var}{{0.0000}}"),
886
+ (
887
+ "consensus_uid",
888
+ "@consensus_uid"
889
+ if "consensus_uid" in data_pd.columns
890
+ else "@consensus_id"
891
+ if "consensus_id" in data_pd.columns
892
+ else "N/A",
893
+ ),
894
+ ("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
895
+ ("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
896
+ ],
897
+ renderers=[scatter],
898
+ )
899
+ p.add_tools(hover)
900
+
901
+ row.append(p)
902
+ plots.append(row)
903
+
904
+ # Link axes for same variables
905
+ for i in range(n_vars):
906
+ for j in range(n_vars):
907
+ if i != j: # Don't link diagonal plots
908
+ # Link x-axis to other plots in same column
909
+ for k in range(n_vars):
910
+ if k != i and k != j:
911
+ plots[i][j].x_range = plots[k][j].x_range
912
+
913
+ # Link y-axis to other plots in same row
914
+ for k in range(n_vars):
915
+ if k != j and k != i:
916
+ plots[i][j].y_range = plots[i][k].y_range
917
+
918
+ # Create grid layout and force overall background/border to white so the outer
919
+ # container doesn't show dark UI colors in night mode.
920
+ grid = gridplot(plots)
921
+
922
+ # Set overall background and border to white when supported
923
+ if hasattr(grid, "background_fill_color"):
924
+ grid.background_fill_color = "white"
925
+ if hasattr(grid, "border_fill_color"):
926
+ grid.border_fill_color = "white"
927
+
928
+ # Output and show
929
+ if filename:
930
+ output_file(filename)
931
+
932
+ show(grid)
933
+ return grid
934
+
935
+
936
+ def plot_pca(
937
+ self,
938
+ filename=None,
939
+ width=600,
940
+ height=600,
941
+ alpha=0.8,
942
+ markersize=8,
943
+ n_components=2,
944
+ color_by=None,
945
+ title="PCA of Consensus Matrix",
946
+ ):
947
+ """
948
+ Plot PCA (Principal Component Analysis) of the consensus matrix using Bokeh.
949
+
950
+ Parameters:
951
+ filename (str, optional): Output filename for saving the plot
952
+ width (int): Plot width (default: 800)
953
+ height (int): Plot height (default: 600)
954
+ alpha (float): Point transparency (default: 0.8)
955
+ markersize (int): Size of points (default: 8)
956
+ n_components (int): Number of PCA components to compute (default: 2)
957
+ color_by (str, optional): Column from samples_df to color points by
958
+ title (str): Plot title (default: "PCA of Consensus Matrix")
959
+ """
960
+ from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
961
+ from bokeh.plotting import figure, show, output_file
962
+ from bokeh.palettes import Category20, viridis
963
+ from bokeh.transform import factor_cmap
964
+ from sklearn.decomposition import PCA
965
+ from sklearn.preprocessing import StandardScaler
966
+ import pandas as pd
967
+ import numpy as np
968
+
969
+ # Check if consensus matrix and samples_df exist
970
+ try:
971
+ consensus_matrix = self.get_consensus_matrix()
972
+ samples_df = self.samples_df
973
+ except Exception as e:
974
+ self.logger.error(f"Error getting consensus matrix or samples_df: {e}")
975
+ return
976
+
977
+ if consensus_matrix is None or consensus_matrix.shape[0] == 0:
978
+ self.logger.error("No consensus matrix available. Run merge/find_consensus first.")
979
+ return
980
+
981
+ if samples_df is None or samples_df.is_empty():
982
+ self.logger.error("No samples dataframe available.")
983
+ return
984
+
985
+ self.logger.info(f"Performing PCA on consensus matrix with shape: {consensus_matrix.shape}")
986
+
987
+ # Convert consensus matrix to numpy if it's not already
988
+ if hasattr(consensus_matrix, "values"):
989
+ matrix_data = consensus_matrix.values
990
+ elif hasattr(consensus_matrix, "to_numpy"):
991
+ matrix_data = consensus_matrix.to_numpy()
992
+ else:
993
+ matrix_data = np.array(consensus_matrix)
994
+
995
+ # Transpose matrix so samples are rows and features are columns
996
+ matrix_data = matrix_data.T
997
+
998
+ # Handle missing values by replacing with 0
999
+ matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
1000
+
1001
+ # Standardize the data
1002
+ scaler = StandardScaler()
1003
+ matrix_scaled = scaler.fit_transform(matrix_data)
1004
+
1005
+ # Perform PCA
1006
+ pca = PCA(n_components=n_components)
1007
+ pca_result = pca.fit_transform(matrix_scaled)
1008
+
1009
+ # Get explained variance ratios
1010
+ explained_var = pca.explained_variance_ratio_
1011
+
1012
+ self.logger.info(f"PCA explained variance ratios: {explained_var}")
1013
+
1014
+ # Convert samples_df to pandas for easier manipulation
1015
+ samples_pd = samples_df.to_pandas()
1016
+
1017
+ # Create dataframe with PCA results and sample information
1018
+ pca_df = pd.DataFrame({
1019
+ "PC1": pca_result[:, 0],
1020
+ "PC2": pca_result[:, 1] if n_components > 1 else np.zeros(len(pca_result)),
1021
+ })
1022
+
1023
+ # Add sample information to PCA dataframe
1024
+ if len(samples_pd) == len(pca_df):
1025
+ for col in samples_pd.columns:
1026
+ pca_df[col] = samples_pd[col].values
1027
+ else:
1028
+ self.logger.warning(
1029
+ f"Sample count mismatch: samples_df has {len(samples_pd)} rows, "
1030
+ f"but consensus matrix has {len(pca_df)} samples"
1031
+ )
1032
+
1033
+ # Prepare color mapping
1034
+ color_column = None
1035
+ color_mapper = None
1036
+
1037
+ if color_by and color_by in pca_df.columns:
1038
+ color_column = color_by
1039
+ unique_values = pca_df[color_by].unique()
1040
+
1041
+ # Handle categorical vs numeric coloring
1042
+ if pca_df[color_by].dtype in ["object", "string", "category"]:
1043
+ # Categorical coloring
1044
+ if len(unique_values) <= 20:
1045
+ palette = Category20[min(20, max(3, len(unique_values)))]
1046
+ else:
1047
+ palette = viridis(min(256, len(unique_values)))
1048
+ color_mapper = factor_cmap(color_by, palette, unique_values)
1049
+ else:
1050
+ # Numeric coloring
1051
+ palette = viridis(256)
1052
+ color_mapper = LinearColorMapper(
1053
+ palette=palette,
1054
+ low=pca_df[color_by].min(),
1055
+ high=pca_df[color_by].max(),
1056
+ )
1057
+
1058
+ # Create Bokeh plot
1059
+ p = figure(
1060
+ width=width,
1061
+ height=height,
1062
+ title=f"{title} (PC1: {explained_var[0]:.1%}, PC2: {explained_var[1]:.1%})",
1063
+ tools="pan,wheel_zoom,box_zoom,reset,save",
1064
+ )
1065
+
1066
+ p.xaxis.axis_label = f"PC1 ({explained_var[0]:.1%} variance)"
1067
+ p.yaxis.axis_label = f"PC2 ({explained_var[1]:.1%} variance)"
1068
+
1069
+ # Create data source
1070
+ source = ColumnDataSource(pca_df)
1071
+
1072
+ # Create scatter plot
1073
+ if color_mapper:
1074
+ if isinstance(color_mapper, LinearColorMapper):
1075
+ scatter = p.scatter(
1076
+ "PC1",
1077
+ "PC2",
1078
+ size=markersize,
1079
+ alpha=alpha,
1080
+ color={"field": color_by, "transform": color_mapper},
1081
+ source=source,
1082
+ )
1083
+ # Add colorbar for numeric coloring
1084
+ color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
1085
+ p.add_layout(color_bar, "right")
1086
+ else:
1087
+ scatter = p.scatter(
1088
+ "PC1",
1089
+ "PC2",
1090
+ size=markersize,
1091
+ alpha=alpha,
1092
+ color=color_mapper,
1093
+ source=source,
1094
+ legend_field=color_by,
1095
+ )
1096
+ else:
1097
+ scatter = p.scatter(
1098
+ "PC1",
1099
+ "PC2",
1100
+ size=markersize,
1101
+ alpha=alpha,
1102
+ color="blue",
1103
+ source=source,
1104
+ )
1105
+
1106
+ # Create comprehensive hover tooltips with all sample information
1107
+ tooltip_list = [
1108
+ ("PC1", "@PC1{0.00}"),
1109
+ ("PC2", "@PC2{0.00}"),
1110
+ ]
1111
+
1112
+ # Add all sample dataframe columns to tooltips
1113
+ for col in samples_pd.columns:
1114
+ if col in pca_df.columns:
1115
+ if pca_df[col].dtype in ["float64", "float32"]:
1116
+ tooltip_list.append((col, f"@{col}{{0.00}}"))
1117
+ else:
1118
+ tooltip_list.append((col, f"@{col}"))
1119
+
1120
+ hover = HoverTool(
1121
+ tooltips=tooltip_list,
1122
+ renderers=[scatter],
1123
+ )
1124
+ p.add_tools(hover)
1125
+
1126
+ # Add legend if using categorical coloring
1127
+ if color_mapper and not isinstance(color_mapper, LinearColorMapper) and color_by:
1128
+ p.legend.location = "top_left"
1129
+ p.legend.click_policy = "hide"
1130
+
1131
+ # Output and show
1132
+ if filename:
1133
+ output_file(filename)
1134
+
1135
+ show(p)
1136
+ return p