masster 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/docs/SCX_API_Documentation.md +0 -0
- masster/docs/SCX_DLL_Analysis.md +0 -0
- masster/logger.py +92 -78
- masster/sample/defaults/find_features_def.py +90 -94
- masster/sample/defaults/sample_def.py +15 -0
- masster/sample/h5.py +2 -2
- masster/sample/helpers.py +137 -136
- masster/sample/lib.py +11 -11
- masster/sample/load.py +13 -9
- masster/sample/plot.py +167 -60
- masster/sample/processing.py +150 -153
- masster/sample/sample.py +4 -4
- masster/sample/sample5_schema.json +62 -62
- masster/sample/save.py +16 -13
- masster/sample/sciex.py +187 -176
- masster/study/defaults/align_def.py +224 -6
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/study_def.py +2 -2
- masster/study/export.py +144 -131
- masster/study/h5.py +193 -133
- masster/study/helpers.py +293 -245
- masster/study/helpers_optimized.py +99 -57
- masster/study/load.py +51 -25
- masster/study/plot.py +453 -17
- masster/study/processing.py +197 -123
- masster/study/save.py +7 -7
- masster/study/study.py +97 -88
- masster/study/study5_schema.json +82 -82
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/METADATA +1 -1
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/RECORD +34 -32
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/WHEEL +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/entry_points.txt +0 -0
- {masster-0.3.9.dist-info → masster-0.3.11.dist-info}/licenses/LICENSE +0 -0
masster/study/plot.py
CHANGED
|
@@ -7,14 +7,6 @@ import holoviews as hv
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import panel
|
|
9
9
|
import polars as pl
|
|
10
|
-
|
|
11
|
-
from bokeh.io.export import export_png
|
|
12
|
-
from bokeh.models import ColumnDataSource
|
|
13
|
-
from bokeh.models import HoverTool
|
|
14
|
-
from bokeh.palettes import Turbo256
|
|
15
|
-
from bokeh.plotting import figure
|
|
16
|
-
from bokeh.plotting import output_file
|
|
17
|
-
from bokeh.plotting import show
|
|
18
10
|
from tqdm import tqdm
|
|
19
11
|
|
|
20
12
|
hv.extension("bokeh")
|
|
@@ -163,11 +155,11 @@ def plot_consensus_2d(
|
|
|
163
155
|
width=900,
|
|
164
156
|
height=900,
|
|
165
157
|
mz_range=None,
|
|
166
|
-
rt_range=None
|
|
158
|
+
rt_range=None,
|
|
167
159
|
):
|
|
168
160
|
"""
|
|
169
161
|
Plot consensus features in a 2D scatter plot with retention time vs m/z.
|
|
170
|
-
|
|
162
|
+
|
|
171
163
|
Parameters:
|
|
172
164
|
filename (str, optional): Path to save the plot
|
|
173
165
|
colorby (str): Column name to use for color mapping (default: "number_samples")
|
|
@@ -187,13 +179,13 @@ def plot_consensus_2d(
|
|
|
187
179
|
self.logger.error("No consensus map found.")
|
|
188
180
|
return
|
|
189
181
|
data = self.consensus_df.clone()
|
|
190
|
-
|
|
182
|
+
|
|
191
183
|
# Filter by mz_range and rt_range if provided
|
|
192
184
|
if mz_range is not None:
|
|
193
185
|
data = data.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
|
|
194
186
|
if rt_range is not None:
|
|
195
187
|
data = data.filter((pl.col("rt") >= rt_range[0]) & (pl.col("rt") <= rt_range[1]))
|
|
196
|
-
|
|
188
|
+
|
|
197
189
|
if colorby not in data.columns:
|
|
198
190
|
self.logger.error(f"Column {colorby} not found in consensus_df.")
|
|
199
191
|
return
|
|
@@ -342,13 +334,13 @@ def plot_samples_2d(
|
|
|
342
334
|
width=900,
|
|
343
335
|
height=900,
|
|
344
336
|
mz_range=None,
|
|
345
|
-
rt_range=None
|
|
337
|
+
rt_range=None,
|
|
346
338
|
):
|
|
347
339
|
"""
|
|
348
340
|
Plot all feature maps for sample_uid in parameter uids in an overlaid scatter plot.
|
|
349
341
|
Each sample is a different color. Alpha scales with intensity.
|
|
350
342
|
OPTIMIZED VERSION: Uses vectorized operations and batch processing.
|
|
351
|
-
|
|
343
|
+
|
|
352
344
|
Parameters:
|
|
353
345
|
samples: Sample UIDs to plot
|
|
354
346
|
filename (str, optional): Path to save the plot
|
|
@@ -366,6 +358,12 @@ def plot_samples_2d(
|
|
|
366
358
|
rt_range (tuple, optional): Retention time range for filtering features (min_rt, max_rt)
|
|
367
359
|
"""
|
|
368
360
|
|
|
361
|
+
# Local bokeh imports to avoid heavy top-level dependency
|
|
362
|
+
from bokeh.plotting import figure, show, output_file
|
|
363
|
+
from bokeh.io.export import export_png
|
|
364
|
+
from bokeh.models import ColumnDataSource, HoverTool
|
|
365
|
+
from bokeh.palettes import Turbo256
|
|
366
|
+
|
|
369
367
|
sample_uids = self._get_sample_uids(samples)
|
|
370
368
|
|
|
371
369
|
if not sample_uids:
|
|
@@ -385,7 +383,7 @@ def plot_samples_2d(
|
|
|
385
383
|
|
|
386
384
|
# OPTIMIZATION 1: Batch filter all features for selected samples at once
|
|
387
385
|
features_batch = self.features_df.filter(pl.col("sample_uid").is_in(sample_uids))
|
|
388
|
-
|
|
386
|
+
|
|
389
387
|
# Filter by mz_range and rt_range if provided
|
|
390
388
|
if mz_range is not None:
|
|
391
389
|
features_batch = features_batch.filter((pl.col("mz") >= mz_range[0]) & (pl.col("mz") <= mz_range[1]))
|
|
@@ -560,6 +558,9 @@ def plot_chrom(
|
|
|
560
558
|
self.logger.error("No chromatogram data found.")
|
|
561
559
|
return
|
|
562
560
|
|
|
561
|
+
# Local import for color palette
|
|
562
|
+
from bokeh.palettes import Turbo256
|
|
563
|
+
|
|
563
564
|
# Assign a fixed color to each sample/column
|
|
564
565
|
sample_names = [col for col in chroms.columns if col not in ["consensus_uid"]]
|
|
565
566
|
if not sample_names:
|
|
@@ -569,12 +570,12 @@ def plot_chrom(
|
|
|
569
570
|
|
|
570
571
|
plots = []
|
|
571
572
|
self.logger.info(f"Plotting {chroms.shape[0]} chromatograms...")
|
|
572
|
-
|
|
573
|
+
tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
|
|
573
574
|
for row in tqdm(
|
|
574
575
|
chroms.iter_rows(named=True),
|
|
575
576
|
total=chroms.shape[0],
|
|
576
577
|
desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO | {self.log_label}Plot chromatograms",
|
|
577
|
-
disable=
|
|
578
|
+
disable=tqdm_disable,
|
|
578
579
|
):
|
|
579
580
|
consensus_uid = row["consensus_uid"] # Get consensus_uid from the row
|
|
580
581
|
consensus_id = consensus_uid # Use the same value for consensus_id
|
|
@@ -698,3 +699,438 @@ def plot_chrom(
|
|
|
698
699
|
# In a server context, return the panel object instead of showing or saving directly
|
|
699
700
|
# return panel.panel(layout)
|
|
700
701
|
panel.panel(layout).show()
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
def plot_consensus_stats(
|
|
705
|
+
self,
|
|
706
|
+
filename=None,
|
|
707
|
+
width=1200,
|
|
708
|
+
height=1200,
|
|
709
|
+
alpha=0.6,
|
|
710
|
+
markersize=3,
|
|
711
|
+
):
|
|
712
|
+
"""
|
|
713
|
+
Plot a scatter plot matrix (SPLOM) of consensus statistics using Bokeh.
|
|
714
|
+
|
|
715
|
+
Parameters:
|
|
716
|
+
filename (str, optional): Output filename for saving the plot
|
|
717
|
+
width (int): Overall width of the plot (default: 1200)
|
|
718
|
+
height (int): Overall height of the plot (default: 1200)
|
|
719
|
+
alpha (float): Point transparency (default: 0.6)
|
|
720
|
+
markersize (int): Size of points (default: 5)
|
|
721
|
+
"""
|
|
722
|
+
from bokeh.layouts import gridplot
|
|
723
|
+
from bokeh.models import ColumnDataSource, HoverTool
|
|
724
|
+
from bokeh.plotting import figure, show, output_file
|
|
725
|
+
|
|
726
|
+
# Check if consensus_df exists and has data
|
|
727
|
+
if self.consensus_df is None or self.consensus_df.is_empty():
|
|
728
|
+
self.logger.error("No consensus data available. Run merge/find_consensus first.")
|
|
729
|
+
return
|
|
730
|
+
|
|
731
|
+
# Define the columns to plot
|
|
732
|
+
columns = [
|
|
733
|
+
"rt",
|
|
734
|
+
"mz",
|
|
735
|
+
"number_samples",
|
|
736
|
+
"log10_quality",
|
|
737
|
+
"mz_delta_mean",
|
|
738
|
+
"rt_delta_mean",
|
|
739
|
+
"chrom_coherence_mean",
|
|
740
|
+
"chrom_prominence_scaled_mean",
|
|
741
|
+
"inty_mean",
|
|
742
|
+
"number_ms2",
|
|
743
|
+
]
|
|
744
|
+
|
|
745
|
+
# Check which columns exist in the dataframe and compute missing ones
|
|
746
|
+
available_columns = self.consensus_df.columns
|
|
747
|
+
data_df = self.consensus_df.clone()
|
|
748
|
+
|
|
749
|
+
# Add log10_quality if quality exists
|
|
750
|
+
if "quality" in available_columns and "log10_quality" not in available_columns:
|
|
751
|
+
data_df = data_df.with_columns(
|
|
752
|
+
pl.col("quality").log10().alias("log10_quality"),
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
# Filter columns that actually exist
|
|
756
|
+
final_columns = [col for col in columns if col in data_df.columns]
|
|
757
|
+
|
|
758
|
+
if len(final_columns) < 2:
|
|
759
|
+
self.logger.error(f"Need at least 2 columns for SPLOM. Available: {final_columns}")
|
|
760
|
+
return
|
|
761
|
+
|
|
762
|
+
self.logger.debug(f"Creating SPLOM with columns: {final_columns}")
|
|
763
|
+
|
|
764
|
+
# Add important ID columns for tooltips even if not plotting them
|
|
765
|
+
tooltip_columns = []
|
|
766
|
+
for id_col in ["consensus_uid", "consensus_id"]:
|
|
767
|
+
if id_col in data_df.columns and id_col not in final_columns:
|
|
768
|
+
tooltip_columns.append(id_col)
|
|
769
|
+
|
|
770
|
+
# Select plotting columns plus tooltip columns
|
|
771
|
+
all_columns = final_columns + tooltip_columns
|
|
772
|
+
data_pd = data_df.select(all_columns).to_pandas()
|
|
773
|
+
|
|
774
|
+
# Remove any infinite or NaN values
|
|
775
|
+
data_pd = data_pd.replace([np.inf, -np.inf], np.nan).dropna()
|
|
776
|
+
|
|
777
|
+
if data_pd.empty:
|
|
778
|
+
self.logger.error("No valid data after removing NaN/infinite values.")
|
|
779
|
+
return
|
|
780
|
+
|
|
781
|
+
source = ColumnDataSource(data_pd)
|
|
782
|
+
|
|
783
|
+
n_vars = len(final_columns)
|
|
784
|
+
|
|
785
|
+
# Fixed dimensions - override user input to ensure consistent layout
|
|
786
|
+
total_width = 1200
|
|
787
|
+
total_height = 1200
|
|
788
|
+
|
|
789
|
+
# Calculate plot sizes to ensure uniform inner plot areas
|
|
790
|
+
# First column needs extra width for y-axis labels
|
|
791
|
+
plot_width_first = 180 # Wider to account for y-axis labels
|
|
792
|
+
plot_width_others = 120 # Standard width for other columns
|
|
793
|
+
plot_height_normal = 120 # Standard height
|
|
794
|
+
plot_height_last = 155 # Taller last row to accommodate x-axis labels while keeping inner plot area same size
|
|
795
|
+
|
|
796
|
+
# Create grid of plots with variable outer sizes but equal inner areas
|
|
797
|
+
plots = []
|
|
798
|
+
|
|
799
|
+
for i, y_var in enumerate(final_columns):
|
|
800
|
+
row = []
|
|
801
|
+
for j, x_var in enumerate(final_columns):
|
|
802
|
+
# Determine if this plot needs axis labels
|
|
803
|
+
has_x_label = i == n_vars - 1 # bottom row
|
|
804
|
+
has_y_label = j == 0 # left column
|
|
805
|
+
|
|
806
|
+
# First column wider to accommodate y-axis labels, ensuring equal inner plot areas
|
|
807
|
+
current_width = plot_width_first if has_y_label else plot_width_others
|
|
808
|
+
current_height = plot_height_last if has_x_label else plot_height_normal
|
|
809
|
+
|
|
810
|
+
p = figure(
|
|
811
|
+
width=current_width,
|
|
812
|
+
height=current_height,
|
|
813
|
+
title=None, # No title on any plot
|
|
814
|
+
toolbar_location=None,
|
|
815
|
+
# Adjusted borders - first column has more space, others minimal
|
|
816
|
+
min_border_left=70 if has_y_label else 15,
|
|
817
|
+
min_border_bottom=50 if has_x_label else 15,
|
|
818
|
+
min_border_right=15,
|
|
819
|
+
min_border_top=15,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
# Ensure subplot background and border are explicitly white so the plot looks
|
|
823
|
+
# correct in dark and light themes.
|
|
824
|
+
p.outline_line_color = None
|
|
825
|
+
p.border_fill_color = "white"
|
|
826
|
+
p.border_fill_alpha = 1.0
|
|
827
|
+
p.background_fill_color = "white"
|
|
828
|
+
|
|
829
|
+
# Remove axis lines to eliminate black lines between plots
|
|
830
|
+
p.xaxis.axis_line_color = None
|
|
831
|
+
p.yaxis.axis_line_color = None
|
|
832
|
+
|
|
833
|
+
# Keep subtle grid lines for data reference
|
|
834
|
+
p.grid.visible = True
|
|
835
|
+
p.grid.grid_line_color = "#E0E0E0" # Light gray grid lines
|
|
836
|
+
|
|
837
|
+
# Set axis labels and formatting
|
|
838
|
+
if has_x_label: # bottom row
|
|
839
|
+
p.xaxis.axis_label = x_var
|
|
840
|
+
p.xaxis.axis_label_text_font_size = "12pt"
|
|
841
|
+
p.xaxis.major_label_text_font_size = "9pt"
|
|
842
|
+
p.xaxis.axis_label_standoff = 15
|
|
843
|
+
else:
|
|
844
|
+
p.xaxis.major_label_text_font_size = "0pt"
|
|
845
|
+
p.xaxis.minor_tick_line_color = None
|
|
846
|
+
p.xaxis.major_tick_line_color = None
|
|
847
|
+
|
|
848
|
+
if has_y_label: # left column
|
|
849
|
+
p.yaxis.axis_label = y_var
|
|
850
|
+
p.yaxis.axis_label_text_font_size = "10pt" # Smaller y-axis title
|
|
851
|
+
p.yaxis.major_label_text_font_size = "8pt"
|
|
852
|
+
p.yaxis.axis_label_standoff = 12
|
|
853
|
+
else:
|
|
854
|
+
p.yaxis.major_label_text_font_size = "0pt"
|
|
855
|
+
p.yaxis.minor_tick_line_color = None
|
|
856
|
+
p.yaxis.major_tick_line_color = None
|
|
857
|
+
|
|
858
|
+
if i == j:
|
|
859
|
+
# Diagonal: histogram
|
|
860
|
+
hist, edges = np.histogram(data_pd[x_var], bins=30)
|
|
861
|
+
p.quad(
|
|
862
|
+
top=hist,
|
|
863
|
+
bottom=0,
|
|
864
|
+
left=edges[:-1],
|
|
865
|
+
right=edges[1:],
|
|
866
|
+
fill_color="green",
|
|
867
|
+
line_color="white",
|
|
868
|
+
alpha=alpha,
|
|
869
|
+
)
|
|
870
|
+
else:
|
|
871
|
+
# Off-diagonal: scatter plot
|
|
872
|
+
scatter = p.scatter(
|
|
873
|
+
x=x_var,
|
|
874
|
+
y=y_var,
|
|
875
|
+
size=markersize,
|
|
876
|
+
alpha=alpha,
|
|
877
|
+
color="blue",
|
|
878
|
+
source=source,
|
|
879
|
+
)
|
|
880
|
+
|
|
881
|
+
# Add hover tool
|
|
882
|
+
hover = HoverTool(
|
|
883
|
+
tooltips=[
|
|
884
|
+
(x_var, f"@{x_var}{{0.0000}}"),
|
|
885
|
+
(y_var, f"@{y_var}{{0.0000}}"),
|
|
886
|
+
(
|
|
887
|
+
"consensus_uid",
|
|
888
|
+
"@consensus_uid"
|
|
889
|
+
if "consensus_uid" in data_pd.columns
|
|
890
|
+
else "@consensus_id"
|
|
891
|
+
if "consensus_id" in data_pd.columns
|
|
892
|
+
else "N/A",
|
|
893
|
+
),
|
|
894
|
+
("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
|
|
895
|
+
("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
|
|
896
|
+
],
|
|
897
|
+
renderers=[scatter],
|
|
898
|
+
)
|
|
899
|
+
p.add_tools(hover)
|
|
900
|
+
|
|
901
|
+
row.append(p)
|
|
902
|
+
plots.append(row)
|
|
903
|
+
|
|
904
|
+
# Link axes for same variables
|
|
905
|
+
for i in range(n_vars):
|
|
906
|
+
for j in range(n_vars):
|
|
907
|
+
if i != j: # Don't link diagonal plots
|
|
908
|
+
# Link x-axis to other plots in same column
|
|
909
|
+
for k in range(n_vars):
|
|
910
|
+
if k != i and k != j:
|
|
911
|
+
plots[i][j].x_range = plots[k][j].x_range
|
|
912
|
+
|
|
913
|
+
# Link y-axis to other plots in same row
|
|
914
|
+
for k in range(n_vars):
|
|
915
|
+
if k != j and k != i:
|
|
916
|
+
plots[i][j].y_range = plots[i][k].y_range
|
|
917
|
+
|
|
918
|
+
# Create grid layout and force overall background/border to white so the outer
|
|
919
|
+
# container doesn't show dark UI colors in night mode.
|
|
920
|
+
grid = gridplot(plots)
|
|
921
|
+
|
|
922
|
+
# Set overall background and border to white when supported
|
|
923
|
+
if hasattr(grid, "background_fill_color"):
|
|
924
|
+
grid.background_fill_color = "white"
|
|
925
|
+
if hasattr(grid, "border_fill_color"):
|
|
926
|
+
grid.border_fill_color = "white"
|
|
927
|
+
|
|
928
|
+
# Output and show
|
|
929
|
+
if filename:
|
|
930
|
+
output_file(filename)
|
|
931
|
+
|
|
932
|
+
show(grid)
|
|
933
|
+
return grid
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
def plot_pca(
|
|
937
|
+
self,
|
|
938
|
+
filename=None,
|
|
939
|
+
width=600,
|
|
940
|
+
height=600,
|
|
941
|
+
alpha=0.8,
|
|
942
|
+
markersize=8,
|
|
943
|
+
n_components=2,
|
|
944
|
+
color_by=None,
|
|
945
|
+
title="PCA of Consensus Matrix",
|
|
946
|
+
):
|
|
947
|
+
"""
|
|
948
|
+
Plot PCA (Principal Component Analysis) of the consensus matrix using Bokeh.
|
|
949
|
+
|
|
950
|
+
Parameters:
|
|
951
|
+
filename (str, optional): Output filename for saving the plot
|
|
952
|
+
width (int): Plot width (default: 800)
|
|
953
|
+
height (int): Plot height (default: 600)
|
|
954
|
+
alpha (float): Point transparency (default: 0.8)
|
|
955
|
+
markersize (int): Size of points (default: 8)
|
|
956
|
+
n_components (int): Number of PCA components to compute (default: 2)
|
|
957
|
+
color_by (str, optional): Column from samples_df to color points by
|
|
958
|
+
title (str): Plot title (default: "PCA of Consensus Matrix")
|
|
959
|
+
"""
|
|
960
|
+
from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
|
|
961
|
+
from bokeh.plotting import figure, show, output_file
|
|
962
|
+
from bokeh.palettes import Category20, viridis
|
|
963
|
+
from bokeh.transform import factor_cmap
|
|
964
|
+
from sklearn.decomposition import PCA
|
|
965
|
+
from sklearn.preprocessing import StandardScaler
|
|
966
|
+
import pandas as pd
|
|
967
|
+
import numpy as np
|
|
968
|
+
|
|
969
|
+
# Check if consensus matrix and samples_df exist
|
|
970
|
+
try:
|
|
971
|
+
consensus_matrix = self.get_consensus_matrix()
|
|
972
|
+
samples_df = self.samples_df
|
|
973
|
+
except Exception as e:
|
|
974
|
+
self.logger.error(f"Error getting consensus matrix or samples_df: {e}")
|
|
975
|
+
return
|
|
976
|
+
|
|
977
|
+
if consensus_matrix is None or consensus_matrix.shape[0] == 0:
|
|
978
|
+
self.logger.error("No consensus matrix available. Run merge/find_consensus first.")
|
|
979
|
+
return
|
|
980
|
+
|
|
981
|
+
if samples_df is None or samples_df.is_empty():
|
|
982
|
+
self.logger.error("No samples dataframe available.")
|
|
983
|
+
return
|
|
984
|
+
|
|
985
|
+
self.logger.info(f"Performing PCA on consensus matrix with shape: {consensus_matrix.shape}")
|
|
986
|
+
|
|
987
|
+
# Convert consensus matrix to numpy if it's not already
|
|
988
|
+
if hasattr(consensus_matrix, "values"):
|
|
989
|
+
matrix_data = consensus_matrix.values
|
|
990
|
+
elif hasattr(consensus_matrix, "to_numpy"):
|
|
991
|
+
matrix_data = consensus_matrix.to_numpy()
|
|
992
|
+
else:
|
|
993
|
+
matrix_data = np.array(consensus_matrix)
|
|
994
|
+
|
|
995
|
+
# Transpose matrix so samples are rows and features are columns
|
|
996
|
+
matrix_data = matrix_data.T
|
|
997
|
+
|
|
998
|
+
# Handle missing values by replacing with 0
|
|
999
|
+
matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
|
|
1000
|
+
|
|
1001
|
+
# Standardize the data
|
|
1002
|
+
scaler = StandardScaler()
|
|
1003
|
+
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
1004
|
+
|
|
1005
|
+
# Perform PCA
|
|
1006
|
+
pca = PCA(n_components=n_components)
|
|
1007
|
+
pca_result = pca.fit_transform(matrix_scaled)
|
|
1008
|
+
|
|
1009
|
+
# Get explained variance ratios
|
|
1010
|
+
explained_var = pca.explained_variance_ratio_
|
|
1011
|
+
|
|
1012
|
+
self.logger.info(f"PCA explained variance ratios: {explained_var}")
|
|
1013
|
+
|
|
1014
|
+
# Convert samples_df to pandas for easier manipulation
|
|
1015
|
+
samples_pd = samples_df.to_pandas()
|
|
1016
|
+
|
|
1017
|
+
# Create dataframe with PCA results and sample information
|
|
1018
|
+
pca_df = pd.DataFrame({
|
|
1019
|
+
"PC1": pca_result[:, 0],
|
|
1020
|
+
"PC2": pca_result[:, 1] if n_components > 1 else np.zeros(len(pca_result)),
|
|
1021
|
+
})
|
|
1022
|
+
|
|
1023
|
+
# Add sample information to PCA dataframe
|
|
1024
|
+
if len(samples_pd) == len(pca_df):
|
|
1025
|
+
for col in samples_pd.columns:
|
|
1026
|
+
pca_df[col] = samples_pd[col].values
|
|
1027
|
+
else:
|
|
1028
|
+
self.logger.warning(
|
|
1029
|
+
f"Sample count mismatch: samples_df has {len(samples_pd)} rows, "
|
|
1030
|
+
f"but consensus matrix has {len(pca_df)} samples"
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
# Prepare color mapping
|
|
1034
|
+
color_column = None
|
|
1035
|
+
color_mapper = None
|
|
1036
|
+
|
|
1037
|
+
if color_by and color_by in pca_df.columns:
|
|
1038
|
+
color_column = color_by
|
|
1039
|
+
unique_values = pca_df[color_by].unique()
|
|
1040
|
+
|
|
1041
|
+
# Handle categorical vs numeric coloring
|
|
1042
|
+
if pca_df[color_by].dtype in ["object", "string", "category"]:
|
|
1043
|
+
# Categorical coloring
|
|
1044
|
+
if len(unique_values) <= 20:
|
|
1045
|
+
palette = Category20[min(20, max(3, len(unique_values)))]
|
|
1046
|
+
else:
|
|
1047
|
+
palette = viridis(min(256, len(unique_values)))
|
|
1048
|
+
color_mapper = factor_cmap(color_by, palette, unique_values)
|
|
1049
|
+
else:
|
|
1050
|
+
# Numeric coloring
|
|
1051
|
+
palette = viridis(256)
|
|
1052
|
+
color_mapper = LinearColorMapper(
|
|
1053
|
+
palette=palette,
|
|
1054
|
+
low=pca_df[color_by].min(),
|
|
1055
|
+
high=pca_df[color_by].max(),
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
# Create Bokeh plot
|
|
1059
|
+
p = figure(
|
|
1060
|
+
width=width,
|
|
1061
|
+
height=height,
|
|
1062
|
+
title=f"{title} (PC1: {explained_var[0]:.1%}, PC2: {explained_var[1]:.1%})",
|
|
1063
|
+
tools="pan,wheel_zoom,box_zoom,reset,save",
|
|
1064
|
+
)
|
|
1065
|
+
|
|
1066
|
+
p.xaxis.axis_label = f"PC1 ({explained_var[0]:.1%} variance)"
|
|
1067
|
+
p.yaxis.axis_label = f"PC2 ({explained_var[1]:.1%} variance)"
|
|
1068
|
+
|
|
1069
|
+
# Create data source
|
|
1070
|
+
source = ColumnDataSource(pca_df)
|
|
1071
|
+
|
|
1072
|
+
# Create scatter plot
|
|
1073
|
+
if color_mapper:
|
|
1074
|
+
if isinstance(color_mapper, LinearColorMapper):
|
|
1075
|
+
scatter = p.scatter(
|
|
1076
|
+
"PC1",
|
|
1077
|
+
"PC2",
|
|
1078
|
+
size=markersize,
|
|
1079
|
+
alpha=alpha,
|
|
1080
|
+
color={"field": color_by, "transform": color_mapper},
|
|
1081
|
+
source=source,
|
|
1082
|
+
)
|
|
1083
|
+
# Add colorbar for numeric coloring
|
|
1084
|
+
color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
|
|
1085
|
+
p.add_layout(color_bar, "right")
|
|
1086
|
+
else:
|
|
1087
|
+
scatter = p.scatter(
|
|
1088
|
+
"PC1",
|
|
1089
|
+
"PC2",
|
|
1090
|
+
size=markersize,
|
|
1091
|
+
alpha=alpha,
|
|
1092
|
+
color=color_mapper,
|
|
1093
|
+
source=source,
|
|
1094
|
+
legend_field=color_by,
|
|
1095
|
+
)
|
|
1096
|
+
else:
|
|
1097
|
+
scatter = p.scatter(
|
|
1098
|
+
"PC1",
|
|
1099
|
+
"PC2",
|
|
1100
|
+
size=markersize,
|
|
1101
|
+
alpha=alpha,
|
|
1102
|
+
color="blue",
|
|
1103
|
+
source=source,
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
# Create comprehensive hover tooltips with all sample information
|
|
1107
|
+
tooltip_list = [
|
|
1108
|
+
("PC1", "@PC1{0.00}"),
|
|
1109
|
+
("PC2", "@PC2{0.00}"),
|
|
1110
|
+
]
|
|
1111
|
+
|
|
1112
|
+
# Add all sample dataframe columns to tooltips
|
|
1113
|
+
for col in samples_pd.columns:
|
|
1114
|
+
if col in pca_df.columns:
|
|
1115
|
+
if pca_df[col].dtype in ["float64", "float32"]:
|
|
1116
|
+
tooltip_list.append((col, f"@{col}{{0.00}}"))
|
|
1117
|
+
else:
|
|
1118
|
+
tooltip_list.append((col, f"@{col}"))
|
|
1119
|
+
|
|
1120
|
+
hover = HoverTool(
|
|
1121
|
+
tooltips=tooltip_list,
|
|
1122
|
+
renderers=[scatter],
|
|
1123
|
+
)
|
|
1124
|
+
p.add_tools(hover)
|
|
1125
|
+
|
|
1126
|
+
# Add legend if using categorical coloring
|
|
1127
|
+
if color_mapper and not isinstance(color_mapper, LinearColorMapper) and color_by:
|
|
1128
|
+
p.legend.location = "top_left"
|
|
1129
|
+
p.legend.click_policy = "hide"
|
|
1130
|
+
|
|
1131
|
+
# Output and show
|
|
1132
|
+
if filename:
|
|
1133
|
+
output_file(filename)
|
|
1134
|
+
|
|
1135
|
+
show(p)
|
|
1136
|
+
return p
|