masster 0.4.22__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/sample/adducts.py +1 -1
- masster/sample/load.py +10 -9
- masster/sample/plot.py +1 -1
- masster/sample/processing.py +4 -4
- masster/sample/sample.py +29 -32
- masster/study/analysis.py +1762 -0
- masster/study/export.py +5 -3
- masster/study/helpers.py +153 -80
- masster/study/id.py +3 -3
- masster/study/load.py +17 -52
- masster/study/merge.py +316 -313
- masster/study/parameters.py +3 -3
- masster/study/plot.py +398 -43
- masster/study/processing.py +4 -4
- masster/study/save.py +8 -4
- masster/study/study.py +97 -139
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/METADATA +54 -14
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/RECORD +22 -21
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/WHEEL +0 -0
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/entry_points.txt +0 -0
- {masster-0.4.22.dist-info → masster-0.5.0.dist-info}/licenses/LICENSE +0 -0
masster/study/parameters.py
CHANGED
|
@@ -8,7 +8,7 @@ similar to the sample parameters module but for study-level operations.
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def
|
|
11
|
+
def update_history(self, keys, value):
|
|
12
12
|
"""
|
|
13
13
|
Store parameters in a nested dictionary structure.
|
|
14
14
|
|
|
@@ -74,10 +74,10 @@ def update_parameters(self, **kwargs):
|
|
|
74
74
|
# Check if it's a parameter defaults instance
|
|
75
75
|
if hasattr(value, "to_dict") and callable(getattr(value, "to_dict")):
|
|
76
76
|
# Store the parameter object
|
|
77
|
-
self.
|
|
77
|
+
self.update_history([key], value.to_dict())
|
|
78
78
|
else:
|
|
79
79
|
# Store individual parameter
|
|
80
|
-
self.
|
|
80
|
+
self.update_history([key], value)
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
def get_parameters_property(self):
|
masster/study/plot.py
CHANGED
|
@@ -308,7 +308,7 @@ def plot_alignment(
|
|
|
308
308
|
self.logger.info("Showing current RT values for both plots. Run align() first to see alignment comparison.")
|
|
309
309
|
|
|
310
310
|
# Get sample_uids to filter by if specified
|
|
311
|
-
sample_uids = self.
|
|
311
|
+
sample_uids = self._get_samples_uids(samples) if samples is not None else None
|
|
312
312
|
|
|
313
313
|
# Start with full features_df
|
|
314
314
|
features_df = self.features_df
|
|
@@ -836,7 +836,7 @@ def plot_samples_2d(
|
|
|
836
836
|
from bokeh.io.export import export_png
|
|
837
837
|
from bokeh.models import ColumnDataSource, HoverTool
|
|
838
838
|
|
|
839
|
-
sample_uids = self.
|
|
839
|
+
sample_uids = self._get_samples_uids(samples)
|
|
840
840
|
|
|
841
841
|
if not sample_uids:
|
|
842
842
|
self.logger.error("No valid sample_uids provided.")
|
|
@@ -1053,7 +1053,7 @@ def plot_bpc(
|
|
|
1053
1053
|
from bokeh.io.export import export_png
|
|
1054
1054
|
from masster.study.helpers import get_bpc
|
|
1055
1055
|
|
|
1056
|
-
sample_uids = self.
|
|
1056
|
+
sample_uids = self._get_samples_uids(samples)
|
|
1057
1057
|
if not sample_uids:
|
|
1058
1058
|
self.logger.error("No valid sample_uids provided for BPC plotting.")
|
|
1059
1059
|
return
|
|
@@ -1238,7 +1238,7 @@ def plot_eic(
|
|
|
1238
1238
|
self.logger.error("mz must be provided for EIC plotting")
|
|
1239
1239
|
return
|
|
1240
1240
|
|
|
1241
|
-
sample_uids = self.
|
|
1241
|
+
sample_uids = self._get_samples_uids(samples)
|
|
1242
1242
|
if not sample_uids:
|
|
1243
1243
|
self.logger.error("No valid sample_uids provided for EIC plotting.")
|
|
1244
1244
|
return
|
|
@@ -1400,7 +1400,7 @@ def plot_rt_correction(
|
|
|
1400
1400
|
self.logger.error("Column 'rt_original' not found in features_df. Alignment/backup RTs missing.")
|
|
1401
1401
|
return
|
|
1402
1402
|
|
|
1403
|
-
sample_uids = self.
|
|
1403
|
+
sample_uids = self._get_samples_uids(samples)
|
|
1404
1404
|
if not sample_uids:
|
|
1405
1405
|
self.logger.error("No valid sample_uids provided for RT correction plotting.")
|
|
1406
1406
|
return
|
|
@@ -1537,7 +1537,7 @@ def plot_chrom(
|
|
|
1537
1537
|
height=300,
|
|
1538
1538
|
):
|
|
1539
1539
|
cons_uids = self._get_consensus_uids(uids)
|
|
1540
|
-
sample_uids = self.
|
|
1540
|
+
sample_uids = self._get_samples_uids(samples)
|
|
1541
1541
|
|
|
1542
1542
|
chroms = self.get_chrom(uids=cons_uids, samples=sample_uids)
|
|
1543
1543
|
|
|
@@ -1723,18 +1723,32 @@ def plot_chrom(
|
|
|
1723
1723
|
def plot_consensus_stats(
|
|
1724
1724
|
self,
|
|
1725
1725
|
filename=None,
|
|
1726
|
-
width=1200
|
|
1726
|
+
width=840, # Reduced from 1200 (30% smaller)
|
|
1727
1727
|
height=None,
|
|
1728
1728
|
alpha=0.6,
|
|
1729
1729
|
bins=30,
|
|
1730
1730
|
n_cols=4,
|
|
1731
1731
|
):
|
|
1732
1732
|
"""
|
|
1733
|
-
Plot histograms/distributions for
|
|
1733
|
+
Plot histograms/distributions for specific consensus statistics in the requested order.
|
|
1734
|
+
|
|
1735
|
+
Shows the following properties in order:
|
|
1736
|
+
1. rt: Retention time
|
|
1737
|
+
2. rt_delta_mean: Mean retention time delta
|
|
1738
|
+
3. mz: Mass-to-charge ratio
|
|
1739
|
+
4. mz_range: Mass range (mz_max - mz_min)
|
|
1740
|
+
5. log10_inty_mean: Log10 of mean intensity
|
|
1741
|
+
6. number_samples: Number of samples
|
|
1742
|
+
7. number_ms2: Number of MS2 spectra
|
|
1743
|
+
8. charge_mean: Mean charge
|
|
1744
|
+
9. quality: Feature quality
|
|
1745
|
+
10. chrom_coherence_mean: Mean chromatographic coherence
|
|
1746
|
+
11. chrom_height_scaled_mean: Mean scaled chromatographic height
|
|
1747
|
+
12. chrom_prominence_scaled_mean: Mean scaled chromatographic prominence
|
|
1734
1748
|
|
|
1735
1749
|
Parameters:
|
|
1736
1750
|
filename (str, optional): Output filename for saving the plot
|
|
1737
|
-
width (int): Overall width of the plot (default:
|
|
1751
|
+
width (int): Overall width of the plot (default: 840)
|
|
1738
1752
|
height (int, optional): Overall height of the plot (auto-calculated if None)
|
|
1739
1753
|
alpha (float): Histogram transparency (default: 0.6)
|
|
1740
1754
|
bins (int): Number of histogram bins (default: 30)
|
|
@@ -1753,24 +1767,48 @@ def plot_consensus_stats(
|
|
|
1753
1767
|
# Get all columns and their data types - work with original dataframe
|
|
1754
1768
|
data_df = self.consensus_df.clone()
|
|
1755
1769
|
|
|
1756
|
-
#
|
|
1757
|
-
|
|
1758
|
-
|
|
1770
|
+
# Define specific columns to plot in the exact order requested
|
|
1771
|
+
desired_columns = [
|
|
1772
|
+
"rt",
|
|
1773
|
+
"rt_delta_mean",
|
|
1774
|
+
"mz",
|
|
1775
|
+
"mz_range", # mz_max-mz_min (will be calculated)
|
|
1776
|
+
"log10_inty_mean", # log10(inty_mean) (will be calculated)
|
|
1777
|
+
"number_samples",
|
|
1778
|
+
"number_ms2",
|
|
1779
|
+
"charge_mean",
|
|
1780
|
+
"quality",
|
|
1781
|
+
"chrom_coherence_mean",
|
|
1782
|
+
"chrom_height_scaled_mean",
|
|
1783
|
+
"chrom_prominence_scaled_mean"
|
|
1784
|
+
]
|
|
1785
|
+
|
|
1786
|
+
# Calculate derived columns if they don't exist
|
|
1787
|
+
if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
|
|
1788
|
+
data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
|
|
1789
|
+
|
|
1790
|
+
if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
|
|
1791
|
+
data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
|
|
1759
1792
|
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1793
|
+
# Filter to only include columns that exist in the dataframe, preserving order
|
|
1794
|
+
numeric_columns = [col for col in desired_columns if col in data_df.columns]
|
|
1795
|
+
|
|
1796
|
+
# Check if the numeric columns are actually numeric
|
|
1797
|
+
final_numeric_columns = []
|
|
1798
|
+
for col in numeric_columns:
|
|
1799
|
+
dtype = data_df[col].dtype
|
|
1800
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
1801
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
1802
|
+
pl.Float32, pl.Float64]:
|
|
1803
|
+
final_numeric_columns.append(col)
|
|
1804
|
+
|
|
1805
|
+
numeric_columns = final_numeric_columns
|
|
1768
1806
|
|
|
1769
1807
|
if len(numeric_columns) == 0:
|
|
1770
|
-
self.logger.error("
|
|
1808
|
+
self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df.columns)}")
|
|
1771
1809
|
return
|
|
1772
1810
|
|
|
1773
|
-
self.logger.debug(f"Creating distribution plots for {len(numeric_columns)}
|
|
1811
|
+
self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
|
|
1774
1812
|
|
|
1775
1813
|
# Work directly with Polars - no conversion to pandas needed
|
|
1776
1814
|
data_df_clean = data_df.select(numeric_columns)
|
|
@@ -1798,25 +1836,29 @@ def plot_consensus_stats(
|
|
|
1798
1836
|
|
|
1799
1837
|
# Auto-calculate height if not provided
|
|
1800
1838
|
if height is None:
|
|
1801
|
-
plot_height = 300
|
|
1802
|
-
height = plot_height * n_rows +
|
|
1839
|
+
plot_height = 210 # Reduced from 300 (30% smaller)
|
|
1840
|
+
height = plot_height * n_rows + 56 # Reduced from 80 (30% smaller)
|
|
1803
1841
|
else:
|
|
1804
|
-
plot_height = (height -
|
|
1842
|
+
plot_height = (height - 56) // n_rows # Reduced padding (30% smaller)
|
|
1805
1843
|
|
|
1806
|
-
plot_width = (width -
|
|
1844
|
+
plot_width = (width - 56) // n_cols # Reduced padding (30% smaller)
|
|
1807
1845
|
|
|
1808
1846
|
# Create plots grid
|
|
1809
1847
|
plots = []
|
|
1810
1848
|
current_row = []
|
|
1811
1849
|
|
|
1812
1850
|
for i, col in enumerate(numeric_columns):
|
|
1851
|
+
# Check if this column should use log scale for y-axis
|
|
1852
|
+
y_axis_type = "log" if col in ["number_samples", "number_ms2"] else "linear"
|
|
1853
|
+
|
|
1813
1854
|
# Create histogram for this column
|
|
1814
1855
|
p = figure(
|
|
1815
1856
|
width=plot_width,
|
|
1816
1857
|
height=plot_height,
|
|
1817
1858
|
title=col,
|
|
1818
1859
|
toolbar_location="above",
|
|
1819
|
-
tools="pan,wheel_zoom,box_zoom,reset,save"
|
|
1860
|
+
tools="pan,wheel_zoom,box_zoom,reset,save",
|
|
1861
|
+
y_axis_type=y_axis_type
|
|
1820
1862
|
)
|
|
1821
1863
|
|
|
1822
1864
|
# Set white background
|
|
@@ -1840,10 +1882,19 @@ def plot_consensus_stats(
|
|
|
1840
1882
|
values_array = valid_values.to_numpy()
|
|
1841
1883
|
hist, edges = np.histogram(values_array, bins=bins)
|
|
1842
1884
|
|
|
1885
|
+
# Handle log y-axis: replace zero counts with small positive values
|
|
1886
|
+
if y_axis_type == "log":
|
|
1887
|
+
# Replace zero counts with a small value (1e-1) to make them visible on log scale
|
|
1888
|
+
hist_log_safe = np.where(hist == 0, 0.1, hist)
|
|
1889
|
+
bottom_val = 0.1 # Use small positive value for bottom on log scale
|
|
1890
|
+
else:
|
|
1891
|
+
hist_log_safe = hist
|
|
1892
|
+
bottom_val = 0
|
|
1893
|
+
|
|
1843
1894
|
# Create histogram bars
|
|
1844
1895
|
p.quad(
|
|
1845
|
-
top=
|
|
1846
|
-
bottom=
|
|
1896
|
+
top=hist_log_safe,
|
|
1897
|
+
bottom=bottom_val,
|
|
1847
1898
|
left=edges[:-1],
|
|
1848
1899
|
right=edges[1:],
|
|
1849
1900
|
fill_color="steelblue",
|
|
@@ -1852,11 +1903,16 @@ def plot_consensus_stats(
|
|
|
1852
1903
|
)
|
|
1853
1904
|
|
|
1854
1905
|
# Style the plot
|
|
1855
|
-
p.title.text_font_size = "12pt
|
|
1856
|
-
p.xaxis.axis_label =
|
|
1857
|
-
p.
|
|
1858
|
-
p.grid.
|
|
1859
|
-
p.grid.
|
|
1906
|
+
p.title.text_font_size = "10pt" # Reduced from 12pt
|
|
1907
|
+
p.xaxis.axis_label = "" # Remove x-axis title
|
|
1908
|
+
p.grid.grid_line_alpha = 0.3 # Show y-axis grid with transparency
|
|
1909
|
+
p.grid.grid_line_color = "gray"
|
|
1910
|
+
p.grid.grid_line_dash = [6, 4] # Dashed grid lines
|
|
1911
|
+
p.xgrid.visible = False # Hide x-axis grid
|
|
1912
|
+
p.outline_line_color = None # Remove gray border around plot area
|
|
1913
|
+
|
|
1914
|
+
# Remove y-axis label but keep y-axis visible
|
|
1915
|
+
p.yaxis.axis_label = ""
|
|
1860
1916
|
|
|
1861
1917
|
current_row.append(p)
|
|
1862
1918
|
|
|
@@ -1868,14 +1924,12 @@ def plot_consensus_stats(
|
|
|
1868
1924
|
plots.append(current_row)
|
|
1869
1925
|
current_row = []
|
|
1870
1926
|
|
|
1871
|
-
# Create grid layout
|
|
1872
|
-
grid = gridplot(plots)
|
|
1927
|
+
# Create grid layout with white background
|
|
1928
|
+
grid = gridplot(plots, toolbar_location="above", merge_tools=True)
|
|
1873
1929
|
|
|
1874
|
-
#
|
|
1875
|
-
|
|
1876
|
-
|
|
1877
|
-
if hasattr(grid, "border_fill_color"):
|
|
1878
|
-
grid.border_fill_color = "white"
|
|
1930
|
+
# The background should be white by default in Bokeh
|
|
1931
|
+
# Individual plots already have white backgrounds set above
|
|
1932
|
+
|
|
1879
1933
|
|
|
1880
1934
|
# Apply consistent save/display behavior
|
|
1881
1935
|
if filename is not None:
|
|
@@ -1895,7 +1949,7 @@ def plot_consensus_stats(
|
|
|
1895
1949
|
return grid
|
|
1896
1950
|
|
|
1897
1951
|
|
|
1898
|
-
def
|
|
1952
|
+
def plot_samples_pca(
|
|
1899
1953
|
self,
|
|
1900
1954
|
filename=None,
|
|
1901
1955
|
width=500,
|
|
@@ -2035,6 +2089,7 @@ def plot_pca(
|
|
|
2035
2089
|
tools="pan,wheel_zoom,box_zoom,reset,save",
|
|
2036
2090
|
)
|
|
2037
2091
|
|
|
2092
|
+
p.grid.visible = False
|
|
2038
2093
|
p.xaxis.axis_label = f"PC1 ({explained_var[0]:.1%} variance)"
|
|
2039
2094
|
p.yaxis.axis_label = f"PC2 ({explained_var[1]:.1%} variance)"
|
|
2040
2095
|
|
|
@@ -2159,6 +2214,293 @@ def plot_pca(
|
|
|
2159
2214
|
return p
|
|
2160
2215
|
|
|
2161
2216
|
|
|
2217
|
+
def plot_samples_umap(
|
|
2218
|
+
self,
|
|
2219
|
+
filename=None,
|
|
2220
|
+
width=500,
|
|
2221
|
+
height=450,
|
|
2222
|
+
alpha=0.8,
|
|
2223
|
+
markersize=6,
|
|
2224
|
+
n_components=2,
|
|
2225
|
+
colorby=None,
|
|
2226
|
+
title="UMAP of Consensus Matrix",
|
|
2227
|
+
n_neighbors=15,
|
|
2228
|
+
min_dist=0.1,
|
|
2229
|
+
metric="euclidean",
|
|
2230
|
+
random_state=42,
|
|
2231
|
+
):
|
|
2232
|
+
"""
|
|
2233
|
+
Plot UMAP (Uniform Manifold Approximation and Projection) of the consensus matrix using Bokeh.
|
|
2234
|
+
|
|
2235
|
+
Parameters:
|
|
2236
|
+
filename (str, optional): Output filename for saving the plot
|
|
2237
|
+
width (int): Plot width (default: 500)
|
|
2238
|
+
height (int): Plot height (default: 450)
|
|
2239
|
+
alpha (float): Point transparency (default: 0.8)
|
|
2240
|
+
markersize (int): Size of points (default: 6)
|
|
2241
|
+
n_components (int): Number of UMAP components to compute (default: 2)
|
|
2242
|
+
colorby (str, optional): Column from samples_df to color points by
|
|
2243
|
+
title (str): Plot title (default: "UMAP of Consensus Matrix")
|
|
2244
|
+
n_neighbors (int): Number of neighbors for UMAP (default: 15)
|
|
2245
|
+
min_dist (float): Minimum distance for UMAP (default: 0.1)
|
|
2246
|
+
metric (str): Distance metric for UMAP (default: "euclidean")
|
|
2247
|
+
random_state (int or None): Random state for reproducibility (default: 42).
|
|
2248
|
+
- Use an integer (e.g., 42) for reproducible results (slower, single-threaded)
|
|
2249
|
+
- Use None for faster computation with multiple cores (non-reproducible)
|
|
2250
|
+
|
|
2251
|
+
Note:
|
|
2252
|
+
Setting random_state forces single-threaded computation but ensures reproducible results.
|
|
2253
|
+
Set random_state=None to enable parallel processing for faster computation.
|
|
2254
|
+
"""
|
|
2255
|
+
try:
|
|
2256
|
+
import umap
|
|
2257
|
+
except ImportError:
|
|
2258
|
+
self.logger.error("UMAP not available. Please install umap-learn: pip install umap-learn")
|
|
2259
|
+
return
|
|
2260
|
+
|
|
2261
|
+
from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
|
|
2262
|
+
from bokeh.plotting import figure
|
|
2263
|
+
from bokeh.palettes import Category20, viridis
|
|
2264
|
+
from bokeh.transform import factor_cmap
|
|
2265
|
+
from sklearn.preprocessing import StandardScaler
|
|
2266
|
+
import pandas as pd
|
|
2267
|
+
import numpy as np
|
|
2268
|
+
|
|
2269
|
+
# Check if consensus matrix and samples_df exist
|
|
2270
|
+
try:
|
|
2271
|
+
consensus_matrix = self.get_consensus_matrix()
|
|
2272
|
+
samples_df = self.samples_df
|
|
2273
|
+
except Exception as e:
|
|
2274
|
+
self.logger.error(f"Error getting consensus matrix or samples_df: {e}")
|
|
2275
|
+
return
|
|
2276
|
+
|
|
2277
|
+
if consensus_matrix is None or consensus_matrix.shape[0] == 0:
|
|
2278
|
+
self.logger.error("No consensus matrix available. Run merge/find_consensus first.")
|
|
2279
|
+
return
|
|
2280
|
+
|
|
2281
|
+
if samples_df is None or samples_df.is_empty():
|
|
2282
|
+
self.logger.error("No samples dataframe available.")
|
|
2283
|
+
return
|
|
2284
|
+
|
|
2285
|
+
self.logger.debug(f"Performing UMAP on consensus matrix with shape: {consensus_matrix.shape}")
|
|
2286
|
+
|
|
2287
|
+
# Extract only the sample columns (exclude consensus_uid column)
|
|
2288
|
+
sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
|
|
2289
|
+
|
|
2290
|
+
# Convert consensus matrix to numpy, excluding the consensus_uid column
|
|
2291
|
+
if hasattr(consensus_matrix, "select"):
|
|
2292
|
+
# Polars DataFrame
|
|
2293
|
+
matrix_data = consensus_matrix.select(sample_cols).to_numpy()
|
|
2294
|
+
else:
|
|
2295
|
+
# Pandas DataFrame or other - drop consensus_uid column
|
|
2296
|
+
matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
|
|
2297
|
+
if hasattr(matrix_sample_data, "values"):
|
|
2298
|
+
matrix_data = matrix_sample_data.values
|
|
2299
|
+
elif hasattr(matrix_sample_data, "to_numpy"):
|
|
2300
|
+
matrix_data = matrix_sample_data.to_numpy()
|
|
2301
|
+
else:
|
|
2302
|
+
matrix_data = np.array(matrix_sample_data)
|
|
2303
|
+
|
|
2304
|
+
# Transpose matrix so samples are rows and features are columns
|
|
2305
|
+
matrix_data = matrix_data.T
|
|
2306
|
+
|
|
2307
|
+
# Handle missing values by replacing with 0
|
|
2308
|
+
matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
|
|
2309
|
+
|
|
2310
|
+
# Standardize the data
|
|
2311
|
+
scaler = StandardScaler()
|
|
2312
|
+
matrix_scaled = scaler.fit_transform(matrix_data)
|
|
2313
|
+
|
|
2314
|
+
# Perform UMAP
|
|
2315
|
+
reducer = umap.UMAP(
|
|
2316
|
+
n_components=n_components,
|
|
2317
|
+
n_neighbors=n_neighbors,
|
|
2318
|
+
min_dist=min_dist,
|
|
2319
|
+
metric=metric,
|
|
2320
|
+
random_state=random_state,
|
|
2321
|
+
n_jobs=1
|
|
2322
|
+
)
|
|
2323
|
+
umap_result = reducer.fit_transform(matrix_scaled)
|
|
2324
|
+
|
|
2325
|
+
self.logger.debug(f"UMAP completed with shape: {umap_result.shape}")
|
|
2326
|
+
|
|
2327
|
+
# Convert samples_df to pandas for easier manipulation
|
|
2328
|
+
samples_pd = samples_df.to_pandas()
|
|
2329
|
+
|
|
2330
|
+
# Create dataframe with UMAP results and sample information
|
|
2331
|
+
umap_df = pd.DataFrame({
|
|
2332
|
+
"UMAP1": umap_result[:, 0],
|
|
2333
|
+
"UMAP2": umap_result[:, 1] if n_components > 1 else np.zeros(len(umap_result)),
|
|
2334
|
+
})
|
|
2335
|
+
|
|
2336
|
+
# Add sample information to UMAP dataframe
|
|
2337
|
+
if len(samples_pd) == len(umap_df):
|
|
2338
|
+
for col in samples_pd.columns:
|
|
2339
|
+
umap_df[col] = samples_pd[col].values
|
|
2340
|
+
else:
|
|
2341
|
+
self.logger.warning(
|
|
2342
|
+
f"Sample count mismatch: samples_df has {len(samples_pd)} rows, "
|
|
2343
|
+
f"but consensus matrix has {len(umap_df)} samples",
|
|
2344
|
+
)
|
|
2345
|
+
|
|
2346
|
+
# Prepare color mapping
|
|
2347
|
+
color_column = None
|
|
2348
|
+
color_mapper = None
|
|
2349
|
+
|
|
2350
|
+
if colorby and colorby in umap_df.columns:
|
|
2351
|
+
color_column = colorby
|
|
2352
|
+
unique_values = umap_df[colorby].unique()
|
|
2353
|
+
|
|
2354
|
+
# Handle categorical vs numeric coloring
|
|
2355
|
+
if umap_df[colorby].dtype in ["object", "string", "category"]:
|
|
2356
|
+
# Categorical coloring
|
|
2357
|
+
if len(unique_values) <= 20:
|
|
2358
|
+
palette = Category20[min(20, max(3, len(unique_values)))]
|
|
2359
|
+
else:
|
|
2360
|
+
palette = viridis(min(256, len(unique_values)))
|
|
2361
|
+
color_mapper = factor_cmap(colorby, palette, unique_values)
|
|
2362
|
+
else:
|
|
2363
|
+
# Numeric coloring
|
|
2364
|
+
palette = viridis(256)
|
|
2365
|
+
color_mapper = LinearColorMapper(
|
|
2366
|
+
palette=palette,
|
|
2367
|
+
low=umap_df[colorby].min(),
|
|
2368
|
+
high=umap_df[colorby].max(),
|
|
2369
|
+
)
|
|
2370
|
+
|
|
2371
|
+
# Create Bokeh plot
|
|
2372
|
+
p = figure(
|
|
2373
|
+
width=width,
|
|
2374
|
+
height=height,
|
|
2375
|
+
title=f"{title}",
|
|
2376
|
+
tools="pan,wheel_zoom,box_zoom,reset,save",
|
|
2377
|
+
)
|
|
2378
|
+
|
|
2379
|
+
p.grid.visible = False
|
|
2380
|
+
p.xaxis.axis_label = "UMAP1"
|
|
2381
|
+
p.yaxis.axis_label = "UMAP2"
|
|
2382
|
+
|
|
2383
|
+
# Create data source
|
|
2384
|
+
source = ColumnDataSource(umap_df)
|
|
2385
|
+
|
|
2386
|
+
# Create scatter plot
|
|
2387
|
+
if color_mapper:
|
|
2388
|
+
if isinstance(color_mapper, LinearColorMapper):
|
|
2389
|
+
scatter = p.scatter(
|
|
2390
|
+
"UMAP1",
|
|
2391
|
+
"UMAP2",
|
|
2392
|
+
size=markersize,
|
|
2393
|
+
alpha=alpha,
|
|
2394
|
+
color={"field": colorby, "transform": color_mapper},
|
|
2395
|
+
source=source,
|
|
2396
|
+
)
|
|
2397
|
+
# Add colorbar for numeric coloring
|
|
2398
|
+
color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
|
|
2399
|
+
p.add_layout(color_bar, "right")
|
|
2400
|
+
else:
|
|
2401
|
+
scatter = p.scatter(
|
|
2402
|
+
"UMAP1",
|
|
2403
|
+
"UMAP2",
|
|
2404
|
+
size=markersize,
|
|
2405
|
+
alpha=alpha,
|
|
2406
|
+
color=color_mapper,
|
|
2407
|
+
source=source,
|
|
2408
|
+
legend_field=colorby,
|
|
2409
|
+
)
|
|
2410
|
+
else:
|
|
2411
|
+
# If no color_by provided, use sample_color column from samples_df
|
|
2412
|
+
if "sample_uid" in umap_df.columns or "sample_name" in umap_df.columns:
|
|
2413
|
+
# Choose the identifier to map colors by
|
|
2414
|
+
id_col = "sample_uid" if "sample_uid" in umap_df.columns else "sample_name"
|
|
2415
|
+
|
|
2416
|
+
# Get colors from samples_df based on the identifier
|
|
2417
|
+
if id_col == "sample_uid":
|
|
2418
|
+
sample_colors = (
|
|
2419
|
+
self.samples_df.filter(pl.col("sample_uid").is_in(umap_df[id_col].unique()))
|
|
2420
|
+
.select(["sample_uid", "sample_color"])
|
|
2421
|
+
.to_dict(as_series=False)
|
|
2422
|
+
)
|
|
2423
|
+
color_map = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
|
|
2424
|
+
else: # sample_name
|
|
2425
|
+
sample_colors = (
|
|
2426
|
+
self.samples_df.filter(pl.col("sample_name").is_in(umap_df[id_col].unique()))
|
|
2427
|
+
.select(["sample_name", "sample_color"])
|
|
2428
|
+
.to_dict(as_series=False)
|
|
2429
|
+
)
|
|
2430
|
+
color_map = dict(zip(sample_colors["sample_name"], sample_colors["sample_color"]))
|
|
2431
|
+
|
|
2432
|
+
# Map colors into dataframe
|
|
2433
|
+
umap_df["color"] = [color_map.get(x, "#1f77b4") for x in umap_df[id_col]] # fallback to blue
|
|
2434
|
+
# Update the ColumnDataSource with new color column
|
|
2435
|
+
source = ColumnDataSource(umap_df)
|
|
2436
|
+
scatter = p.scatter(
|
|
2437
|
+
"UMAP1",
|
|
2438
|
+
"UMAP2",
|
|
2439
|
+
size=markersize,
|
|
2440
|
+
alpha=alpha,
|
|
2441
|
+
color="color",
|
|
2442
|
+
source=source,
|
|
2443
|
+
)
|
|
2444
|
+
else:
|
|
2445
|
+
scatter = p.scatter(
|
|
2446
|
+
"UMAP1",
|
|
2447
|
+
"UMAP2",
|
|
2448
|
+
size=markersize,
|
|
2449
|
+
alpha=alpha,
|
|
2450
|
+
color="blue",
|
|
2451
|
+
source=source,
|
|
2452
|
+
)
|
|
2453
|
+
|
|
2454
|
+
# Create comprehensive hover tooltips with all sample information
|
|
2455
|
+
tooltip_list = []
|
|
2456
|
+
|
|
2457
|
+
# Columns to exclude from tooltips (file paths and internal/plot fields)
|
|
2458
|
+
excluded_cols = {"file_source", "file_path", "sample_path", "map_id", "UMAP1", "UMAP2", "ms1", "ms2", "size"}
|
|
2459
|
+
|
|
2460
|
+
# Add all sample dataframe columns to tooltips, skipping excluded ones
|
|
2461
|
+
for col in samples_pd.columns:
|
|
2462
|
+
if col in excluded_cols:
|
|
2463
|
+
continue
|
|
2464
|
+
if col in umap_df.columns:
|
|
2465
|
+
if col == "sample_color":
|
|
2466
|
+
# Display sample_color as a colored swatch
|
|
2467
|
+
tooltip_list.append(("color", "$color[swatch]:sample_color"))
|
|
2468
|
+
elif umap_df[col].dtype in ["float64", "float32"]:
|
|
2469
|
+
tooltip_list.append((col, f"@{col}{{0.00}}"))
|
|
2470
|
+
else:
|
|
2471
|
+
tooltip_list.append((col, f"@{col}"))
|
|
2472
|
+
|
|
2473
|
+
hover = HoverTool(
|
|
2474
|
+
tooltips=tooltip_list,
|
|
2475
|
+
renderers=[scatter],
|
|
2476
|
+
)
|
|
2477
|
+
p.add_tools(hover)
|
|
2478
|
+
|
|
2479
|
+
# Add legend if using categorical coloring
|
|
2480
|
+
if color_mapper and not isinstance(color_mapper, LinearColorMapper) and colorby:
|
|
2481
|
+
# Only set legend properties if legends exist (avoid Bokeh warning when none created)
|
|
2482
|
+
if getattr(p, "legend", None) and len(p.legend) > 0:
|
|
2483
|
+
p.legend.location = "top_left"
|
|
2484
|
+
p.legend.click_policy = "hide"
|
|
2485
|
+
|
|
2486
|
+
# Apply consistent save/display behavior
|
|
2487
|
+
if filename is not None:
|
|
2488
|
+
# Convert relative paths to absolute paths using study folder as base
|
|
2489
|
+
import os
|
|
2490
|
+
if not os.path.isabs(filename):
|
|
2491
|
+
filename = os.path.join(self.folder, filename)
|
|
2492
|
+
|
|
2493
|
+
# Convert to absolute path for logging
|
|
2494
|
+
abs_filename = os.path.abspath(filename)
|
|
2495
|
+
|
|
2496
|
+
# Use isolated file saving
|
|
2497
|
+
_isolated_save_plot(p, filename, abs_filename, self.logger, "UMAP Plot")
|
|
2498
|
+
else:
|
|
2499
|
+
# Show in notebook when no filename provided
|
|
2500
|
+
_isolated_show_notebook(p)
|
|
2501
|
+
return p
|
|
2502
|
+
|
|
2503
|
+
|
|
2162
2504
|
def plot_tic(
|
|
2163
2505
|
self,
|
|
2164
2506
|
samples=100,
|
|
@@ -2179,7 +2521,7 @@ def plot_tic(
|
|
|
2179
2521
|
from bokeh.io.export import export_png
|
|
2180
2522
|
from masster.study.helpers import get_tic
|
|
2181
2523
|
|
|
2182
|
-
sample_uids = self.
|
|
2524
|
+
sample_uids = self._get_samples_uids(samples)
|
|
2183
2525
|
if not sample_uids:
|
|
2184
2526
|
self.logger.error("No valid sample_uids provided for TIC plotting.")
|
|
2185
2527
|
return
|
|
@@ -2312,3 +2654,16 @@ def plot_tic(
|
|
|
2312
2654
|
_isolated_show_notebook(p)
|
|
2313
2655
|
|
|
2314
2656
|
return p
|
|
2657
|
+
|
|
2658
|
+
|
|
2659
|
+
def plot_pca(self, *args, **kwargs):
|
|
2660
|
+
"""Deprecated: Use plot_samples_pca instead."""
|
|
2661
|
+
import warnings
|
|
2662
|
+
warnings.warn("plot_pca is deprecated, use plot_samples_pca instead", DeprecationWarning, stacklevel=2)
|
|
2663
|
+
return self.plot_samples_pca(*args, **kwargs)
|
|
2664
|
+
|
|
2665
|
+
def plot_umap(self, *args, **kwargs):
|
|
2666
|
+
"""Deprecated: Use plot_samples_umap instead."""
|
|
2667
|
+
import warnings
|
|
2668
|
+
warnings.warn("plot_umap is deprecated, use plot_samples_umap instead", DeprecationWarning, stacklevel=2)
|
|
2669
|
+
return self.plot_samples_umap(*args, **kwargs)
|
masster/study/processing.py
CHANGED
|
@@ -166,7 +166,7 @@ def align(self, **kwargs):
|
|
|
166
166
|
# end of parameter initialization
|
|
167
167
|
|
|
168
168
|
# Store parameters in the Study object
|
|
169
|
-
self.
|
|
169
|
+
self.update_history(["align"], params.to_dict())
|
|
170
170
|
self.logger.debug("Parameters stored to align")
|
|
171
171
|
|
|
172
172
|
# Generate temporary feature maps on-demand from features_df instead of using cached data
|
|
@@ -370,7 +370,7 @@ def find_ms2(self, **kwargs):
|
|
|
370
370
|
# end of parameter initialization
|
|
371
371
|
|
|
372
372
|
# Store parameters in the Study object
|
|
373
|
-
self.
|
|
373
|
+
self.update_history(["find_ms2"], params.to_dict())
|
|
374
374
|
self.logger.debug("Parameters stored to find_ms2")
|
|
375
375
|
|
|
376
376
|
data = []
|
|
@@ -551,7 +551,7 @@ def _integrate_chrom_impl(self, **kwargs):
|
|
|
551
551
|
# end of parameter initialization
|
|
552
552
|
|
|
553
553
|
# Store parameters in the Study object
|
|
554
|
-
self.
|
|
554
|
+
self.update_history(["integrate_chrom"], params.to_dict())
|
|
555
555
|
self.logger.debug("Parameters stored to integrate_chrom")
|
|
556
556
|
|
|
557
557
|
# Get parameter values for use in the method
|
|
@@ -769,7 +769,7 @@ def integrate(self, **kwargs):
|
|
|
769
769
|
# end of parameter initialization
|
|
770
770
|
|
|
771
771
|
# Store parameters in the Study object
|
|
772
|
-
self.
|
|
772
|
+
self.update_history(["integrate"], params.to_dict())
|
|
773
773
|
self.logger.debug("Parameters stored to integrate")
|
|
774
774
|
|
|
775
775
|
# Call the original integrate_chrom function with extracted parameters
|
masster/study/save.py
CHANGED
|
@@ -59,13 +59,16 @@ def save(self, filename=None, add_timestamp=True, compress=False):
|
|
|
59
59
|
|
|
60
60
|
# Use compressed mode for large datasets
|
|
61
61
|
if compress:
|
|
62
|
-
|
|
62
|
+
from masster.study.h5 import _save_study5_compressed
|
|
63
|
+
_save_study5_compressed(self, filename)
|
|
63
64
|
else:
|
|
64
|
-
|
|
65
|
+
from masster.study.h5 import _save_study5
|
|
66
|
+
_save_study5(self, filename)
|
|
65
67
|
|
|
66
68
|
if self.consensus_map is not None:
|
|
67
69
|
# save the features as a separate file
|
|
68
|
-
|
|
70
|
+
from masster.study.save import _save_consensusXML
|
|
71
|
+
_save_consensusXML(self, filename=filename.replace(".study5", ".consensusXML"))
|
|
69
72
|
self.filename = filename
|
|
70
73
|
|
|
71
74
|
|
|
@@ -211,4 +214,5 @@ def save_consensus(self, **kwargs):
|
|
|
211
214
|
if self.consensus_map is None:
|
|
212
215
|
self.logger.error("No consensus map found.")
|
|
213
216
|
return
|
|
214
|
-
|
|
217
|
+
from masster.study.save import _save_consensusXML
|
|
218
|
+
_save_consensusXML(self, **kwargs)
|