PyPI - masster - Versions diffs - 0.4.21__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

masster 0.4.21py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (24) hide show

masster/_version.py +1 -1
masster/sample/adducts.py +1 -1
masster/sample/load.py +10 -9
masster/sample/plot.py +1 -1
masster/sample/processing.py +4 -4
masster/sample/sample.py +29 -32
masster/sample/save.py +0 -2
masster/study/analysis.py +1762 -0
masster/study/export.py +8 -6
masster/study/helpers.py +153 -80
masster/study/id.py +3 -3
masster/study/load.py +56 -55
masster/study/merge.py +316 -313
masster/study/parameters.py +3 -3
masster/study/plot.py +491 -203
masster/study/processing.py +109 -15
masster/study/save.py +8 -4
masster/study/study.py +97 -139
masster/wizard/wizard.py +8 -8
{masster-0.4.21.dist-info → masster-0.5.0.dist-info}/METADATA +54 -14
{masster-0.4.21.dist-info → masster-0.5.0.dist-info}/RECORD +24 -23
{masster-0.4.21.dist-info → masster-0.5.0.dist-info}/WHEEL +0 -0
{masster-0.4.21.dist-info → masster-0.5.0.dist-info}/entry_points.txt +0 -0
{masster-0.4.21.dist-info → masster-0.5.0.dist-info}/licenses/LICENSE +0 -0

masster/study/plot.py CHANGED Viewed

@@ -308,7 +308,7 @@ def plot_alignment(
         self.logger.info("Showing current RT values for both plots. Run align() first to see alignment comparison.")
     # Get sample_uids to filter by if specified
-    sample_uids = self._get_sample_uids(samples) if samples is not None else None
+    sample_uids = self._get_samples_uids(samples) if samples is not None else None
     # Start with full features_df
     features_df = self.features_df
@@ -836,7 +836,7 @@ def plot_samples_2d(
     from bokeh.io.export import export_png
     from bokeh.models import ColumnDataSource, HoverTool
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.error("No valid sample_uids provided.")
@@ -1053,7 +1053,7 @@ def plot_bpc(
     from bokeh.io.export import export_png
     from masster.study.helpers import get_bpc
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.error("No valid sample_uids provided for BPC plotting.")
         return
@@ -1238,7 +1238,7 @@ def plot_eic(
         self.logger.error("mz must be provided for EIC plotting")
         return
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.error("No valid sample_uids provided for EIC plotting.")
         return
@@ -1400,7 +1400,7 @@ def plot_rt_correction(
         self.logger.error("Column 'rt_original' not found in features_df. Alignment/backup RTs missing.")
         return
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.error("No valid sample_uids provided for RT correction plotting.")
         return
@@ -1537,7 +1537,7 @@ def plot_chrom(
     height=300,
 ):
     cons_uids = self._get_consensus_uids(uids)
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     chroms = self.get_chrom(uids=cons_uids, samples=sample_uids)
@@ -1723,226 +1723,213 @@ def plot_chrom(
 def plot_consensus_stats(
     self,
     filename=None,
-    width=1200,
-    height=1200,
+    width=840,  # Reduced from 1200 (30% smaller)
+    height=None,
     alpha=0.6,
-    markersize=3,
+    bins=30,
+    n_cols=4,
 ):
     """
-    Plot a scatter plot matrix (SPLOM) of consensus statistics using Bokeh.
+    Plot histograms/distributions for specific consensus statistics in the requested order.
+    Shows the following properties in order:
+    1. rt: Retention time
+    2. rt_delta_mean: Mean retention time delta
+    3. mz: Mass-to-charge ratio
+    4. mz_range: Mass range (mz_max - mz_min)
+    5. log10_inty_mean: Log10 of mean intensity
+    6. number_samples: Number of samples
+    7. number_ms2: Number of MS2 spectra
+    8. charge_mean: Mean charge
+    9. quality: Feature quality
+    10. chrom_coherence_mean: Mean chromatographic coherence
+    11. chrom_height_scaled_mean: Mean scaled chromatographic height
+    12. chrom_prominence_scaled_mean: Mean scaled chromatographic prominence
     Parameters:
         filename (str, optional): Output filename for saving the plot
-        width (int): Overall width of the plot (default: 1200)
-        height (int): Overall height of the plot (default: 1200)
-        alpha (float): Point transparency (default: 0.6)
-        markersize (int): Size of points (default: 5)
+        width (int): Overall width of the plot (default: 840)
+        height (int, optional): Overall height of the plot (auto-calculated if None)
+        alpha (float): Histogram transparency (default: 0.6)
+        bins (int): Number of histogram bins (default: 30)
+        n_cols (int): Number of columns in the grid layout (default: 4)
     """
     from bokeh.layouts import gridplot
-    from bokeh.models import ColumnDataSource, HoverTool
-    from bokeh.plotting import figure, show, output_file
+    from bokeh.plotting import figure
+    import polars as pl
+    import numpy as np
     # Check if consensus_df exists and has data
     if self.consensus_df is None or self.consensus_df.is_empty():
         self.logger.error("No consensus data available. Run merge/find_consensus first.")
         return
-    # Define the columns to plot
-    columns = [
-        "rt",
-        "mz",
-        "number_samples",
-        "log10_quality",
-        "mz_delta_mean",
-        "rt_delta_mean",
-        "chrom_coherence_mean",
-        "chrom_prominence_scaled_mean",
-        "inty_mean",
-        "number_ms2",
-    ]
-    # Check which columns exist in the dataframe and compute missing ones
-    available_columns = self.consensus_df.columns
+    # Get all columns and their data types - work with original dataframe
     data_df = self.consensus_df.clone()
-    # Add log10_quality if quality exists
-    if "quality" in available_columns and "log10_quality" not in available_columns:
-        data_df = data_df.with_columns(
-            pl.col("quality").log10().alias("log10_quality"),
-        )
-    # Filter columns that actually exist
-    final_columns = [col for col in columns if col in data_df.columns]
+    # Define specific columns to plot in the exact order requested
+    desired_columns = [
+        "rt",
+        "rt_delta_mean",
+        "mz",
+        "mz_range",  # mz_max-mz_min (will be calculated)
+        "log10_inty_mean",  # log10(inty_mean) (will be calculated)
+        "number_samples",
+        "number_ms2",
+        "charge_mean",
+        "quality",
+        "chrom_coherence_mean",
+        "chrom_height_scaled_mean",
+        "chrom_prominence_scaled_mean"
+    ]
+    # Calculate derived columns if they don't exist
+    if "mz_range" not in data_df.columns and "mz_max" in data_df.columns and "mz_min" in data_df.columns:
+        data_df = data_df.with_columns((pl.col("mz_max") - pl.col("mz_min")).alias("mz_range"))
+    if "log10_inty_mean" not in data_df.columns and "inty_mean" in data_df.columns:
+        data_df = data_df.with_columns(pl.col("inty_mean").log10().alias("log10_inty_mean"))
+    # Filter to only include columns that exist in the dataframe, preserving order
+    numeric_columns = [col for col in desired_columns if col in data_df.columns]
+    # Check if the numeric columns are actually numeric
+    final_numeric_columns = []
+    for col in numeric_columns:
+        dtype = data_df[col].dtype
+        if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
+                    pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
+                    pl.Float32, pl.Float64]:
+            final_numeric_columns.append(col)
+    numeric_columns = final_numeric_columns
-    if len(final_columns) < 2:
-        self.logger.error(f"Need at least 2 columns for SPLOM. Available: {final_columns}")
+    if len(numeric_columns) == 0:
+        self.logger.error(f"None of the requested consensus statistics columns were found or are numeric. Available columns: {list(data_df.columns)}")
         return
-    self.logger.debug(f"Creating SPLOM with columns: {final_columns}")
-    # Add important ID columns for tooltips even if not plotting them
-    tooltip_columns = []
-    for id_col in ["consensus_uid", "consensus_id"]:
-        if id_col in data_df.columns and id_col not in final_columns:
-            tooltip_columns.append(id_col)
+    self.logger.debug(f"Creating distribution plots for {len(numeric_columns)} specific consensus columns: {numeric_columns}")
-    # Select plotting columns plus tooltip columns
-    all_columns = final_columns + tooltip_columns
-    data_pd = data_df.select(all_columns).to_pandas()
+    # Work directly with Polars - no conversion to pandas needed
+    data_df_clean = data_df.select(numeric_columns)
-    # Remove any infinite or NaN values
-    data_pd = data_pd.replace([np.inf, -np.inf], np.nan).dropna()
-    if data_pd.empty:
-        self.logger.error("No valid data after removing NaN/infinite values.")
+    # Check if all numeric columns are empty
+    all_columns_empty = True
+    for col in numeric_columns:
+        # Check if column has any non-null, finite values
+        non_null_count = data_df_clean[col].filter(
+            data_df_clean[col].is_not_null() &
+            (data_df_clean[col].is_finite() if data_df_clean[col].dtype in [pl.Float32, pl.Float64] else pl.lit(True))
+        ).len()
+        if non_null_count > 0:
+            all_columns_empty = False
+            break
+    if all_columns_empty:
+        self.logger.error("All numeric columns contain only NaN/infinite values.")
         return
-    source = ColumnDataSource(data_pd)
-    n_vars = len(final_columns)
-    # Fixed dimensions - override user input to ensure consistent layout
-    total_width = 1200
-    total_height = 1200
-    # Calculate plot sizes to ensure uniform inner plot areas
-    # First column needs extra width for y-axis labels
-    plot_width_first = 180  # Wider to account for y-axis labels
-    plot_width_others = 120  # Standard width for other columns
-    plot_height_normal = 120  # Standard height
-    plot_height_last = 155  # Taller last row to accommodate x-axis labels while keeping inner plot area same size
+    # Calculate grid dimensions
+    n_plots = len(numeric_columns)
+    n_rows = (n_plots + n_cols - 1) // n_cols  # Ceiling division
+    # Auto-calculate height if not provided
+    if height is None:
+        plot_height = 210  # Reduced from 300 (30% smaller)
+        height = plot_height * n_rows + 56  # Reduced from 80 (30% smaller)
+    else:
+        plot_height = (height - 56) // n_rows  # Reduced padding (30% smaller)
+    plot_width = (width - 56) // n_cols  # Reduced padding (30% smaller)
-    # Create grid of plots with variable outer sizes but equal inner areas
+    # Create plots grid
     plots = []
+    current_row = []
+    for i, col in enumerate(numeric_columns):
+        # Check if this column should use log scale for y-axis
+        y_axis_type = "log" if col in ["number_samples", "number_ms2"] else "linear"
+        # Create histogram for this column
+        p = figure(
+            width=plot_width,
+            height=plot_height,
+            title=col,
+            toolbar_location="above",
+            tools="pan,wheel_zoom,box_zoom,reset,save",
+            y_axis_type=y_axis_type
+        )
+        # Set white background
+        p.background_fill_color = "white"
+        p.border_fill_color = "white"
+        # Calculate histogram using Polars
+        # Get valid (non-null, finite) values for this column
+        if data_df_clean[col].dtype in [pl.Float32, pl.Float64]:
+            valid_values = data_df_clean.filter(
+                data_df_clean[col].is_not_null() & data_df_clean[col].is_finite()
+            )[col]
+        else:
+            valid_values = data_df_clean.filter(data_df_clean[col].is_not_null())[col]
+        if valid_values.len() == 0:
+            self.logger.warning(f"No valid values for column {col}")
+            continue
+        # Convert to numpy for histogram calculation
+        values_array = valid_values.to_numpy()
+        hist, edges = np.histogram(values_array, bins=bins)
+        # Handle log y-axis: replace zero counts with small positive values
+        if y_axis_type == "log":
+            # Replace zero counts with a small value (1e-1) to make them visible on log scale
+            hist_log_safe = np.where(hist == 0, 0.1, hist)
+            bottom_val = 0.1  # Use small positive value for bottom on log scale
+        else:
+            hist_log_safe = hist
+            bottom_val = 0
+        # Create histogram bars
+        p.quad(
+            top=hist_log_safe,
+            bottom=bottom_val,
+            left=edges[:-1],
+            right=edges[1:],
+            fill_color="steelblue",
+            line_color="white",
+            alpha=alpha,
+        )
+        # Style the plot
+        p.title.text_font_size = "10pt"  # Reduced from 12pt
+        p.xaxis.axis_label = ""  # Remove x-axis title
+        p.grid.grid_line_alpha = 0.3  # Show y-axis grid with transparency
+        p.grid.grid_line_color = "gray"
+        p.grid.grid_line_dash = [6, 4]  # Dashed grid lines
+        p.xgrid.visible = False  # Hide x-axis grid
+        p.outline_line_color = None  # Remove gray border around plot area
+        # Remove y-axis label but keep y-axis visible
+        p.yaxis.axis_label = ""
+        current_row.append(p)
+        # If we've filled a row or reached the end, add the row to plots
+        if len(current_row) == n_cols or i == n_plots - 1:
+            # Fill remaining spots in the last row with None if needed
+            while len(current_row) < n_cols and i == n_plots - 1:
+                current_row.append(None)
+            plots.append(current_row)
+            current_row = []
+    # Create grid layout with white background
+    grid = gridplot(plots, toolbar_location="above", merge_tools=True)
+    # The background should be white by default in Bokeh
+    # Individual plots already have white backgrounds set above
-    for i, y_var in enumerate(final_columns):
-        row = []
-        for j, x_var in enumerate(final_columns):
-            # Determine if this plot needs axis labels
-            has_x_label = i == n_vars - 1  # bottom row
-            has_y_label = j == 0  # left column
-            # First column wider to accommodate y-axis labels, ensuring equal inner plot areas
-            current_width = plot_width_first if has_y_label else plot_width_others
-            current_height = plot_height_last if has_x_label else plot_height_normal
-            p = figure(
-                width=current_width,
-                height=current_height,
-                title=None,  # No title on any plot
-                toolbar_location=None,
-                # Adjusted borders - first column has more space, others minimal
-                min_border_left=70 if has_y_label else 15,
-                min_border_bottom=50 if has_x_label else 15,
-                min_border_right=15,
-                min_border_top=15,
-            )
-            # Ensure subplot background and border are explicitly white so the plot looks
-            # correct in dark and light themes.
-            p.outline_line_color = None
-            p.border_fill_color = "white"
-            p.border_fill_alpha = 1.0
-            p.background_fill_color = "white"
-            # Remove axis lines to eliminate black lines between plots
-            p.xaxis.axis_line_color = None
-            p.yaxis.axis_line_color = None
-            # Keep subtle grid lines for data reference
-            p.grid.visible = True
-            p.grid.grid_line_color = "#E0E0E0"  # Light gray grid lines
-            # Set axis labels and formatting
-            if has_x_label:  # bottom row
-                p.xaxis.axis_label = x_var
-                p.xaxis.axis_label_text_font_size = "12pt"
-                p.xaxis.major_label_text_font_size = "9pt"
-                p.xaxis.axis_label_standoff = 15
-            else:
-                p.xaxis.major_label_text_font_size = "0pt"
-                p.xaxis.minor_tick_line_color = None
-                p.xaxis.major_tick_line_color = None
-            if has_y_label:  # left column
-                p.yaxis.axis_label = y_var
-                p.yaxis.axis_label_text_font_size = "10pt"  # Smaller y-axis title
-                p.yaxis.major_label_text_font_size = "8pt"
-                p.yaxis.axis_label_standoff = 12
-            else:
-                p.yaxis.major_label_text_font_size = "0pt"
-                p.yaxis.minor_tick_line_color = None
-                p.yaxis.major_tick_line_color = None
-            if i == j:
-                # Diagonal: histogram
-                hist, edges = np.histogram(data_pd[x_var], bins=30)
-                p.quad(
-                    top=hist,
-                    bottom=0,
-                    left=edges[:-1],
-                    right=edges[1:],
-                    fill_color="green",
-                    line_color="white",
-                    alpha=alpha,
-                )
-            else:
-                # Off-diagonal: scatter plot
-                scatter = p.scatter(
-                    x=x_var,
-                    y=y_var,
-                    size=markersize,
-                    alpha=alpha,
-                    color="blue",
-                    source=source,
-                )
-                # Add hover tool
-                hover = HoverTool(
-                    tooltips=[
-                        (x_var, f"@{x_var}{{0.0000}}"),
-                        (y_var, f"@{y_var}{{0.0000}}"),
-                        (
-                            "consensus_uid",
-                            "@consensus_uid"
-                            if "consensus_uid" in data_pd.columns
-                            else "@consensus_id"
-                            if "consensus_id" in data_pd.columns
-                            else "N/A",
-                        ),
-                        ("rt", "@rt{0.00}" if "rt" in data_pd.columns else "N/A"),
-                        ("mz", "@mz{0.0000}" if "mz" in data_pd.columns else "N/A"),
-                    ],
-                    renderers=[scatter],
-                )
-                p.add_tools(hover)
-            row.append(p)
-        plots.append(row)
-    # Link axes for same variables
-    for i in range(n_vars):
-        for j in range(n_vars):
-            if i != j:  # Don't link diagonal plots
-                # Link x-axis to other plots in same column
-                for k in range(n_vars):
-                    if k != i and k != j:
-                        plots[i][j].x_range = plots[k][j].x_range
-                # Link y-axis to other plots in same row
-                for k in range(n_vars):
-                    if k != j and k != i:
-                        plots[i][j].y_range = plots[i][k].y_range
-    # Create grid layout and force overall background/border to white so the outer
-    # container doesn't show dark UI colors in night mode.
-    grid = gridplot(plots)
-    # Set overall background and border to white when supported
-    if hasattr(grid, "background_fill_color"):
-        grid.background_fill_color = "white"
-    if hasattr(grid, "border_fill_color"):
-        grid.border_fill_color = "white"
     # Apply consistent save/display behavior
     if filename is not None:
@@ -1962,7 +1949,7 @@ def plot_consensus_stats(
     return grid
-def plot_pca(
+def plot_samples_pca(
     self,
     filename=None,
     width=500,
@@ -2102,6 +2089,7 @@ def plot_pca(
         tools="pan,wheel_zoom,box_zoom,reset,save",
     )
+    p.grid.visible = False
     p.xaxis.axis_label = f"PC1 ({explained_var[0]:.1%} variance)"
     p.yaxis.axis_label = f"PC2 ({explained_var[1]:.1%} variance)"
@@ -2226,6 +2214,293 @@ def plot_pca(
     return p
+def plot_samples_umap(
+    self,
+    filename=None,
+    width=500,
+    height=450,
+    alpha=0.8,
+    markersize=6,
+    n_components=2,
+    colorby=None,
+    title="UMAP of Consensus Matrix",
+    n_neighbors=15,
+    min_dist=0.1,
+    metric="euclidean",
+    random_state=42,
+):
+    """
+    Plot UMAP (Uniform Manifold Approximation and Projection) of the consensus matrix using Bokeh.
+    Parameters:
+        filename (str, optional): Output filename for saving the plot
+        width (int): Plot width (default: 500)
+        height (int): Plot height (default: 450)
+        alpha (float): Point transparency (default: 0.8)
+        markersize (int): Size of points (default: 6)
+        n_components (int): Number of UMAP components to compute (default: 2)
+        colorby (str, optional): Column from samples_df to color points by
+        title (str): Plot title (default: "UMAP of Consensus Matrix")
+        n_neighbors (int): Number of neighbors for UMAP (default: 15)
+        min_dist (float): Minimum distance for UMAP (default: 0.1)
+        metric (str): Distance metric for UMAP (default: "euclidean")
+        random_state (int or None): Random state for reproducibility (default: 42).
+            - Use an integer (e.g., 42) for reproducible results (slower, single-threaded)
+            - Use None for faster computation with multiple cores (non-reproducible)
+    Note:
+        Setting random_state forces single-threaded computation but ensures reproducible results.
+        Set random_state=None to enable parallel processing for faster computation.
+    """
+    try:
+        import umap
+    except ImportError:
+        self.logger.error("UMAP not available. Please install umap-learn: pip install umap-learn")
+        return
+    from bokeh.models import ColumnDataSource, HoverTool, ColorBar, LinearColorMapper
+    from bokeh.plotting import figure
+    from bokeh.palettes import Category20, viridis
+    from bokeh.transform import factor_cmap
+    from sklearn.preprocessing import StandardScaler
+    import pandas as pd
+    import numpy as np
+    # Check if consensus matrix and samples_df exist
+    try:
+        consensus_matrix = self.get_consensus_matrix()
+        samples_df = self.samples_df
+    except Exception as e:
+        self.logger.error(f"Error getting consensus matrix or samples_df: {e}")
+        return
+    if consensus_matrix is None or consensus_matrix.shape[0] == 0:
+        self.logger.error("No consensus matrix available. Run merge/find_consensus first.")
+        return
+    if samples_df is None or samples_df.is_empty():
+        self.logger.error("No samples dataframe available.")
+        return
+    self.logger.debug(f"Performing UMAP on consensus matrix with shape: {consensus_matrix.shape}")
+    # Extract only the sample columns (exclude consensus_uid column)
+    sample_cols = [col for col in consensus_matrix.columns if col != "consensus_uid"]
+    # Convert consensus matrix to numpy, excluding the consensus_uid column
+    if hasattr(consensus_matrix, "select"):
+        # Polars DataFrame
+        matrix_data = consensus_matrix.select(sample_cols).to_numpy()
+    else:
+        # Pandas DataFrame or other - drop consensus_uid column
+        matrix_sample_data = consensus_matrix.drop(columns=["consensus_uid"], errors="ignore")
+        if hasattr(matrix_sample_data, "values"):
+            matrix_data = matrix_sample_data.values
+        elif hasattr(matrix_sample_data, "to_numpy"):
+            matrix_data = matrix_sample_data.to_numpy()
+        else:
+            matrix_data = np.array(matrix_sample_data)
+    # Transpose matrix so samples are rows and features are columns
+    matrix_data = matrix_data.T
+    # Handle missing values by replacing with 0
+    matrix_data = np.nan_to_num(matrix_data, nan=0.0, posinf=0.0, neginf=0.0)
+    # Standardize the data
+    scaler = StandardScaler()
+    matrix_scaled = scaler.fit_transform(matrix_data)
+    # Perform UMAP
+    reducer = umap.UMAP(
+        n_components=n_components,
+        n_neighbors=n_neighbors,
+        min_dist=min_dist,
+        metric=metric,
+        random_state=random_state,
+        n_jobs=1
+    )
+    umap_result = reducer.fit_transform(matrix_scaled)
+    self.logger.debug(f"UMAP completed with shape: {umap_result.shape}")
+    # Convert samples_df to pandas for easier manipulation
+    samples_pd = samples_df.to_pandas()
+    # Create dataframe with UMAP results and sample information
+    umap_df = pd.DataFrame({
+        "UMAP1": umap_result[:, 0],
+        "UMAP2": umap_result[:, 1] if n_components > 1 else np.zeros(len(umap_result)),
+    })
+    # Add sample information to UMAP dataframe
+    if len(samples_pd) == len(umap_df):
+        for col in samples_pd.columns:
+            umap_df[col] = samples_pd[col].values
+    else:
+        self.logger.warning(
+            f"Sample count mismatch: samples_df has {len(samples_pd)} rows, "
+            f"but consensus matrix has {len(umap_df)} samples",
+        )
+    # Prepare color mapping
+    color_column = None
+    color_mapper = None
+    if colorby and colorby in umap_df.columns:
+        color_column = colorby
+        unique_values = umap_df[colorby].unique()
+        # Handle categorical vs numeric coloring
+        if umap_df[colorby].dtype in ["object", "string", "category"]:
+            # Categorical coloring
+            if len(unique_values) <= 20:
+                palette = Category20[min(20, max(3, len(unique_values)))]
+            else:
+                palette = viridis(min(256, len(unique_values)))
+            color_mapper = factor_cmap(colorby, palette, unique_values)
+        else:
+            # Numeric coloring
+            palette = viridis(256)
+            color_mapper = LinearColorMapper(
+                palette=palette,
+                low=umap_df[colorby].min(),
+                high=umap_df[colorby].max(),
+            )
+    # Create Bokeh plot
+    p = figure(
+        width=width,
+        height=height,
+        title=f"{title}",
+        tools="pan,wheel_zoom,box_zoom,reset,save",
+    )
+    p.grid.visible = False
+    p.xaxis.axis_label = "UMAP1"
+    p.yaxis.axis_label = "UMAP2"
+    # Create data source
+    source = ColumnDataSource(umap_df)
+    # Create scatter plot
+    if color_mapper:
+        if isinstance(color_mapper, LinearColorMapper):
+            scatter = p.scatter(
+                "UMAP1",
+                "UMAP2",
+                size=markersize,
+                alpha=alpha,
+                color={"field": colorby, "transform": color_mapper},
+                source=source,
+            )
+            # Add colorbar for numeric coloring
+            color_bar = ColorBar(color_mapper=color_mapper, width=8, location=(0, 0))
+            p.add_layout(color_bar, "right")
+        else:
+            scatter = p.scatter(
+                "UMAP1",
+                "UMAP2",
+                size=markersize,
+                alpha=alpha,
+                color=color_mapper,
+                source=source,
+                legend_field=colorby,
+            )
+    else:
+        # If no color_by provided, use sample_color column from samples_df
+        if "sample_uid" in umap_df.columns or "sample_name" in umap_df.columns:
+            # Choose the identifier to map colors by
+            id_col = "sample_uid" if "sample_uid" in umap_df.columns else "sample_name"
+            # Get colors from samples_df based on the identifier
+            if id_col == "sample_uid":
+                sample_colors = (
+                    self.samples_df.filter(pl.col("sample_uid").is_in(umap_df[id_col].unique()))
+                    .select(["sample_uid", "sample_color"])
+                    .to_dict(as_series=False)
+                )
+                color_map = dict(zip(sample_colors["sample_uid"], sample_colors["sample_color"]))
+            else:  # sample_name
+                sample_colors = (
+                    self.samples_df.filter(pl.col("sample_name").is_in(umap_df[id_col].unique()))
+                    .select(["sample_name", "sample_color"])
+                    .to_dict(as_series=False)
+                )
+                color_map = dict(zip(sample_colors["sample_name"], sample_colors["sample_color"]))
+            # Map colors into dataframe
+            umap_df["color"] = [color_map.get(x, "#1f77b4") for x in umap_df[id_col]]  # fallback to blue
+            # Update the ColumnDataSource with new color column
+            source = ColumnDataSource(umap_df)
+            scatter = p.scatter(
+                "UMAP1",
+                "UMAP2",
+                size=markersize,
+                alpha=alpha,
+                color="color",
+                source=source,
+            )
+        else:
+            scatter = p.scatter(
+                "UMAP1",
+                "UMAP2",
+                size=markersize,
+                alpha=alpha,
+                color="blue",
+                source=source,
+            )
+    # Create comprehensive hover tooltips with all sample information
+    tooltip_list = []
+    # Columns to exclude from tooltips (file paths and internal/plot fields)
+    excluded_cols = {"file_source", "file_path", "sample_path", "map_id", "UMAP1", "UMAP2", "ms1", "ms2", "size"}
+    # Add all sample dataframe columns to tooltips, skipping excluded ones
+    for col in samples_pd.columns:
+        if col in excluded_cols:
+            continue
+        if col in umap_df.columns:
+            if col == "sample_color":
+                # Display sample_color as a colored swatch
+                tooltip_list.append(("color", "$color[swatch]:sample_color"))
+            elif umap_df[col].dtype in ["float64", "float32"]:
+                tooltip_list.append((col, f"@{col}{{0.00}}"))
+            else:
+                tooltip_list.append((col, f"@{col}"))
+    hover = HoverTool(
+        tooltips=tooltip_list,
+        renderers=[scatter],
+    )
+    p.add_tools(hover)
+    # Add legend if using categorical coloring
+    if color_mapper and not isinstance(color_mapper, LinearColorMapper) and colorby:
+        # Only set legend properties if legends exist (avoid Bokeh warning when none created)
+        if getattr(p, "legend", None) and len(p.legend) > 0:
+            p.legend.location = "top_left"
+            p.legend.click_policy = "hide"
+    # Apply consistent save/display behavior
+    if filename is not None:
+        # Convert relative paths to absolute paths using study folder as base
+        import os
+        if not os.path.isabs(filename):
+            filename = os.path.join(self.folder, filename)
+        # Convert to absolute path for logging
+        abs_filename = os.path.abspath(filename)
+        # Use isolated file saving
+        _isolated_save_plot(p, filename, abs_filename, self.logger, "UMAP Plot")
+    else:
+        # Show in notebook when no filename provided
+        _isolated_show_notebook(p)
+    return p
 def plot_tic(
     self,
     samples=100,
@@ -2246,7 +2521,7 @@ def plot_tic(
     from bokeh.io.export import export_png
     from masster.study.helpers import get_tic
-    sample_uids = self._get_sample_uids(samples)
+    sample_uids = self._get_samples_uids(samples)
     if not sample_uids:
         self.logger.error("No valid sample_uids provided for TIC plotting.")
         return
@@ -2379,3 +2654,16 @@ def plot_tic(
         _isolated_show_notebook(p)
     return p
+def plot_pca(self, *args, **kwargs):
+    """Deprecated: Use plot_samples_pca instead."""
+    import warnings
+    warnings.warn("plot_pca is deprecated, use plot_samples_pca instead", DeprecationWarning, stacklevel=2)
+    return self.plot_samples_pca(*args, **kwargs)
+def plot_umap(self, *args, **kwargs):
+    """Deprecated: Use plot_samples_umap instead."""
+    import warnings
+    warnings.warn("plot_umap is deprecated, use plot_samples_umap instead", DeprecationWarning, stacklevel=2)
+    return self.plot_samples_umap(*args, **kwargs)

masster 0.4.21__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

masster 0.4.21py3-none-any.whl → 0.5.0py3-none-any.whl