PyPI - wsi-toolbox - Versions diffs - 0.2.0__py3-none-any.whl - Mend

wsi-toolbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

wsi_toolbox/__init__.py +122 -0
wsi_toolbox/app.py +874 -0
wsi_toolbox/cli.py +599 -0
wsi_toolbox/commands/__init__.py +66 -0
wsi_toolbox/commands/clustering.py +198 -0
wsi_toolbox/commands/data_loader.py +219 -0
wsi_toolbox/commands/dzi.py +160 -0
wsi_toolbox/commands/patch_embedding.py +196 -0
wsi_toolbox/commands/pca.py +206 -0
wsi_toolbox/commands/preview.py +394 -0
wsi_toolbox/commands/show.py +171 -0
wsi_toolbox/commands/umap_embedding.py +174 -0
wsi_toolbox/commands/wsi.py +223 -0
wsi_toolbox/common.py +148 -0
wsi_toolbox/models.py +30 -0
wsi_toolbox/utils/__init__.py +109 -0
wsi_toolbox/utils/analysis.py +174 -0
wsi_toolbox/utils/hdf5_paths.py +232 -0
wsi_toolbox/utils/plot.py +227 -0
wsi_toolbox/utils/progress.py +207 -0
wsi_toolbox/utils/seed.py +26 -0
wsi_toolbox/utils/st.py +55 -0
wsi_toolbox/utils/white.py +121 -0
wsi_toolbox/watcher.py +256 -0
wsi_toolbox/wsi_files.py +619 -0
wsi_toolbox-0.2.0.dist-info/METADATA +253 -0
wsi_toolbox-0.2.0.dist-info/RECORD +30 -0
wsi_toolbox-0.2.0.dist-info/WHEEL +4 -0
wsi_toolbox-0.2.0.dist-info/entry_points.txt +3 -0
wsi_toolbox-0.2.0.dist-info/licenses/LICENSE +21 -0

wsi_toolbox/utils/analysis.py ADDED Viewed

@@ -0,0 +1,174 @@
+import multiprocessing
+from typing import Callable
+import numpy as np
+def reorder_clusters_by_pca(clusters: np.ndarray, pca_values: np.ndarray) -> np.ndarray:
+    """
+    Reorder cluster IDs based on PCA distribution for consistent visualization.
+    The goal is to ensure that when clusters are plotted in a violin plot (left to right),
+    the distribution rises gradually from left and steeply on the right.
+    Algorithm:
+    1. Sort clusters by their mean PCA1 value
+    2. Check if median of sorted means is below or above the midpoint
+    3. If median > midpoint, flip the order so lower cluster IDs have lower PCA values
+    This ensures consistent ordering regardless of PCA sign ambiguity.
+    Args:
+        clusters: Cluster labels array [N]
+        pca_values: PCA1 values array [N] (first principal component)
+    Returns:
+        Reordered cluster labels with same shape
+    """
+    unique_clusters = [c for c in np.unique(clusters) if c >= 0]
+    if len(unique_clusters) <= 1:
+        return clusters
+    # 1. Compute mean PCA1 for each cluster
+    cluster_means = {}
+    for c in unique_clusters:
+        cluster_means[c] = np.mean(pca_values[clusters == c])
+    # 2. Sort clusters by mean PCA1
+    sorted_clusters = sorted(unique_clusters, key=lambda c: cluster_means[c])
+    sorted_means = [cluster_means[c] for c in sorted_clusters]
+    # 3. Check distribution: flip if median is on the higher side
+    midpoint = (sorted_means[0] + sorted_means[-1]) / 2
+    median_mean = np.median(sorted_means)
+    if median_mean > midpoint:
+        sorted_clusters = sorted_clusters[::-1]
+    # 4. Build remapping
+    old_to_new = {old: new for new, old in enumerate(sorted_clusters)}
+    # 5. Apply remapping (preserve -1 for filtered)
+    return np.array([old_to_new.get(c, c) for c in clusters])
+def find_optimal_components(features, threshold=0.95):
+    # Lazy import: sklearn is slow to load (~600ms), defer until needed
+    from sklearn.decomposition import PCA  # noqa: PLC0415
+    pca = PCA()
+    pca.fit(features)
+    explained_variance = pca.explained_variance_ratio_
+    # 累積寄与率が95%を超える次元数を選択する例
+    cumulative_variance = np.cumsum(explained_variance)
+    optimal_n = np.argmax(cumulative_variance >= threshold) + 1
+    return min(optimal_n, len(features) - 1)
+def process_edges_batch(batch_indices, all_indices, h, use_umap_embs, pca=None):
+    """Process a batch of nodes and their edges"""
+    edges = []
+    weights = []
+    for i in batch_indices:
+        for j in all_indices[i]:
+            if i == j:  # skip self loop
+                continue
+            if use_umap_embs:
+                distance = np.linalg.norm(h[i] - h[j])
+                weight = np.exp(-distance)
+            else:
+                explained_variance_ratio = pca.explained_variance_ratio_
+                weighted_diff = (h[i] - h[j]) * np.sqrt(explained_variance_ratio[: len(h[i])])
+                distance = np.linalg.norm(weighted_diff)
+                weight = np.exp(-distance / distance.mean())
+            edges.append((i, j))
+            weights.append(weight)
+    return edges, weights
+def leiden_cluster(
+    features: np.ndarray,
+    resolution: float = 1.0,
+    n_jobs: int = -1,
+    on_progress: Callable[[str], None] | None = None,
+) -> np.ndarray:
+    """
+    Perform Leiden clustering on feature embeddings.
+    Args:
+        features: Feature matrix (n_samples, n_features)
+        resolution: Leiden clustering resolution parameter
+        n_jobs: Number of parallel jobs (-1 = all CPUs)
+        on_progress: Optional callback for progress updates, receives message string
+    Returns:
+        np.ndarray: Cluster labels for each sample
+    """
+    # Lazy import: sklearn/igraph/networkx are slow to load, defer until needed
+    import igraph as ig  # noqa: PLC0415
+    import leidenalg as la  # noqa: PLC0415
+    import networkx as nx  # noqa: PLC0415
+    from joblib import Parallel, delayed  # noqa: PLC0415
+    from sklearn.decomposition import PCA  # noqa: PLC0415
+    from sklearn.neighbors import NearestNeighbors  # noqa: PLC0415
+    if n_jobs < 0:
+        n_jobs = multiprocessing.cpu_count()
+    n_samples = features.shape[0]
+    def _progress(msg: str):
+        if on_progress:
+            on_progress(msg)
+    # 1. PCA
+    _progress("Processing PCA")
+    n_components = find_optimal_components(features)
+    pca = PCA(n_components)
+    target_features = pca.fit_transform(features)
+    # 2. KNN
+    _progress("Processing KNN")
+    k = int(np.sqrt(len(target_features)))
+    nn = NearestNeighbors(n_neighbors=k).fit(target_features)
+    distances, indices = nn.kneighbors(target_features)
+    # 3. Build graph
+    _progress("Building graph")
+    G = nx.Graph()
+    G.add_nodes_from(range(n_samples))
+    batch_size = max(1, n_samples // n_jobs)
+    batches = [list(range(i, min(i + batch_size, n_samples))) for i in range(0, n_samples, batch_size)]
+    results = Parallel(n_jobs=n_jobs)(
+        [delayed(process_edges_batch)(batch, indices, target_features, False, pca) for batch in batches]
+    )
+    for batch_edges, batch_weights in results:
+        for (i, j), weight in zip(batch_edges, batch_weights):
+            G.add_edge(i, j, weight=weight)
+    # 4. Leiden clustering
+    _progress("Leiden clustering")
+    edges = list(G.edges())
+    weights = [G[u][v]["weight"] for u, v in edges]
+    ig_graph = ig.Graph(n=n_samples, edges=edges, edge_attrs={"weight": weights})
+    partition = la.find_partition(
+        ig_graph,
+        la.RBConfigurationVertexPartition,
+        weights="weight",
+        resolution_parameter=resolution,
+    )
+    # 5. Finalize
+    _progress("Finalizing")
+    clusters = np.full(n_samples, -1)
+    for i, community in enumerate(partition):
+        for node in community:
+            clusters[node] = i
+    return clusters

wsi_toolbox/utils/hdf5_paths.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""
+HDF5 path utilities for consistent namespace and filter handling
+"""
+from pathlib import Path
+import h5py
+# Reserved namespace names that cannot be used
+# These conflict with existing HDF5 structure (e.g., model/features, model/metadata)
+RESERVED_NAMESPACES = frozenset({"features", "metadata", "latent_features"})
+def validate_namespace(namespace: str) -> bool:
+    """
+    Validate namespace string
+    Args:
+        namespace: Namespace to validate
+    Returns:
+        True if valid, False if invalid
+    """
+    if not namespace:
+        return False
+    if namespace in RESERVED_NAMESPACES:
+        return False
+    return True
+def normalize_filename(path: str) -> str:
+    """
+    Normalize filename for use in namespace
+    Args:
+        path: File path
+    Returns:
+        Normalized name (stem only, forbidden chars replaced)
+    """
+    name = Path(path).stem
+    # Replace forbidden characters
+    name = name.replace("+", "_")  # + is reserved for separator
+    name = name.replace("/", "_")  # path separator
+    return name
+def build_namespace(input_paths: list[str]) -> str:
+    """
+    Build namespace from input file paths
+    Note: No validation here - auto-generated namespaces always contain +
+    Validation happens at build_cluster_path() which is the final path assembly
+    Args:
+        input_paths: List of HDF5 file paths
+    Returns:
+        Namespace string
+        - Single file: "default"
+        - Multiple files: "file1+file2+..." (sorted, normalized)
+    """
+    if len(input_paths) == 1:
+        return "default"
+    # Normalize and sort filenames
+    names = sorted([normalize_filename(p) for p in input_paths])
+    return "+".join(names)
+def build_cluster_path(
+    model_name: str,
+    namespace: str = "default",
+    filters: list[list[int]] | None = None,
+    dataset: str = "clusters",
+) -> str:
+    """
+    Build HDF5 path for clustering data
+    Args:
+        model_name: Model name (e.g., "uni", "gigapath")
+        namespace: Namespace (e.g., "default", "001+002")
+        filters: Nested list of cluster filters, e.g., [[1,2,3], [0,1]]
+        dataset: Dataset name ("clusters", "umap", "pca1", "pca2", "pca3")
+    Returns:
+        Full HDF5 path
+    Raises:
+        ValueError: If namespace is invalid or reserved
+    Examples:
+        >>> build_cluster_path("uni", "default")
+        'uni/default/clusters'
+        >>> build_cluster_path("uni", "default", [[1,2,3]])
+        'uni/default/filter/1+2+3/clusters'
+        >>> build_cluster_path("uni", "default", [[1,2,3], [0,1]])
+        'uni/default/filter/1+2+3/filter/0+1/clusters'
+        >>> build_cluster_path("uni", "001+002", [[5]])
+        'uni/001+002/filter/5/clusters'
+    """
+    # Validate namespace
+    if not validate_namespace(namespace):
+        raise ValueError(f"Invalid namespace '{namespace}'. Reserved names: {', '.join(sorted(RESERVED_NAMESPACES))}")
+    path = f"{model_name}/{namespace}"
+    if filters:
+        for filter_ids in filters:
+            filter_str = "+".join(map(str, sorted(filter_ids)))
+            path += f"/filter/{filter_str}"
+    path += f"/{dataset}"
+    return path
+def parse_cluster_path(path: str) -> dict:
+    """
+    Parse cluster path into components
+    Args:
+        path: HDF5 path (e.g., "uni/default/filter/1+2+3/clusters")
+    Returns:
+        Dict with keys: model_name, namespace, filters, dataset
+    Examples:
+        >>> parse_cluster_path("uni/default/clusters")
+        {'model_name': 'uni', 'namespace': 'default', 'filters': [], 'dataset': 'clusters'}
+        >>> parse_cluster_path("uni/default/filter/1+2+3/clusters")
+        {'model_name': 'uni', 'namespace': 'default', 'filters': [[1,2,3]], 'dataset': 'clusters'}
+    """
+    parts = path.split("/")
+    result = {"model_name": parts[0], "namespace": parts[1], "filters": [], "dataset": parts[-1]}
+    # Parse filter hierarchy
+    i = 2
+    while i < len(parts) - 1:
+        if parts[i] == "filter":
+            filter_str = parts[i + 1]
+            filter_ids = [int(x) for x in filter_str.split("+")]
+            result["filters"].append(filter_ids)
+            i += 2
+        else:
+            i += 1
+    return result
+def list_namespaces(h5_file, model_name: str) -> list[str]:
+    """
+    List all namespaces in HDF5 file for given model
+    Args:
+        h5_file: h5py.File object (opened)
+        model_name: Model name
+    Returns:
+        List of namespace strings
+    """
+    if model_name not in h5_file:
+        return []
+    namespaces = []
+    for key in h5_file[model_name].keys():
+        if isinstance(h5_file[f"{model_name}/{key}"], h5py.Group):
+            # Check if it contains 'clusters' dataset
+            if "clusters" in h5_file[f"{model_name}/{key}"]:
+                namespaces.append(key)
+    return namespaces
+def list_filters(h5_file, model_name: str, namespace: str) -> list[str]:
+    """
+    List all filter paths under a namespace
+    Args:
+        h5_file: h5py.File object (opened)
+        model_name: Model name
+        namespace: Namespace
+    Returns:
+        List of filter strings (e.g., ["1+2+3", "5"])
+    """
+    base_path = f"{model_name}/{namespace}/filter"
+    if base_path not in h5_file:
+        return []
+    filters = []
+    def visit_filters(name, obj):
+        if isinstance(obj, h5py.Group) and "clusters" in obj:
+            # Extract filter string from full path
+            rel_path = name.replace(base_path + "/", "")
+            # Remove '/filter/' segments to get just the IDs
+            filter_str = rel_path.replace("/filter/", "/")
+            filters.append(filter_str)
+    h5_file[base_path].visititems(visit_filters)
+    return filters
+def ensure_groups(h5file: h5py.File, path: str) -> None:
+    """
+    Ensure all parent groups exist for a given path.
+    Args:
+        h5file: Open h5py.File object
+        path: Full path to dataset (e.g., "model/namespace/clusters")
+    Example:
+        >>> with h5py.File("data.h5", "a") as f:
+        ...     ensure_groups(f, "uni/default/filter/1+2/clusters")
+        ...     f.create_dataset("uni/default/filter/1+2/clusters", data=clusters)
+    """
+    parts = path.split("/")
+    group_parts = parts[:-1]  # Exclude the dataset name
+    current = ""
+    for part in group_parts:
+        current = f"{current}/{part}" if current else part
+        if current not in h5file:
+            h5file.create_group(current)

wsi_toolbox/utils/plot.py ADDED Viewed

@@ -0,0 +1,227 @@
+"""
+Plotting utilities for 2D scatter plots and 1D violin plots
+"""
+import numpy as np
+from matplotlib import pyplot as plt
+from ..common import _get_cluster_color
+def plot_scatter_2d(
+    coords_list: list[np.ndarray],
+    clusters_list: list[np.ndarray],
+    filenames: list[str],
+    title: str = "2D Projection",
+    figsize: tuple = (12, 8),
+    xlabel: str = "Dimension 1",
+    ylabel: str = "Dimension 2",
+):
+    """
+    Plot 2D scatter plot from single or multiple files
+    Unified plotting logic that works for both single and multiple files.
+    Args:
+        coords_list: List of coordinate arrays (one per file)
+        clusters_list: List of cluster arrays (one per file)
+        filenames: List of file names for legend
+        title: Plot title
+        figsize: Figure size
+        xlabel: X-axis label
+        ylabel: Y-axis label
+    Returns:
+        matplotlib Figure
+    """
+    markers = ["o", "s", "^", "D", "v", "<", ">", "p", "*", "h"]
+    # Get all unique clusters (same namespace = same clusters)
+    all_unique_clusters = sorted(np.unique(np.concatenate(clusters_list)))
+    cluster_to_color = {cluster_id: _get_cluster_color(cluster_id) for cluster_id in all_unique_clusters}
+    fig, ax = plt.subplots(figsize=figsize)
+    # Single file: simpler legend (no file markers)
+    if len(coords_list) == 1:
+        for cluster_id in all_unique_clusters:
+            mask = clusters_list[0] == cluster_id
+            if np.sum(mask) > 0:
+                if cluster_id == -1:
+                    color = "black"
+                    label = "Noise"
+                    size = 12
+                else:
+                    color = cluster_to_color[cluster_id]
+                    label = f"Cluster {cluster_id}"
+                    size = 7
+                ax.scatter(
+                    coords_list[0][mask, 0],
+                    coords_list[0][mask, 1],
+                    s=size,
+                    c=[color],
+                    label=label,
+                    alpha=0.8,
+                )
+    else:
+        # Multiple files: show both cluster colors and file markers
+        # Create handles for cluster legend (colors)
+        cluster_handles = []
+        for cluster_id in all_unique_clusters:
+            if cluster_id < 0:  # Skip noise
+                continue
+            handle = plt.Line2D(
+                [0],
+                [0],
+                marker="o",
+                color="w",
+                markerfacecolor=cluster_to_color[cluster_id],
+                markersize=8,
+                label=f"Cluster {cluster_id}",
+            )
+            cluster_handles.append(handle)
+        # Create handles for file legend (markers)
+        file_handles = []
+        for i, filename in enumerate(filenames):
+            marker = markers[i % len(markers)]
+            handle = plt.Line2D(
+                [0], [0], marker=marker, color="w", markerfacecolor="gray", markersize=8, label=filename
+            )
+            file_handles.append(handle)
+        # Plot all data: cluster-first, then file-specific markers
+        for cluster_id in all_unique_clusters:
+            for i, (coords, clusters, filename) in enumerate(zip(coords_list, clusters_list, filenames)):
+                mask = clusters == cluster_id
+                if np.sum(mask) > 0:  # Only plot if this file has patches in this cluster
+                    marker = markers[i % len(markers)]
+                    ax.scatter(
+                        coords[mask, 0],
+                        coords[mask, 1],
+                        marker=marker,
+                        c=[cluster_to_color[cluster_id]],
+                        s=10,
+                        alpha=0.6,
+                    )
+        # Add legends for multiple files
+        legend1 = ax.legend(handles=cluster_handles, title="Clusters", loc="upper left", bbox_to_anchor=(1.02, 1))
+        ax.add_artist(legend1)
+        ax.legend(handles=file_handles, title="Sources", loc="upper left", bbox_to_anchor=(1.02, 0.5))
+    # Draw cluster numbers at centroids
+    all_coords_combined = np.concatenate(coords_list)
+    all_clusters_combined = np.concatenate(clusters_list)
+    for cluster_id in all_unique_clusters:
+        if cluster_id < 0:  # Skip noise cluster
+            continue
+        cluster_points = all_coords_combined[all_clusters_combined == cluster_id]
+        if len(cluster_points) < 1:
+            continue
+        centroid_x = np.mean(cluster_points[:, 0])
+        centroid_y = np.mean(cluster_points[:, 1])
+        ax.text(
+            centroid_x,
+            centroid_y,
+            str(cluster_id),
+            fontsize=12,
+            fontweight="bold",
+            ha="center",
+            va="center",
+            bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"),
+        )
+    # Single file: show legend normally
+    if len(coords_list) == 1:
+        ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
+    ax.set_title(title)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    plt.tight_layout()
+    return fig
+def plot_violin_1d(
+    values_list: list[np.ndarray],
+    clusters_list: list[np.ndarray],
+    title: str = "Distribution by Cluster",
+    ylabel: str = "Value",
+    figsize: tuple = (12, 8),
+):
+    """
+    Plot 1D violin plot with cluster distribution
+    Args:
+        values_list: List of 1D value arrays (one per file)
+        clusters_list: List of cluster arrays (one per file)
+        title: Plot title
+        ylabel: Y-axis label
+        figsize: Figure size
+    Returns:
+        matplotlib Figure
+    """
+    # Combine all data
+    all_values = np.concatenate(values_list)
+    all_clusters = np.concatenate(clusters_list)
+    # Show all clusters except noise (-1)
+    cluster_ids = sorted([c for c in np.unique(all_clusters) if c >= 0])
+    # Prepare violin plot data
+    data = []
+    labels = []
+    # Add "All" first
+    data.append(all_values)
+    labels.append("All")
+    # Then add each cluster
+    for cluster_id in cluster_ids:
+        cluster_mask = all_clusters == cluster_id
+        cluster_values = all_values[cluster_mask]
+        if len(cluster_values) > 0:
+            data.append(cluster_values)
+            labels.append(f"Cluster {cluster_id}")
+    if len(data) == 0:
+        raise ValueError("No data for specified clusters")
+    # Create plot
+    # Lazy import: seaborn is slow to load (~500ms), defer until needed
+    import seaborn as sns  # noqa: PLC0415
+    fig = plt.figure(figsize=figsize)
+    sns.set_style("whitegrid")
+    ax = plt.subplot(111)
+    # Prepare colors: gray for "All", then cluster colors
+    palette = ["gray"]  # Color for "All"
+    for cluster_id in cluster_ids:
+        color = _get_cluster_color(cluster_id)
+        palette.append(color)
+    sns.violinplot(data=data, ax=ax, inner="box", cut=0, zorder=1, alpha=0.5, palette=palette)
+    # Scatter: first is "All" with gray, then clusters
+    for i, d in enumerate(data):
+        x = np.random.normal(i, 0.05, size=len(d))
+        if i == 0:
+            color = "gray"  # All
+        else:
+            color = _get_cluster_color(cluster_ids[i - 1])
+        ax.scatter(x, d, alpha=0.8, s=5, color=color, zorder=2)
+    ax.set_xticks(np.arange(0, len(labels)))
+    ax.set_xticklabels(labels)
+    ax.set_ylabel(ylabel)
+    ax.set_title(title)
+    ax.grid(axis="y", linestyle="--", alpha=0.7)
+    plt.tight_layout()
+    return fig