PyPI - wsi-toolbox - Versions diffs - 0.2.0__py3-none-any.whl - Mend

wsi-toolbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

wsi_toolbox/__init__.py +122 -0
wsi_toolbox/app.py +874 -0
wsi_toolbox/cli.py +599 -0
wsi_toolbox/commands/__init__.py +66 -0
wsi_toolbox/commands/clustering.py +198 -0
wsi_toolbox/commands/data_loader.py +219 -0
wsi_toolbox/commands/dzi.py +160 -0
wsi_toolbox/commands/patch_embedding.py +196 -0
wsi_toolbox/commands/pca.py +206 -0
wsi_toolbox/commands/preview.py +394 -0
wsi_toolbox/commands/show.py +171 -0
wsi_toolbox/commands/umap_embedding.py +174 -0
wsi_toolbox/commands/wsi.py +223 -0
wsi_toolbox/common.py +148 -0
wsi_toolbox/models.py +30 -0
wsi_toolbox/utils/__init__.py +109 -0
wsi_toolbox/utils/analysis.py +174 -0
wsi_toolbox/utils/hdf5_paths.py +232 -0
wsi_toolbox/utils/plot.py +227 -0
wsi_toolbox/utils/progress.py +207 -0
wsi_toolbox/utils/seed.py +26 -0
wsi_toolbox/utils/st.py +55 -0
wsi_toolbox/utils/white.py +121 -0
wsi_toolbox/watcher.py +256 -0
wsi_toolbox/wsi_files.py +619 -0
wsi_toolbox-0.2.0.dist-info/METADATA +253 -0
wsi_toolbox-0.2.0.dist-info/RECORD +30 -0
wsi_toolbox-0.2.0.dist-info/WHEEL +4 -0
wsi_toolbox-0.2.0.dist-info/entry_points.txt +3 -0
wsi_toolbox-0.2.0.dist-info/licenses/LICENSE +21 -0

wsi_toolbox/commands/preview.py ADDED Viewed

@@ -0,0 +1,394 @@
+"""
+Preview generation commands using Template Method Pattern
+"""
+import h5py
+import numpy as np
+from matplotlib import colors as mcolors
+from matplotlib import pyplot as plt
+from PIL import Image, ImageFont
+from ..utils import create_frame, get_platform_font
+from ..utils.hdf5_paths import build_cluster_path
+from . import _get, _get_cluster_color, _progress
+class BasePreviewCommand:
+    """
+    Base class for preview commands using Template Method Pattern
+    Subclasses must implement:
+    - _prepare(f, **kwargs): Prepare data (frames, scores, etc.)
+    - _get_frame(index, data, f): Get frame for specific patch
+    """
+    def __init__(self, size: int = 64, font_size: int = 16, model_name: str | None = None, rotate: bool = False):
+        """
+        Initialize preview command
+        Args:
+            size: Thumbnail patch size
+            font_size: Font size for labels
+            model_name: Model name (None to use global default)
+            rotate: Whether to rotate patches 180 degrees
+        """
+        self.size = size
+        self.font_size = font_size
+        self.model_name = _get("model_name", model_name)
+        self.rotate = rotate
+    def __call__(self, hdf5_path: str, **kwargs) -> Image.Image:
+        """
+        Template method - common workflow for all preview commands
+        Args:
+            hdf5_path: Path to HDF5 file
+            **kwargs: Subclass-specific arguments
+        Returns:
+            PIL.Image: Thumbnail image
+        """
+        S = self.size
+        with h5py.File(hdf5_path, "r") as f:
+            # Load metadata
+            cols, rows, patch_count, patch_size = self._load_metadata(f)
+            # Subclass-specific preparation
+            data = self._prepare(f, **kwargs)
+            # Create canvas
+            canvas = Image.new("RGB", (cols * S, rows * S), (0, 0, 0))
+            # Render all patches (common loop)
+            tq = _progress(range(patch_count))
+            for i in tq:
+                coord = f["coordinates"][i]
+                patch_array = f["patches"][i]
+                # Get subclass-specific frame
+                frame = self._get_frame(i, data, f)
+                # Render patch with optional rotation
+                if self.rotate:
+                    # Transform coordinates for 180-degree rotation
+                    orig_x, orig_y = coord // patch_size * S
+                    x = (cols - 1) * S - orig_x
+                    y = (rows - 1) * S - orig_y
+                    # Rotate patch 180 degrees
+                    patch = Image.fromarray(patch_array).resize((S, S)).rotate(180)
+                else:
+                    x, y = coord // patch_size * S
+                    patch = Image.fromarray(patch_array).resize((S, S))
+                if frame:
+                    patch.paste(frame, (0, 0), frame)
+                canvas.paste(patch, (x, y, x + S, y + S))
+        return canvas
+    def _load_metadata(self, f: h5py.File):
+        """Load common metadata"""
+        cols = f["metadata/cols"][()]
+        rows = f["metadata/rows"][()]
+        patch_count = f["metadata/patch_count"][()]
+        patch_size = f["metadata/patch_size"][()]
+        return cols, rows, patch_count, patch_size
+    def _prepare(self, f: h5py.File, **kwargs):
+        """
+        Prepare data for rendering (implemented by subclass)
+        Args:
+            f: HDF5 file handle
+            **kwargs: Subclass-specific arguments
+        Returns:
+            Any data structure needed for _get_frame()
+        """
+        raise NotImplementedError
+    def _get_frame(self, index: int, data, f: h5py.File):
+        """
+        Get frame for specific patch (implemented by subclass)
+        Args:
+            index: Patch index
+            data: Data prepared by _prepare()
+            f: HDF5 file handle
+        Returns:
+            PIL.Image or None: Frame overlay
+        """
+        raise NotImplementedError
+class PreviewClustersCommand(BasePreviewCommand):
+    """
+    Generate thumbnail with cluster visualization
+    Usage:
+        cmd = PreviewClustersCommand(size=64)
+        image = cmd(hdf5_path='data.h5', cluster_name='test')
+    """
+    def _prepare(self, f: h5py.File, namespace: str = "default", filter_path: str = ""):
+        """
+        Prepare cluster frames
+        Args:
+            f: HDF5 file handle
+            namespace: Namespace (e.g., "default", "001+002")
+            filter_path: Filter path (e.g., "1+2+3" or "1+2+3/0+1")
+        Returns:
+            dict with 'clusters' and 'frames'
+        """
+        # Parse filter path
+        filters = None
+        if filter_path:
+            filters = []
+            for part in filter_path.split("/"):
+                filter_ids = [int(x) for x in part.split("+")]
+                filters.append(filter_ids)
+        # Build cluster path
+        cluster_path = build_cluster_path(self.model_name, namespace, filters)
+        if cluster_path not in f:
+            raise RuntimeError(f"{cluster_path} does not exist in HDF5 file")
+        clusters = f[cluster_path][:]
+        # Prepare frames for each cluster
+        font = ImageFont.truetype(font=get_platform_font(), size=self.font_size)
+        frames = {}
+        for cluster in np.unique(clusters).tolist() + [-1]:
+            if cluster >= 0:
+                color = mcolors.rgb2hex(_get_cluster_color(cluster)[:3])
+            else:
+                color = "#111"
+            frames[cluster] = create_frame(self.size, color, f"{cluster}", font)
+        return {"clusters": clusters, "frames": frames}
+    def _get_frame(self, index: int, data, f: h5py.File):
+        """Get frame for cluster at index"""
+        cluster = data["clusters"][index]
+        return data["frames"][cluster] if cluster >= 0 else None
+class PreviewScoresCommand(BasePreviewCommand):
+    """
+    Generate thumbnail with PCA visualization
+    Usage:
+        cmd = PreviewScoresCommand(size=64)
+        image = cmd(hdf5_path='data.h5', score_name='pca1', namespace='default')
+    """
+    def _prepare(
+        self,
+        f: h5py.File,
+        score_name: str,
+        namespace: str = "default",
+        filter_path: str = "",
+        cmap_name: str = "viridis",
+        invert: bool = False,
+    ):
+        """
+        Prepare PCA visualization data
+        Args:
+            f: HDF5 file handle
+            score_name: Score dataset name (e.g., 'pca1', 'pca2')
+            namespace: Namespace (e.g., "default", "001+002")
+            filter_path: Filter path (e.g., "1+2+3" or "1+2+3/0+1")
+            cmap_name: Colormap name
+            invert: Invert scores (1 - score)
+        Returns:
+            dict with 'scores', 'cmap', and 'font'
+        """
+        # Parse filter path
+        filters = None
+        if filter_path:
+            filters = []
+            for part in filter_path.split("/"):
+                filter_ids = [int(x) for x in part.split("+")]
+                filters.append(filter_ids)
+        # Build hierarchical path
+        score_path = build_cluster_path(self.model_name, namespace, filters, dataset=score_name)
+        if score_path not in f:
+            raise RuntimeError(f"{score_path} does not exist in HDF5 file")
+        scores = f[score_path][:]
+        # Handle multi-dimensional scores (take first component)
+        if scores.ndim > 1:
+            scores = scores[:, 0]
+        # Invert scores if requested
+        if invert:
+            scores = 1 - scores
+        # Prepare font and colormap
+        font = ImageFont.truetype(font=get_platform_font(), size=self.font_size)
+        cmap = plt.get_cmap(cmap_name)
+        return {"scores": scores, "cmap": cmap, "font": font}
+    def _get_frame(self, index: int, data, f: h5py.File):
+        """Get frame for score at index"""
+        score = data["scores"][index]
+        if np.isnan(score):
+            return None
+        color = mcolors.rgb2hex(data["cmap"](score)[:3])
+        return create_frame(self.size, color, f"{score:.3f}", data["font"])
+class PreviewLatentPCACommand(BasePreviewCommand):
+    """
+    Generate thumbnail with latent PCA visualization
+    Usage:
+        cmd = PreviewLatentPCACommand(size=64)
+        image = cmd(hdf5_path='data.h5', alpha=0.5)
+    """
+    def _prepare(self, f: h5py.File, alpha: float = 0.5):
+        """
+        Prepare latent PCA visualization data
+        Args:
+            f: HDF5 file handle
+            alpha: Transparency of overlay (0.0-1.0)
+        Returns:
+            dict with 'overlays' and 'alpha_mask'
+        """
+        # Lazy import: sklearn is slow to load (~600ms), defer until needed
+        from sklearn.decomposition import PCA  # noqa: PLC0415
+        from sklearn.preprocessing import MinMaxScaler  # noqa: PLC0415
+        # Load latent features
+        h = f[f"{self.model_name}/latent_features"][()]  # B, L(16x16), EMB(1024)
+        h = h.astype(np.float32)
+        s = h.shape
+        # Estimate original latent size
+        latent_size = int(np.sqrt(s[1]))  # l = sqrt(L)
+        # Validate dyadicity
+        assert latent_size**2 == s[1]
+        if self.size % latent_size != 0:
+            print(f"WARNING: {self.size} is not divisible by {latent_size}")
+        # Apply PCA
+        pca = PCA(n_components=3)
+        latent_pca = pca.fit_transform(h.reshape(s[0] * s[1], s[-1]))  # B*L, 3
+        # Normalize to [0, 1]
+        scaler = MinMaxScaler()
+        latent_pca = scaler.fit_transform(latent_pca)
+        # Reshape and convert to RGB
+        latent_pca = latent_pca.reshape(s[0], latent_size, latent_size, 3)
+        overlays = (latent_pca * 255).astype(np.uint8)  # B, l, l, 3
+        # Create alpha mask
+        alpha_mask = Image.new("L", (self.size, self.size), int(alpha * 255))
+        return {"overlays": overlays, "alpha_mask": alpha_mask, "latent_size": latent_size}
+    def _get_frame(self, index: int, data, f: h5py.File):
+        """
+        Get latent PCA overlay as a frame for patch at index
+        Args:
+            index: Patch index
+            data: Data prepared by _prepare()
+            f: HDF5 file handle
+        Returns:
+            PIL.Image: RGBA overlay image
+        """
+        # Get overlay for this patch
+        overlay = Image.fromarray(data["overlays"][index]).convert("RGBA")
+        overlay = overlay.resize((self.size, self.size), Image.NEAREST)
+        # Apply alpha mask to make it an overlay
+        overlay.putalpha(data["alpha_mask"])
+        return overlay
+class PreviewLatentClusterCommand(BasePreviewCommand):
+    """
+    Generate thumbnail with latent cluster visualization
+    Usage:
+        cmd = PreviewLatentClusterCommand(size=64)
+        image = cmd(hdf5_path='data.h5', alpha=0.5)
+    """
+    def _prepare(self, f: h5py.File, alpha: float = 0.5):
+        """
+        Prepare latent cluster visualization data
+        Args:
+            f: HDF5 file handle
+            alpha: Transparency of overlay (0.0-1.0)
+        Returns:
+            dict with 'overlays' and 'alpha_mask'
+        """
+        # Load latent clusters
+        clusters = f[f"{self.model_name}/latent_clusters"][()]  # B, L(16x16)
+        s = clusters.shape
+        # Estimate original latent size
+        latent_size = int(np.sqrt(s[1]))  # l = sqrt(L)
+        # Validate dyadicity
+        assert latent_size**2 == s[1]
+        if self.size % latent_size != 0:
+            print(f"WARNING: {self.size} is not divisible by {latent_size}")
+        # Apply colormap
+        cmap = plt.get_cmap("tab20")
+        latent_map = cmap(clusters)
+        latent_map = latent_map.reshape(s[0], latent_size, latent_size, 4)
+        overlays = (latent_map * 255).astype(np.uint8)  # B, l, l, 4
+        # Create alpha mask
+        alpha_mask = Image.new("L", (self.size, self.size), int(alpha * 255))
+        return {"overlays": overlays, "alpha_mask": alpha_mask, "latent_size": latent_size}
+    def _get_frame(self, index: int, data, f: h5py.File):
+        """
+        Get latent cluster overlay as a frame for patch at index
+        Args:
+            index: Patch index
+            data: Data prepared by _prepare()
+            f: HDF5 file handle
+        Returns:
+            PIL.Image: RGBA overlay image
+        """
+        # Get overlay for this patch
+        overlay = Image.fromarray(data["overlays"][index]).convert("RGBA")
+        overlay = overlay.resize((self.size, self.size), Image.NEAREST)
+        # Apply alpha mask to make it an overlay
+        overlay.putalpha(data["alpha_mask"])
+        return overlay

wsi_toolbox/commands/show.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""
+Show HDF5 file structure command
+"""
+import h5py
+from pydantic import BaseModel
+from ..utils.hdf5_paths import list_namespaces
+class ShowResult(BaseModel):
+    """Result of show command"""
+    patch_count: int | None = None
+    patch_size: int | None = None
+    models: list[str] = []
+    namespaces: dict[str, list[str]] = {}
+class ShowCommand:
+    """
+    Show HDF5 file structure and contents
+    Usage:
+        cmd = ShowCommand(verbose=True)
+        result = cmd("data.h5")
+    """
+    def __init__(self, verbose: bool = False):
+        self.verbose = verbose
+    def __call__(self, hdf5_path: str) -> ShowResult:
+        """
+        Show HDF5 file structure
+        Args:
+            hdf5_path: Path to HDF5 file
+        Returns:
+            ShowResult: Structure information
+        """
+        result = ShowResult()
+        with h5py.File(hdf5_path, "r") as f:
+            self._print_header(hdf5_path)
+            self._print_basic_info(f, result)
+            self._print_models(f, result)
+            self._print_namespaces(f, result)
+            self._print_scores(f, result)
+            self._print_footer()
+        return result
+    def _print_header(self, path: str):
+        print(f"\n{'=' * 60}")
+        print(f"HDF5 File: {path}")
+        print(f"{'=' * 60}\n")
+    def _print_footer(self):
+        print(f"{'=' * 60}\n")
+    def _print_basic_info(self, f: h5py.File, result: ShowResult):
+        if "metadata/patch_count" in f:
+            result.patch_count = int(f["metadata/patch_count"][()])
+            result.patch_size = int(f["metadata/patch_size"][()])
+            print("Basic Info:")
+            print(f"  Patch Count:  {result.patch_count}")
+            print(f"  Patch Size:   {result.patch_size}px")
+            print(f"  Grid:         {f['metadata/cols'][()]} x {f['metadata/rows'][()]} (cols x rows)")
+            if "metadata/mpp" in f:
+                mpp = f["metadata/mpp"][()]
+                print(f"  MPP:          {mpp:.4f}" + (" (estimated)" if mpp > 0 else ""))
+            print()
+    def _print_models(self, f: h5py.File, result: ShowResult):
+        available_models = [k for k in f.keys() if k in ["uni", "gigapath", "virchow2"]]
+        result.models = available_models
+        if available_models:
+            print("Available Models:")
+            for model in available_models:
+                has_features = f"{model}/features" in f
+                has_latent = f"{model}/latent_features" in f
+                feat_str = "features" if has_features else "x features"
+                latent_str = ", latent" if has_latent else ""
+                if has_features:
+                    feat_shape = f[f"{model}/features"].shape
+                    feat_str += f" {feat_shape}"
+                print(f"  {model:12s} {feat_str}{latent_str}")
+            print()
+    def _print_namespaces(self, f: h5py.File, result: ShowResult):
+        available_models = result.models
+        for model in available_models:
+            namespaces = list_namespaces(f, model)
+            if not namespaces:
+                continue
+            result.namespaces[model] = namespaces
+            print(f"{model.upper()} Namespaces:")
+            for ns in namespaces:
+                cluster_path = f"{model}/{ns}/clusters"
+                if cluster_path in f:
+                    clusters = f[cluster_path][:]
+                    unique_clusters = [c for c in sorted(set(clusters)) if c >= 0]
+                    n_clustered = sum(clusters >= 0)
+                    n_total = len(clusters)
+                    umap_path = f"{model}/{ns}/umap"
+                    has_umap = "o" if umap_path in f else "x"
+                    ns_display = "default" if ns == "default" else ns
+                    print(f"  {ns_display}/")
+                    print(f"     clusters: {len(unique_clusters)} clusters, {n_clustered}/{n_total} patches")
+                    if self.verbose:
+                        cluster_list = ", ".join(map(str, unique_clusters[:10]))
+                        if len(unique_clusters) > 10:
+                            cluster_list += f", ... ({len(unique_clusters)} total)"
+                        print(f"               [{cluster_list}]")
+                    print(f"     umap:     {has_umap}")
+                    # Check filters
+                    filter_base = f"{model}/{ns}/filter"
+                    if filter_base in f:
+                        filters = self._list_filters_recursive(f, filter_base)
+                        if filters:
+                            print("     filters:")
+                            for filter_path in sorted(filters):
+                                full_path = f"{filter_base}/{filter_path}/clusters"
+                                if full_path in f:
+                                    fclusters = f[full_path][:]
+                                    funique = [c for c in sorted(set(fclusters)) if c >= 0]
+                                    fn_clustered = sum(fclusters >= 0)
+                                    print(f"       {filter_path}/ -> {len(funique)} clusters, {fn_clustered} patches")
+            print()
+    def _print_scores(self, f: h5py.File, result: ShowResult):
+        for model in result.models:
+            score_datasets = [k for k in f.get(model, {}).keys() if k.startswith("scores_")]
+            if score_datasets:
+                print(f"{model.upper()} Scores:")
+                for score in score_datasets:
+                    score_name = score.replace("scores_", "")
+                    print(f"  {score_name}")
+                print()
+    def _list_filters_recursive(self, f: h5py.File, base_path: str, prefix: str = "") -> list[str]:
+        """Recursively list all filter paths"""
+        filters = []
+        if base_path not in f:
+            return filters
+        for key in f[base_path].keys():
+            current_path = f"{prefix}{key}"
+            item_path = f"{base_path}/{key}"
+            if isinstance(f[item_path], h5py.Group):
+                if "clusters" in f[item_path]:
+                    filters.append(current_path)
+                nested_base = f"{item_path}/filter"
+                if nested_base in f:
+                    nested = self._list_filters_recursive(f, nested_base, f"{current_path}/filter/")
+                    filters.extend(nested)
+        return filters