PyPI - wsi-toolbox - Versions diffs - 0.2.0__py3-none-any.whl - Mend

wsi-toolbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

wsi_toolbox/__init__.py +122 -0
wsi_toolbox/app.py +874 -0
wsi_toolbox/cli.py +599 -0
wsi_toolbox/commands/__init__.py +66 -0
wsi_toolbox/commands/clustering.py +198 -0
wsi_toolbox/commands/data_loader.py +219 -0
wsi_toolbox/commands/dzi.py +160 -0
wsi_toolbox/commands/patch_embedding.py +196 -0
wsi_toolbox/commands/pca.py +206 -0
wsi_toolbox/commands/preview.py +394 -0
wsi_toolbox/commands/show.py +171 -0
wsi_toolbox/commands/umap_embedding.py +174 -0
wsi_toolbox/commands/wsi.py +223 -0
wsi_toolbox/common.py +148 -0
wsi_toolbox/models.py +30 -0
wsi_toolbox/utils/__init__.py +109 -0
wsi_toolbox/utils/analysis.py +174 -0
wsi_toolbox/utils/hdf5_paths.py +232 -0
wsi_toolbox/utils/plot.py +227 -0
wsi_toolbox/utils/progress.py +207 -0
wsi_toolbox/utils/seed.py +26 -0
wsi_toolbox/utils/st.py +55 -0
wsi_toolbox/utils/white.py +121 -0
wsi_toolbox/watcher.py +256 -0
wsi_toolbox/wsi_files.py +619 -0
wsi_toolbox-0.2.0.dist-info/METADATA +253 -0
wsi_toolbox-0.2.0.dist-info/RECORD +30 -0
wsi_toolbox-0.2.0.dist-info/WHEEL +4 -0
wsi_toolbox-0.2.0.dist-info/entry_points.txt +3 -0
wsi_toolbox-0.2.0.dist-info/licenses/LICENSE +21 -0

wsi_toolbox/commands/umap_embedding.py ADDED Viewed

@@ -0,0 +1,174 @@
+"""
+UMAP embedding command for dimensionality reduction
+"""
+import h5py
+import numpy as np
+from pydantic import BaseModel
+from ..utils.hdf5_paths import build_cluster_path, build_namespace, ensure_groups
+from . import _get, _progress, get_config
+from .data_loader import MultipleContext
+class UmapResult(BaseModel):
+    """Result of UMAP embedding operation"""
+    n_samples: int
+    n_components: int
+    namespace: str
+    target_path: str
+    skipped: bool = False
+class UmapCommand:
+    """
+    Compute UMAP embeddings from features
+    Usage:
+        # Basic UMAP
+        cmd = UmapCommand()
+        result = cmd('data.h5')  # → uni/default/umap
+        # Multi-file UMAP
+        cmd = UmapCommand()
+        result = cmd(['001.h5', '002.h5'])  # → uni/001+002/umap
+        # UMAP for filtered data
+        cmd = UmapCommand(parent_filters=[[1,2,3]])
+        result = cmd('data.h5')  # → uni/default/filter/1+2+3/umap
+    """
+    def __init__(
+        self,
+        namespace: str | None = None,
+        parent_filters: list[list[int]] | None = None,
+        n_components: int = 2,
+        n_neighbors: int = 15,
+        min_dist: float = 0.1,
+        metric: str = "euclidean",
+        overwrite: bool = False,
+        model_name: str | None = None,
+    ):
+        """
+        Initialize UMAP command
+        Args:
+            namespace: Explicit namespace (None = auto-generate from input paths)
+            parent_filters: Hierarchical filters, e.g., [[1,2,3]]
+            n_components: Number of UMAP dimensions (default: 2)
+            n_neighbors: UMAP n_neighbors parameter (default: 15)
+            min_dist: UMAP min_dist parameter (default: 0.1)
+            metric: UMAP metric (default: "euclidean")
+            overwrite: Whether to overwrite existing UMAP coordinates
+            model_name: Model name (None to use global default)
+        """
+        self.namespace = namespace
+        self.parent_filters = parent_filters or []
+        self.n_components = n_components
+        self.n_neighbors = n_neighbors
+        self.min_dist = min_dist
+        self.metric = metric
+        self.overwrite = overwrite
+        self.model_name = _get("model_name", model_name)
+        # Validate model
+        if self.model_name not in ["uni", "gigapath", "virchow2"]:
+            raise ValueError(f"Invalid model: {self.model_name}")
+        # Internal state
+        self.hdf5_paths = []
+        self.umap_embeddings = None
+    def __call__(self, hdf5_paths: str | list[str]) -> UmapResult:
+        """Execute UMAP embedding"""
+        import umap  # noqa: PLC0415 - lazy load, umap is slow to import
+        # Normalize to list
+        if isinstance(hdf5_paths, str):
+            hdf5_paths = [hdf5_paths]
+        self.hdf5_paths = hdf5_paths
+        # Determine namespace
+        if self.namespace is None:
+            self.namespace = build_namespace(hdf5_paths)
+        elif "+" in self.namespace:
+            raise ValueError("Namespace cannot contain '+' (reserved for multi-file auto-generated namespaces)")
+        # Build target path
+        target_path = build_cluster_path(self.model_name, self.namespace, filters=self.parent_filters, dataset="umap")
+        # Check if already exists
+        if not self.overwrite:
+            with h5py.File(hdf5_paths[0], "r") as f:
+                if target_path in f:
+                    umap_coords = f[target_path][:]
+                    n_samples = np.sum(~np.isnan(umap_coords[:, 0]))
+                    if get_config().verbose:
+                        print(f"UMAP already exists at {target_path}")
+                    return UmapResult(
+                        n_samples=n_samples,
+                        n_components=self.n_components,
+                        namespace=self.namespace,
+                        target_path=target_path,
+                        skipped=True,
+                    )
+        # Execute with progress tracking
+        with _progress(total=3, desc="UMAP") as pbar:
+            # Load features
+            pbar.set_description("Loading features")
+            ctx = MultipleContext(hdf5_paths, self.model_name, self.namespace, self.parent_filters)
+            features = ctx.load_features(source="features")
+            pbar.update(1)
+            # Compute UMAP
+            pbar.set_description("Computing UMAP")
+            reducer = umap.UMAP(
+                n_components=self.n_components,
+                n_neighbors=self.n_neighbors,
+                min_dist=self.min_dist,
+                metric=self.metric,
+            )
+            self.umap_embeddings = reducer.fit_transform(features)
+            pbar.update(1)
+            # Write results
+            pbar.set_description("Writing results")
+            self._write_results(ctx, target_path)
+            pbar.update(1)
+        # Verbose output after progress bar closes
+        if get_config().verbose:
+            print(f"Computing UMAP: {len(features)} samples → {self.n_components}D")
+            print(f"Wrote {target_path} to {len(hdf5_paths)} file(s)")
+        return UmapResult(
+            n_samples=len(features), n_components=self.n_components, namespace=self.namespace, target_path=target_path
+        )
+    def _write_results(self, ctx: MultipleContext, target_path: str):
+        """Write UMAP coordinates to HDF5 files"""
+        for file_slice in ctx:
+            umap_coords = file_slice.slice(self.umap_embeddings)
+            with h5py.File(file_slice.hdf5_path, "a") as f:
+                ensure_groups(f, target_path)
+                if target_path in f:
+                    del f[target_path]
+                # Fill with NaN for filtered patches
+                full_umap = np.full((len(file_slice.mask), self.n_components), np.nan, dtype=umap_coords.dtype)
+                full_umap[file_slice.mask] = umap_coords
+                ds = f.create_dataset(target_path, data=full_umap)
+                ds.attrs["n_components"] = self.n_components
+                ds.attrs["n_neighbors"] = self.n_neighbors
+                ds.attrs["min_dist"] = self.min_dist
+                ds.attrs["metric"] = self.metric
+                ds.attrs["model"] = self.model_name
+    def get_embeddings(self):
+        """Get computed UMAP embeddings"""
+        return self.umap_embeddings

wsi_toolbox/commands/wsi.py ADDED Viewed

@@ -0,0 +1,223 @@
+"""
+WSI to HDF5 conversion command
+"""
+import os
+from typing import Callable
+import cv2
+import h5py
+import numpy as np
+from pydantic import BaseModel
+from ..utils.white import create_white_detector
+from ..wsi_files import create_wsi_file
+from . import _progress, get_config
+class Wsi2HDF5Result(BaseModel):
+    """Result of WSI to HDF5 conversion"""
+    mpp: float
+    original_mpp: float
+    scale: int
+    patch_count: int
+    patch_size: int
+    cols: int
+    rows: int
+    output_path: str
+class Wsi2HDF5Command:
+    """
+    Convert WSI image to HDF5 format with patch extraction
+    Usage:
+        # Set global config once
+        commands.set_default_progress('tqdm')
+        # Create and run command
+        cmd = Wsi2HDF5Command(patch_size=256, engine='auto')
+        result = cmd(input_path='image.ndpi', output_path='output.h5')
+    """
+    def __init__(
+        self,
+        patch_size: int = 256,
+        engine: str = "auto",
+        mpp: float = 0.5,
+        rotate: bool = False,
+        white_detector: Callable[[np.ndarray], bool] = None,
+    ):
+        """
+        Initialize WSI to HDF5 converter
+        Args:
+            patch_size: Size of patches to extract
+            engine: WSI reader engine ('auto', 'openslide', 'tifffile', 'standard')
+            mpp: Microns per pixel (for standard images)
+            rotate: Whether to rotate patches 180 degrees
+            white_detector: Function that takes (H, W, 3) array and returns bool.
+                          If None, uses legacy is_white_patch with default params.
+        Note:
+            progress and verbose are controlled by global config:
+            - commands.set_default_progress('tqdm')
+            - commands.set_verbose(True/False)
+        """
+        self.patch_size = patch_size
+        self.engine = engine
+        self.mpp = mpp
+        self.rotate = rotate
+        # Set white detection function
+        if white_detector is None:
+            # Default: use ptp method with default threshold
+            self._is_white_patch = create_white_detector("ptp")
+        else:
+            self._is_white_patch = white_detector
+    def __call__(self, input_path: str, output_path: str) -> Wsi2HDF5Result:
+        """
+        Execute WSI to HDF5 conversion
+        Args:
+            input_path: Path to input WSI file
+            output_path: Path to output HDF5 file
+        Returns:
+            Wsi2HDF5Result: Metadata including mpp, scale, patch_count
+        """
+        # Create WSI reader
+        wsi = create_wsi_file(input_path, engine=self.engine, mpp=self.mpp)
+        # Calculate scale based on mpp
+        original_mpp = wsi.get_mpp()
+        if 0.360 < original_mpp < 0.660:
+            # mpp ≃ 0.5 mpp
+            scale = 1
+        elif original_mpp < 0.360:
+            scale = 2
+        else:
+            raise RuntimeError(f"Invalid mpp: {original_mpp:.6f}")
+        mpp = original_mpp * scale
+        # Get image dimensions
+        W, H = wsi.get_original_size()
+        S = self.patch_size  # Scaled patch size
+        T = S * scale  # Original patch size
+        x_patch_count = W // T
+        y_patch_count = H // T
+        width = (W // T) * T
+        row_count = H // T
+        coordinates = []
+        try:
+            # Create HDF5 file
+            with h5py.File(output_path, "w") as f:
+                # Write metadata (both as datasets and attrs for migration)
+                f.create_dataset("metadata/original_mpp", data=original_mpp)
+                f.create_dataset("metadata/original_width", data=W)
+                f.create_dataset("metadata/original_height", data=H)
+                f.create_dataset("metadata/image_level", data=0)
+                f.create_dataset("metadata/mpp", data=mpp)
+                f.create_dataset("metadata/scale", data=scale)
+                f.create_dataset("metadata/patch_size", data=S)
+                f.create_dataset("metadata/cols", data=x_patch_count)
+                f.create_dataset("metadata/rows", data=y_patch_count)
+                # Also save as attrs for future migration
+                f.attrs["original_mpp"] = original_mpp
+                f.attrs["original_width"] = W
+                f.attrs["original_height"] = H
+                f.attrs["image_level"] = 0
+                f.attrs["mpp"] = mpp
+                f.attrs["scale"] = scale
+                f.attrs["patch_size"] = S
+                f.attrs["cols"] = x_patch_count
+                f.attrs["rows"] = y_patch_count
+                # Create patches dataset
+                total_patches = f.create_dataset(
+                    "patches",
+                    shape=(x_patch_count * y_patch_count, S, S, 3),
+                    dtype=np.uint8,
+                    chunks=(1, S, S, 3),
+                    compression="gzip",
+                    compression_opts=9,
+                )
+                # Extract patches row by row
+                cursor = 0
+                tq = _progress(range(row_count))
+                for row in tq:
+                    # Read one row
+                    image = wsi.read_region((0, row * T, width, T))
+                    image = cv2.resize(image, (width // scale, S), interpolation=cv2.INTER_LANCZOS4)
+                    # Reshape into patches
+                    patches = image.reshape(1, S, x_patch_count, S, 3)  # (y, h, x, w, 3)
+                    patches = patches.transpose(0, 2, 1, 3, 4)  # (y, x, h, w, 3)
+                    patches = patches[0]
+                    # Filter white patches and collect valid ones
+                    batch = []
+                    for col, patch in enumerate(patches):
+                        if self._is_white_patch(patch):
+                            continue
+                        if self.rotate:
+                            patch = cv2.rotate(patch, cv2.ROTATE_180)
+                            coordinates.append(((x_patch_count - 1 - col) * S, (y_patch_count - 1 - row) * S))
+                        else:
+                            coordinates.append((col * S, row * S))
+                        batch.append(patch)
+                    # Write batch
+                    batch = np.array(batch)
+                    total_patches[cursor : cursor + len(batch), ...] = batch
+                    cursor += len(batch)
+                    tq.set_description(f"Selected {len(batch)}/{len(patches)} patches (row {row}/{y_patch_count})")
+                    tq.refresh()
+                # Resize to actual patch count and save coordinates
+                patch_count = len(coordinates)
+                f.create_dataset("coordinates", data=coordinates)
+                f["patches"].resize((patch_count, S, S, 3))
+                f.create_dataset("metadata/patch_count", data=patch_count)
+                f.attrs["patch_count"] = patch_count
+        except BaseException:
+            # Clean up incomplete file on error (including Ctrl-C)
+            if os.path.exists(output_path):
+                os.remove(output_path)
+            raise
+        # Verbose output after progress bar closes
+        if get_config().verbose:
+            print(f"Original mpp: {original_mpp:.6f}")
+            print(f"Image mpp: {mpp:.6f}")
+            print(f"Target resolutions: {W} x {H}")
+            print(f"Obtained resolutions: {x_patch_count * S} x {y_patch_count * S}")
+            print(f"Scale: {scale}")
+            print(f"Patch size: {T}")
+            print(f"Scaled patch size: {S}")
+            print(f"Row count: {y_patch_count}")
+            print(f"Col count: {x_patch_count}")
+            print(f"{patch_count} patches were selected.")
+        return Wsi2HDF5Result(
+            mpp=mpp,
+            original_mpp=original_mpp,
+            scale=scale,
+            patch_count=patch_count,
+            patch_size=S,
+            cols=x_patch_count,
+            rows=y_patch_count,
+            output_path=output_path,
+        )

wsi_toolbox/common.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""
+Global configuration and settings for WSI-toolbox
+"""
+from functools import partial
+from typing import Callable
+from matplotlib import pyplot as plt
+from pydantic import BaseModel, Field
+from .models import MODEL_NAMES, create_foundation_model
+from .utils.progress import Progress
+# === Global Configuration (Pydantic) ===
+class Config(BaseModel):
+    """Global configuration for commands"""
+    progress: str = Field(default="tqdm", description="Progress bar backend")
+    model_name: str = Field(default="uni", description="Default model name")
+    model_generator: Callable | None = Field(default=None, description="Model generator function")
+    verbose: bool = Field(default=True, description="Verbose output")
+    device: str = Field(default="cuda", description="Device for computation")
+    cluster_cmap: str = Field(default="tab20", description="Cluster colormap name")
+    class Config:
+        arbitrary_types_allowed = True
+# Global config instance
+_config = Config()
+def get_config() -> Config:
+    """Get global configuration instance"""
+    return _config
+def set_default_progress(backend: str):
+    """Set global default progress backend ('tqdm', 'streamlit', etc.)"""
+    _config.progress = backend
+def set_default_model(name: str, generator: Callable, label: str | None = None):
+    """Set custom model generator as default
+    Args:
+        name: Model name (used for file paths, etc.)
+        generator: Callable that returns a model instance (e.g., lambda: MyModel())
+        label: Display label (defaults to name if not provided)
+    Example:
+        >>> set_default_model('resnet', lambda: torchvision.models.resnet50())
+        >>> set_default_model('custom', create_my_model, label='My Custom Model')
+    """
+    _config.model_name = name
+    _config.model_generator = generator
+def set_default_model_preset(preset_name: str):
+    """Set default model from preset ('uni', 'gigapath', 'virchow2')
+    Args:
+        preset_name: One of 'uni', 'gigapath', 'virchow2'
+    """
+    if preset_name not in MODEL_NAMES:
+        raise ValueError(f"Invalid preset: {preset_name}. Must be one of {MODEL_NAMES}")
+    _config.model_name = preset_name
+    _config.model_generator = partial(create_foundation_model, preset_name)
+def create_default_model():
+    """Create a new model instance using the registered generator.
+    Returns:
+        torch.nn.Module: Fresh model instance
+    Raises:
+        RuntimeError: If no model generator is registered
+    Example:
+        >>> set_default_model_preset('uni')
+        >>> model = create_default_model()  # Creates new UNI model instance
+    """
+    if _config.model_generator is None:
+        raise RuntimeError(
+            "No model generator registered. Call set_default_model() or set_default_model_preset() first."
+        )
+    return _config.model_generator()
+def set_default_device(device: str):
+    """Set global default device ('cuda', 'cpu')"""
+    _config.device = device
+def set_verbose(verbose: bool):
+    """Set global verbosity"""
+    _config.verbose = verbose
+def set_default_cluster_cmap(cmap_name: str):
+    """Set global cluster colormap ('tab20', 'tab10', 'Set1', etc.)"""
+    _config.cluster_cmap = cmap_name
+def _get_cluster_color(cluster_id: int):
+    """
+    Get color for cluster ID using global colormap
+    Args:
+        cluster_id: Cluster ID
+    Returns:
+        Color in matplotlib format (array or string)
+    """
+    cmap = plt.get_cmap(_config.cluster_cmap)
+    return cmap(cluster_id % 20)  # Modulo to handle colormaps with limited colors
+def _get(key: str, value):
+    """Get value or fall back to global default"""
+    if value is not None:
+        return value
+    return getattr(_config, key)
+def _progress(iterable=None, total=None, desc="", **kwargs):
+    """Create a progress bar using global config backend"""
+    return Progress(iterable=iterable, backend=_config.progress, total=total, desc=desc, **kwargs)
+__all__ = [
+    "Config",
+    "get_config",
+    "set_default_progress",
+    "set_default_model",
+    "set_default_model_preset",
+    "create_default_model",
+    "set_default_device",
+    "set_verbose",
+    "set_default_cluster_cmap",
+    "_get_cluster_color",
+    "_get",
+    "_progress",
+]

wsi_toolbox/models.py ADDED Viewed

@@ -0,0 +1,30 @@
+MODEL_NAMES = ["uni", "gigapath", "virchow2"]
+def create_foundation_model(model_name: str):
+    """
+    Create a foundation model instance by preset name.
+    Args:
+        model_name: One of 'uni', 'gigapath', 'virchow2'
+    Returns:
+        torch.nn.Module: Model instance (not moved to device, not in eval mode)
+    """
+    # Lazy import: timm/torch are slow to load (~2s), defer until model creation
+    import timm  # noqa: PLC0415
+    import torch  # noqa: PLC0415
+    from timm.layers import SwiGLUPacked  # noqa: PLC0415
+    if model_name == "uni":
+        return timm.create_model("hf-hub:MahmoodLab/uni", pretrained=True, dynamic_img_size=True, init_values=1e-5)
+    if model_name == "gigapath":
+        return timm.create_model("hf_hub:prov-gigapath/prov-gigapath", pretrained=True, dynamic_img_size=True)
+    if model_name == "virchow2":
+        return timm.create_model(
+            "hf-hub:paige-ai/Virchow2", pretrained=True, mlp_layer=SwiGLUPacked, act_layer=torch.nn.SiLU
+        )
+    raise ValueError(f"Invalid model_name: {model_name}. Must be one of {MODEL_NAMES}")

wsi_toolbox/utils/__init__.py ADDED Viewed

@@ -0,0 +1,109 @@
+import sys
+import numpy as np
+from matplotlib import colors as mcolors
+from matplotlib import pyplot as plt
+from matplotlib.offsetbox import AnnotationBbox, OffsetImage
+from PIL import Image, ImageDraw
+from PIL.Image import Image as ImageType
+def yes_no_prompt(question):
+    print(f"{question} [Y/n]: ", end="")
+    response = input().lower()
+    return response == "" or response.startswith("y")
+def get_platform_font():
+    if sys.platform == "win32":
+        # Windows
+        font_path = "C:\\Windows\\Fonts\\msgothic.ttc"  # MSゴシック
+    elif sys.platform == "darwin":
+        # macOS
+        font_path = "/System/Library/Fonts/Supplemental/Arial.ttf"
+    else:
+        # Linux
+        # font_path = '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf' # TODO: propagation
+        font_path = "/usr/share/fonts/TTF/DejaVuSans.ttf"
+    return font_path
+def create_frame(size, color, text, font):
+    frame = Image.new("RGBA", (size, size), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(frame)
+    draw.rectangle((0, 0, size, size), outline=color, width=4)
+    text_color = "white" if mcolors.rgb_to_hsv(mcolors.hex2color(color))[2] < 0.9 else "black"
+    bbox = np.array(draw.textbbox((0, 0), text, font=font))
+    draw.rectangle((4, 4, bbox[2] + 4, bbox[3] + 4), fill=color)
+    draw.text((1, 1), text, font=font, fill=text_color)
+    return frame
+def safe_del(hdf_file, key_path):
+    """
+    Safely delete a dataset from HDF5 file if it exists
+    Args:
+        hdf_file: h5py.File object
+        key_path: Dataset path to delete
+    """
+    if key_path in hdf_file:
+        del hdf_file[key_path]
+def hover_images_on_scatters(scatters, imagess, ax=None, offset=(150, 30)):
+    if ax is None:
+        ax = plt.gca()
+    fig = ax.figure
+    def as_image(image_or_path):
+        if isinstance(image_or_path, np.ndarray):
+            return image_or_path
+        if isinstance(image_or_path, ImageType):
+            return image_or_path
+        if isinstance(image_or_path, str):
+            return Image.open(image_or_path)
+        raise RuntimeError("Invalid param", image_or_path)
+    imagebox = OffsetImage(as_image(imagess[0][0]), zoom=0.5)
+    imagebox.image.axes = ax
+    annot = AnnotationBbox(
+        imagebox,
+        xy=(0, 0),
+        # xybox=(256, 256),
+        # xycoords='data',
+        boxcoords="offset points",
+        # boxcoords=('axes fraction', 'data'),
+        pad=0.1,
+        arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=-0.3"),
+        zorder=100,
+    )
+    annot.set_visible(False)
+    ax.add_artist(annot)
+    def hover(event):
+        vis = annot.get_visible()
+        if event.inaxes != ax:
+            return
+        for n, (sc, ii) in enumerate(zip(scatters, imagess)):
+            cont, index = sc.contains(event)
+            if cont:
+                i = index["ind"][0]
+                pos = sc.get_offsets()[i]
+                annot.xy = pos
+                annot.xybox = pos + np.array(offset)
+                image = as_image(ii[i])
+                # text = unique_code[n]
+                # annot.set_text(text)
+                # annot.get_bbox_patch().set_facecolor(cmap(int(text)/10))
+                imagebox.set_data(image)
+                annot.set_visible(True)
+                fig.canvas.draw_idle()
+                return
+        if vis:
+            annot.set_visible(False)
+            fig.canvas.draw_idle()
+            return
+    fig.canvas.mpl_connect("motion_notify_event", hover)