PyPI - wsi-toolbox - Versions diffs - 0.2.0__py3-none-any.whl - Mend

wsi-toolbox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

wsi_toolbox/__init__.py +122 -0
wsi_toolbox/app.py +874 -0
wsi_toolbox/cli.py +599 -0
wsi_toolbox/commands/__init__.py +66 -0
wsi_toolbox/commands/clustering.py +198 -0
wsi_toolbox/commands/data_loader.py +219 -0
wsi_toolbox/commands/dzi.py +160 -0
wsi_toolbox/commands/patch_embedding.py +196 -0
wsi_toolbox/commands/pca.py +206 -0
wsi_toolbox/commands/preview.py +394 -0
wsi_toolbox/commands/show.py +171 -0
wsi_toolbox/commands/umap_embedding.py +174 -0
wsi_toolbox/commands/wsi.py +223 -0
wsi_toolbox/common.py +148 -0
wsi_toolbox/models.py +30 -0
wsi_toolbox/utils/__init__.py +109 -0
wsi_toolbox/utils/analysis.py +174 -0
wsi_toolbox/utils/hdf5_paths.py +232 -0
wsi_toolbox/utils/plot.py +227 -0
wsi_toolbox/utils/progress.py +207 -0
wsi_toolbox/utils/seed.py +26 -0
wsi_toolbox/utils/st.py +55 -0
wsi_toolbox/utils/white.py +121 -0
wsi_toolbox/watcher.py +256 -0
wsi_toolbox/wsi_files.py +619 -0
wsi_toolbox-0.2.0.dist-info/METADATA +253 -0
wsi_toolbox-0.2.0.dist-info/RECORD +30 -0
wsi_toolbox-0.2.0.dist-info/WHEEL +4 -0
wsi_toolbox-0.2.0.dist-info/entry_points.txt +3 -0
wsi_toolbox-0.2.0.dist-info/licenses/LICENSE +21 -0

wsi_toolbox/commands/clustering.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""
+Clustering command for WSI features
+"""
+import h5py
+import numpy as np
+from pydantic import BaseModel
+from ..utils.analysis import leiden_cluster, reorder_clusters_by_pca
+from ..utils.hdf5_paths import build_cluster_path, build_namespace, ensure_groups
+from . import _get, _progress, get_config
+from .data_loader import MultipleContext
+class ClusteringResult(BaseModel):
+    """Result of clustering operation"""
+    cluster_count: int
+    feature_count: int
+    target_path: str
+    skipped: bool = False
+class ClusteringCommand:
+    """
+    Perform Leiden clustering on features or UMAP coordinates
+    Input:
+        - features (from <model>/features)
+        - namespace + filters (recursive hierarchy)
+        - source: "features" or "umap"
+        - resolution: clustering resolution
+    Output:
+        - clusters written to deepest level
+        - metadata (resolution, source) saved as HDF5 attributes
+    Example hierarchy:
+        uni/default/filter/1+2+3/filter/4+5/clusters
+            ↑ with attributes: resolution=1.0, source="features"
+    Usage:
+        # Basic clustering
+        cmd = ClusteringCommand(resolution=1.0)
+        result = cmd('data.h5')  # → uni/default/clusters
+        # Filtered clustering
+        cmd = ClusteringCommand(parent_filters=[[1,2,3], [4,5]])
+        result = cmd('data.h5')  # → uni/default/filter/1+2+3/filter/4+5/clusters
+        # UMAP-based clustering
+        cmd = ClusteringCommand(source="umap")
+        result = cmd('data.h5')  # → uses uni/default/umap
+    """
+    def __init__(
+        self,
+        resolution: float = 1.0,
+        namespace: str | None = None,
+        parent_filters: list[list[int]] | None = None,
+        source: str = "features",
+        sort_clusters: bool = True,
+        overwrite: bool = False,
+        model_name: str | None = None,
+    ):
+        """
+        Args:
+            resolution: Leiden clustering resolution
+            namespace: Explicit namespace (None = auto-generate)
+            parent_filters: Hierarchical filters, e.g., [[1,2,3], [4,5]]
+            source: "features" or "umap"
+            sort_clusters: Reorder cluster IDs by PCA distribution (default: True)
+            overwrite: Overwrite existing clusters
+            model_name: Model name (None = use global default)
+        """
+        self.resolution = resolution
+        self.namespace = namespace
+        self.parent_filters = parent_filters or []
+        self.source = source
+        self.sort_clusters = sort_clusters
+        self.overwrite = overwrite
+        self.model_name = _get("model_name", model_name)
+        # Validate
+        if self.model_name not in ["uni", "gigapath", "virchow2"]:
+            raise ValueError(f"Invalid model: {self.model_name}")
+        if self.source not in ["features", "umap"]:
+            raise ValueError(f"Invalid source: {self.source}")
+        # Internal state
+        self.hdf5_paths = []
+        self.clusters = None
+    def __call__(self, hdf5_paths: str | list[str]) -> ClusteringResult:
+        """
+        Execute clustering
+        Args:
+            hdf5_paths: Single HDF5 path or list of paths
+        Returns:
+            ClusteringResult
+        """
+        # Normalize to list
+        if isinstance(hdf5_paths, str):
+            hdf5_paths = [hdf5_paths]
+        self.hdf5_paths = hdf5_paths
+        # Determine namespace
+        if self.namespace is None:
+            self.namespace = build_namespace(hdf5_paths)
+        elif "+" in self.namespace:
+            raise ValueError("Namespace cannot contain '+' (reserved for multi-file auto-generated namespaces)")
+        # Build target path
+        target_path = build_cluster_path(
+            self.model_name, self.namespace, filters=self.parent_filters, dataset="clusters"
+        )
+        # Check if already exists
+        if not self.overwrite:
+            with h5py.File(hdf5_paths[0], "r") as f:
+                if target_path in f:
+                    clusters = f[target_path][:]
+                    cluster_count = len([c for c in set(clusters) if c >= 0])
+                    if get_config().verbose:
+                        print(f"Clusters already exist at {target_path}")
+                    return ClusteringResult(
+                        cluster_count=cluster_count,
+                        feature_count=np.sum(clusters >= 0),
+                        target_path=target_path,
+                        skipped=True,
+                    )
+        # Execute with progress tracking
+        # Total: 1 (load) + 5 (clustering steps) + 1 (write) = 7
+        with _progress(total=7, desc="Clustering") as pbar:
+            # Load data
+            pbar.set_description("Loading data")
+            ctx = MultipleContext(hdf5_paths, self.model_name, self.namespace, self.parent_filters)
+            data = ctx.load_features(source=self.source)
+            pbar.update(1)
+            # Perform clustering using analysis module
+            def on_progress(msg: str):
+                pbar.set_description(msg)
+                pbar.update(1)
+            self.clusters = leiden_cluster(
+                data,
+                resolution=self.resolution,
+                on_progress=on_progress,
+            )
+            # Reorder cluster IDs by PCA distribution for consistent visualization
+            if self.sort_clusters:
+                pbar.set_description("Sorting clusters")
+                features = ctx.load_features(source="features")
+                from sklearn.decomposition import PCA  # noqa: PLC0415
+                pca = PCA(n_components=1)
+                pca1 = pca.fit_transform(features).flatten()
+                self.clusters = reorder_clusters_by_pca(self.clusters, pca1)
+            cluster_count = len(set(self.clusters))
+            # Write results
+            pbar.set_description("Writing results")
+            self._write_results(ctx, target_path)
+            pbar.update(1)
+        # Verbose output after progress bar closes
+        if get_config().verbose:
+            print(f"Loaded {len(data)} samples from {self.source}")
+            print(f"Found {cluster_count} clusters")
+            print(f"Wrote {target_path} to {len(hdf5_paths)} file(s)")
+        return ClusteringResult(cluster_count=cluster_count, feature_count=len(data), target_path=target_path)
+    def _write_results(self, ctx: MultipleContext, target_path: str):
+        """Write clustering results to HDF5 files"""
+        for file_slice in ctx:
+            clusters = file_slice.slice(self.clusters)
+            with h5py.File(file_slice.hdf5_path, "a") as f:
+                ensure_groups(f, target_path)
+                if target_path in f:
+                    del f[target_path]
+                # Fill with -1 for filtered patches
+                full_clusters = np.full(len(file_slice.mask), -1, dtype=clusters.dtype)
+                full_clusters[file_slice.mask] = clusters
+                ds = f.create_dataset(target_path, data=full_clusters)
+                ds.attrs["resolution"] = self.resolution
+                ds.attrs["source"] = self.source
+                ds.attrs["model"] = self.model_name

wsi_toolbox/commands/data_loader.py ADDED Viewed

@@ -0,0 +1,219 @@
+"""
+Multi-file context for HDF5 operations with namespace and filter support.
+"""
+from dataclasses import dataclass
+import h5py
+import numpy as np
+from ..utils.hdf5_paths import build_cluster_path
+@dataclass
+class FileSlice:
+    """Context for a single file in multi-file operations."""
+    hdf5_path: str
+    mask: np.ndarray
+    start: int
+    end: int
+    def slice(self, data: np.ndarray) -> np.ndarray:
+        """Extract this file's portion from concatenated data."""
+        return data[self.start : self.end]
+    @property
+    def count(self) -> int:
+        """Number of masked (active) samples in this file."""
+        return self.end - self.start
+class MultipleContext:
+    """
+    Multi-file context for HDF5 operations with namespace + filters.
+    Handles the common pattern of:
+    1. Loading existing clusters at each filter level
+    2. Building cumulative mask
+    3. Loading features/UMAP coordinates with the mask
+    4. Iterating over files for writing results
+    Usage:
+        ctx = MultipleContext(hdf5_paths, model_name, namespace, filters)
+        data = ctx.load_features()
+        results = some_computation(data)
+        for file_slice in ctx:
+            file_results = file_slice.slice(results)
+            with h5py.File(file_slice.hdf5_path, "a") as f:
+                # write file_results with file_slice.mask
+    """
+    def __init__(
+        self,
+        hdf5_paths: list[str],
+        model_name: str,
+        namespace: str,
+        parent_filters: list[list[int]] | None = None,
+    ):
+        """
+        Args:
+            hdf5_paths: List of HDF5 file paths
+            model_name: Model name (e.g., "uni")
+            namespace: Namespace (e.g., "default", "001+002")
+            parent_filters: Hierarchical filters, e.g., [[1,2,3], [4,5]]
+        """
+        self.hdf5_paths = hdf5_paths
+        self.model_name = model_name
+        self.namespace = namespace
+        self.parent_filters = parent_filters or []
+        # Populated after load_features()
+        self._masks: list[np.ndarray] | None = None
+        self._total_count: int = 0
+    def load_features(self, source: str = "features") -> np.ndarray:
+        """
+        Load features or UMAP coordinates with filtering.
+        Args:
+            source: "features" or "umap"
+        Returns:
+            Concatenated and normalized features/UMAP coordinates
+        """
+        data_list = []
+        self._masks = []
+        for hdf5_path in self.hdf5_paths:
+            with h5py.File(hdf5_path, "r") as f:
+                patch_count = f["metadata/patch_count"][()]
+                # Build cumulative mask from filters
+                mask = self._build_mask(f, patch_count)
+                # Validate mask length
+                if len(mask) != patch_count:
+                    raise RuntimeError(f"Mask length mismatch in {hdf5_path}: expected {patch_count}, got {len(mask)}")
+                self._masks.append(mask)
+                # Load data based on source
+                if source == "umap":
+                    umap_path = build_cluster_path(
+                        self.model_name,
+                        self.namespace,
+                        filters=self.parent_filters if self.parent_filters else None,
+                        dataset="umap",
+                    )
+                    if umap_path not in f:
+                        raise RuntimeError(f"UMAP coordinates not found at {umap_path}. Run 'wsi-toolbox umap' first.")
+                    data = f[umap_path][mask]
+                    if np.any(np.isnan(data)):
+                        raise RuntimeError(f"NaN values in UMAP coordinates at {umap_path}")
+                else:
+                    feature_path = f"{self.model_name}/features"
+                    if feature_path not in f:
+                        raise RuntimeError(f"Features not found at {feature_path} in {hdf5_path}")
+                    data = f[feature_path][mask]
+                data_list.append(data)
+        # Concatenate and normalize
+        # Lazy import: sklearn is slow to load (~600ms), defer until needed
+        from sklearn.preprocessing import StandardScaler  # noqa: PLC0415
+        data = np.concatenate(data_list)
+        self._total_count = len(data)
+        scaler = StandardScaler()
+        data = scaler.fit_transform(data)
+        return data
+    def __iter__(self):
+        """Iterate over files yielding FileSlice for each."""
+        if self._masks is None:
+            raise RuntimeError("Call load_features() before iterating")
+        cursor = 0
+        for hdf5_path, mask in zip(self.hdf5_paths, self._masks):
+            count = np.sum(mask)
+            yield FileSlice(
+                hdf5_path=hdf5_path,
+                mask=mask,
+                start=cursor,
+                end=cursor + count,
+            )
+            cursor += count
+    def __len__(self) -> int:
+        """Total number of samples across all files."""
+        return self._total_count
+    @property
+    def masks(self) -> list[np.ndarray]:
+        """Get masks (for backward compatibility)."""
+        if self._masks is None:
+            raise RuntimeError("Call load_features() before accessing masks")
+        return self._masks
+    def _build_mask(self, f: h5py.File, patch_count: int) -> np.ndarray:
+        """
+        Build cumulative mask from hierarchical filters
+        Strategy: Only read the deepest cluster level
+        - If filters = [[1,2,3], [4,5]], only read clusters at filter/1+2+3/filter/4+5
+        - Those clusters are already filtered by [1,2,3], so we only need to filter by [4,5]
+        """
+        if not self.parent_filters:
+            # No filtering
+            return np.ones(patch_count, dtype=bool)
+        # Get the deepest cluster path (parent of where we'll write new clusters)
+        # If filters = [[1,2,3], [4,5]], we need clusters at filter/1+2+3/
+        parent_cluster_path = build_cluster_path(
+            self.model_name,
+            self.namespace,
+            filters=self.parent_filters[:-1] if len(self.parent_filters) > 1 else None,
+            dataset="clusters",
+        )
+        if parent_cluster_path not in f:
+            raise RuntimeError(
+                f"Parent clusters not found at {parent_cluster_path}. Run clustering at parent level first."
+            )
+        clusters = f[parent_cluster_path][:]
+        # Filter by the last filter only (because previous filters are already applied)
+        last_filter = self.parent_filters[-1]
+        mask = np.isin(clusters, last_filter)
+        return mask
+    def get_parent_cluster_info(self, hdf5_path: str) -> tuple[np.ndarray, np.ndarray]:
+        """
+        Get parent clusters and mask for a single file
+        Returns:
+            (clusters, mask): Parent cluster values and boolean mask
+        """
+        with h5py.File(hdf5_path, "r") as f:
+            patch_count = f["metadata/patch_count"][()]
+            mask = self._build_mask(f, patch_count)
+            if self.parent_filters:
+                parent_cluster_path = build_cluster_path(
+                    self.model_name,
+                    self.namespace,
+                    filters=self.parent_filters[:-1] if len(self.parent_filters) > 1 else None,
+                    dataset="clusters",
+                )
+                clusters = f[parent_cluster_path][:]
+            else:
+                clusters = None
+            return clusters, mask

wsi_toolbox/commands/dzi.py ADDED Viewed

@@ -0,0 +1,160 @@
+"""
+DZI export command for Deep Zoom Image format
+"""
+from pathlib import Path
+from PIL import Image
+from pydantic import BaseModel
+from ..wsi_files import PyramidalWSIFile, WSIFile, create_wsi_file
+from . import _progress, get_config
+class DziResult(BaseModel):
+    """Result of DZI export"""
+    dzi_path: str
+    max_level: int
+    tile_size: int
+    overlap: int
+    width: int
+    height: int
+class DziCommand:
+    """
+    Export WSI to DZI (Deep Zoom Image) format
+    Usage:
+        cmd = DziCommand(tile_size=256, overlap=0, jpeg_quality=90)
+        result = cmd(wsi_path='slide.svs', output_dir='output', name='slide')
+        # Or with existing WSIFile instance
+        wsi = create_wsi_file('slide.svs')
+        result = cmd(wsi_file=wsi, output_dir='output', name='slide')
+    """
+    def __init__(
+        self,
+        tile_size: int = 256,
+        overlap: int = 0,
+        jpeg_quality: int = 90,
+        format: str = "jpeg",
+    ):
+        """
+        Initialize DZI export command
+        Args:
+            tile_size: Tile size in pixels (default: 256)
+            overlap: Overlap in pixels (default: 0)
+            jpeg_quality: JPEG compression quality (0-100)
+            format: Image format ("jpeg" or "png")
+        """
+        self.tile_size = tile_size
+        self.overlap = overlap
+        self.jpeg_quality = jpeg_quality
+        self.format = format
+    def __call__(
+        self,
+        wsi_path: str | None = None,
+        wsi_file: WSIFile | None = None,
+        output_dir: str = ".",
+        name: str = "slide",
+    ) -> DziResult:
+        """
+        Export WSI to DZI format
+        Args:
+            wsi_path: Path to WSI file (either this or wsi_file required)
+            wsi_file: WSIFile instance (either this or wsi_path required)
+            output_dir: Output directory
+            name: Base name for DZI files
+        Returns:
+            DziResult: Export metadata
+        """
+        # Get or create WSIFile
+        if wsi_file is None:
+            if wsi_path is None:
+                raise ValueError("Either wsi_path or wsi_file must be provided")
+            wsi_file = create_wsi_file(wsi_path)
+        # Check if pyramidal (DZI supported)
+        if not isinstance(wsi_file, PyramidalWSIFile):
+            raise TypeError(
+                f"DZI export requires PyramidalWSIFile, got {type(wsi_file).__name__}. "
+                "StandardImage does not support DZI export."
+            )
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        # Get dimensions
+        width, height = wsi_file.get_original_size()
+        max_level = wsi_file.get_dzi_max_level()
+        if get_config().verbose:
+            print(f"Original size: {width}x{height}")
+            print(f"Tile size: {self.tile_size}, Overlap: {self.overlap}")
+            print(f"Max zoom level: {max_level}")
+        # Setup directories
+        dzi_path = output_dir / f"{name}.dzi"
+        files_dir = output_dir / f"{name}_files"
+        files_dir.mkdir(exist_ok=True)
+        # Generate all levels
+        for level in range(max_level, -1, -1):
+            self._generate_level(wsi_file, files_dir, level)
+        # Write DZI XML
+        dzi_xml = wsi_file.get_dzi_xml(self.tile_size, self.overlap, self.format)
+        with open(dzi_path, "w", encoding="utf-8") as f:
+            f.write(dzi_xml)
+        if get_config().verbose:
+            print(f"DZI export complete: {dzi_path}")
+        return DziResult(
+            dzi_path=str(dzi_path),
+            max_level=max_level,
+            tile_size=self.tile_size,
+            overlap=self.overlap,
+            width=width,
+            height=height,
+        )
+    def _generate_level(
+        self,
+        wsi_file: PyramidalWSIFile,
+        files_dir: Path,
+        level: int,
+    ):
+        """Generate all tiles for a single level."""
+        level_dir = files_dir / str(level)
+        level_dir.mkdir(exist_ok=True)
+        level_width, level_height, cols, rows = wsi_file.get_dzi_level_info(level, self.tile_size)
+        if get_config().verbose:
+            print(f"Level {level}: {level_width}x{level_height}, {cols}x{rows} tiles")
+        ext = "png" if self.format == "png" else "jpeg"
+        tq = _progress(range(rows))
+        for row in tq:
+            tq.set_description(f"Level {level}: row {row + 1}/{rows}")
+            for col in range(cols):
+                tile_path = level_dir / f"{col}_{row}.{ext}"
+                # Get tile from WSIFile
+                tile_array = wsi_file.get_dzi_tile(level, col, row, self.tile_size, self.overlap)
+                # Save tile
+                img = Image.fromarray(tile_array)
+                if self.format == "png":
+                    img.save(tile_path, "PNG")
+                else:
+                    img.save(tile_path, "JPEG", quality=self.jpeg_quality)