PyPI - patchworks - Versions diffs - 0.2.0__py3-none-any.whl - Mend

patchworks 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

patchworks/__init__.py +48 -0
patchworks/_chunks.py +258 -0
patchworks/_cluster.py +93 -0
patchworks/_core.py +352 -0
patchworks/_io.py +218 -0
patchworks/_merge.py +405 -0
patchworks/_relabel.py +83 -0
patchworks/plugins/__init__.py +1 -0
patchworks/plugins/cellpose.py +188 -0
patchworks-0.2.0.dist-info/METADATA +294 -0
patchworks-0.2.0.dist-info/RECORD +12 -0
patchworks-0.2.0.dist-info/WHEEL +4 -0

patchworks/_merge.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""Zarr-native label merge: boundary scan → scipy CC → parallel relabel.
+Three steps, all zarr-native with no dask task graph:
+  1. Scan thin boundary slabs → touching label pairs (O(n_faces × face_area))
+  2. scipy sparse connected_components on pairs → relabeling LUT
+  3. Apply LUT to each chunk in parallel via multiprocessing.Pool
+Trade-off: touching-label merge only (overlap_depth=0 semantics for merge).
+IoU-overlap merge is not supported here. Keep overlap > 0 during segmentation
+for boundary-cell context; trim the halo before staging so chunk boundaries
+in the staged zarr are clean for this merge.
+Public API
+----------
+``merge_tile_labels(labeled, write_to, ...)`` — standalone merge for labeled
+dask arrays or pre-staged zarr stores. Use this directly if you already have
+per-tile labels and just need the boundary-stitching step.
+"""
+from __future__ import annotations
+import logging
+import os
+import tempfile
+from contextlib import nullcontext as _nullcontext
+from itertools import product as _iproduct
+from multiprocessing import Pool as _Pool
+from pathlib import Path
+from typing import Any, Union
+import dask.array as da
+import numpy as np
+import zarr
+try:
+    from tqdm.auto import tqdm as _tqdm
+except ImportError:
+    _tqdm = None
+logger = logging.getLogger(__name__)
+_ZARR_V3 = int(zarr.__version__.split(".")[0]) >= 3
+_LUT_WARN_THRESHOLD = 100_000_000  # warn when max_label > 100 M (LUT > 800 MB)
+# Per-worker globals set by _init_worker.
+# LUT is memory-mapped from disk so it is shared read-only across all workers
+# (OS page cache, no per-process copy). Passing the LUT directly via pickle
+# would deserialize N separate copies — e.g. 4 workers × 800 MB = 3.2 GB wasted.
+_merge_lut: "np.ndarray | None" = None
+_merge_lut_path: "str | None" = None
+_merge_staged_path: "str | None" = None
+_merge_staged_comp: "str | None" = None
+_merge_out_path: "str | None" = None
+_merge_out_comp: "str | None" = None
+def _init_worker(lut_path, staged_path, staged_comp, out_path, out_comp):
+    global _merge_lut, _merge_lut_path, _merge_staged_path, _merge_staged_comp
+    global _merge_out_path, _merge_out_comp
+    _merge_lut = np.load(lut_path, mmap_mode="r")  # shared read-only via OS page cache
+    _merge_lut_path = lut_path
+    _merge_staged_path = staged_path
+    _merge_staged_comp = staged_comp
+    _merge_out_path = out_path
+    _merge_out_comp = out_comp
+def _relabel_chunk_worker(chunk_slice: tuple) -> None:
+    src = zarr.open_group(_merge_staged_path, mode="r")[_merge_staged_comp]
+    dst = zarr.open_group(_merge_out_path, mode="r+")[_merge_out_comp]
+    block = np.asarray(src[chunk_slice], dtype=np.int64)
+    max_b = int(block.max())
+    if max_b == 0:
+        dst[chunk_slice] = block.astype(np.int32)
+        return
+    lut = _merge_lut
+    if max_b < len(lut):
+        out = lut[block]
+    else:
+        ext = np.arange(len(lut), max_b + 1, dtype=np.int64)
+        out = np.concatenate([lut, ext])[block]
+    dst[chunk_slice] = out.astype(np.int32)
+def _boundary_face_specs(
+    shape: tuple[int, ...], chunk_shape: tuple[int, ...]
+) -> list[tuple[int, int]]:
+    specs = []
+    for ax, (s, cs) in enumerate(zip(shape, chunk_shape)):
+        pos = cs
+        while pos < s:
+            specs.append((ax, pos))
+            pos += cs
+    return specs
+def _scan_touching_pairs(
+    zarr_path: str, component: str, chunk_shape: tuple[int, ...]
+) -> np.ndarray:
+    """Scan chunk-boundary slabs; return (N, 2) int64 array of touching pairs.
+    Reads the boundary face one zarr-chunk column at a time so memory per read
+    is bounded to one chunk (~200 MB). Reading the full face at once
+    (slice(None) on face axes) would allocate face_area × 8 bytes in one shot —
+    e.g. 37888 × 27392 × 8 = 8 GiB for a single z-face (OOM on real datasets).
+    """
+    root = zarr.open_group(zarr_path, mode="r")
+    arr = root[component]
+    shape = arr.shape
+    specs = _boundary_face_specs(shape, chunk_shape)
+    all_pairs: list[np.ndarray] = []
+    for ax, pos in specs:
+        # tile the face dimensions using chunk_shape columns
+        face_axes = [a for a in range(arr.ndim) if a != ax]
+        face_ranges = [range(0, shape[a], chunk_shape[a]) for a in face_axes]
+        for offsets in _iproduct(*face_ranges):
+            sl: list = [slice(None)] * arr.ndim
+            sl[ax] = slice(pos - 1, pos + 1)
+            for a, off in zip(face_axes, offsets):
+                sl[a] = slice(off, min(off + chunk_shape[a], shape[a]))
+            slab = np.moveaxis(np.asarray(arr[tuple(sl)]), ax, 0)
+            a_vals = slab[0].ravel().astype(np.int64)
+            b_vals = slab[1].ravel().astype(np.int64)
+            mask = (a_vals > 0) & (b_vals > 0) & (a_vals != b_vals)
+            if mask.any():
+                pairs = np.sort(
+                    np.stack([a_vals[mask], b_vals[mask]], axis=1), axis=1
+                )
+                all_pairs.append(np.unique(pairs, axis=0))
+    if not all_pairs:
+        return np.empty((0, 2), dtype=np.int64)
+    return np.unique(np.vstack(all_pairs), axis=0)
+def _build_relabel_lut(pairs: np.ndarray, max_label: int) -> np.ndarray:
+    """Touching-pairs → scipy connected components → relabeling LUT."""
+    if max_label > _LUT_WARN_THRESHOLD:
+        logger.warning(
+            "_build_relabel_lut: max_label=%d → LUT ~%.0f MB. "
+            "Memory use is bounded but large LUTs slow the merge.",
+            max_label, max_label * 8 / 1024**2,
+        )
+    lut = np.arange(max_label + 1, dtype=np.int64)
+    if len(pairs) == 0 or max_label == 0:
+        return lut
+    from scipy.sparse import csr_matrix
+    from scipy.sparse.csgraph import connected_components
+    n = max_label + 1
+    valid = (pairs[:, 0] < n) & (pairs[:, 1] < n)
+    pairs = pairs[valid]
+    if len(pairs) == 0:
+        return lut
+    rows = np.concatenate([pairs[:, 0], pairs[:, 1]])
+    cols = np.concatenate([pairs[:, 1], pairs[:, 0]])
+    graph = csr_matrix(
+        (np.ones(len(rows), dtype=np.float32), (rows, cols)), shape=(n, n)
+    )
+    n_cc, cc_of = connected_components(graph, directed=False)
+    cc_min = np.full(n_cc, n, dtype=np.int64)
+    np.minimum.at(cc_min, cc_of, np.arange(n, dtype=np.int64))
+    return cc_min[cc_of]
+def _create_zarr_label_array(
+    group: zarr.Group, name: str, shape: tuple, chunks: tuple
+) -> zarr.Array:
+    if name in group:
+        del group[name]
+    if _ZARR_V3:
+        return group.create_array(name, shape=shape, chunks=chunks, dtype=np.int32)
+    return group.zeros(name, shape=shape, chunks=chunks, dtype=np.int32, overwrite=True)
+def zarr_native_merge(
+    staged_path: str,
+    staged_component: str,
+    out_path: str,
+    out_component: str,
+    n_workers: int = 4,
+    show_progress: bool = False,
+) -> None:
+    """Zarr-native label merge: boundary scan → scipy CC → parallel relabel.
+    Scales to 2000+ chunks where the dask_image approach stalls (O(n_chunks²)
+    graph). Reads *staged_path/staged_component*, merges touching cross-boundary
+    labels, writes result to *out_path/out_component*. No dask task graph.
+    """
+    root = zarr.open_group(staged_path, mode="r")
+    arr = root[staged_component]
+    shape, chunk_shape = arr.shape, arr.chunks
+    max_label = int(da.from_zarr(staged_path, component=staged_component).max().compute())
+    logger.info(
+        "zarr_native_merge: shape=%s chunks=%s max_label=%d", shape, chunk_shape, max_label
+    )
+    n_faces = len(_boundary_face_specs(shape, chunk_shape))
+    logger.info("zarr_native_merge: scanning %d boundary faces…", n_faces)
+    pairs = _scan_touching_pairs(staged_path, staged_component, chunk_shape)
+    logger.info("zarr_native_merge: %d touching pairs → building LUT", len(pairs))
+    lut = _build_relabel_lut(pairs, max_label)
+    n_remapped = int((lut != np.arange(len(lut), dtype=np.int64)).sum())
+    logger.info("zarr_native_merge: %d labels remapped across boundaries", n_remapped)
+    out_root = zarr.open_group(out_path, mode="a")
+    _create_zarr_label_array(out_root, out_component, shape, chunk_shape)
+    n_per_dim = [(s + c - 1) // c for s, c in zip(shape, chunk_shape)]
+    chunk_slices = [
+        tuple(
+            slice(i * c, min((i + 1) * c, s))
+            for i, c, s in zip(idx, chunk_shape, shape)
+        )
+        for idx in _iproduct(*[range(n) for n in n_per_dim])
+    ]
+    n_chunks = len(chunk_slices)
+    n_w = max(1, min(n_workers, n_chunks))
+    logger.info("zarr_native_merge: relabeling %d chunks with %d worker(s)…", n_chunks, n_w)
+    # Save LUT to a temp .npy file so workers memory-map it (shared OS page cache).
+    # Pickling the LUT array directly via multiprocessing initargs would
+    # deserialize a full copy per worker — e.g. 4 workers × 800 MB = 3.2 GB.
+    _lut_dir = tempfile.mkdtemp(prefix="bb_lut_")
+    lut_path = os.path.join(_lut_dir, "lut.npy")
+    np.save(lut_path, lut)
+    del lut  # parent no longer needs it; workers load via mmap
+    try:
+        if n_w <= 1:
+            _init_worker(lut_path, staged_path, staged_component, out_path, out_component)
+            it: Any = chunk_slices
+            if show_progress and _tqdm is not None:
+                it = _tqdm(it, total=n_chunks, desc="relabel chunks")
+            for sl in it:
+                _relabel_chunk_worker(sl)
+        else:
+            with _Pool(
+                processes=n_w,
+                initializer=_init_worker,
+                initargs=(lut_path, staged_path, staged_component, out_path, out_component),
+            ) as pool:
+                it = pool.imap_unordered(_relabel_chunk_worker, chunk_slices)
+                if show_progress and _tqdm is not None:
+                    it = _tqdm(it, total=n_chunks, desc="relabel chunks")
+                for _ in it:
+                    pass
+    finally:
+        import shutil
+        shutil.rmtree(_lut_dir, ignore_errors=True)
+# ---------------------------------------------------------------------------
+# Public standalone merge API
+# ---------------------------------------------------------------------------
+def merge_tile_labels(
+    labeled: Union["da.Array", str, Path],
+    write_to: Union[str, Path, None] = None,
+    *,
+    input_component: str = "labels",
+    output_component: str = "labels",
+    overlap: int = 0,
+    sequential_labels: bool = False,
+    n_workers: int | None = None,
+    stage_dir: Union[str, Path, None] = None,
+    keep_stage: bool = False,
+    progress: bool = False,
+) -> "da.Array":
+    """Merge per-tile labels into a globally consistent label array.
+    Standalone merge step — use this when you already have per-tile labels
+    (from your own segmentation pipeline) and just need the boundary stitching.
+    Accepts either:
+    - A **dask array** of per-tile integer labels (e.g. output of
+      ``dask.array.map_blocks`` on your own segmentation function).
+    - A **zarr store path** whose ``input_component`` array already contains
+      per-tile labels written by your own pipeline.
+    Labels that **touch** across tile boundaries are merged into a single ID.
+    The merge is zarr-native (boundary scan → scipy connected components →
+    parallel relabel) — no dask task graph, scales to thousands of tiles.
+    Parameters
+    ----------
+    labeled:
+        Per-tile label array. Either a dask array or a path to a zarr store
+        that contains per-tile labels in ``input_component``.
+    write_to:
+        Output zarr store path. When None, an auto-temp store is used.
+    input_component:
+        Array name inside a zarr *input* store (ignored for dask arrays).
+    output_component:
+        Array name inside ``write_to``. Default ``"labels"``.
+    overlap:
+        If ``labeled`` is a dask array that was computed with ``da.overlap``,
+        pass the same depth here to trim the halos before merging.
+        Set 0 (default) if the array has no overlap halos.
+    sequential_labels:
+        Renumber the merged labels to a contiguous ``1..N`` range via a cheap
+        linear post-pass (O(voxels)). Default False.
+    n_workers:
+        Parallel workers for the relabel step. Default ``min(4, cpu_count)``.
+    stage_dir:
+        Directory for the temp stage zarr when *labeled* is a dask array.
+        Default: a system temp directory.
+    keep_stage:
+        Keep the temp stage zarr after merging. Default False.
+    progress:
+        Show a progress bar during the relabel step.
+    Returns
+    -------
+    da.Array
+        Merged label array (int32) backed by ``write_to``.
+    Examples
+    --------
+    **From a dask array of per-tile labels:**
+    >>> import dask.array as da
+    >>> from patchworks import merge_tile_labels
+    >>>
+    >>> # your own tiling + segmentation
+    >>> image = da.from_zarr("image.zarr").rechunk((1, 1024, 1024))
+    >>> labeled = image.map_blocks(my_segment_fn, dtype="int32",
+    ...                            meta=np.empty((0,) * image.ndim, dtype="int32"))
+    >>>
+    >>> merged = merge_tile_labels(labeled, write_to="labels.zarr", progress=True)
+    **From a pre-staged zarr store (your pipeline already wrote labels):**
+    >>> merged = merge_tile_labels(
+    ...     "my_staged_labels.zarr",
+    ...     input_component="raw_labels",
+    ...     write_to="merged_labels.zarr",
+    ...     sequential_labels=True,
+    ... )
+    **Trim overlap halos before merging:**
+    >>> # if labeled was computed with da.overlap.overlap(depth=20)
+    >>> merged = merge_tile_labels(labeled, write_to="labels.zarr", overlap=20)
+    """
+    import dask.array as da
+    from ._relabel import relabel_sequential_zarr
+    nw = n_workers if n_workers is not None else min(4, os.cpu_count() or 1)
+    # -- Stage dask array to zarr if needed --
+    stage_path: str | None = None
+    staged_component = "staged"
+    if isinstance(labeled, (str, Path)):
+        stage_path = str(labeled)
+        staged_component = input_component
+    else:
+        # labeled is a dask array
+        if overlap > 0:
+            labeled = da.overlap.trim_overlap(labeled, depth=overlap, boundary="none")
+        _base = str(stage_dir) if stage_dir is not None else tempfile.mkdtemp(prefix="bb_stage_")
+        stage_path = os.path.join(_base, "_bb_stage.zarr")
+        import dask
+        from dask.diagnostics import ProgressBar
+        ctx = ProgressBar() if progress else _nullcontext()
+        logger.info("Staging per-tile labels to %s …", stage_path)
+        with ctx:
+            dask.compute(
+                labeled.to_zarr(stage_path, component=staged_component, overwrite=True, compute=False)
+            )
+    # -- Resolve output path --
+    if write_to is not None:
+        effective_out = str(write_to)
+    else:
+        effective_out = os.path.join(
+            tempfile.mkdtemp(prefix="bb_merge_"), "merged.zarr"
+        )
+        logger.info("write_to not set — merged labels in auto-temp %s", effective_out)
+    # -- Merge --
+    zarr_native_merge(
+        stage_path, staged_component,
+        effective_out, output_component,
+        n_workers=nw,
+        show_progress=progress,
+    )
+    if sequential_labels:
+        logger.info("Relabelling to contiguous ids…")
+        relabel_sequential_zarr(effective_out, output_component)
+    # -- Cleanup temp stage (only when we created it) --
+    if not isinstance(labeled, (str, Path)) and not keep_stage:
+        import shutil
+        shutil.rmtree(stage_path, ignore_errors=True)
+        logger.info("Removed stage store %s", stage_path)
+    return da.from_zarr(effective_out, component=output_component)

patchworks/_relabel.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Linear sequential relabelling (O(voxels), not O(n_chunks²))."""
+from __future__ import annotations
+import logging
+from itertools import product as _iproduct
+import numpy as np
+import zarr
+logger = logging.getLogger(__name__)
+_LUT_WARN_THRESHOLD = 100_000_000  # warn when max_label > 100 M (LUT > 800 MB)
+def relabel_sequential_array(labels: np.ndarray) -> np.ndarray:
+    """Remap *labels* to a contiguous ``0, 1, … N`` range.
+    Background (0) stays 0. Runs in one ``np.unique`` + a lookup-table gather,
+    i.e. O(voxels) — unlike dask's ``relabel_sequential`` which is O(n_chunks²).
+    Examples
+    --------
+    >>> relabel_sequential_array(np.array([0, 500000, 500000, 7]))
+    array([0, 2, 2, 1])
+    """
+    uniq = np.unique(labels)
+    max_label = int(uniq[-1])
+    if max_label > _LUT_WARN_THRESHOLD:
+        logger.warning(
+            "relabel_sequential_array: max_label=%d → LUT size ~%.0f MB. "
+            "Consider using write_to= so labels never need to be in RAM.",
+            max_label, max_label * 8 / 1024**2,
+        )
+    lut = np.zeros(max_label + 1, dtype=np.int64)
+    lut[uniq] = np.arange(uniq.size)
+    out = lut[labels]
+    n = uniq.size - 1 if uniq[0] == 0 else uniq.size
+    dtype = np.uint16 if n < np.iinfo(np.uint16).max else np.uint32
+    return out.astype(dtype)
+def relabel_sequential_zarr(store_path: str, component: str = "labels") -> int:
+    """Relabel a written label zarr to contiguous ids, in place. Returns N.
+    Two-pass streaming algorithm — safe for arrays far larger than RAM.
+    Pass 1 collects unique ids (bounded memory: a set). Pass 2 applies the
+    lookup-table remap chunk by chunk.
+    """
+    root = zarr.open_group(store_path, mode="r+")
+    z = root[component]
+    z_shape, z_chunks = z.shape, z.chunks
+    # Iterate over actual zarr chunks in ALL dimensions. The z-slab approach
+    # (step = z_chunks[0], slice z[i0:i0+step]) reads the full y/x extent per
+    # step — for chunks like (120, 731, 731) that means (120, 37888, 27392)
+    # = 464 GiB in one allocation (MemoryError).
+    n_per_dim = [(s + c - 1) // c for s, c in zip(z_shape, z_chunks)]
+    chunk_slices = [
+        tuple(slice(i * c, min((i + 1) * c, s)) for i, c, s in zip(idx, z_chunks, z_shape))
+        for idx in _iproduct(*[range(n) for n in n_per_dim])
+    ]
+    uniq: set[int] = set()
+    for sl in chunk_slices:
+        uniq.update(np.unique(np.asarray(z[sl])).tolist())
+    sorted_ids = np.array(sorted(uniq), dtype=np.int64)
+    max_label = int(sorted_ids[-1])
+    if max_label > _LUT_WARN_THRESHOLD:
+        logger.warning(
+            "relabel_sequential_zarr: max_label=%d → LUT size ~%.0f MB.",
+            max_label, max_label * 8 / 1024**2,
+        )
+    lut = np.zeros(max_label + 1, dtype=np.int64)
+    lut[sorted_ids] = np.arange(sorted_ids.size)
+    n = sorted_ids.size - 1 if sorted_ids[0] == 0 else sorted_ids.size
+    # Use same dtype logic as relabel_sequential_array so output never overflows.
+    out_dtype = np.uint16 if n < np.iinfo(np.uint16).max else np.uint32
+    for sl in chunk_slices:
+        block = np.asarray(z[sl])
+        z[sl] = lut[block].astype(out_dtype)
+    logger.info("relabel_sequential_zarr: %d objects renumbered to 1..%d", n, n)
+    return int(n)

patchworks/plugins/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # patchworks plugins

patchworks/plugins/cellpose.py ADDED Viewed

@@ -0,0 +1,188 @@
+"""Cellpose plugin for patchworks.
+Requires cellpose >= 3.0 (compatible with v3 and v4).
+Usage
+-----
+>>> from patchworks.plugins.cellpose import cellpose_fn
+>>> from patchworks import tile_process
+>>>
+>>> fn = cellpose_fn("cyto3", gpu=True, diameter=30)
+>>> result = tile_process("image.zarr", fn, tile_shape=(1, 2048, 2048),
+...                       overlap=20, write_to="labels.zarr", progress=True)
+"""
+from __future__ import annotations
+import importlib.metadata
+import logging
+from functools import partial
+from typing import Any, Callable
+import numpy as np
+logger = logging.getLogger(__name__)
+try:
+    from cellpose import models as _cellpose_models
+    _CELLPOSE_VERSION: tuple[int, ...] = tuple(
+        int(x) for x in importlib.metadata.version("cellpose").split(".")[:2]
+    )
+    _CELLPOSE_V4 = _CELLPOSE_VERSION[0] >= 4
+except ImportError as _e:
+    _cellpose_models = None  # type: ignore[assignment]
+    _CELLPOSE_VERSION = (0, 0)
+    _CELLPOSE_V4 = False
+# Per-process model cache keyed by (model_type, gpu)
+_model_cache: dict[tuple, Any] = {}
+def _require_cellpose():
+    if _cellpose_models is None:
+        raise ImportError(
+            "cellpose is not installed. Install it with:\n"
+            "    pip install cellpose\n"
+            "or:\n"
+            "    pip install patchworks[cellpose]"
+        )
+def cellpose_fn(
+    model: str = "cyto3",
+    *,
+    gpu: bool = False,
+    diameter: float | None = None,
+    do_3D: bool = False,
+    channels: list[int] | None = None,
+    channel_axis: int | None = None,
+    **cellpose_kwargs: Any,
+) -> Callable[[np.ndarray], np.ndarray]:
+    """Return a ready-to-use Cellpose function for ``tile_process``.
+    One-liner convenience wrapper: combines model configuration and function
+    creation into a single call.
+    Parameters
+    ----------
+    model:
+        Cellpose model type: ``"cyto3"``, ``"cyto2"``, ``"nuclei"``, etc.
+    gpu:
+        Use GPU for inference.
+    diameter:
+        Expected cell diameter in pixels. ``None`` → Cellpose auto-estimates.
+    do_3D:
+        Run in 3-D mode. Each tile must contain the full z-stack — use
+        ``auto_tile_shape_cellpose(do_3D=True)`` for appropriate tile shapes.
+    channels:
+        *Cellpose 3 only.* ``[cytoplasm_channel, nucleus_channel]`` (1-based,
+        0 = greyscale). ``[0, 0]`` → greyscale. ``[1, 2]`` → cyto=ch1, nuc=ch2.
+    channel_axis:
+        *Cellpose 4 only.* Index of the channel axis in the input array.
+        ``None`` → greyscale input.
+    **cellpose_kwargs:
+        Extra kwargs forwarded to ``model.eval()``
+        (e.g. ``flow_threshold``, ``cellprob_threshold``, ``anisotropy``).
+    Returns
+    -------
+    Callable[[ndarray], ndarray]
+        Picklable function ready for ``tile_process``.
+    Examples
+    --------
+    Greyscale 2-D:
+    >>> fn = cellpose_fn("cyto3", gpu=True, diameter=30)
+    >>> result = tile_process("image.zarr", fn, tile_shape=(1, 2048, 2048), overlap=20)
+    Nuclear segmentation:
+    >>> fn = cellpose_fn("nuclei", diameter=15)
+    >>> result = tile_process("image.zarr", fn, channel=1)
+    3-D with anisotropy:
+    >>> fn = cellpose_fn("cyto3", gpu=True, do_3D=True, anisotropy=3.0, diameter=20)
+    >>> from functools import partial
+    >>> from patchworks import auto_tile_shape_cellpose, tile_process
+    >>> tile_fn = partial(auto_tile_shape_cellpose, do_3D=True, use_gpu=True, diameter=20)
+    >>> result = tile_process("image.zarr", fn, tile_shape=tile_fn, overlap=10)
+    """
+    _require_cellpose()
+    cfg = _make_config(model, gpu, channels, channel_axis, diameter, do_3D, **cellpose_kwargs)
+    return partial(_run, cellpose_dict=cfg)
+def _make_config(
+    model: str = "cyto3",
+    gpu: bool = False,
+    channels: list[int] | None = None,
+    channel_axis: int | None = None,
+    diameter: float | None = None,
+    do_3D: bool = False,
+    **cellpose_kwargs: Any,
+) -> dict[str, Any]:
+    return {
+        "model": model,
+        "gpu": gpu,
+        "channels": channels if channels is not None else [0, 0],
+        "channel_axis": channel_axis,
+        "diameter": diameter,
+        "do_3D": do_3D,
+        "cellpose_kwargs": cellpose_kwargs,
+    }
+def _get_model(cellpose_dict: dict[str, Any]) -> Any:
+    """Return a worker-local cached Cellpose model."""
+    _require_cellpose()
+    key = (cellpose_dict["model"], cellpose_dict.get("gpu", False))
+    if key not in _model_cache:
+        gpu = cellpose_dict.get("gpu", False)
+        model_type = cellpose_dict["model"]
+        if _CELLPOSE_V4:
+            _model_cache[key] = _cellpose_models.CellposeModel(
+                model_type=model_type, gpu=gpu
+            )
+        else:
+            _model_cache[key] = _cellpose_models.Cellpose(
+                model_type=model_type, gpu=gpu
+            )
+    return _model_cache[key]
+def _run(block: np.ndarray, cellpose_dict: dict[str, Any]) -> np.ndarray:
+    """Segment one tile with a cached Cellpose model."""
+    model = _get_model(cellpose_dict)
+    do_3D = cellpose_dict["do_3D"]
+    if _CELLPOSE_V4:
+        kwargs: dict[str, Any] = dict(
+            channel_axis=cellpose_dict.get("channel_axis"),
+            diameter=cellpose_dict["diameter"],
+            do_3D=do_3D,
+            **cellpose_dict.get("cellpose_kwargs", {}),
+        )
+    else:
+        kwargs = dict(
+            channels=cellpose_dict["channels"],
+            diameter=cellpose_dict["diameter"],
+            do_3D=do_3D,
+            **cellpose_dict.get("cellpose_kwargs", {}),
+        )
+    if do_3D:
+        kwargs["z_axis"] = 0
+        return model.eval(block, **kwargs)[0].astype("int32")
+    else:
+        # Squeeze singleton z so Cellpose gets a clean 2-D image
+        squeeze = block.ndim == 3 and block.shape[0] == 1
+        img = block[0] if squeeze else block
+        masks = model.eval(img, **kwargs)[0].astype("int32")
+        return masks[np.newaxis] if squeeze else masks
+# Keep the lower-level names available for advanced users
+make_cellpose_config = _make_config
+get_cellpose_model = _get_model
+run_cellpose = _run