PyPI - nnInteractive - Versions diffs - 2.3.1__tar.gz → 2.3.3__tar.gz - Mend

nnInteractive 2.3.1tar.gz → 2.3.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{nninteractive-2.3.1 → nninteractive-2.3.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nnInteractive
-Version: 2.3.1
+Version: 2.3.3
 Summary: Inference code for nnInteractive
 Author: Helmholtz Imaging Applied Computer Vision Lab
 Author-email: Fabian Isensee <f.isensee@dkfz-heidelberg.de>

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/inference/inference_session.py RENAMED Viewed

@@ -31,12 +31,20 @@ from nnInteractive.utils.inference_helpers import (
     transform_coordinates_noresampling,
     version_to_tuple,
 )
+from nnInteractive.utils.os_shennanigans import is_linux_kernel_6_11
 from nnInteractive.utils.rounding import round_to_nearest_odd
 class nnInteractiveInferenceSession:
     INFERENCE_SESSION_VERSION = nnInteractive.__version__
     REFINEMENT_CACHE_GPU_HEADROOM_BYTES = 4 * 1024**3
+    # Maximum adaptive zoom-out factor (see _predict). Also bounds the largest interaction crop,
+    # which sizes the reusable blosc2 decompression buffer.
+    MAX_AUTOZOOM_FACTOR = 4
+    # 'auto' interaction storage threshold: images with at most this many spatial voxels
+    # (512*512*1024) use the dense tensor backend; larger ones use blosc2 to bound RAM.
+    AUTO_TENSOR_MAX_VOXELS = 2**28
+    INTERACTIONS_STORAGE_OPTIONS = ("blosc2", "tensor", "auto")
     # Interactions implemented by this inference session.
     SUPPORTED_INTERACTION_KEYS = ("scribble", "lasso", "points", "bbox2d", "bbox3d")
@@ -47,6 +55,7 @@ class nnInteractiveInferenceSession:
         verbose: bool = False,
         torch_n_threads: int = 8,
         do_autozoom: bool = True,
+        interactions_storage: str = "auto",
     ):
         """
         Only intended to work with nnInteractiveTrainerV2 and its derivatives
@@ -57,7 +66,22 @@ class nnInteractiveInferenceSession:
         This is recommended for the persistent inference server, where the
         process is long-lived so the one-time compile cost is paid only once and
         amortized across the whole session lifetime.
+        ``interactions_storage``: storage backend for the interaction tensor, one of
+        ``"blosc2"``, ``"tensor"`` or ``"auto"`` (default).
+        ``"blosc2"`` keeps it as a compact blosc2 in-memory NDArray (low RAM, pays
+        (de)compression on every read/write). ``"tensor"`` stores it as a dense CPU
+        float16 ``torch.Tensor`` (more RAM, far lower per-access overhead; pinned memory
+        by default, skipped when ``device`` is not CUDA or on Linux kernel 6.11 where
+        pinning is buggy). ``"auto"`` decides per image at initialization from the
+        interaction tensor's voxel count: at most ``AUTO_TENSOR_MAX_VOXELS`` (512*512*1024)
+        spatial voxels uses ``"tensor"``, larger uses ``"blosc2"``.
         """
+        if interactions_storage not in self.INTERACTIONS_STORAGE_OPTIONS:
+            raise ValueError(
+                f"interactions_storage must be one of {self.INTERACTIONS_STORAGE_OPTIONS}, "
+                f"got {interactions_storage!r}."
+            )
         print("session initialized")
         self.network = None
@@ -69,6 +93,9 @@ class nnInteractiveInferenceSession:
         self._interactions_shape = None
         self.device = device
         self.use_torch_compile = use_torch_compile
+        self.interactions_storage = interactions_storage
+        # Concrete backend ("blosc2"/"tensor") resolved per image in _initialize_interactions.
+        self._interactions_storage_resolved: Optional[str] = None
         self.interaction_decay = None
         self.current_interaction_intensity: float = 1.0
         self._fp16_max_value = float(torch.finfo(torch.float16).max)
@@ -86,7 +113,10 @@ class nnInteractiveInferenceSession:
         self.license: Optional[str] = None
         # image specific
-        self.interactions = None  # blosc2.NDArray once initialized
+        self.interactions = None  # blosc2.NDArray or dense torch.Tensor (see interactions_storage)
+        # Reusable, pre-faulted float16 buffer to decompress blosc2 interaction crops into (Path B).
+        # Allocated per image in _initialize_interactions; None for the dense-tensor backend.
+        self._interactions_read_buffer = None
         self.preprocessed_image: torch.Tensor = None
         self.preprocessed_props = None
         self.target_buffer: Union[np.ndarray, torch.Tensor] = None
@@ -303,19 +333,38 @@ class nnInteractiveInferenceSession:
     def _interactions_inplace_maximum(self, channel_idx: int, int_slicer, new_values) -> None:
         """In-place element-wise maximum for a subregion of a channel."""
+        full_slicer = (channel_idx, *int_slicer)
+        if isinstance(self.interactions, torch.Tensor):
+            # Dense torch backend: operate in place without a numpy round-trip.
+            if not isinstance(new_values, torch.Tensor):
+                new_values = torch.as_tensor(new_values)
+            view = self.interactions[full_slicer]
+            torch.maximum(view, new_values.to(view.dtype), out=view)
+            return
         if isinstance(new_values, torch.Tensor):
             new_values = new_values.cpu().numpy().astype(np.float16)
-        full_slicer = (channel_idx, *int_slicer)
         current_sub = np.asarray(self.interactions[full_slicer])
         np.maximum(current_sub, new_values, out=current_sub)
         self.interactions[full_slicer] = current_sub
     def _write_interactions_channel(self, channel_idx: int, value) -> None:
         """Write a full channel. Handles torch→numpy for blosc2."""
+        if isinstance(self.interactions, torch.Tensor):
+            if not isinstance(value, torch.Tensor):
+                value = torch.as_tensor(value)
+            self.interactions[channel_idx] = value.to(self.interactions.dtype)
+            return
         if isinstance(value, torch.Tensor):
             value = value.cpu().numpy().astype(np.float16)
         self.interactions[channel_idx] = value
+    def _read_interactions_to_device(self, full_slicer, device) -> torch.Tensor:
+        """Read an interaction subregion as a torch.Tensor on ``device``, regardless of backend."""
+        sub = self.interactions[full_slicer]
+        if isinstance(sub, torch.Tensor):
+            return sub.to(device)
+        return torch.from_numpy(np.asarray(sub)).to(device)
     def _paste_prediction_to_target_buffer(self, prediction: torch.Tensor, bbox: List[List[int]]) -> None:
         target_bbox = self._interaction_bbox_to_target_bbox(bbox)
         if isinstance(self.target_buffer, torch.Tensor):
@@ -556,11 +605,30 @@ class nnInteractiveInferenceSession:
         self.original_image_shape = None
         self._last_paste_bbox = None
-    def _initialize_interactions(self, image_torch: torch.Tensor):
-        shape = (self.num_interaction_channels, *image_torch.shape[1:])
-        if self.verbose:
-            print("Initialize interactions with blosc2 in-memory compression")
-        self.interactions = blosc2.zeros(
+    def _resolve_interactions_storage(self, spatial_shape) -> str:
+        """Resolve the configured storage to a concrete backend ("blosc2" or "tensor").
+        For "auto", pick "tensor" for images with at most AUTO_TENSOR_MAX_VOXELS spatial voxels
+        (lower per-access overhead) and "blosc2" for larger ones (to bound RAM).
+        """
+        if self.interactions_storage != "auto":
+            return self.interactions_storage
+        n_voxels = int(np.prod(spatial_shape, dtype=np.int64))
+        return "blosc2" if n_voxels > self.AUTO_TENSOR_MAX_VOXELS else "tensor"
+    def _new_interactions_array(self, shape, compression_nthreads: int):
+        """Allocate a zeroed interaction array using the resolved backend.
+        "tensor" selects a dense CPU float16 torch.Tensor (more RAM, lower per-access
+        overhead); "blosc2" uses a compact blosc2 in-memory NDArray.
+        """
+        if self._interactions_storage_resolved == "tensor":
+            # Pinning enables faster non-blocking host->device copies, but only helps for a
+            # CUDA target and is buggy on Linux kernel 6.11 (see utils/os_shennanigans).
+            pin = self.device.type == "cuda" and not is_linux_kernel_6_11()
+            tensor = torch.zeros(shape, dtype=torch.float16, device="cpu")
+            return tensor.pin_memory() if pin else tensor
+        return blosc2.zeros(
             shape,
             dtype=np.float16,
             chunks=(1, *[min(64, s) for s in shape[1:]]),
@@ -570,11 +638,49 @@ class nnInteractiveInferenceSession:
                 "codec": blosc2.Codec.LZ4,
                 "clevel": 5,
                 "filters": [blosc2.Filter.NOFILTER],
-                "nthreads": min(self.torch_n_threads, os.cpu_count()),
+                "nthreads": compression_nthreads,
             },
-            dparams={"nthreads": 4},
+            # Decompression of this sparse interaction tensor is fastest single-threaded:
+            # blosc2's per-chunk thread sync costs more than it saves here, badly so on
+            # many-core/many-CCD servers (see benchmarks). Multithreading only hurts.
+            dparams={"nthreads": 1},
         )
+    def _initialize_interactions(self, image_torch: torch.Tensor):
+        shape = (self.num_interaction_channels, *image_torch.shape[1:])
+        self._interactions_storage_resolved = self._resolve_interactions_storage(shape[1:])
+        via_auto = self.interactions_storage == "auto"
+        if self.verbose or via_auto:
+            backend = "dense torch.Tensor" if self._interactions_storage_resolved == "tensor" else "blosc2 in-memory compression"
+            print(f"Initialize interactions with {backend}{' (auto)' if via_auto else ''}")
+        self.interactions = self._new_interactions_array(shape, min(self.torch_n_threads, os.cpu_count()))
         self._interactions_shape = shape
+        self._interactions_read_buffer = self._new_interactions_read_buffer(shape)
+    def _new_interactions_read_buffer(self, shape) -> Optional[np.ndarray]:
+        """Pre-faulted buffer to decompress blosc2 interaction crops into (Path B), or None.
+        Sized to the largest possible crop: the patch size scaled by the maximum autozoom factor,
+        capped to the image size. Only allocated for the blosc2 backend that exposes the
+        decompress-into-buffer method; the dense-tensor backend returns views and needs no buffer.
+        """
+        if self._interactions_storage_resolved != "blosc2":
+            return None
+        if not hasattr(self.interactions, "get_slice_numpy"):
+            print(
+                "WARNING: this blosc2 build has no NDArray.get_slice_numpy; cannot reuse a "
+                "decompression buffer for interaction crops. Falling back to a fresh allocation on "
+                "every read (slower). Consider updating blosc2."
+            )
+            return None
+        max_valid = [
+            min(round(p * self.MAX_AUTOZOOM_FACTOR), s)
+            for p, s in zip(self.configuration_manager.patch_size, shape[1:])
+        ]
+        n = self.num_interaction_channels * int(np.prod(max_valid, dtype=np.int64))
+        buffer = np.empty(n, dtype=np.float16)
+        buffer[:] = 0  # first-touch the pages once, up front
+        return buffer
     @torch.inference_mode()
     def _background_set_image(self, image: np.ndarray, image_properties: dict):
@@ -635,20 +741,7 @@ class nnInteractiveInferenceSession:
         """
         if self.interactions is not None:
             del self.interactions
-            self.interactions = blosc2.zeros(
-                self._interactions_shape,
-                dtype=np.float16,
-                chunks=(1, *[min(64, s) for s in self._interactions_shape[1:]]),
-                blocks=(1, *[min(32, s) for s in self._interactions_shape[1:]]),
-                # Interactions compress better with NOFILTER, which is also faster than SHUFFLE.
-                cparams={
-                    "codec": blosc2.Codec.LZ4,
-                    "clevel": 5,
-                    "filters": [blosc2.Filter.NOFILTER],
-                    "nthreads": os.cpu_count(),
-                },
-                dparams={"nthreads": 4},
-            )
+            self.interactions = self._new_interactions_array(self._interactions_shape, os.cpu_count())
         self.current_interaction_intensity = 1.0
         if self.target_buffer is not None:
@@ -980,7 +1073,9 @@ class nnInteractiveInferenceSession:
         Returns:
         """
-        print("Current cratio", self.interactions.cratio)
+        if not isinstance(self.interactions, torch.Tensor):
+            # cratio is a blosc2-only diagnostic; the dense tensor backend has no compression.
+            print("Current cratio", self.interactions.cratio)
         assert self.pad_mode_data == "constant", "pad modes other than constant are not implemented here"
         assert len(self.new_interaction_centers) == len(self.new_interaction_zoom_out_factors)
@@ -996,7 +1091,7 @@ class nnInteractiveInferenceSession:
                 "!!!WE NO LONGER RUN ONE PREDICTION PER CENTER AND ONLY USE THE LAST ADDED INTERACTION AS CENTER!!!"
             )
         prediction_center, zoom_out_factor = self.new_interaction_centers[-1], self.new_interaction_zoom_out_factors[-1]
-        zoom_out_factor = min(4, zoom_out_factor)
+        zoom_out_factor = min(self.MAX_AUTOZOOM_FACTOR, zoom_out_factor)
         start_predict = time()
         with torch.autocast(self.device.type, enabled=True) if self.device.type == "cuda" else dummy_context():
@@ -1005,7 +1100,9 @@ class nnInteractiveInferenceSession:
             input_for_predict, scaled_patch_size, scaled_bbox, previous_prediction = self._build_network_input(
                 prediction_center, zoom_out_factor
             )
-            pred = self.network(input_for_predict[None])[0].argmax(0).detach()
+            # .contiguous() is required for torch.compile: the input may be a non-contiguous
+            # view (e.g. from the dense-tensor backend), and the compiled graph assumes contiguity.
+            pred = self.network(input_for_predict[None].contiguous())[0].argmax(0).detach()
             del input_for_predict
             # detect changes at border. If there are, we enter autozoom
@@ -1022,17 +1119,19 @@ class nnInteractiveInferenceSession:
             start_zoomout = time()
             while has_change and self.do_autozoom:
                 print(f"AutoZoom zoom out factor {zoom_out_factor}")
-                # we allow a max zoom out of 4
-                if zoom_out_factor >= 4:
+                # we allow a max zoom out of MAX_AUTOZOOM_FACTOR
+                if zoom_out_factor >= self.MAX_AUTOZOOM_FACTOR:
                     break
                 else:
                     zoom_out_factor *= zoom_out_growth_factor
-                    zoom_out_factor = min(4, zoom_out_factor)
+                    zoom_out_factor = min(self.MAX_AUTOZOOM_FACTOR, zoom_out_factor)
                 input_for_predict, scaled_patch_size, scaled_bbox, previous_prediction_resized = (
                     self._build_network_input(prediction_center, zoom_out_factor)
                 )
-                pred = self.network(input_for_predict[None])[0].argmax(0).detach()
+                # .contiguous() is required for torch.compile: the input may be a non-contiguous
+                # view (e.g. from the dense-tensor backend), and the compiled graph assumes contiguity.
+                pred = self.network(input_for_predict[None].contiguous())[0].argmax(0).detach()
                 del input_for_predict
                 empty_cache(self.device)
@@ -1077,7 +1176,9 @@ class nnInteractiveInferenceSession:
         # cropping happens on CPU, padding happens on GPU (later)
         crop_img, pad_image = crop_to_valid(self.preprocessed_image, scaled_bbox)
-        interactions_tensor, pad_interaction = crop_to_valid(self.interactions, scaled_bbox)
+        interactions_tensor, pad_interaction = crop_to_valid(
+            self.interactions, scaled_bbox, out=self._interactions_read_buffer
+        )
         # For blosc2, crop_to_valid returns a numpy array; convert to torch (still on CPU).
         if not isinstance(interactions_tensor, torch.Tensor):
             interactions_tensor = torch.from_numpy(np.asarray(interactions_tensor))
@@ -1174,7 +1275,8 @@ class nnInteractiveInferenceSession:
                     dim=0,
                 )
-            pred = self.network(patch[None])[0].argmax(0).detach()
+            # .contiguous(): see _predict — required for torch.compile with possibly non-contiguous input.
+            pred = self.network(patch[None].contiguous())[0].argmax(0).detach()
             paste_tensor(
                 cache_interactions,
                 pred.to(cache_interactions.device, dtype=cache_interactions.dtype),
@@ -1261,7 +1363,7 @@ class nnInteractiveInferenceSession:
         pred_slicer = tuple(slice(lb, ub) for lb, ub in pred_bbox)
         local_slicer = tuple(slice(lb, ub) for lb, ub in local_seen_bbox)
-        prev_sub = torch.from_numpy(np.asarray(self.interactions[(prev_seg_ch, *seen_slicer)])).to(self.device)
+        prev_sub = self._read_interactions_to_device((prev_seg_ch, *seen_slicer), self.device)
         diff_local[local_slicer] = (pred[pred_slicer] != prev_sub).to(diff_local.dtype)
         del prev_sub
@@ -1280,7 +1382,7 @@ class nnInteractiveInferenceSession:
     def _mark_prev_seg_in_local_diff(self, diff_local: torch.Tensor, planning_bbox: List[List[int]]) -> None:
         prev_seg_ch = self._get_prev_seg_channel()
         planning_slicer = tuple(slice(lb, ub) for lb, ub in planning_bbox)
-        prev_sub = torch.from_numpy(np.asarray(self.interactions[(prev_seg_ch, *planning_slicer)])).to(self.device)
+        prev_sub = self._read_interactions_to_device((prev_seg_ch, *planning_slicer), self.device)
         diff_local[prev_sub > 0.5] = 1
         del prev_sub
@@ -1548,8 +1650,12 @@ class nnInteractiveInferenceSession:
         self.network = self.network.to(self.device)
     def __del__(self):
-        self._finish_preprocessing_and_initialize_interactions()
-        self.executor.shutdown()
+        # Be robust to a partially-constructed instance (e.g. __init__ raised on bad arguments):
+        # these attributes may not exist yet.
+        if hasattr(self, "preprocess_future"):
+            self._finish_preprocessing_and_initialize_interactions()
+        if hasattr(self, "executor"):
+            self.executor.shutdown()
 if __name__ == "__main__":

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/inference/remote/remote_session.py RENAMED Viewed

@@ -132,6 +132,13 @@ class nnInteractiveRemoteInferenceSession:
         the server before the client gives up. Default 60s matches observed
         prediction times (100ms..~10s) with headroom for slow links. On
         expiry: ``httpx.ReadTimeout``.
+    set_image_read_timeout:
+        Read timeout (seconds) used *only* for ``set_image``. After the volume
+        is uploaded, the server decompresses and preprocesses the full image
+        before responding, which can take far longer than a prediction on a
+        large volume. ``set_image`` therefore gets its own generous read
+        timeout instead of the much tighter ``read_timeout`` used for
+        predictions. On expiry: ``httpx.ReadTimeout``.
     write_timeout:
         Seconds to finish uploading the request body. ``set_image`` uploads
         the full 4D volume so this is the longest-running upload. On expiry:
@@ -146,6 +153,7 @@ class nnInteractiveRemoteInferenceSession:
         api_key: Optional[str] = None,
         connect_timeout: float = 10.0,
         read_timeout: float = 60.0,
+        set_image_read_timeout: float = 600.0,
         write_timeout: float = 120.0,
         pool_timeout: float = 10.0,
     ):
@@ -166,6 +174,15 @@ class nnInteractiveRemoteInferenceSession:
             ),
             headers=headers,
         )
+        # Per-request timeout override for set_image: same connect/write/pool as
+        # the client default, but a much longer read budget for server-side
+        # decompression + preprocessing of the full volume.
+        self._set_image_timeout = httpx.Timeout(
+            connect=connect_timeout,
+            read=set_image_read_timeout,
+            write=write_timeout,
+            pool=pool_timeout,
+        )
         self._lease_token: Optional[str] = None
         # Claim a session on the server. The lease token is then attached to
@@ -242,12 +259,21 @@ class nnInteractiveRemoteInferenceSession:
         resp.raise_for_status()
         return resp
-    def _post_binary(self, path: str, meta: dict, array_bytes: bytes) -> httpx.Response:
+    def _post_binary(
+        self,
+        path: str,
+        meta: dict,
+        array_bytes: bytes,
+        timeout: Union[httpx.Timeout, float, None] = None,
+    ) -> httpx.Response:
         headers = {
             META_HEADER: json.dumps(_to_jsonable(meta), separators=(",", ":")),
             "Content-Type": CONTENT_TYPE_OCTET_STREAM,
         }
-        resp = self._http.post(path, content=array_bytes, headers=headers)
+        # httpx treats timeout=None as "no override" only when the arg is
+        # omitted; pass it through explicitly only when a caller supplied one.
+        kwargs = {} if timeout is None else {"timeout": timeout}
+        resp = self._http.post(path, content=array_bytes, headers=headers, **kwargs)
         _raise_for_lease_errors(resp)
         resp.raise_for_status()
         return resp
@@ -300,7 +326,12 @@ class nnInteractiveRemoteInferenceSession:
     def set_image(self, image: np.ndarray, image_properties: Optional[dict] = None) -> None:
         assert image.ndim == 4, f"expected a 4d image as input, got {image.ndim}d. Shape {image.shape}"
         meta = {"image_properties": image_properties or {}}
-        resp = self._post_binary(PATH_SET_IMAGE, meta, pack_array(image, nthreads=_compression_threads()))
+        resp = self._post_binary(
+            PATH_SET_IMAGE,
+            meta,
+            pack_array(image, nthreads=_compression_threads()),
+            timeout=self._set_image_timeout,
+        )
         info = resp.json()
         self.original_image_shape = tuple(info["original_image_shape"])

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/inference/server/app.py RENAMED Viewed

@@ -31,6 +31,15 @@ Concurrency model:
     prediction runs at a time.
   - The acquisition order is always (session lock → gpu lock) so there is no
     deadlock potential.
+  - The endpoints that carry large payloads (``set_image`` and the mask
+    interactions) are ``async`` so they can ``await`` the upload, but their
+    CPU-bound work (blosc2 decompression, image preprocessing, prediction,
+    response compression) is dispatched to a worker thread via
+    ``run_in_threadpool``. This keeps the event loop free during a long
+    ``set_image``/predict so lightweight endpoints — ``/heartbeat``,
+    ``/healthz`` — and the background reaper stay responsive, and so two
+    clients can genuinely preprocess concurrently. Acquiring a session/gpu
+    lock therefore also happens off the loop, never stalling it.
 """
 from __future__ import annotations
@@ -50,6 +59,7 @@ import blosc2
 import numpy as np
 import torch
 from fastapi import Depends, FastAPI, HTTPException, Header, Request, Response, status
+from starlette.concurrency import run_in_threadpool
 from nnInteractive.inference.inference_session import nnInteractiveInferenceSession
 from nnInteractive.inference.remote._protocol import (
@@ -151,6 +161,7 @@ class SessionRegistry:
         torch_n_threads: int,
         do_autozoom: bool,
         use_torch_compile: bool,
+        interactions_storage: str,
         verbose: bool,
     ) -> None:
         self._artifacts = artifacts
@@ -161,6 +172,7 @@ class SessionRegistry:
         self._torch_n_threads = torch_n_threads
         self._do_autozoom = do_autozoom
         self._use_torch_compile = use_torch_compile
+        self._interactions_storage = interactions_storage
         self._verbose = verbose
         self._entries: dict[str, SessionEntry] = {}
         self._mu = threading.Lock()
@@ -189,6 +201,7 @@ class SessionRegistry:
                 verbose=self._verbose,
                 torch_n_threads=self._torch_n_threads,
                 do_autozoom=self._do_autozoom,
+                interactions_storage=self._interactions_storage,
             )
             session.initialize_from_loaded_artifacts(self._artifacts)
             entry = SessionEntry(session)
@@ -290,6 +303,7 @@ def make_app(
     torch_n_threads: int = 8,
     do_autozoom: bool = True,
     use_torch_compile: bool = False,
+    interactions_storage: str = "auto",
     verbose: bool = False,
     api_key: Optional[str] = None,
     sweep_interval_seconds: float = 15.0,
@@ -303,6 +317,7 @@ def make_app(
         torch_n_threads=torch_n_threads,
         do_autozoom=do_autozoom,
         use_torch_compile=use_torch_compile,
+        interactions_storage=interactions_storage,
         verbose=verbose,
     )
     gpu_lock = threading.Lock()
@@ -353,6 +368,7 @@ def make_app(
         verbose=False,
         torch_n_threads=torch_n_threads,
         do_autozoom=do_autozoom,
+        interactions_storage=interactions_storage,
     )
     _capability_session.initialize_from_loaded_artifacts(artifacts)
     _capability_snapshot = _build_capability_snapshot(_capability_session)
@@ -570,17 +586,24 @@ def make_app(
     async def set_image(request: Request, entry: SessionEntry = lease) -> dict:
         meta = _parse_meta_header(request.headers.get(META_HEADER))
         body = await request.body()
-        image = unpack_array(body)
         image_properties = meta.get("image_properties") or {}
-        def _do(session):
-            session.set_image(image, image_properties)
-            # set_image preprocesses in a background thread; force completion so
-            # subsequent calls can safely use original_image_shape.
-            session._finish_preprocessing_and_initialize_interactions()
-            return {"original_image_shape": list(session.original_image_shape)}
+        # Decompression + full-volume preprocessing are CPU-bound and can run
+        # for many seconds on a large image. Run them in a worker thread so the
+        # event loop keeps servicing heartbeats/healthz and the reaper.
+        def _work():
+            image = unpack_array(body)
-        return _under_session_lock(entry, _do)
+            def _do(session):
+                session.set_image(image, image_properties)
+                # set_image preprocesses in a background thread; force completion
+                # so subsequent calls can safely use original_image_shape.
+                session._finish_preprocessing_and_initialize_interactions()
+                return {"original_image_shape": list(session.original_image_shape)}
+            return _under_session_lock(entry, _do)
+        return await run_in_threadpool(_work)
     @app.post(PATH_SET_TARGET_BUFFER, dependencies=[auth])
     def set_target_buffer(payload: dict, entry: SessionEntry = lease) -> dict:
@@ -652,41 +675,53 @@ def make_app(
     async def _handle_mask_interaction(request: Request, entry: SessionEntry, kind: str) -> Response:
         meta = _parse_meta_header(request.headers.get(META_HEADER))
         body = await request.body()
-        mask = unpack_array(body)
         run_prediction = bool(meta.get("run_prediction", True))
         interaction_bbox = meta.get("interaction_bbox")
         if interaction_bbox is not None:
             interaction_bbox = [list(b) for b in interaction_bbox]
-        def _do(session):
-            method = session.add_scribble_interaction if kind == "scribble" else session.add_lasso_interaction
-            method(
-                mask,
-                bool(meta["include_interaction"]),
-                run_prediction=run_prediction,
-                override_capability_checks=bool(meta.get("override_capability_checks", False)),
-                interaction_bbox=interaction_bbox,
-            )
-            return _build_prediction_response(session, run_prediction)
+        # Decompression + prediction + response compression are CPU/GPU-bound;
+        # run them off the event loop (see set_image).
+        def _work():
+            mask = unpack_array(body)
-        return _under_session_and_gpu_lock(entry, _do)
+            def _do(session):
+                method = session.add_scribble_interaction if kind == "scribble" else session.add_lasso_interaction
+                method(
+                    mask,
+                    bool(meta["include_interaction"]),
+                    run_prediction=run_prediction,
+                    override_capability_checks=bool(meta.get("override_capability_checks", False)),
+                    interaction_bbox=interaction_bbox,
+                )
+                return _build_prediction_response(session, run_prediction)
+            return _under_session_and_gpu_lock(entry, _do)
+        return await run_in_threadpool(_work)
     @app.post(PATH_ADD_INITIAL_SEG, dependencies=[auth])
     async def add_initial_seg_interaction(request: Request, entry: SessionEntry = lease) -> Response:
         meta = _parse_meta_header(request.headers.get(META_HEADER))
         body = await request.body()
-        initial_seg = unpack_array(body)
         run_prediction = bool(meta.get("run_prediction", False))
-        def _do(session):
-            session.add_initial_seg_interaction(
-                initial_seg=initial_seg,
-                run_prediction=run_prediction,
-                override_capability_checks=bool(meta.get("override_capability_checks", False)),
-            )
-            return _build_prediction_response(session, run_prediction)
+        # Decompression + (optional) prediction are CPU/GPU-bound; run them off
+        # the event loop (see set_image).
+        def _work():
+            initial_seg = unpack_array(body)
-        return _under_session_and_gpu_lock(entry, _do)
+            def _do(session):
+                session.add_initial_seg_interaction(
+                    initial_seg=initial_seg,
+                    run_prediction=run_prediction,
+                    override_capability_checks=bool(meta.get("override_capability_checks", False)),
+                )
+                return _build_prediction_response(session, run_prediction)
+            return _under_session_and_gpu_lock(entry, _do)
+        return await run_in_threadpool(_work)
     return app

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/inference/server/main.py RENAMED Viewed

@@ -61,6 +61,15 @@ def _build_parser() -> argparse.ArgumentParser:
         "the long-lived process. Pass this flag to skip compilation (e.g. for faster startup or "
         "to work around a compile/backend issue).",
     )
+    p.add_argument(
+        "--interactions-storage",
+        choices=["blosc2", "tensor", "auto"],
+        default="auto",
+        help="Storage backend for the interaction tensor (default: auto). 'blosc2': compact "
+        "in-memory array (low RAM, pays (de)compression per read/write). 'tensor': dense pinned "
+        "CPU float16 torch.Tensor (more RAM, lower per-access overhead). 'auto': per image, use "
+        "'tensor' for images up to 512x512x1024 voxels and 'blosc2' for larger ones.",
+    )
     p.add_argument(
         "--no-autozoom",
         action="store_true",
@@ -222,6 +231,7 @@ def main(argv=None) -> int:
         torch_n_threads=args.torch_n_threads,
         do_autozoom=not args.no_autozoom,
         use_torch_compile=use_torch_compile,
+        interactions_storage=args.interactions_storage,
         verbose=args.verbose,
         api_key=api_key,
     )

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/interaction/point.py RENAMED Viewed

@@ -124,6 +124,11 @@ class PointInteraction_stub:
             )
             target_slices = (channel_idx, *slices)
+            if isinstance(interaction_map, torch.Tensor):
+                # Dense torch backend: in-place maximum, no numpy round-trip.
+                view = interaction_map[target_slices]
+                torch.maximum(view, strel[structuring_slices].to(view.dtype), out=view)
+                return interaction_map
             current_sub = np.asarray(interaction_map[target_slices])
             strel_np = strel[structuring_slices].numpy().astype(current_sub.dtype)
             np.maximum(current_sub, strel_np, out=current_sub)

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive/utils/crop.py RENAMED Viewed

@@ -190,7 +190,7 @@ def paste_tensor(target, source, bbox, channel_idx=None):
     return target
-def crop_to_valid(img, bbox):
+def crop_to_valid(img, bbox, out=None):
     """
     Crops the image to the part of the bounding box that lies within the image.
     Supports a 4D tensor of shape (C, X, Y, Z). The bounding box is specified as
@@ -200,6 +200,12 @@ def crop_to_valid(img, bbox):
         img: Input tensor (or blosc2 NDArray) of shape (C, X, Y, Z).
         bbox (list or tuple): Bounding box as a list of three intervals for spatial dims:
                               [[x1, x2], [y1, y2], [z1, z2]].
+        out (np.ndarray, optional): A flat, pre-faulted float16 buffer to decompress a blosc2
+                              crop into, avoiding a fresh allocation + page-fault on every call
+                              ("Path B"). Only used when ``img`` is a blosc2 NDArray exposing
+                              ``get_slice_numpy`` and the crop fits; otherwise ignored and a fresh
+                              array is returned. When used, the returned crop is a VIEW into ``out``
+                              and is only valid until the next call that reuses the same buffer.
     Returns:
         cropped: Cropped data of shape (C, cropped_x, cropped_y, cropped_z).
@@ -224,6 +230,26 @@ def crop_to_valid(img, bbox):
         pad_right = end - dim_size if end > dim_size else 0
         pad.append((pad_left, pad_right))
+    # Path B: decompress the blosc2 crop straight into a reused, pre-faulted buffer to avoid the
+    # per-call allocation + first-touch page-fault cost. get_slice_numpy is blosc2's internal
+    # decompress-into-buffer method (what __getitem__ calls under the hood); guarded since it is
+    # not a documented public API. Falls back to a fresh allocation if the crop would not fit.
+    if out is not None and not isinstance(img, torch.Tensor) and hasattr(img, "get_slice_numpy"):
+        valid_shape = [ce - cs for cs, ce in crop_indices]
+        output_shape = (img.shape[0], *valid_shape)
+        n = int(np.prod(output_shape, dtype=np.int64))
+        if n <= out.size:
+            view = out[:n].reshape(output_shape)
+            start = (0, *[cs for cs, _ in crop_indices])
+            stop = (img.shape[0], *[ce for _, ce in crop_indices])
+            img.get_slice_numpy(view, (start, stop))
+            return view, pad
+        print(
+            f"WARNING: interaction crop of {n} elements (shape {output_shape}) exceeds the reusable "
+            f"decompression buffer of {out.size} elements; this should never happen. Falling back to "
+            "a fresh allocation."
+        )
     # Crop the image on spatial dimensions, leaving the channel dimension intact.
     cropped = img[
         :,

{nninteractive-2.3.1 → nninteractive-2.3.3}/nnInteractive.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nnInteractive
-Version: 2.3.1
+Version: 2.3.3
 Summary: Inference code for nnInteractive
 Author: Helmholtz Imaging Applied Computer Vision Lab
 Author-email: Fabian Isensee <f.isensee@dkfz-heidelberg.de>

{nninteractive-2.3.1 → nninteractive-2.3.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "nnInteractive"
-version = "2.3.1"
+version = "2.3.3"
 requires-python = ">=3.10"
 description = "Inference code for nnInteractive"
 readme = "readme.md"