npm - @camstack/addon-detection-pipeline - Versions diffs - 0.1.1 - Mend

@camstack/addon-detection-pipeline 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/index.d.mts +638 -0
package/dist/index.d.ts +638 -0
package/dist/index.js +5826 -0
package/dist/index.js.map +1 -0
package/dist/index.mjs +5801 -0
package/dist/index.mjs.map +1 -0
package/package.json +84 -0
package/python/inference_pool.py +1088 -0
package/python/postprocessors/__init__.py +24 -0
package/python/postprocessors/arcface.py +31 -0
package/python/postprocessors/ctc.py +68 -0
package/python/postprocessors/saliency.py +44 -0
package/python/postprocessors/scrfd.py +212 -0
package/python/postprocessors/softmax.py +43 -0
package/python/postprocessors/yamnet.py +41 -0
package/python/postprocessors/yolo.py +278 -0
package/python/postprocessors/yolo_seg.py +247 -0
package/python/requirements-coreml.txt +4 -0
package/python/requirements-onnxruntime.txt +3 -0
package/python/requirements-openvino.txt +3 -0
package/python/requirements.txt +9 -0

package/python/inference_pool.py ADDED Viewed

@@ -0,0 +1,1088 @@
+#!/usr/bin/env python3
+"""Async inference pool — request_id multiplexed, per-runtime concurrency.
+Architecture mirrors Scrypted's ML plugins (coreml / openvino / onnx):
+  - asyncio main loop reads requests from stdin; inference is dispatched
+    onto a runtime-specific executor so the reader never blocks.
+  - Each request carries a 32-bit id; responses are tagged with the same
+    id so the Node side can keep N requests in flight concurrently.
+  - Runtime executors:
+      CoreML   → ThreadPoolExecutor(1) — ANE is single-context; one
+                 Python thread is enough, and avoids GIL thrashing.
+      OpenVINO → ThreadPoolExecutor(1) driving the compiled model (the
+                 OV runtime manages internal infer-request parallelism).
+      ONNX     → ThreadPoolExecutor(N) — N independent InferenceSessions
+                 where N = concurrency setting; each session pinned to
+                 its own worker thread.
+  - Optional raw-frame path skips JPEG decode when the caller already
+    has a decoded RGB/BGR/GRAY buffer (e.g. the stream-broker decoder
+    cap output).
+Startup protocol (Node → Python, v2):
+  1. [4B total_len][4B req_id=0][1B msg_type=0x00][JSON config]
+     config = {"runtime": ..., "models": [...], "concurrency": N?}
+  2. Python responds with [4B total_len][4B req_id=0][JSON ready status].
+Runtime protocol:
+  Request:  [4B total_len][4B req_id][1B msg_type][payload]
+  Response: [4B total_len][4B req_id][JSON payload]
+  msg_type:
+    0x00 — command      payload = JSON
+    0x01 — infer_jpeg   payload = [1B model_idx][JPEG bytes]
+    0x02 — infer_raw    payload = [1B model_idx][4B width][4B height]
+                                  [1B fmt 0=RGB,1=BGR,2=GRAY][pixels]
+Commands: load, unload, replace, status.
+"""
+from __future__ import annotations
+import asyncio
+import concurrent.futures
+import io
+import json
+import os
+import struct
+import sys
+import threading
+import time
+from dataclasses import dataclass, field
+from typing import Any, Awaitable, Callable, Optional
+import numpy as np
+from PIL import Image
+from postprocessors import POSTPROCESSORS
+# ---------------------------------------------------------------------------
+# Wire protocol constants
+# ---------------------------------------------------------------------------
+MSG_COMMAND = 0x00
+MSG_INFER_JPEG = 0x01
+MSG_INFER_RAW = 0x02
+# MSG_INFER_BATCH — N raw items packed into a single IPC frame.
+# Wire format:
+#   [1B model_idx][1B count][N × item]
+# Each item:
+#   [4B width][4B height][1B fmt][4B size][size bytes raw]
+# Response payload:
+#   {"results": [<single-detect dict>, ...]}  (same length as count)
+# Each item is dispatched concurrently via asyncio.gather so the
+# predict pool's existing parallelism applies; the saving over N
+# separate calls is one IPC round-trip per item collapsed to one,
+# matching Scrypted's batch=4 semantics for fair benchmarking.
+MSG_INFER_BATCH = 0x03
+MSG_CACHE_FRAME = 0x04
+MSG_INFER_CACHED = 0x05
+RAW_FMT_RGB = 0x00
+RAW_FMT_BGR = 0x01
+RAW_FMT_GRAY = 0x02
+# ---------------------------------------------------------------------------
+# Preprocessing (unchanged from v1 — same shapes/math, moved into helpers)
+# ---------------------------------------------------------------------------
+def letterbox_image(img: Image.Image, size: int) -> tuple[Image.Image, float, tuple[int, int]]:
+    """Resize with letterbox padding; returns (PIL canvas, scale, (padX, padY)).
+    Stays in uint8 PIL space — no float conversion. Used by the imageType
+    fast path where CoreML's predict() accepts a PIL Image directly and
+    handles the BGR/normalize/CVPixelBuffer dance internally.
+    """
+    w, h = img.size
+    scale = size / max(w, h)
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    pad_x = (size - new_w) // 2
+    pad_y = (size - new_h) // 2
+    if w == size and h == size:
+        # Already at target size — skip resize + paste + alloc entirely.
+        return img, 1.0, (0, 0)
+    resized = img.resize((new_w, new_h), Image.BILINEAR)
+    canvas = Image.new("RGB", (size, size), (114, 114, 114))
+    canvas.paste(resized, (pad_x, pad_y))
+    return canvas, scale, (pad_x, pad_y)
+def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
+    """Resize with letterbox padding; returns (array[0..1] HWC float32, scale, (padX, padY)).
+    Used by multiArrayType inputs where the model expects a normalized
+    float tensor. Does the uint8→float32/255 conversion exactly once.
+    """
+    canvas, scale, pad = letterbox_image(img, size)
+    arr = np.array(canvas, dtype=np.float32) / 255.0
+    return arr, scale, pad
+def resize_image(img: Image.Image, width: int, height: int) -> np.ndarray:
+    resized = img.resize((width, height), Image.BILINEAR)
+    return np.array(resized, dtype=np.float32) / 255.0
+def decode_jpeg(jpeg: bytes) -> Image.Image:
+    return Image.open(io.BytesIO(jpeg)).convert("RGB")
+def wrap_raw(raw: bytes, width: int, height: int, fmt: int) -> Image.Image:
+    """Zero-copy (via PIL frombuffer) wrap of a raw frame buffer."""
+    if fmt == RAW_FMT_GRAY:
+        img = Image.frombuffer("L", (width, height), raw, "raw", "L", 0, 1)
+        return img.convert("RGB")
+    mode = "BGR" if fmt == RAW_FMT_BGR else "RGB"
+    img = Image.frombuffer("RGB", (width, height), raw, "raw", mode, 0, 1)
+    return img
+# ---------------------------------------------------------------------------
+# Model slot + runtime-specific loading
+# ---------------------------------------------------------------------------
+@dataclass
+class ModelSlot:
+    model: Any = None
+    config: dict = field(default_factory=dict)
+    loaded: bool = False
+    predict_fn: Optional[Callable[[dict], dict]] = None
+    # Atomic snapshot of model.spec input names; used by both single
+    # and batched predict paths to filter input_dicts against the
+    # CURRENTLY-loaded model. Defends against the race where an in-
+    # flight preprocess holds a reference to the previously-loaded
+    # model's `_input_names` and adds keys (iouThreshold,
+    # confidenceThreshold) that the new model rejects.
+    input_names: frozenset = field(default_factory=frozenset)
+_runtime: str = ""
+_runtime_lib: Any = None
+def _init_runtime(runtime: str) -> None:
+    global _runtime, _runtime_lib
+    _runtime = runtime
+    if runtime == "coreml":
+        import coremltools as ct
+        _runtime_lib = ct
+    elif runtime == "openvino":
+        from openvino.runtime import Core
+        _runtime_lib = Core()
+    elif runtime == "onnxruntime":
+        import onnxruntime as ort
+        _runtime_lib = ort
+    else:
+        raise ValueError(f"Unknown runtime: {runtime}")
+def _load_model(slot: ModelSlot, config: dict) -> None:
+    """Load a model into a slot using the active runtime. Thin adapter per backend."""
+    slot.config = dict(config)
+    path = config["path"]
+    if _runtime == "coreml":
+        ct = _runtime_lib
+        device = config.get("device", "all")
+        compute_map = {
+            "cpu": ct.ComputeUnit.CPU_ONLY,
+            "gpu": ct.ComputeUnit.CPU_AND_GPU,
+            "ane": ct.ComputeUnit.CPU_AND_NE,
+            "all": ct.ComputeUnit.ALL,
+        }
+        model = ct.models.MLModel(path, compute_units=compute_map.get(device, ct.ComputeUnit.ALL))
+        spec = model.get_spec()
+        input_spec = spec.description.input[0]
+        slot.config["_input_name"] = input_spec.name
+        slot.config["_input_type"] = input_spec.type.WhichOneof("Type")
+        slot.config["_input_names"] = [i.name for i in spec.description.input]
+        if slot.config["_input_type"] == "multiArrayType":
+            slot.config["_input_shape"] = list(input_spec.type.multiArrayType.shape)
+            # Detect batch flexibility — when the model's first axis is
+            # declared as a `shapeRange` (RangeDim at export time), we
+            # can stack N preprocessed frames into a single (N,3,H,W)
+            # tensor and issue ONE predict call instead of iterating
+            # the list-of-dicts path. One ANE dispatch per batch.
+            mat = input_spec.type.multiArrayType
+            if mat.WhichOneof("ShapeFlexibility") == "shapeRange" and len(mat.shapeRange.sizeRanges) > 0:
+                batch_axis = mat.shapeRange.sizeRanges[0]
+                if batch_axis.upperBound > 1:
+                    slot.config["_supports_batch"] = True
+                    slot.config["_max_batch"] = int(batch_axis.upperBound)
+        slot.model = model
+        slot.input_names = frozenset(i.name for i in spec.description.input)
+        # Filter via slot.input_names atomically read from the CURRENT
+        # slot. During a model `replace`, in-flight preprocess calls
+        # may have added stale keys (iouThreshold, confidenceThreshold)
+        # for the prior model that the new one rejects.
+        def _predict(inp: dict, _slot: ModelSlot = slot) -> dict:
+            names = _slot.input_names
+            filtered = {k: v for k, v in inp.items() if k in names} if names else inp
+            return _slot.model.predict(filtered)
+        slot.predict_fn = _predict
+    elif _runtime == "openvino":
+        core = _runtime_lib
+        ov_device = config.get("device", "AUTO").upper()
+        compiled = core.compile_model(path, device_name=ov_device)
+        output_layers = [compiled.output(i) for i in range(len(compiled.outputs))]
+        output_names = [o.get_any_name() for o in compiled.outputs]
+        def predict(inp_dict: dict) -> dict:
+            inp = list(inp_dict.values())[0]
+            result = compiled(inp)
+            return {name: result[layer] for name, layer in zip(output_names, output_layers)}
+        slot.model = compiled
+        slot.predict_fn = predict
+    elif _runtime == "onnxruntime":
+        ort = _runtime_lib
+        ort_device = config.get("device", "cpu")
+        if ort_device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        elif ort_device == "coreml":
+            providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        session = ort.InferenceSession(path, providers=providers)
+        output_names = [o.name for o in session.get_outputs()]
+        slot.config["_input_name"] = session.get_inputs()[0].name
+        def predict(inp_dict: dict) -> dict:
+            outputs = session.run(output_names, inp_dict)
+            return {name: out for name, out in zip(output_names, outputs)}
+        slot.model = session
+        slot.predict_fn = predict
+    slot.loaded = True
+def _unload_model(slot: ModelSlot) -> None:
+    slot.model = None
+    slot.predict_fn = None
+    slot.input_names = frozenset()
+    slot.config = {}
+    slot.loaded = False
+# ---------------------------------------------------------------------------
+# Preprocess + run_inference — pure sync helpers called from worker threads
+# ---------------------------------------------------------------------------
+# Preprocess cache for bench frames only. Keyed by (_bench_frame_id, inputSize)
+# where _bench_frame_id is a stable integer tag set on PIL Images stored in
+# frame_cache. Camera frames don't have this tag → no cache → correct behavior.
+# Avoids the CPython id() reuse problem that caused stale detections.
+_bench_preprocess_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int]]] = {}
+def _preprocess(img: Image.Image, config: dict) -> tuple[dict, float, tuple[int, int]]:
+    input_size = config.get("inputSize", 640)
+    # Bench frames have _bench_frame_id tag → use preprocess cache
+    bench_fid = getattr(img, '_bench_frame_id', None)
+    if bench_fid is not None:
+        cache_key = (bench_fid, input_size)
+        cached = _bench_preprocess_cache.get(cache_key)
+        if cached is not None:
+            return cached
+    preprocess_mode = config.get("preprocessMode", "letterbox")
+    input_dict: dict = {}
+    if _runtime == "coreml":
+        input_name = config.get("_input_name", "image")
+        input_type = config.get("_input_type", "imageType")
+        input_shape = config.get("_input_shape")
+        if input_type == "imageType":
+            # FAST PATH — stay in uint8 PIL space. CoreML's predict()
+            # accepts a PIL Image directly and handles the
+            # BGR/normalize/CVPixelBuffer dance internally on its own
+            # (zero-copy when the size matches the model input). Going
+            # via numpy float32 → /255 → ×255 → astype(uint8) → Image.
+            # fromarray() costs ~16ms per 640×640 frame on M3 Pro and is
+            # PURE WASTE for imageType inputs.
+            if preprocess_mode == "letterbox":
+                canvas, scale_val, pad = letterbox_image(img, input_size)
+            elif img.size == (input_size, input_size):
+                canvas, scale_val, pad = img, 1.0, (0, 0)
+            else:
+                canvas, scale_val, pad = img.resize((input_size, input_size), Image.BILINEAR), 1.0, (0, 0)
+            input_dict[input_name] = canvas
+            arr = None  # not used downstream for imageType — postprocess runs on predictions
+        elif input_shape is not None and len(input_shape) == 4:
+            if preprocess_mode == "letterbox":
+                arr, scale_val, pad = letterbox(img, input_size)
+            else:
+                arr = resize_image(img, input_size, input_size)
+                scale_val = 1.0
+                pad = (0, 0)
+            _, _, _, w_or_c = input_shape
+            if w_or_c in (1, 3):
+                input_arr = arr[np.newaxis].astype(np.float32)
+            else:
+                input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
+            if input_shape[1] == 1 and input_shape[2] != input_shape[3]:
+                gray = np.mean(arr, axis=2)
+                target_h, target_w = input_shape[2], input_shape[3]
+                gray_img = Image.fromarray((gray * 255).astype(np.uint8), mode="L")
+                gray_img = gray_img.resize((target_w, target_h), Image.BILINEAR)
+                input_arr = np.array(gray_img, dtype=np.float32)[np.newaxis, np.newaxis] / 255.0
+            input_dict[input_name] = input_arr
+        else:
+            input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
+            input_dict[input_name] = input_arr
+        input_names = config.get("_input_names", [])
+        if "iouThreshold" in input_names:
+            input_dict["iouThreshold"] = 0.45
+        if "confidenceThreshold" in input_names:
+            input_dict["confidenceThreshold"] = config.get("confidence", 0.25)
+    else:
+        # OpenVINO / ONNX — always need float CHW tensor.
+        if preprocess_mode == "letterbox":
+            arr, scale_val, pad = letterbox(img, input_size)
+        else:
+            arr = resize_image(img, input_size, input_size)
+            scale_val = 1.0
+            pad = (0, 0)
+        input_name = config.get("_input_name", "images")
+        input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
+        input_dict[input_name] = input_arr
+    result = (input_dict, scale_val, pad)
+    if bench_fid is not None:
+        _bench_preprocess_cache[(bench_fid, input_size)] = result
+    return result
+def _postprocess(
+    predictions: dict,
+    slot_config: dict,
+    orig_w: int,
+    orig_h: int,
+    scale_val: float,
+    pad: tuple[int, int],
+    elapsed_ms: float,
+    preprocess_ms: float = 0.0,
+    predict_ms: float = 0.0,
+    batch_size: int = 1,
+) -> dict:
+    postprocessor_type = slot_config.get("postprocessor", "yolo")
+    postprocessor_fn = POSTPROCESSORS.get(postprocessor_type)
+    if postprocessor_fn is None:
+        raise ValueError(f"Unknown postprocessor: {postprocessor_type}")
+    result = postprocessor_fn(predictions, slot_config, orig_w, orig_h, scale_val, pad)
+    result["inferenceMs"] = round(elapsed_ms, 2)
+    result["preprocessMs"] = round(preprocess_ms, 2)
+    result["predictMs"] = round(predict_ms, 2)
+    result["batchSize"] = batch_size
+    result["frameSize"] = f"{orig_w}x{orig_h}"
+    return result
+# ---------------------------------------------------------------------------
+# Runtime dispatcher — pipelined prepare → predict → postprocess
+# ---------------------------------------------------------------------------
+class RuntimeDispatcher:
+    """Split-stage dispatcher that lets preprocess run ahead of predict.
+    Stages use separate thread pools so an in-flight `predict` doesn't
+    block the preprocess for the next frame:
+        camera A: preprocess → predict ─┐
+        camera B:              preprocess → predict ─┐
+        camera C:                          preprocess → predict ─┐
+    While predict for A runs on the predict_pool, preprocess for B can
+    run on the prepare_pool; each stage releases the Python event loop
+    between calls so the reader stays responsive.
+    `concurrency` sets the prepare pool size AND the predict pool size.
+    For CoreML/OpenVINO the runtime itself serialises access to the
+    device, so a single predict worker is usually enough. ONNX Runtime
+    benefits from >1 (its InferenceSession.run is thread-safe and the
+    native thread pool schedules concurrent runs).
+    """
+    def __init__(self, runtime: str, concurrency: int) -> None:
+        workers = max(1, concurrency)
+        self._workers = workers
+        self._prepare_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=workers,
+            thread_name_prefix=f"{runtime}-prep",
+        )
+        self._predict_pool = concurrent.futures.ThreadPoolExecutor(
+            max_workers=workers,
+            thread_name_prefix=f"{runtime}-predict",
+        )
+    @property
+    def workers(self) -> int:
+        return self._workers
+    async def run(self, slot: ModelSlot, img: Image.Image) -> dict:
+        loop = asyncio.get_event_loop()
+        t_start = time.perf_counter()
+        orig_w, orig_h = img.size
+        # Stage 1 — preprocess. Runs on prepare_pool; overlaps with
+        # any predict already scheduled for a previous frame.
+        input_dict, scale_val, pad = await loop.run_in_executor(
+            self._prepare_pool, _preprocess, img, slot.config,
+        )
+        t_predict_start = time.perf_counter()
+        preprocess_ms = (t_predict_start - t_start) * 1000
+        # Stage 2 — predict. Dedicated pool so preprocessors for later
+        # frames can run while the runtime works on this one.
+        predictions = await loop.run_in_executor(
+            self._predict_pool, slot.predict_fn, input_dict,
+        )
+        # Stage 3 — postprocess. Cheap CPU-bound step; reuse the prepare
+        # pool to avoid tying up a predict worker on scalar math.
+        t_predict_end = time.perf_counter()
+        predict_ms = (t_predict_end - t_predict_start) * 1000
+        elapsed_ms = (t_predict_end - t_start) * 1000
+        return await loop.run_in_executor(
+            self._prepare_pool,
+            _postprocess,
+            predictions, slot.config, orig_w, orig_h, scale_val, pad, elapsed_ms,
+            preprocess_ms, predict_ms, 1,
+        )
+    async def run_cached(
+        self,
+        slot: ModelSlot,
+        img: Image.Image,
+        cache_key: tuple[int, int],
+        prep_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int], int, int]],
+    ) -> dict:
+        """Like run() but caches the preprocessed input dict. Second+
+        calls for the same (frame_id, model_idx) skip preprocess entirely
+        — goes straight to predict + postprocess. Saves ~10ms/call."""
+        loop = asyncio.get_event_loop()
+        t_start = time.perf_counter()
+        orig_w, orig_h = img.size
+        cached = prep_cache.get(cache_key)
+        if cached is not None:
+            input_dict, scale_val, pad, _, _ = cached
+            preprocess_ms = 0.0
+        else:
+            input_dict, scale_val, pad = await loop.run_in_executor(
+                self._prepare_pool, _preprocess, img, slot.config,
+            )
+            preprocess_ms = (time.perf_counter() - t_start) * 1000
+            prep_cache[cache_key] = (input_dict, scale_val, pad, orig_w, orig_h)
+        t_predict_start = time.perf_counter()
+        predictions = await loop.run_in_executor(
+            self._predict_pool, slot.predict_fn, input_dict,
+        )
+        t_predict_end = time.perf_counter()
+        predict_ms = (t_predict_end - t_predict_start) * 1000
+        elapsed_ms = (t_predict_end - t_start) * 1000
+        return await loop.run_in_executor(
+            self._prepare_pool,
+            _postprocess,
+            predictions, slot.config, orig_w, orig_h, scale_val, pad, elapsed_ms,
+            preprocess_ms, predict_ms, 1,
+        )
+    async def run_list(self, slot: ModelSlot, imgs: list[Image.Image]) -> list[dict]:
+        """Batch predict path — single CoreML/OV/ORT predict call processing
+        all N items, then split + per-item postprocess.
+        For CoreML: passes a list of input dicts to `model.predict(...)`,
+        which CoreML iterates internally with shared session state — saves
+        per-call ANE setup overhead vs a thread-pool fan-out.
+        For OpenVINO/ONNX: falls back to a loop on the same predict thread
+        (no model-level batching since their predict_fn signatures take one
+        array). Still saves the asyncio.gather + ThreadPoolExecutor dispatch
+        overhead.
+        """
+        if not imgs:
+            return []
+        loop = asyncio.get_event_loop()
+        t_start = time.perf_counter()
+        orig_sizes = [img.size for img in imgs]
+        batch_size = len(imgs)
+        # Stage 1 — preprocess all in parallel on the prepare pool.
+        prepared = await asyncio.gather(*[
+            loop.run_in_executor(self._prepare_pool, _preprocess, img, slot.config)
+            for img in imgs
+        ])
+        # prepared = list of (input_dict, scale_val, pad)
+        t_predict_start = time.perf_counter()
+        preprocess_ms = (t_predict_start - t_start) * 1000
+        # Stage 2 — single predict call on the predict thread.
+        runtime = _runtime
+        supports_batch = bool(slot.config.get("_supports_batch"))
+        max_batch = int(slot.config.get("_max_batch", 1))
+        if runtime == "coreml" and supports_batch and batch_size <= max_batch:
+            # FAST PATH — stack the N preprocessed (1,3,H,W) tensors
+            # into a single (N,3,H,W) tensor and issue ONE predict call.
+            # The model was exported with a flexible batch axis so this
+            # produces a single ANE dispatch processing all N frames in
+            # parallel — amortises CoreML session setup AND the per-call
+            # compute_units context-switch.
+            input_name = slot.config.get("_input_name", "image")
+            stacked = np.concatenate([p[0][input_name] for p in prepared], axis=0)
+            def stacked_predict() -> list[dict]:
+                names = slot.input_names
+                payload: dict = {input_name: stacked}
+                # Some models accept side-channel inputs (iouThreshold
+                # etc.). Forward only those that the current model
+                # accepts; the stacked input is set above.
+                first = prepared[0][0]
+                for k, v in first.items():
+                    if k != input_name and (not names or k in names):
+                        payload[k] = v
+                out = slot.model.predict(payload)
+                # Split each output along axis 0 back into N items.
+                # Output shape (N, ...); slice per index and rebuild
+                # per-item dicts so postprocess sees the same shape it
+                # would for a single-frame predict.
+                results: list[dict] = []
+                for i in range(batch_size):
+                    item: dict = {}
+                    for k, v in out.items():
+                        if hasattr(v, "shape") and len(v.shape) > 0 and v.shape[0] == batch_size:
+                            item[k] = v[i:i + 1]
+                        else:
+                            item[k] = v
+                    results.append(item)
+                return results
+            predictions_list = await loop.run_in_executor(self._predict_pool, stacked_predict)
+        elif runtime == "coreml":
+            # Fallback — model.predict([list of input dicts]) iterates
+            # internally on shared session state. Not as fast as stacked
+            # but still saves the asyncio.gather + ThreadPoolExecutor
+            # dispatch overhead vs N separate calls.
+            def batched_predict() -> list[dict]:
+                names = slot.input_names
+                if names:
+                    cleaned = [{k: v for k, v in p[0].items() if k in names} for p in prepared]
+                else:
+                    cleaned = [p[0] for p in prepared]
+                return slot.model.predict(cleaned)
+            predictions_list = await loop.run_in_executor(self._predict_pool, batched_predict)
+        else:
+            # OpenVINO / ONNX — loop on the predict thread to keep IPC
+            # amortisation (one executor dispatch instead of N) without
+            # requiring model-level batched inputs.
+            def looped_predict() -> list[dict]:
+                return [slot.predict_fn(p[0]) for p in prepared]
+            predictions_list = await loop.run_in_executor(self._predict_pool, looped_predict)
+        # Stage 3 — postprocess each result. Per-item elapsed shares the
+        # batch wall (fair attribution) — callers see it as `inferenceMs`.
+        t_predict_end = time.perf_counter()
+        predict_ms = (t_predict_end - t_predict_start) * 1000
+        elapsed_ms = (t_predict_end - t_start) * 1000
+        return await asyncio.gather(*[
+            loop.run_in_executor(
+                self._prepare_pool,
+                _postprocess,
+                predictions, slot.config,
+                orig_sizes[i][0], orig_sizes[i][1],
+                prepared[i][1], prepared[i][2],
+                elapsed_ms,
+                preprocess_ms, predict_ms, batch_size,
+            )
+            for i, predictions in enumerate(predictions_list)
+        ])
+    def close(self) -> None:
+        self._prepare_pool.shutdown(wait=False)
+        self._predict_pool.shutdown(wait=False)
+# ---------------------------------------------------------------------------
+# IPC — binary framing with request_id multiplexing
+# ---------------------------------------------------------------------------
+HEADER_LEN = 4  # total_len prefix
+PREFIX_LEN = HEADER_LEN + 4 + 1  # total_len + req_id + msg_type
+async def _read_exact(reader: asyncio.StreamReader, n: int) -> bytes:
+    return await reader.readexactly(n)
+class ResponseWriter:
+    """Serialises writes to the stdout pipe; safe from multiple coroutines."""
+    def __init__(self, writer: asyncio.StreamWriter) -> None:
+        self._writer = writer
+        self._lock = asyncio.Lock()
+    async def send(self, req_id: int, payload: dict) -> None:
+        # numpy scalars (float32 / int64 / bool_) are NOT JSON-serializable
+        # under the stdlib encoder. With numpy 2.x the previous lenient
+        # path is gone, so we must coerce before encoding. Cheap default
+        # callback only kicks in for unhandled types — Python natives skip
+        # it entirely.
+        data = json.dumps(payload, default=_json_default).encode("utf-8")
+        total_len = 4 + len(data)  # req_id(4) + json
+        header = struct.pack("<II", total_len, req_id)
+        async with self._lock:
+            self._writer.write(header)
+            self._writer.write(data)
+            await self._writer.drain()
+def _json_default(obj: Any) -> Any:
+    """Coerce numpy scalars/arrays to JSON-friendly Python natives."""
+    if isinstance(obj, np.generic):
+        return obj.item()
+    if isinstance(obj, np.ndarray):
+        return obj.tolist()
+    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
+# ---------------------------------------------------------------------------
+# Command handlers
+# ---------------------------------------------------------------------------
+def _handle_command(models: list[ModelSlot], cmd: dict) -> dict:
+    action = cmd.get("cmd")
+    if action == "load":
+        index = cmd["index"]
+        config = cmd["config"]
+        while len(models) <= index:
+            models.append(ModelSlot())
+        slot = models[index]
+        if slot.loaded:
+            _unload_model(slot)
+        try:
+            t0 = time.perf_counter()
+            _load_model(slot, config)
+            load_ms = round((time.perf_counter() - t0) * 1000)
+            sys.stderr.write(f"Model {index} loaded: {config['path']} ({load_ms}ms)\n")
+            sys.stderr.flush()
+            return {"cmd": "load", "index": index, "status": "ok", "loadMs": load_ms}
+        except Exception as exc:
+            return {"cmd": "load", "index": index, "status": "error", "error": str(exc)}
+    if action == "unload":
+        index = cmd["index"]
+        if index < len(models) and models[index].loaded:
+            _unload_model(models[index])
+            sys.stderr.write(f"Model {index} unloaded\n")
+            sys.stderr.flush()
+        return {"cmd": "unload", "index": index, "status": "ok"}
+    if action == "replace":
+        index = cmd["index"]
+        config = cmd["config"]
+        while len(models) <= index:
+            models.append(ModelSlot())
+        slot = models[index]
+        if slot.loaded:
+            _unload_model(slot)
+        try:
+            t0 = time.perf_counter()
+            _load_model(slot, config)
+            load_ms = round((time.perf_counter() - t0) * 1000)
+            sys.stderr.write(f"Model {index} replaced: {config['path']} ({load_ms}ms)\n")
+            sys.stderr.flush()
+            return {"cmd": "replace", "index": index, "status": "ok", "loadMs": load_ms}
+        except Exception as exc:
+            return {"cmd": "replace", "index": index, "status": "error", "error": str(exc)}
+    if action == "status":
+        status = []
+        for i, slot in enumerate(models):
+            status.append({
+                "index": i,
+                "path": slot.config.get("path") if slot.loaded else None,
+                "loaded": slot.loaded,
+                "postprocessor": slot.config.get("postprocessor") if slot.loaded else None,
+            })
+        return {"cmd": "status", "models": status}
+    return {"cmd": action or "unknown", "status": "error", "error": f"Unknown command: {action}"}
+# ---------------------------------------------------------------------------
+# Main async loop
+# ---------------------------------------------------------------------------
+async def _run() -> None:
+    loop = asyncio.get_event_loop()
+    reader = asyncio.StreamReader(loop=loop)
+    protocol = asyncio.StreamReaderProtocol(reader, loop=loop)
+    await loop.connect_read_pipe(lambda: protocol, sys.stdin.buffer)
+    transport, stdout_protocol = await loop.connect_write_pipe(
+        asyncio.streams.FlowControlMixin, sys.stdout.buffer,
+    )
+    stream_writer = asyncio.StreamWriter(transport, stdout_protocol, None, loop)
+    writer = ResponseWriter(stream_writer)
+    t_start = time.perf_counter()
+    # ── Startup — first message is config (req_id=0, msg_type=COMMAND) ──
+    header = await _read_exact(reader, PREFIX_LEN)
+    total_len, req_id = struct.unpack("<II", header[0:8])
+    msg_type = header[8]
+    remaining = total_len - 5
+    payload = await _read_exact(reader, remaining)
+    if msg_type != MSG_COMMAND:
+        sys.stderr.write(f"Expected startup config (cmd), got msg_type={msg_type}\n")
+        sys.exit(1)
+    config = json.loads(payload)
+    runtime = config.get("runtime", "coreml")
+    concurrency = int(config.get("concurrency", 1) or 1)
+    batch_mode = str(config.get("batch_mode", "none"))
+    if batch_mode not in ("none", "list", "window"):
+        batch_mode = "none"
+    window_ms = max(0, int(config.get("window_ms", 0) or 0))
+    max_batch_size = max(1, int(config.get("max_batch_size", 1) or 1))
+    sys.stderr.write(
+        f"Initializing runtime: {runtime} (concurrency={concurrency}, "
+        f"batch_mode={batch_mode}, window_ms={window_ms}, max_batch={max_batch_size})\n"
+    )
+    sys.stderr.flush()
+    _init_runtime(runtime)
+    models: list[ModelSlot] = []
+    for i, mc in enumerate(config.get("models", [])):
+        slot = ModelSlot()
+        sys.stderr.write(f"Loading model {i}: {mc['path']}\n")
+        sys.stderr.flush()
+        try:
+            _load_model(slot, mc)
+        except Exception as exc:
+            sys.stderr.write(f"ERROR loading model {i}: {exc}\n")
+            sys.stderr.flush()
+        models.append(slot)
+    dispatcher = RuntimeDispatcher(runtime, concurrency)
+    startup_ms = round((time.perf_counter() - t_start) * 1000)
+    loaded_count = sum(1 for s in models if s.loaded)
+    sys.stderr.write(
+        f"Ready — {loaded_count}/{len(models)} model(s) in {startup_ms}ms "
+        f"(runtime={runtime}, workers={dispatcher.workers})\n"
+    )
+    sys.stderr.flush()
+    await writer.send(req_id, {
+        "status": "ready",
+        "models": loaded_count,
+        "total": len(models),
+        "startupMs": startup_ms,
+        "runtime": runtime,
+        "workers": dispatcher.workers,
+    })
+    # ── Window accumulator (per-model) ──────────────────────────────
+    #
+    # In `window` mode, every MSG_INFER_RAW arrival pushes (req_id, img)
+    # into a per-model pending queue and arms a `window_ms` timer. When
+    # the timer fires, OR the queue reaches `max_batch_size`, we flush
+    # via `dispatcher.run_list(slot, imgs)` — single batched predict.
+    # Concurrent inferRaw calls from N cameras coalesce into one CoreML
+    # predict call, amortising ANE setup.
+    class WindowAccumulator:
+        def __init__(self, model_idx: int) -> None:
+            self.model_idx = model_idx
+            self.pending: list[tuple[int, Image.Image]] = []
+            self.flush_task: Optional[asyncio.Task[None]] = None
+        async def submit(self, req_id: int, img: Image.Image) -> None:
+            self.pending.append((req_id, img))
+            if len(self.pending) >= max_batch_size:
+                # Reached cap — cancel any pending timer (we are flushing
+                # immediately from the caller's coroutine, not the timer)
+                # and flush now.
+                if self.flush_task is not None:
+                    self.flush_task.cancel()
+                    self.flush_task = None
+                await self._flush()
+                return
+            if self.flush_task is None:
+                self.flush_task = asyncio.create_task(self._delayed_flush())
+        async def _delayed_flush(self) -> None:
+            try:
+                await asyncio.sleep(window_ms / 1000.0)
+            except asyncio.CancelledError:
+                return
+            # Mark the timer task as complete BEFORE the dispatch await so
+            # `_flush` does not try to cancel the running task (which would
+            # raise CancelledError at `await dispatcher.run_list` and drop
+            # responses on the floor — the symptom the TS side observed as
+            # `runPipeline.mutate` hanging forever).
+            self.flush_task = None
+            await self._flush()
+        async def _flush(self) -> None:
+            batch = self.pending
+            self.pending = []
+            if not batch:
+                return
+            slot = models[self.model_idx]
+            try:
+                imgs = [b[1] for b in batch]
+                results = await dispatcher.run_list(slot, imgs)
+            except Exception as exc:
+                sys.stderr.write(f"Window flush error (model {self.model_idx}): {exc}\n")
+                sys.stderr.flush()
+                err_payload = {
+                    "error": str(exc),
+                    "kind": "detections",
+                    "detections": [],
+                    "inferenceMs": 0,
+                }
+                for rid, _ in batch:
+                    await writer.send(rid, err_payload)
+                return
+            for (rid, _), result in zip(batch, results):
+                await writer.send(rid, result)
+    accumulators: dict[int, WindowAccumulator] = {}
+    # ── Bench frame cache ──────────────────────────────────────────
+    # Stores raw PIL Images keyed by uint32 frame_id. The Node side
+    # sends MSG_CACHE_FRAME once with the full 1.2MB payload; all
+    # subsequent MSG_INFER_CACHED calls send only a 5-byte header
+    # (model_idx + frame_id). Eliminates the 35ms/call pipe transfer
+    # that dominates bench throughput.
+    frame_cache: dict[int, Image.Image] = {}
+    # Preprocessed-input cache: keyed by (frame_id, model_idx). First
+    # inferCached call preprocesses and caches; subsequent calls skip
+    # preprocess entirely and go straight to predict + postprocess.
+    # Saves ~10ms/call (the PIL resize + numpy + tensor pack cost).
+    _preprocess_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int], int, int]] = {}
+    def get_accumulator(model_idx: int) -> WindowAccumulator:
+        acc = accumulators.get(model_idx)
+        if acc is None:
+            acc = WindowAccumulator(model_idx)
+            accumulators[model_idx] = acc
+        return acc
+    # ── Main loop ───────────────────────────────────────────────────
+    async def handle_inference(req_id: int, img: Image.Image, model_idx: int) -> None:
+        try:
+            if model_idx >= len(models) or not models[model_idx].loaded:
+                await writer.send(req_id, {
+                    "error": f"Model {model_idx} not loaded",
+                    "kind": "detections",
+                    "detections": [],
+                    "inferenceMs": 0,
+                })
+                return
+            if batch_mode == "window":
+                await get_accumulator(model_idx).submit(req_id, img)
+                return
+            result = await dispatcher.run(models[model_idx], img)
+            await writer.send(req_id, result)
+        except Exception as exc:
+            sys.stderr.write(f"Inference error (model {model_idx}, req {req_id}): {exc}\n")
+            sys.stderr.flush()
+            await writer.send(req_id, {
+                "error": str(exc),
+                "kind": "detections",
+                "detections": [],
+                "inferenceMs": 0,
+            })
+    async def handle_batch(req_id: int, model_idx: int, items: list[Image.Image]) -> None:
+        if model_idx >= len(models) or not models[model_idx].loaded:
+            await writer.send(req_id, {
+                "error": f"Model {model_idx} not loaded",
+                "results": [],
+            })
+            return
+        try:
+            slot = models[model_idx]
+            if batch_mode in ("list", "window"):
+                # `list` mode: send the batch through dispatcher.run_list
+                # — single predict call, internal CoreML iteration.
+                # `window` mode: same dispatch path (no point queueing into
+                # the accumulator when caller already packed N items).
+                payload = await dispatcher.run_list(slot, items)
+            else:
+                tasks = [dispatcher.run(slot, img) for img in items]
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                payload = []
+                for r in results:
+                    if isinstance(r, Exception):
+                        payload.append({
+                            "error": str(r),
+                            "kind": "detections",
+                            "detections": [],
+                            "inferenceMs": 0,
+                        })
+                    else:
+                        payload.append(r)
+            await writer.send(req_id, {"results": payload})
+        except Exception as exc:
+            sys.stderr.write(f"Batch error (model {model_idx}, req {req_id}): {exc}\n")
+            sys.stderr.flush()
+            await writer.send(req_id, {"error": str(exc), "results": []})
+    while True:
+        try:
+            header = await _read_exact(reader, PREFIX_LEN)
+        except asyncio.IncompleteReadError:
+            return  # stdin closed
+        total_len, req_id = struct.unpack("<II", header[0:8])
+        msg_type = header[8]
+        remaining = total_len - 5
+        try:
+            payload = await _read_exact(reader, remaining) if remaining > 0 else b""
+        except asyncio.IncompleteReadError:
+            return
+        if msg_type == MSG_COMMAND:
+            try:
+                cmd = json.loads(payload)
+                # Handle uncache_frame inline (needs access to frame_cache)
+                if cmd.get("cmd") == "uncache_frame":
+                    fid = cmd.get("frameId", -1)
+                    removed_img = frame_cache.pop(fid, None)
+                    # Purge preprocessed tensor caches for this frame
+                    to_del = [k for k in _preprocess_cache if k[0] == fid]
+                    for k in to_del:
+                        del _preprocess_cache[k]
+                    # Also purge the bench preprocess cache
+                    to_del2 = [k for k in _bench_preprocess_cache if k[0] == fid]
+                    for k in to_del2:
+                        del _bench_preprocess_cache[k]
+                    response = {"cmd": "uncache_frame", "status": "ok", "frameId": fid}
+                else:
+                    response = _handle_command(models, cmd)
+            except Exception as exc:
+                response = {"cmd": "unknown", "status": "error", "error": str(exc)}
+            await writer.send(req_id, response)
+        elif msg_type == MSG_INFER_JPEG:
+            if len(payload) < 1:
+                await writer.send(req_id, {"error": "empty infer_jpeg payload"})
+                continue
+            model_idx = payload[0]
+            jpeg = payload[1:]
+            try:
+                img = decode_jpeg(jpeg)
+            except Exception as exc:
+                await writer.send(req_id, {"error": f"jpeg decode failed: {exc}"})
+                continue
+            asyncio.create_task(handle_inference(req_id, img, model_idx))
+        elif msg_type == MSG_INFER_RAW:
+            if len(payload) < 10:
+                await writer.send(req_id, {"error": "truncated infer_raw header"})
+                continue
+            model_idx = payload[0]
+            width, height = struct.unpack("<II", payload[1:9])
+            fmt = payload[9]
+            raw = payload[10:]
+            try:
+                img = wrap_raw(raw, width, height, fmt)
+            except Exception as exc:
+                await writer.send(req_id, {"error": f"raw wrap failed: {exc}"})
+                continue
+            asyncio.create_task(handle_inference(req_id, img, model_idx))
+        elif msg_type == MSG_INFER_BATCH:
+            # Header: [1B model_idx][1B count]
+            if len(payload) < 2:
+                await writer.send(req_id, {"error": "truncated infer_batch header"})
+                continue
+            model_idx = payload[0]
+            count = payload[1]
+            offset = 2
+            items: list[Image.Image] = []
+            parse_err: Optional[str] = None
+            for _ in range(count):
+                # Per-item header: [4B w][4B h][1B fmt][4B size]
+                if offset + 13 > len(payload):
+                    parse_err = "truncated batch item header"
+                    break
+                width, height = struct.unpack("<II", payload[offset:offset + 8])
+                fmt = payload[offset + 8]
+                size = struct.unpack("<I", payload[offset + 9:offset + 13])[0]
+                offset += 13
+                if offset + size > len(payload):
+                    parse_err = "truncated batch item payload"
+                    break
+                raw = payload[offset:offset + size]
+                offset += size
+                try:
+                    items.append(wrap_raw(raw, width, height, fmt))
+                except Exception as exc:
+                    parse_err = f"raw wrap failed: {exc}"
+                    break
+            if parse_err is not None:
+                await writer.send(req_id, {"error": parse_err, "results": []})
+                continue
+            asyncio.create_task(handle_batch(req_id, model_idx, items))
+        elif msg_type == MSG_CACHE_FRAME:
+            # Wire: [4B frame_id][4B width][4B height][1B fmt][raw bytes]
+            if len(payload) < 13:
+                await writer.send(req_id, {"error": "truncated cache_frame header"})
+                continue
+            fid = struct.unpack("<I", payload[0:4])[0]
+            width, height = struct.unpack("<II", payload[4:12])
+            fmt = payload[12]
+            raw = payload[13:]
+            try:
+                img = wrap_raw(raw, width, height, fmt)
+                img._bench_frame_id = fid  # Tag for preprocess cache keying
+                frame_cache[fid] = img
+                await writer.send(req_id, {"status": "cached", "frameId": fid, "width": width, "height": height})
+            except Exception as exc:
+                await writer.send(req_id, {"error": f"cache_frame wrap failed: {exc}"})
+        elif msg_type == MSG_INFER_CACHED:
+            # Wire: [1B model_idx][4B frame_id]
+            # Routes through handle_inference → window accumulator for
+            # batching. The preprocess runs each time (~4ms) but batch=4
+            # predict saves much more (8ms vs 4×9ms=36ms).
+            if len(payload) < 5:
+                await writer.send(req_id, {"error": "truncated infer_cached header"})
+                continue
+            model_idx = payload[0]
+            fid = struct.unpack("<I", payload[1:5])[0]
+            img = frame_cache.get(fid)
+            if img is None:
+                await writer.send(req_id, {"error": f"frame {fid} not in cache"})
+                continue
+            if model_idx >= len(models) or not models[model_idx].loaded:
+                await writer.send(req_id, {"error": f"model {model_idx} not loaded"})
+                continue
+            asyncio.create_task(handle_inference(req_id, img, model_idx))
+        else:
+            await writer.send(req_id, {"error": f"unknown msg_type: {msg_type}"})
+def main() -> None:
+    try:
+        asyncio.run(_run())
+    except KeyboardInterrupt:
+        pass
+if __name__ == "__main__":
+    main()