PyPI - SparkRT - Versions diffs - 0.1.0rc1__py3-none-any.whl - Mend

SparkRT 0.1.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

sparkrt/__init__.py +33 -0
sparkrt/adapters/__init__.py +13 -0
sparkrt/adapters/_pi05_kv_cache.py +107 -0
sparkrt/adapters/act_adapter.py +127 -0
sparkrt/adapters/pi05_adapter.py +381 -0
sparkrt/api.py +149 -0
sparkrt/backends/__init__.py +59 -0
sparkrt/backends/cudagraph.py +220 -0
sparkrt/backends/eager.py +38 -0
sparkrt/backends/torchcompile.py +114 -0
sparkrt/config/__init__.py +35 -0
sparkrt/config/loader.py +117 -0
sparkrt/config/presets/default.yaml +18 -0
sparkrt/config/presets/latency.yaml +25 -0
sparkrt/config/presets/memory.yaml +30 -0
sparkrt/config/presets/quality.yaml +21 -0
sparkrt/config/presets/safe.yaml +15 -0
sparkrt/config/runtime.py +205 -0
sparkrt/core/__init__.py +22 -0
sparkrt/core/adapter.py +146 -0
sparkrt/core/backend.py +50 -0
sparkrt/core/region.py +46 -0
sparkrt/core/shape.py +47 -0
sparkrt/eval/__init__.py +18 -0
sparkrt/eval/policy.py +125 -0
sparkrt/io/__init__.py +12 -0
sparkrt/io/checkpoint.py +319 -0
sparkrt/observation.py +210 -0
sparkrt/policy.py +136 -0
sparkrt/processors/__init__.py +6 -0
sparkrt/processors/base.py +30 -0
sparkrt/processors/sparkmind.py +40 -0
sparkrt/session/__init__.py +5 -0
sparkrt/session/session.py +117 -0
sparkrt-0.1.0rc1.dist-info/METADATA +334 -0
sparkrt-0.1.0rc1.dist-info/RECORD +39 -0
sparkrt-0.1.0rc1.dist-info/WHEEL +5 -0
sparkrt-0.1.0rc1.dist-info/licenses/LICENSE +164 -0
sparkrt-0.1.0rc1.dist-info/top_level.txt +1 -0

sparkrt/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""SparkRT - edge-side low-latency inference runtime for SparkMind2 models.
+Public surface:
+    from sparkrt import from_sparkmind_agent, InferenceSession
+The runtime is organised in four decoupled layers (see ``docs``/memory plan):
+    processors  obs -> model-ready tensors -> action   (normalize/tokenize)
+    adapters    *what* a model computes (wraps SparkMind2 nn.Module as regions)
+    backends    *how* it executes (eager now; CUDA-graph / native C++ later)
+    session     unified, stateful select_action() loop (queue + ensemble)
+``sparkrt.core`` defines the contracts that tie these together and imports
+without any SparkMind2 / heavy dependency, so the execution seam stays clean.
+"""
+from sparkrt.api import from_sparkmind_agent, load_policy
+from sparkrt.config import BackendConfig, Pi05RuntimeConfig, RuntimeConfig
+from sparkrt.observation import Observation
+from sparkrt.policy import Policy
+from sparkrt.session import InferenceSession
+__all__ = [
+    "from_sparkmind_agent",
+    "load_policy",
+    "Policy",
+    "Observation",
+    "InferenceSession",
+    "RuntimeConfig",
+    "BackendConfig",
+    "Pi05RuntimeConfig",
+]
+__version__ = "0.1.0rc1"

sparkrt/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Model adapters: wrap SparkMind2 nn.Modules as runtime regions."""
+from sparkrt.adapters.act_adapter import ACTAdapter
+from sparkrt.adapters.pi05_adapter import Pi05Adapter
+__all__ = ["ACTAdapter", "Pi05Adapter"]
+#: Maps ``cfg.Model.type`` -> adapter class. Extend this (plus a processor) to
+#: add a new model; no backend or session changes are required.
+ADAPTER_REGISTRY = {
+    "act": ACTAdapter,
+    "pi05": Pi05Adapter,
+}

sparkrt/adapters/_pi05_kv_cache.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Read-only prefix KV-cache shim for Pi0.5 denoise_step.
+``_ReadOnlyPrefixCache`` is a lightweight HF-cache-protocol adapter that wraps
+the frozen prefix KV tensors (computed once by ``encode_prefix``) and avoids
+cloning them on every denoising step.  It implements the minimal subset of the
+HF ``DynamicCache`` interface used by ``PaliGemmaWithExpertModel.forward`` when
+``use_cache=False``, i.e. cross-attending to a prefix without updating it.
+When the backend is ``cudagraph`` and no sliding-window layers are present the
+regular ``clone_past_key_values`` copy is replaced by this object.  The saving
+is ~2–3 ms/chunk on A800 (18 layers × 2 tensors cloned × 10 steps avoided).
+For ``torchcompile`` the shim is disabled by default because it introduces a
+small extra numeric drift when combined with Inductor fusion; the ``off`` or
+``auto`` mode in :class:`~sparkrt.config.Pi05RuntimeConfig` controls this.
+"""
+from __future__ import annotations
+from typing import Any
+__all__ = ["ReadOnlyPrefixCache", "ReadOnlyPrefixCacheLayer"]
+class ReadOnlyPrefixCacheLayer:
+    """Minimal HF cache-layer shim for a single frozen prefix KV pair."""
+    is_sliding = False
+    is_compileable = False
+    is_initialized = True
+    def __init__(self, keys: Any, values: Any) -> None:
+        self.keys = keys
+        self.values = values
+    def update(
+        self, key_states: Any, value_states: Any, *args: Any, **kwargs: Any
+    ) -> tuple[Any, Any]:
+        import torch
+        return (
+            torch.cat([self.keys, key_states], dim=-2),
+            torch.cat([self.values, value_states], dim=-2),
+        )
+    def get_seq_length(self) -> int:
+        return int(self.keys.shape[-2])
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        return self.get_seq_length() + int(query_length), 0
+    def get_max_cache_shape(self) -> int:
+        return -1
+class ReadOnlyPrefixCache:
+    """Cache-like prefix view that avoids cloning immutable prefix KV tensors.
+    Implements the minimal subset of the HF DynamicCache protocol needed by
+    ``PaliGemmaWithExpertModel.forward`` when ``use_cache=False``.
+    :param layers: Tuple of ``(keys, values, sliding_window)`` triples as
+        returned by ``encode_prefix``; ``sliding_window`` is used only to
+        decide whether this optimisation is safe (requires all ``None``).
+    """
+    offloading = False
+    is_compileable = False
+    is_initialized = True
+    def __init__(self, layers: tuple[tuple[Any, Any, Any], ...]) -> None:
+        self.layers = [
+            ReadOnlyPrefixCacheLayer(keys, values) for keys, values, _ in layers
+        ]
+    def update(
+        self,
+        key_states: Any,
+        value_states: Any,
+        layer_idx: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> tuple[Any, Any]:
+        return self.layers[layer_idx].update(key_states, value_states, *args, **kwargs)
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        if layer_idx >= len(self.layers):
+            return 0
+        return self.layers[layer_idx].get_seq_length()
+    def get_mask_sizes(self, query_length: int, layer_idx: int = 0) -> tuple[int, int]:
+        if layer_idx >= len(self.layers):
+            return int(query_length), 0
+        return self.layers[layer_idx].get_mask_sizes(query_length)
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        if layer_idx >= len(self.layers):
+            return -1
+        return self.layers[layer_idx].get_max_cache_shape()
+    @property
+    def is_sliding(self) -> list[bool]:
+        return [False for _ in self.layers]
+    def __len__(self) -> int:
+        return len(self.layers)

sparkrt/adapters/act_adapter.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""ACT adapter - single-shot Action Chunking Transformer.
+ACT is the simple execution shape: one forward pass maps an observation to a
+full action chunk ``[B, chunk_size, action_dim]``; there is no sampling loop and
+no language prompt. This adapter declares a single ``"forward"`` region and
+replicates exactly the image-list assembly that ``ACTAgent.predict_action_chunk``
+does before calling the module.
+At inference the ACT forward is a fixed-shape, purely device-side computation
+(the VAE encoder is skipped in eval, so there is no host-side control flow or
+RNG): a ResNet backbone per camera plus a transformer encoder/decoder. That
+makes it an ideal single-graph capture target. The region therefore takes its
+inputs as *positional CUDA tensors* (``state`` then one tensor per camera) -
+the form a graph backend can record over static buffers - and reconstructs the
+tiny ``{state, images}`` dict the module indexes internally. Capture only kicks
+in for the simple ``state + images`` configuration; exotic feature sets
+(environment-state or DoF features) keep the eager dict path.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List
+from sparkmind.data.constants import OBS_IMAGES, OBS_STATE
+from sparkrt.core.adapter import Capabilities, ModelAdapter
+from sparkrt.core.region import Region
+class ACTAdapter(ModelAdapter):
+    """Adapter for :class:`sparkmind...act_model.ACTModel` via an ``ACTAgent``.
+    :param agent: A constructed SparkMind2 ``ACTAgent`` (provides ``.model``,
+        ``.cfg`` and ``.image_features``).
+    """
+    def __init__(self, agent: Any) -> None:
+        super().__init__()
+        self._agent = agent
+        self._model = agent.model
+        cfg = agent.cfg
+        # image_features is a dict {obs_key: feature}; we only need the keys,
+        # in order, to assemble the OBS_IMAGES list the module expects.
+        self._image_keys = list(getattr(agent, "image_features", {}) or {})
+        model = self._model
+        self._has_state = getattr(model, "robot_state_feature", None) is not None
+        has_env_state = getattr(model, "env_state_feature", None) is not None
+        uses_dof = bool(getattr(model, "use_dof_features", False))
+        # The positional/captured path covers the common ACT shape (robot state
+        # + camera images). Anything that pulls extra batch keys into the module
+        # forward stays on the eager dict path to preserve correctness.
+        self._capturable = (
+            bool(self._image_keys) and not has_env_state and not uses_dof
+        )
+        model_cfg = cfg.Model
+        action_dim = int(model_cfg.output_features["action"].shape[0])
+        state_dim = 0
+        if self._has_state:
+            state_feat = model_cfg.input_features.get(OBS_STATE)
+            if state_feat is not None:
+                state_dim = int(state_feat.shape[0])
+        ensemble_coeff = getattr(cfg.Trainer, "temporal_ensemble_coeff", None)
+        self._ensemble_coeff = ensemble_coeff
+        self._caps = Capabilities(
+            requires_prompt=False,
+            is_iterative=False,
+            num_inference_steps=1,
+            chunk_size=int(model_cfg.chunk_size),
+            n_action_steps=int(model_cfg.n_action_steps),
+            action_dim=action_dim,
+            supports_temporal_ensemble=ensemble_coeff is not None,
+            camera_keys=tuple(self._image_keys),
+            state_dim=state_dim,
+        )
+    @property
+    def capabilities(self) -> Capabilities:
+        return self._caps
+    def build_regions(self) -> Dict[str, Region]:
+        model = self._model
+        if not self._capturable:
+            # Eager fallback: the module indexes whatever keys it needs straight
+            # out of the full batch dict (env-state / DoF configurations).
+            def forward_dict(batch: Dict[str, Any]):
+                return model(batch)[0]
+            return {"forward": Region("forward", forward_dict, capturable=False)}
+        has_state = self._has_state
+        def forward(*tensors: Any):
+            # Positional layout: (state, img0, img1, ...) when a robot-state
+            # feature is present, else (img0, img1, ...). Rebuild the minimal
+            # dict the module forward indexes; ACTModel returns
+            # (actions, (mu, log_sigma)) and we want the actions.
+            if has_state:
+                batch = {OBS_STATE: tensors[0], OBS_IMAGES: list(tensors[1:])}
+            else:
+                batch = {OBS_IMAGES: list(tensors)}
+            return model(batch)[0]
+        # Fixed-shape backbone + transformer over static buffers -> capturable.
+        return {"forward": Region("forward", forward, capturable=True)}
+    def predict_chunk(self, ctx: Any, batch: Dict[str, Any], *, noise: Any = None):
+        if not self._capturable:
+            # Assemble the multi-camera image list exactly as the agent does.
+            batch = dict(batch)
+            batch[OBS_IMAGES] = [batch[key] for key in self._image_keys]
+            return self.region("forward")(batch)
+        # Pass the exact tensors the module needs, positionally, so a graph
+        # backend can capture/replay the region.
+        images = [batch[key] for key in self._image_keys]
+        args: List[Any] = [batch[OBS_STATE], *images] if self._has_state else images
+        return self.region("forward")(*args)
+    def make_ensembler(self) -> Any:
+        from sparkmind.learning.IL.models.act_model import ACTTemporalEnsembler
+        if self._ensemble_coeff is None:
+            raise NotImplementedError("temporal ensemble not enabled for this model")
+        return ACTTemporalEnsembler(self._ensemble_coeff, self._caps.chunk_size)

sparkrt/adapters/pi05_adapter.py ADDED Viewed

@@ -0,0 +1,381 @@
+"""Pi0.5 adapter - VLA with a flow-matching denoising loop.
+Pi0.5 is the latency-critical, *stateful-per-chunk* shape:
+    prepare images + language  ->  embed prefix (SigLIP + PaliGemma)  ->  KV cache
+                                    -> denoise loop (N steps over a 300M expert)
+                                    -> action chunk
+The adapter declares regions that mirror exactly what
+``PI05Pytorch.sample_actions`` does, but split at the natural seams:
+* ``encode_prefix`` (run once per chunk, **not** capturable): embeds the images
+  and language prompt and runs the PaliGemma prefill to build the KV cache. This
+  involves HF control flow / variable content and is not the hot path.
+* ``embed_suffix`` (run ``num_inference_steps`` times, **not** capturable):
+  embeds the noisy actions + timestep into the suffix tokens. It is cheap (a few
+  small projections) but builds its attention-mask constant via
+  ``torch.tensor([...], device=cuda)`` - a host->device copy that is illegal
+  during graph capture - so it stays eager.
+* ``denoise_step`` (run ``num_inference_steps`` times, **capturable**): the
+  expensive expert-transformer forward over the fixed-shape suffix against the
+  cached prefix, plus the action projection. This is the latency-critical inner
+  hot path and the CUDA-graph capture target. It mirrors the body of
+  ``PI05Pytorch.denoise_step`` *after* ``embed_suffix`` and reuses every
+  ``nn.Module`` (the expert, ``action_out_proj``); only the mask-assembly glue
+  is inlined, exactly as ``encode_prefix`` mirrors the prefill glue.
+The KV cache is passed to ``denoise_step`` as a *flat tuple of tensors* (so the
+graph backend can allocate one static buffer per tensor and replay across
+chunks); the per-layer ``sliding_window`` metadata - constant for the model - is
+held on the adapter and used to rebuild the ``DynamicCache`` inside the region.
+The denoise loop body (``x_t += dt * v_t``) stays in Python in ``predict_chunk``,
+identical to ``sample_actions``, which keeps numerical parity exact.
+Image/language preprocessing intentionally stays *outside* the regions (it has
+data-dependent shapes and control flow), so the captured hot path is a pure
+tensor-in/tensor-out graph a native core could reimplement 1:1.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+from sparkrt.adapters._pi05_kv_cache import ReadOnlyPrefixCache
+from sparkrt.config.runtime import Pi05RuntimeConfig
+from sparkrt.core.adapter import Capabilities, ModelAdapter
+from sparkrt.core.region import Region
+@dataclass
+class _Pi05Context:
+    schedules: dict[tuple[str, int, int, str], tuple[list[Any], list[float]]] = field(
+        default_factory=dict
+    )
+    suffix_masks: dict[tuple[str, int, int, str], tuple[Any, Any]] = field(
+        default_factory=dict
+    )
+class Pi05Adapter(ModelAdapter):
+    """Adapter for ``PI05Pytorch`` via a SparkMind2 ``Pi05Agent``.
+    :param agent: A constructed SparkMind2 ``Pi05Agent`` (provides ``.model``,
+        ``.cfg`` and the ``prepare_images`` / ``_get_language_inputs`` helpers
+        used to build model inputs).
+    """
+    def __init__(self, agent: Any, config: Optional[Pi05RuntimeConfig] = None) -> None:
+        super().__init__()
+        self._agent = agent
+        self._model = agent._core_model()
+        cfg = agent.cfg
+        model_cfg = cfg.Model
+        self._action_dim = int(model_cfg.output_features["action"].shape[0])
+        self._chunk_size = int(model_cfg.chunk_size)
+        self._max_action_dim = int(getattr(model_cfg, "max_action_dim", self._action_dim))
+        # Camera keys (full ``observation.images.*`` keys, in order) and robot
+        # state dim, surfaced through Capabilities for the SDK observation layer.
+        input_features = model_cfg.input_features or {}
+        self._image_keys = list(getattr(agent, "image_features", []) or [])
+        from sparkmind.data.constants import OBS_STATE
+        state_feat = input_features.get(OBS_STATE)
+        self._state_dim = int(state_feat.shape[0]) if state_feat is not None else 0
+        # Resolve runtime config: explicit > env vars > defaults.
+        self._rtcfg = config if config is not None else Pi05RuntimeConfig.from_env()
+        # Resolve num_steps: config value wins; None means read from checkpoint.
+        if self._rtcfg.num_steps is not None:
+            self._num_steps = self._rtcfg.num_steps
+        else:
+            self._num_steps = int(getattr(model_cfg, "num_inference_steps", 10))
+        # Per-layer KV-cache sliding-window metadata (constant for the model);
+        # learned lazily from the first prefix so denoise_step can rebuild the
+        # DynamicCache from a flat tensor tuple.
+        self._sliding_windows: Optional[List[Any]] = None
+        self._caps = Capabilities(
+            requires_prompt=True,
+            is_iterative=True,
+            num_inference_steps=self._num_steps,
+            chunk_size=self._chunk_size,
+            n_action_steps=int(model_cfg.n_action_steps),
+            action_dim=self._action_dim,
+            supports_temporal_ensemble=False,
+            camera_keys=tuple(self._image_keys),
+            state_dim=self._state_dim,
+        )
+    @property
+    def capabilities(self) -> Capabilities:
+        return self._caps
+    def new_context(self) -> _Pi05Context:
+        return _Pi05Context()
+    def _get_schedule(
+        self,
+        ctx: _Pi05Context,
+        device: Any,
+        batch_size: int,
+    ) -> tuple[list[Any], list[float]]:
+        import torch
+        schedule = self._rtcfg.schedule
+        key = (str(device), int(batch_size), self._num_steps, schedule)
+        cached = ctx.schedules.get(key)
+        if cached is not None:
+            return cached
+        if schedule == "uniform":
+            power = 1.0
+        elif schedule == "power1.5":
+            power = 1.5
+        else:
+            power = 2.0
+        t_nodes = [
+            ((self._num_steps - idx) / self._num_steps) ** power
+            for idx in range(self._num_steps + 1)
+        ]
+        timesteps = [
+            torch.tensor(t_nodes[idx], dtype=torch.float32, device=device).expand(batch_size)
+            for idx in range(self._num_steps)
+        ]
+        dts = [t_nodes[idx + 1] - t_nodes[idx] for idx in range(self._num_steps)]
+        cached = (timesteps, dts)
+        ctx.schedules[key] = cached
+        return cached
+    def _get_suffix_masks(
+        self,
+        ctx: _Pi05Context,
+        device: Any,
+        batch_size: int,
+        dtype: Any,
+    ) -> tuple[Any, Any]:
+        import torch
+        key = (str(device), int(batch_size), self._chunk_size, str(dtype))
+        cached = ctx.suffix_masks.get(key)
+        if cached is not None:
+            return cached
+        suffix_pad_masks = torch.ones(
+            batch_size,
+            self._chunk_size,
+            dtype=torch.bool,
+            device=device,
+        )
+        suffix_att_mask_1d = torch.zeros(self._chunk_size, dtype=dtype, device=device)
+        suffix_att_mask_1d[0] = 1
+        suffix_att_masks = suffix_att_mask_1d[None, :].expand(batch_size, self._chunk_size)
+        cached = (suffix_pad_masks, suffix_att_masks)
+        ctx.suffix_masks[key] = cached
+        return cached
+    def build_regions(self) -> Dict[str, Region]:
+        import torch
+        import torch.nn.functional as F
+        from sparkmind.learning.VLA.models.pi05_model import (
+            clone_past_key_values,
+            create_sinusoidal_pos_embedding,
+            make_att_2d_masks,
+        )
+        model = self._model
+        chunk_size = self._chunk_size
+        attn_impl = self._rtcfg.attn_impl
+        use_readonly_prefix_cache = self._resolve_readonly_prefix_cache()
+        def encode_prefix(images, img_masks, tokens, masks):
+            # Mirrors the prefix section of PI05Pytorch.sample_actions exactly.
+            prefix_embs, prefix_pad_masks, prefix_att_masks = model.embed_prefix(
+                images, img_masks, tokens, masks
+            )
+            prefix_att_2d_masks = make_att_2d_masks(prefix_pad_masks, prefix_att_masks)
+            prefix_position_ids = prefix_pad_masks.cumsum(dim=1) - 1
+            prefix_att_2d_masks_4d = model._prepare_attention_masks_4d(prefix_att_2d_masks)
+            language_model = model.paligemma_with_expert.paligemma.model.language_model
+            language_model.config._attn_implementation = attn_impl  # noqa: SLF001
+            if attn_impl == "sdpa":
+                # HF's SDPA path uses the 4D mask as an additive bias and
+                # requires it to match the query dtype (bf16); the eager path
+                # tolerated the float32 mask. Cast to the attention weight dtype.
+                prefix_att_2d_masks_4d = prefix_att_2d_masks_4d.to(
+                    language_model.layers[0].self_attn.q_proj.weight.dtype
+                )
+            _, past_key_values = model.paligemma_with_expert.forward(
+                attention_mask=prefix_att_2d_masks_4d,
+                position_ids=prefix_position_ids,
+                past_key_values=None,
+                inputs_embeds=[prefix_embs, None],
+                use_cache=True,
+            )
+            return prefix_pad_masks, past_key_values
+        def embed_suffix(x_t, timestep):
+            # Mirrors PI05Pytorch.embed_suffix's numeric path, but leaves the
+            # fixed suffix masks to the adapter context so they can be reused
+            # across denoising steps and chunks.
+            time_emb = create_sinusoidal_pos_embedding(
+                timestep,
+                model.action_in_proj.out_features,
+                min_period=model.config.min_period,
+                max_period=model.config.max_period,
+                device=timestep.device,
+            )
+            time_emb = time_emb.type(dtype=timestep.dtype)
+            action_emb = model.action_in_proj(x_t)
+            time_emb = model.time_mlp_in(time_emb)
+            time_emb = F.silu(time_emb)
+            time_emb = model.time_mlp_out(time_emb)
+            adarms_cond = F.silu(time_emb)
+            return action_emb, adarms_cond
+        def denoise_step(
+            suffix_embs,
+            adarms_cond,
+            prefix_pad_masks,
+            suffix_pad_masks,
+            suffix_att_masks,
+            *kv_tensors,
+        ):
+            # Mirrors PI05Pytorch.denoise_step *after* embed_suffix: assemble the
+            # full attention mask + position ids, run the expert against the
+            # (cloned) cached prefix, and project to a velocity. All ops are
+            # device-side and fixed-shape, so this body is graph-capturable.
+            #
+            # Signature ordering matters for the graph backend: the two inputs
+            # that change every denoising step (``suffix_embs``, ``adarms_cond``)
+            # come first; everything after them - the prefix/suffix masks and the
+            # KV cache - is constant across the loop, so ``invariant_from=2`` lets
+            # the backend skip re-copying the (large) KV tensors on each replay.
+            sliding = self._sliding_windows or []
+            layers = tuple(
+                (kv_tensors[2 * i], kv_tensors[2 * i + 1], sliding[i])
+                for i in range(len(sliding))
+            )
+            can_use_readonly_prefix_cache = use_readonly_prefix_cache and all(
+                sliding_window is None for sliding_window in sliding
+            )
+            if can_use_readonly_prefix_cache:
+                past_key_values = ReadOnlyPrefixCache(layers)
+            else:
+                past_key_values = clone_past_key_values(layers)
+            suffix_len = suffix_pad_masks.shape[1]
+            batch_size = prefix_pad_masks.shape[0]
+            prefix_len = prefix_pad_masks.shape[1]
+            prefix_pad_2d_masks = prefix_pad_masks[:, None, :].expand(
+                batch_size, suffix_len, prefix_len
+            )
+            suffix_att_2d_masks = make_att_2d_masks(suffix_pad_masks, suffix_att_masks)
+            full_att_2d_masks = torch.cat([prefix_pad_2d_masks, suffix_att_2d_masks], dim=2)
+            prefix_offsets = torch.sum(prefix_pad_masks, dim=-1)[:, None]
+            position_ids = prefix_offsets + torch.cumsum(suffix_pad_masks, dim=1) - 1
+            full_att_2d_masks_4d = model._prepare_attention_masks_4d(full_att_2d_masks)
+            gemma_expert = model.paligemma_with_expert.gemma_expert.model
+            gemma_expert.config._attn_implementation = attn_impl  # noqa: SLF001
+            if attn_impl == "sdpa":
+                full_att_2d_masks_4d = full_att_2d_masks_4d.to(
+                    gemma_expert.layers[0].self_attn.q_proj.weight.dtype
+                )
+            outputs_embeds, _ = model.paligemma_with_expert.forward(
+                attention_mask=full_att_2d_masks_4d,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=[None, suffix_embs],
+                use_cache=False,
+                adarms_cond=[None, adarms_cond],
+            )
+            suffix_out = outputs_embeds[1][:, -chunk_size:].to(dtype=torch.float32)
+            return model.action_out_proj(suffix_out)
+        return {
+            # Prefill: HF control flow + variable content -> eager only.
+            "encode_prefix": Region("encode_prefix", encode_prefix, capturable=False),
+            # Suffix embedding: cheap eager region; fixed suffix masks are
+            # cached separately in the adapter context.
+            "embed_suffix": Region("embed_suffix", embed_suffix, capturable=False),
+            # Expert forward + projection: fixed-shape hot path -> captured.
+            # Inputs 2.. (masks + KV cache) are loop-invariant within a chunk.
+            "denoise_step": Region(
+                "denoise_step", denoise_step, capturable=True, invariant_from=2
+            ),
+        }
+    def _resolve_readonly_prefix_cache(self) -> bool:
+        mode = self._rtcfg.readonly_prefix_cache.lower()
+        if mode in {"1", "true", "yes", "on"}:
+            return True
+        if mode in {"0", "false", "no", "off"}:
+            return False
+        # auto: use read-only cache for cudagraph (where it was validated),
+        # but not for torchcompile (adds numeric drift when combined).
+        backend_name = getattr(self._backend, "name", "")
+        return backend_name == "cudagraph"
+    def predict_chunk(self, ctx: Any, batch: Dict[str, Any], *, noise: Any = None):
+        import torch
+        model = self._model
+        # Build model inputs using the agent's own (parity-exact) helpers; this
+        # is preprocessing and stays outside the region bodies.
+        images, img_masks = self._agent.prepare_images(batch)
+        batch_size = images[0].shape[0]
+        device = images[0].device
+        tokens, masks = self._agent._get_language_inputs(batch, batch_size, device)
+        # Sample noise *before* the prefill, exactly as sample_actions does, so
+        # seeded parity matches bit-for-bit (the prefill consumes no RNG).
+        if noise is None:
+            actions_shape = (batch_size, self._chunk_size, self._max_action_dim)
+            noise = model.sample_noise(actions_shape, device)
+        prefix_pad_masks, past_key_values = self.region("encode_prefix")(
+            images, img_masks, tokens, masks
+        )
+        # Flatten the KV cache into positional tensors for the capturable region;
+        # record the (constant) per-layer sliding windows on first use.
+        kv_tensors: List[Any] = []
+        sliding_windows: List[Any] = []
+        for keys, values, sliding_window in past_key_values:
+            kv_tensors.append(keys)
+            kv_tensors.append(values)
+            sliding_windows.append(sliding_window)
+        self._sliding_windows = sliding_windows
+        if not isinstance(ctx, _Pi05Context):
+            ctx = self.new_context()
+        embed_suffix = self.region("embed_suffix")
+        denoise = self.region("denoise_step")
+        x_t = noise
+        timesteps, dts = self._get_schedule(ctx, device, batch_size)
+        for timestep, dt in zip(timesteps, dts):
+            suffix_embs, adarms_cond = embed_suffix(x_t, timestep)
+            suffix_pad_masks, suffix_att_masks = self._get_suffix_masks(
+                ctx,
+                device,
+                batch_size,
+                suffix_embs.dtype,
+            )
+            v_t = denoise(
+                suffix_embs,
+                adarms_cond,
+                prefix_pad_masks,
+                suffix_pad_masks,
+                suffix_att_masks,
+                *kv_tensors,
+            )
+            x_t = x_t + dt * v_t
+        return x_t[:, :, : self._action_dim]