PyPI - late-interaction-kernels - Versions diffs - 0.0.1__py3-none-any.whl - Mend

late-interaction-kernels 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

late_interaction_kernels/__init__.py +206 -0
late_interaction_kernels/_autotune.py +129 -0
late_interaction_kernels/_utils.py +50 -0
late_interaction_kernels/autograd.py +244 -0
late_interaction_kernels/backward.py +274 -0
late_interaction_kernels/backward_csr.py +202 -0
late_interaction_kernels/backward_unified.py +277 -0
late_interaction_kernels/experimental/__init__.py +42 -0
late_interaction_kernels/forward.py +273 -0
late_interaction_kernels/fp8.py +415 -0
late_interaction_kernels/fused_head.py +479 -0
late_interaction_kernels/matryoshka.py +253 -0
late_interaction_kernels/plaid.py +979 -0
late_interaction_kernels/py.typed +0 -0
late_interaction_kernels/pylate_compat.py +213 -0
late_interaction_kernels/reference.py +373 -0
late_interaction_kernels/retrieve.py +272 -0
late_interaction_kernels/scatter.py +202 -0
late_interaction_kernels/smooth.py +639 -0
late_interaction_kernels/soft.py +328 -0
late_interaction_kernels/topk.py +91 -0
late_interaction_kernels/varlen.py +490 -0
late_interaction_kernels/xtr.py +87 -0
late_interaction_kernels-0.0.1.dist-info/METADATA +252 -0
late_interaction_kernels-0.0.1.dist-info/RECORD +27 -0
late_interaction_kernels-0.0.1.dist-info/WHEEL +4 -0
late_interaction_kernels-0.0.1.dist-info/licenses/LICENSE +201 -0

late_interaction_kernels/__init__.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Fused Triton kernels for late-interaction (MaxSim) scoring.
+Common entry points::
+    from late_interaction_kernels import patch_pylate, MaxSimScorer, retrieve
+    patch_pylate()                         # PyLate drop-in
+    scorer = MaxSimScorer(normalize=True)  # nn.Module, autograd-aware
+    scores, idx = retrieve(Q, D, top_k=100)
+See the README for the full API and benchmarks.
+FP8 helpers live in ``late_interaction_kernels.fp8``.
+Research kernels live in ``late_interaction_kernels.experimental``.
+"""
+from importlib.metadata import PackageNotFoundError
+from importlib.metadata import version as _pkg_version
+try:
+    __version__ = _pkg_version("late-interaction-kernels")
+except PackageNotFoundError:  # pragma: no cover — running from a source tree without install
+    __version__ = "0.0.0+unknown"
+# The kernels need Triton (Linux + CUDA). On macOS / Windows we still want
+# `import late_interaction_kernels` to succeed so users can develop against
+# the pure-PyTorch reference and `MaxSimScorer` / `retrieve` fallbacks.
+try:
+    import triton  # noqa: F401
+    _HAS_TRITON = True
+except ImportError:  # pragma: no cover
+    _HAS_TRITON = False
+if _HAS_TRITON:
+    from .autograd import (
+        get_backward_method,
+        maxsim,
+        maxsim_inference,
+        set_backward_method,
+    )
+    from .fp8 import maxsim_inference_fp8
+    from .fused_head import maxsim_from_hidden, maxsim_from_hidden_train
+    from .plaid import (
+        maxsim_residual,
+        maxsim_residual_varlen,
+        plaid_approx_score,
+    )
+    from .pylate_compat import patch_pylate, unpatch_pylate
+    from .scatter import maxsim_inference_scatter
+    from .varlen import maxsim_varlen
+else:  # pragma: no cover
+    def _needs_triton(*_args, **_kwargs):  # type: ignore[no-redef]
+        raise RuntimeError(
+            "late-interaction-kernels GPU kernels require Triton, which isn't "
+            "installed on this platform. Install a CUDA-enabled Triton (Linux only) "
+            "or use `late_interaction_kernels.reference` for the pure-PyTorch path."
+        )
+    maxsim = maxsim_inference = _needs_triton
+    maxsim_from_hidden = maxsim_from_hidden_train = _needs_triton
+    maxsim_inference_fp8 = _needs_triton
+    maxsim_varlen = _needs_triton
+    plaid_approx_score = _needs_triton
+    maxsim_residual = maxsim_residual_varlen = _needs_triton
+    maxsim_inference_scatter = _needs_triton
+    set_backward_method = get_backward_method = _needs_triton
+    patch_pylate = unpatch_pylate = _needs_triton
+# `MaxSimScorer` and `retrieve` are always importable: they fall back to the
+# pure-PyTorch reference on platforms without Triton, so training and
+# retrieval code can be unit-tested locally.
+from . import reference  # noqa: E402,F401
+from .retrieve import MaxSimScorer, retrieve  # noqa: E402
+# Symbols moved out of the top level. Still importable, with a
+# `DeprecationWarning`. Scheduled for removal in a future release.
+_DEPRECATED_EXPERIMENTAL = {
+    "maxsim_matryoshka": "late_interaction_kernels.experimental",
+    "maxsim_xtr": "late_interaction_kernels.experimental",
+    "soft_maxsim": "late_interaction_kernels.experimental",
+    "smooth_maxsim": "late_interaction_kernels.experimental",
+}
+_DEPRECATED_FP8_HELPERS = {
+    "quantize_fp8_per_tensor": "late_interaction_kernels.fp8",
+    "quantize_fp8_per_token": "late_interaction_kernels.fp8",
+    "dequantize_fp8_per_tensor": "late_interaction_kernels.fp8",
+    "dequantize_fp8_per_token": "late_interaction_kernels.fp8",
+}
+def __getattr__(name: str):
+    """PEP 562 — re-export deprecated / moved symbols with a warning."""
+    import warnings
+    if name == "maxsim_forward":
+        warnings.warn(
+            "`late_interaction_kernels.maxsim_forward` is deprecated. Use "
+            "`maxsim_inference` for reranking, `maxsim` for gradients, or "
+            "import the primitive from `late_interaction_kernels.forward`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from .forward import maxsim_forward as _mf
+            return _mf
+        return _needs_triton
+    if name == "maxsim_topk":
+        warnings.warn(
+            "`maxsim_topk` is deprecated; use `retrieve(Q, D, top_k=...)` "
+            "(same semantics, transparent CPU fallback). Still importable from "
+            "`late_interaction_kernels.topk`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from .topk import maxsim_topk as _mt
+            return _mt
+        return _needs_triton
+    if name == "maxsim_residual_inference":
+        warnings.warn(
+            "`maxsim_residual_inference` is deprecated; `maxsim_residual` "
+            "auto-skips the argmax save when `Q.requires_grad=False`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from .plaid import maxsim_residual_inference as _mri
+            return _mri
+        return _needs_triton
+    if name == "maxsim_varlen_inference":
+        warnings.warn(
+            "`maxsim_varlen_inference` is deprecated; `maxsim_varlen` "
+            "auto-skips the argmax save when neither input requires grad.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from .varlen import maxsim_varlen_inference as _mvi
+            return _mvi
+        return _needs_triton
+    if name in _DEPRECATED_EXPERIMENTAL:
+        new_home = _DEPRECATED_EXPERIMENTAL[name]
+        warnings.warn(
+            f"`late_interaction_kernels.{name}` moved to `{new_home}`. Use `from {new_home} import {name}`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from . import experimental
+            return getattr(experimental, name)
+        return _needs_triton
+    if name in _DEPRECATED_FP8_HELPERS:
+        new_home = _DEPRECATED_FP8_HELPERS[name]
+        warnings.warn(
+            f"`late_interaction_kernels.{name}` moved to `{new_home}`. Use `from {new_home} import {name}`.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if _HAS_TRITON:
+            from . import fp8
+            return getattr(fp8, name)
+        return _needs_triton
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+__all__ = [
+    "__version__",
+    # high-level
+    "MaxSimScorer",
+    "retrieve",
+    "patch_pylate",
+    "unpatch_pylate",
+    # core MaxSim
+    "maxsim",
+    "maxsim_inference",
+    "maxsim_varlen",
+    # reranking on packed batches
+    "maxsim_inference_scatter",
+    # fused D-side head
+    "maxsim_from_hidden",
+    "maxsim_from_hidden_train",
+    # PLAID / ColBERTv2
+    "plaid_approx_score",
+    "maxsim_residual",
+    "maxsim_residual_varlen",
+    # FP8 inference
+    "maxsim_inference_fp8",
+    # configuration
+    "set_backward_method",
+    "get_backward_method",
+    "reference",
+]

late_interaction_kernels/_autotune.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""Autotune configs per GPU family.
+Triton autotune runs each candidate once on the first call for a given key,
+caches the winner, and reuses it forever after. We keep lists short — each
+extra config costs one real launch.
+Family rules of thumb (verified on H100 / A100 and conservative on the rest):
+- Small ``d`` (≤ 128): prefer `BLOCK_Q=32-64, BLOCK_D=64-128`.
+- Large ``d`` (≥ 512): shrink blocks so the fp16 `Q`/`D` tiles plus the fp32
+  `S` tile fit in the SM's shared-memory budget.
+- Hopper loves `num_stages ≥ 3` (warp specialization + async copy).
+- Ampere / Ada are happiest with `num_stages=2`.
+Per-family SRAM budgets (KiB of shared memory the kernel can actually use):
+- Hopper (H100 / H200):       228
+- Ampere (A100):              164
+- Ampere consumer (3090, A10): 100
+- Ada (L4, L40, RTX 4090):    100
+- Unknown / older:             48 (safe floor)
+"""
+from __future__ import annotations
+import inspect
+import triton
+from ._utils import detect_gpu
+# Warp specialization (FA-3 style) requires Triton 3.2+. The
+# ``num_consumer_groups`` / ``num_buffers_warp_spec`` kwargs on
+# ``triton.Config`` opt a kernel into producer-consumer warp specialization
+# so loads overlap cleanly with ``tl.dot``. We feature-detect so we still
+# run on older Triton.
+try:
+    _CFG_PARAMS = set(inspect.signature(triton.Config).parameters)
+except (TypeError, ValueError):  # pragma: no cover
+    _CFG_PARAMS = set()
+_HAS_WARP_SPEC = {"num_consumer_groups", "num_buffers_warp_spec"} <= _CFG_PARAMS
+def _cfg(kwargs, *, num_warps, num_stages, warp_spec=False):
+    """Build a ``triton.Config`` and quietly opt-in to warp specialization
+    when the running Triton supports it.
+    """
+    extras = {}
+    if warp_spec and _HAS_WARP_SPEC:
+        extras["num_consumer_groups"] = 2
+        extras["num_buffers_warp_spec"] = num_stages
+    return triton.Config(kwargs, num_warps=num_warps, num_stages=num_stages, **extras)
+def _small_d_hopper():
+    return [
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 64}, num_warps=4, num_stages=3),
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 128}, num_warps=8, num_stages=3),
+        _cfg({"BLOCK_Q": 64, "BLOCK_D": 64}, num_warps=4, num_stages=3),
+        _cfg({"BLOCK_Q": 64, "BLOCK_D": 128}, num_warps=8, num_stages=3),
+        _cfg({"BLOCK_Q": 128, "BLOCK_D": 64}, num_warps=8, num_stages=2),
+        _cfg({"BLOCK_Q": 128, "BLOCK_D": 128}, num_warps=8, num_stages=2),
+        # Warp-specialized shortlist: producer warp group streams Q/D tiles
+        # into shared memory while consumer group(s) run back-to-back
+        # ``tl.dot`` + running-max. No-ops on Triton < 3.2.
+        _cfg({"BLOCK_Q": 64, "BLOCK_D": 128}, num_warps=8, num_stages=3, warp_spec=True),
+        _cfg({"BLOCK_Q": 128, "BLOCK_D": 128}, num_warps=8, num_stages=3, warp_spec=True),
+    ]
+def _small_d_ampere():
+    """Works on A100, A10, A40, 3090, and is a safe default for Ada (L4, L40, 4090)."""
+    return [
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 64}, num_warps=4, num_stages=2),
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 128}, num_warps=8, num_stages=2),
+        _cfg({"BLOCK_Q": 64, "BLOCK_D": 64}, num_warps=4, num_stages=2),
+        _cfg({"BLOCK_Q": 64, "BLOCK_D": 128}, num_warps=8, num_stages=2),
+        _cfg({"BLOCK_Q": 128, "BLOCK_D": 64}, num_warps=8, num_stages=1),
+    ]
+def _large_d_configs():
+    """Small-block configs for d ≥ 512 — fit any GPU, any SM."""
+    return [
+        _cfg({"BLOCK_Q": 16, "BLOCK_D": 16}, num_warps=2, num_stages=2),
+        _cfg({"BLOCK_Q": 16, "BLOCK_D": 32}, num_warps=2, num_stages=2),
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 16}, num_warps=2, num_stages=2),
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 32}, num_warps=4, num_stages=2),
+        _cfg({"BLOCK_Q": 32, "BLOCK_D": 64}, num_warps=4, num_stages=2),
+    ]
+_SRAM_KIB_BY_FAMILY = {
+    "hopper": 228,
+    "a100": 164,
+    "ampere": 100,
+    "ada": 100,
+    "generic": 48,
+}
+def forward_configs():
+    gpu = detect_gpu()
+    base = _large_d_configs()
+    if gpu == "hopper":
+        return base + _small_d_hopper()
+    if gpu in ("a100", "ampere", "ada"):
+        return base + _small_d_ampere()
+    return base  # minimal safe shortlist for unknown GPUs
+def prune_forward(configs, named_args, **kwargs):
+    """Drop configs that overflow shared memory or are oversized for the problem."""
+    Lq = named_args.get("Lq", 32)
+    d = named_args.get("d", 128)
+    gpu = detect_gpu()
+    # Reserve 8 KiB for Triton scratch; the rest is ours.
+    sram_budget = (_SRAM_KIB_BY_FAMILY.get(gpu, 48) - 8) * 1024
+    keep = []
+    for cfg in configs:
+        bq, bd = cfg.kwargs["BLOCK_Q"], cfg.kwargs["BLOCK_D"]
+        # fp16/bf16 Q tile + fp16/bf16 D tile + fp32 S tile.
+        need = (bq * d + bd * d) * 2 + bq * bd * 4
+        if need > sram_budget:
+            continue
+        if bq > 2 * Lq:
+            continue
+        keep.append(cfg)
+    # Always return at least two configs so autotune has something to compare.
+    return keep or configs[:2]

late_interaction_kernels/_utils.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Small helpers shared across kernels."""
+from __future__ import annotations
+import functools
+import torch
+def next_pow2(x: int) -> int:
+    """Smallest power of two >= x. `next_pow2(0)` returns 1."""
+    if x <= 1:
+        return 1
+    return 1 << (x - 1).bit_length()
+@functools.lru_cache(maxsize=1)
+def detect_gpu() -> str:
+    """Return a short GPU family string: 'hopper' | 'a100' | 'ada' | 'ampere' | 'generic'."""
+    if not torch.cuda.is_available():
+        return "generic"
+    name = torch.cuda.get_device_name().lower()
+    if "h100" in name or "h200" in name:
+        return "hopper"
+    if "a100" in name:
+        return "a100"
+    if "l4" in name or "l40" in name or "rtx 40" in name:
+        return "ada"
+    if "3090" in name or "a10" in name or "a40" in name:
+        return "ampere"
+    return "generic"
+def ensure_contiguous_last(x: torch.Tensor) -> torch.Tensor:
+    """Make sure the last dim is contiguous — cheap path for most inputs."""
+    if x.stride(-1) == 1:
+        return x
+    return x.contiguous()
+def pick_compute_dtype(Q: torch.Tensor, D: torch.Tensor) -> torch.dtype:
+    """Pick the compute dtype for `tl.dot`.
+    We honor user intent: if both tensors are fp16/bf16, dot runs in that dtype
+    with fp32 accumulator. If either is fp32 we fall back to fp16 on the tile
+    (fp32 GEMM doesn't go through tensor cores on H100 anyway).
+    """
+    if Q.dtype == torch.bfloat16 or D.dtype == torch.bfloat16:
+        return torch.bfloat16
+    return torch.float16

late_interaction_kernels/autograd.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""User-facing autograd wrapper for the fused MaxSim kernel."""
+from __future__ import annotations
+import os
+import warnings
+import torch
+from .backward import maxsim_backward
+from .backward_unified import maxsim_backward_unified
+from .forward import _run_forward, maxsim_forward
+_BACKWARD_METHOD = "auto"  # module-level toggle, flipped by `set_backward_method`
+_VALID_METHODS = ("auto", "atomic", "csr", "unified")
+# One-shot flag so we don't spam the user's logs if they happen to pass
+# unnormalized inputs inside a tight training loop.
+_WARNED_UNNORMALIZED = False
+def set_backward_method(method: str) -> None:
+    """Set the process-wide default ``grad_D`` path.
+    Prefer the per-call ``backward=`` kwarg on :func:`maxsim` and
+    :class:`~late_interaction_kernels.MaxSimScorer`. This global is kept
+    for back-compat and for pinning a single method across a benchmark run.
+    Values:
+    * ``"auto"`` — ``"unified"`` for almost every shape; ``"csr"`` for
+      very high ``grad_D`` contention (``Nq ≥ 256 ∧ Nd ≥ 256 ∧ Lq ≤ 64``).
+    * ``"unified"`` — single-pass fused ``grad_Q + grad_D`` kernel.
+    * ``"csr"`` — scatter-free bucketed reduction; bitwise-deterministic.
+    * ``"atomic"`` — legacy two-pass with fp32 ``tl.atomic_add``.
+    """
+    global _BACKWARD_METHOD
+    if method not in _VALID_METHODS:
+        raise ValueError(f"method must be one of {_VALID_METHODS}, got {method!r}")
+    _BACKWARD_METHOD = method
+def get_backward_method() -> str:
+    return _BACKWARD_METHOD
+def _maybe_warn_unnormalized(Q: torch.Tensor) -> None:
+    """Warn once when ``normalize=False`` is paired with non-normalized Q.
+    ColBERT / ColPali / LateOn always score L2-normalized tokens. Calling
+    ``maxsim`` on raw encoder outputs silently produces different score
+    scales than PyLate. Silence with ``LIK_SUPPRESS_NORM_WARN=1``.
+    """
+    global _WARNED_UNNORMALIZED
+    if _WARNED_UNNORMALIZED or os.environ.get("LIK_SUPPRESS_NORM_WARN", "0") == "1":
+        return
+    # Cheap sanity check: a handful of token norms.
+    with torch.no_grad():
+        sample = Q.detach()
+        # Flatten leading dims, inspect up to the first 64 tokens.
+        sample = sample.reshape(-1, sample.shape[-1])[:64]
+        if sample.numel() == 0:
+            return
+        norms = sample.float().norm(dim=-1)
+        med = norms.median().item()
+    if not (0.9 <= med <= 1.1):
+        _WARNED_UNNORMALIZED = True
+        warnings.warn(
+            f"late-interaction-kernels: `maxsim(..., normalize=False)` but Q's median L2 norm "
+            f"is {med:.3f} (ColBERT-style models expect ≈1.0). Pass `normalize=True` to fuse "
+            "the L2-norm into the kernel, or pre-normalize with `F.normalize(Q, dim=-1)`. "
+            "Silence with `LIK_SUPPRESS_NORM_WARN=1`.",
+            UserWarning,
+            stacklevel=3,
+        )
+class _MaxSimFn(torch.autograd.Function):
+    """Fused MaxSim with saved argmax, 3-D inputs."""
+    @staticmethod
+    def forward(ctx, Q, D, q_mask, d_mask, normalize, backward_method):
+        scores, argmax = _run_forward(Q, D, q_mask, d_mask, save_argmax=True, normalize=normalize)
+        ctx.save_for_backward(Q, D, argmax, q_mask, d_mask)
+        ctx.backward_method = backward_method
+        ctx.normalize = normalize
+        return scores
+    @staticmethod
+    def backward(ctx, grad_scores):
+        Q, D, argmax, q_mask, d_mask = ctx.saved_tensors
+        grad_scores = grad_scores.contiguous().to(torch.float32)
+        # `auto` -> `unified` for typical training shapes; `csr` only when
+        # `grad_D` contention is very high (large square batches, short Lq).
+        method = ctx.backward_method
+        if method == "auto":
+            Nq, Lq, _ = Q.shape
+            Nd = D.shape[0]
+            high_contention = Nq >= 256 and Nd >= 256 and Lq <= 64
+            method = "csr" if high_contention else "unified"
+        def _bwd(Qt, Dt):
+            if method == "unified":
+                return maxsim_backward_unified(grad_scores, Qt, Dt, argmax, q_mask=q_mask, method="atomic")
+            return maxsim_backward(
+                grad_scores,
+                Qt,
+                Dt,
+                argmax,
+                q_mask,
+                d_mask,
+                method=method,
+            )
+        if ctx.normalize:
+            # The forward computed scores against Q_hat = Q / ||Q|| and D_hat = D / ||D||.
+            # We need grad w.r.t. the *unnormalized* Q and D. We get that by
+            # (a) running the existing backward against the normalized tensors to get
+            # grad_Q_hat, grad_D_hat, then (b) applying the L2-normalize Jacobian.
+            q_norm = torch.linalg.vector_norm(Q, dim=-1, keepdim=True).clamp_min(1e-6)
+            d_norm = torch.linalg.vector_norm(D, dim=-1, keepdim=True).clamp_min(1e-6)
+            Q_hat = Q / q_norm
+            D_hat = D / d_norm
+            grad_Qh, grad_Dh = _bwd(Q_hat, D_hat)
+            # d Qhat / d Q = (I - Qhat Qhat^T) / ||Q||
+            grad_Q = (grad_Qh - (grad_Qh * Q_hat).sum(-1, keepdim=True) * Q_hat) / q_norm
+            grad_D = (grad_Dh - (grad_Dh * D_hat).sum(-1, keepdim=True) * D_hat) / d_norm
+        else:
+            grad_Q, grad_D = _bwd(Q, D)
+        # masks, normalize, backward_method receive no gradient
+        return grad_Q, grad_D, None, None, None, None
+def maxsim(
+    Q: torch.Tensor,
+    D: torch.Tensor,
+    q_mask: torch.Tensor | None = None,
+    d_mask: torch.Tensor | None = None,
+    *,
+    normalize: bool = False,
+    backward: str | None = None,
+) -> torch.Tensor:
+    """Differentiable fused MaxSim. Drop-in for PyLate's ``colbert_scores``.
+    Args:
+        Q: ``[Nq, Lq, d]`` or ``[Lq, d]``.
+        D: ``[Nd, Ld, d]`` or ``[Ld, d]``.
+        q_mask, d_mask: bool tensors (``True`` = valid token).
+        normalize: L2-normalize Q and D per-token inside the kernel. Set to
+            ``True`` for ColBERT / ColPali / LateOn-style scoring.
+        backward: per-call override of the ``grad_D`` strategy
+            (``"auto" | "unified" | "csr" | "atomic"``). ``None`` defers
+            to :func:`set_backward_method`.
+    Returns:
+        scores: ``[Nq, Nd]`` fp32, squeezed to match 2-D inputs.
+    Inputs can be fp16 / bf16 / fp32 (fp32 accumulator). Gradients flow
+    into Q and D; masks are non-differentiable.
+    """
+    q_was_2d = Q.dim() == 2
+    d_was_2d = D.dim() == 2
+    if q_was_2d:
+        Q = Q.unsqueeze(0)
+    if d_was_2d:
+        D = D.unsqueeze(0)
+    if q_mask is not None and q_mask.dim() == 1:
+        q_mask = q_mask.unsqueeze(0)
+    if d_mask is not None and d_mask.dim() == 1:
+        d_mask = d_mask.unsqueeze(0)
+    # Shape / device contract — fail fast with a clear message so user code
+    # doesn't silently corrupt memory or produce garbage scores.
+    if Q.shape[-1] != D.shape[-1]:
+        raise ValueError(
+            f"Q and D must share the embedding dim; got Q.shape[-1]={Q.shape[-1]} "
+            f"vs D.shape[-1]={D.shape[-1]}."
+        )
+    if Q.device != D.device:
+        raise ValueError(
+            f"Q and D must be on the same device; got Q.device={Q.device} vs D.device={D.device}."
+        )
+    if q_mask is not None and q_mask.device != Q.device:
+        raise ValueError(f"q_mask must be on the same device as Q; got {q_mask.device} vs {Q.device}.")
+    if d_mask is not None and d_mask.device != D.device:
+        raise ValueError(f"d_mask must be on the same device as D; got {d_mask.device} vs {D.device}.")
+    if backward is None:
+        method = _BACKWARD_METHOD
+    elif backward not in _VALID_METHODS:
+        raise ValueError(f"backward= must be one of {_VALID_METHODS} or None, got {backward!r}")
+    else:
+        method = backward
+    if not normalize:
+        _maybe_warn_unnormalized(Q)
+    Q = Q.contiguous()
+    D = D.contiguous()
+    q_mask_i8 = q_mask.contiguous().to(torch.int8) if q_mask is not None else None
+    d_mask_i8 = d_mask.contiguous().to(torch.int8) if d_mask is not None else None
+    scores = _MaxSimFn.apply(Q, D, q_mask_i8, d_mask_i8, normalize, method)
+    if q_was_2d and d_was_2d:
+        return scores.reshape(())
+    if q_was_2d:
+        return scores.squeeze(0)
+    if d_was_2d:
+        return scores.squeeze(-1)
+    return scores
+def maxsim_inference(
+    Q: torch.Tensor,
+    D: torch.Tensor,
+    q_mask: torch.Tensor | None = None,
+    d_mask: torch.Tensor | None = None,
+    *,
+    normalize: bool = False,
+) -> torch.Tensor:
+    """Inference-only MaxSim — like :func:`maxsim` but no saved argmax."""
+    if Q.shape[-1] != D.shape[-1]:
+        raise ValueError(
+            f"Q and D must share the embedding dim; got Q.shape[-1]={Q.shape[-1]} "
+            f"vs D.shape[-1]={D.shape[-1]}."
+        )
+    if Q.device != D.device:
+        raise ValueError(
+            f"Q and D must be on the same device; got Q.device={Q.device} vs D.device={D.device}."
+        )
+    if not normalize:
+        _maybe_warn_unnormalized(Q)
+    scores, _ = maxsim_forward(
+        Q,
+        D,
+        q_mask=q_mask,
+        d_mask=d_mask,
+        save_argmax=False,
+        normalize=normalize,
+    )
+    return scores