PyPI - mlx-recurrence - Versions diffs - 0.3.0__py3-none-any.whl - Mend

mlx-recurrence 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

mlx_recurrence/__init__.py +90 -0
mlx_recurrence/_chassis.py +213 -0
mlx_recurrence/gla.py +346 -0
mlx_recurrence/legacy/__init__.py +41 -0
mlx_recurrence/legacy/_utils.py +61 -0
mlx_recurrence/legacy/gla_scan.py +373 -0
mlx_recurrence/legacy/ssm_scan.py +430 -0
mlx_recurrence/rglru.py +282 -0
mlx_recurrence/rotlru.py +351 -0
mlx_recurrence/ssd.py +394 -0
mlx_recurrence-0.3.0.dist-info/METADATA +299 -0
mlx_recurrence-0.3.0.dist-info/RECORD +15 -0
mlx_recurrence-0.3.0.dist-info/WHEEL +5 -0
mlx_recurrence-0.3.0.dist-info/licenses/LICENSE +21 -0
mlx_recurrence-0.3.0.dist-info/top_level.txt +1 -0

mlx_recurrence/__init__.py ADDED Viewed

@@ -0,0 +1,90 @@
+"""mlx_recurrence — A plug-in framework for linear-recurrence Metal kernels
+on Apple Silicon ("flash-linear-attention for MLX").
+Each kernel is a self-contained plug-in built on a shared chassis
+(:mod:`mlx_recurrence._chassis`) that supplies the segment-checkpoint +
+recompute backward pattern, shape validation, and a parity-test helper. The
+Metal source for each recurrence stays in its own module, readable per-kernel.
+v2 kernels (checkpoint + recompute, fused simd reductions, chunked-prefill
+final-state variants):
+    ssd     — Mamba-2-style head-wise SSD selective scan
+    gla     — Gated Linear Attention recurrence
+    rglru   — RG-LRU diagonal recurrence (Griffin / RecurrentGemma)
+    rotlru  — rotational LRU: complex-diagonal scan over (u, w) pairs
+The original v0.1 token-loop kernels remain available under
+``mlx_recurrence.legacy`` and are re-exported at top level for backwards
+compatibility (``selective_scan_metal``, ``gla_scan_metal``, ...).
+"""
+# --- v2 chassis-based kernels ---------------------------------------------
+from .ssd import (
+    ssd_scan,
+    ssd_scan_with_state,
+    ssd_scan_reference,
+)
+from .gla import (
+    gla_scan,
+    gla_scan_with_state,
+    gla_scan_reference,
+)
+from .rglru import (
+    rglru_scan,
+    rglru_scan_with_state,
+    rglru_scan_reference,
+)
+from .rotlru import (
+    rotlru_scan,
+    rotlru_scan_with_state,
+    rotlru_scan_reference,
+)
+# --- shared chassis (public for building new plug-in kernels) -------------
+from ._chassis import (
+    DEFAULT_SEG,
+    get_or_build_kernel,
+    check_segment_shape,
+    parity_check,
+)
+# --- legacy v0.1 kernels (backwards compatibility) ------------------------
+from . import legacy
+from .legacy import (
+    selective_scan_metal,
+    selective_scan_chunked,
+    gla_scan_metal,
+    gla_scan_chunked,
+)
+__all__ = [
+    # v2 SSD
+    "ssd_scan",
+    "ssd_scan_with_state",
+    "ssd_scan_reference",
+    # v2 GLA
+    "gla_scan",
+    "gla_scan_with_state",
+    "gla_scan_reference",
+    # v2 RG-LRU
+    "rglru_scan",
+    "rglru_scan_with_state",
+    "rglru_scan_reference",
+    # v2 rotational LRU
+    "rotlru_scan",
+    "rotlru_scan_with_state",
+    "rotlru_scan_reference",
+    # chassis
+    "DEFAULT_SEG",
+    "get_or_build_kernel",
+    "check_segment_shape",
+    "parity_check",
+    # legacy subpackage + re-exports
+    "legacy",
+    "selective_scan_metal",
+    "selective_scan_chunked",
+    "gla_scan_metal",
+    "gla_scan_chunked",
+]
+__version__ = "0.3.0"

mlx_recurrence/_chassis.py ADDED Viewed

@@ -0,0 +1,213 @@
+"""_chassis.py — Shared infrastructure for chassis-based recurrence kernels.
+This module factors out the machinery common to every v2 plug-in kernel
+(SSD, GLA, RG-LRU) so each kernel file only has to supply its own Metal
+source strings and gradient wiring. It deliberately does NOT abstract the
+Metal source itself: every recurrence has a different state shape and
+update rule, and the kernel bodies are meant to stay readable per-kernel.
+What lives here
+---------------
+1. Kernel cache / builder around ``mx.fast.metal_kernel`` so each unique
+   shape configuration is JIT-compiled exactly once per process.
+2. Shape validation shared by the segment-checkpoint + recompute pattern:
+   the sequence length must tile evenly into segments (``L % seg == 0``)
+   and the simd-reduced lane dimension must be a multiple of 32.
+3. A reusable parity-test helper that compares forward output plus every
+   gradient against a pure-MLX reference and reports max abs / rel diffs.
+The segment-checkpoint + recompute pattern (shared design)
+----------------------------------------------------------
+Every kernel here follows the same playbook, tuned to the Apple Silicon
+unified-memory hierarchy:
+  Forward:  run the recurrence once, write only the state at each
+            segment boundary -> ``h_ckpt`` (SEG=32 => ~1/32 the writes of
+            saving every timestep). The last checkpoint doubles as the
+            chunk's final state, enabling chunked prefill.
+  Backward: walk segments newest -> oldest. For each segment, recompute
+            its per-timestep states from the preceding checkpoint into a
+            small scratch buffer (one segment's worth, stays resident in
+            the system-level cache instead of streaming the full state
+            history through DRAM), then run the adjoint sweep over that
+            segment. Cross-lane gradient reductions are fused in-kernel
+            with ``simd_sum`` over 32-lane simdgroups; the remaining
+            sum-over-simdgroups is a single cheap MLX reduction.
+All kernels keep fp32 state and accumulation regardless of input dtype,
+and reproduce the forward states bit-exactly on recompute (same fp32 ops,
+same order, from the same checkpoint).
+"""
+from __future__ import annotations
+import mlx.core as mx
+# Default segment length for the checkpoint+recompute pattern. SEG=32 is a
+# sweet spot on M3 Max: matches the 32-lane simdgroup width used by the
+# fused reductions and keeps the per-segment scratch buffer small enough to
+# stay SLC-resident at training shapes.
+DEFAULT_SEG = 32
+# Simd lane width on Apple GPUs. Lane dimensions reduced with simd_sum must
+# be a multiple of this.
+SIMD_WIDTH = 32
+# ---------------------------------------------------------------------------
+# Kernel cache / builder
+# ---------------------------------------------------------------------------
+_kernel_cache: dict = {}
+def get_or_build_kernel(name, input_names, output_names, source, header=""):
+    """Compile a Metal kernel once per unique ``name`` and cache it.
+    ``name`` should encode every shape/template constant baked into
+    ``source`` (e.g. ``f"ssd_fwd_{B}_{L}_{H}_{Dh}_{N}_{seg}"``) so that
+    distinct shapes get distinct compiled kernels and identical shapes
+    reuse the cached one.
+    ``source`` is the kernel BODY only (no ``kernel void`` signature) — MLX
+    generates the signature from ``input_names`` / ``output_names``. Helper
+    functions / includes go in ``header``.
+    """
+    if name not in _kernel_cache:
+        _kernel_cache[name] = mx.fast.metal_kernel(
+            name=name,
+            input_names=input_names,
+            output_names=output_names,
+            source=source,
+            header=header,
+        )
+    return _kernel_cache[name]
+# ---------------------------------------------------------------------------
+# Shape validation
+# ---------------------------------------------------------------------------
+def check_segment_shape(L, seg, lane_dim, lane_name="lane dimension"):
+    """Validate the constraints of the segment-checkpoint + simd-reduce pattern.
+    Args:
+        L:         sequence length.
+        seg:       segment length for checkpointing.
+        lane_dim:  the dimension mapped to 32-lane simdgroups (must tile by 32).
+        lane_name: human-readable name of ``lane_dim`` for the error message.
+    Raises:
+        ValueError: if either constraint is violated.
+    """
+    if seg <= 0:
+        raise ValueError(f"seg must be positive, got {seg}")
+    if L % seg != 0:
+        raise ValueError(
+            f"sequence length L={L} must be divisible by seg={seg} "
+            f"(segment-checkpoint pattern tiles L into L/seg segments)"
+        )
+    if lane_dim % SIMD_WIDTH != 0:
+        raise ValueError(
+            f"{lane_name}={lane_dim} must be a multiple of {SIMD_WIDTH} "
+            f"(fused gradient reductions use {SIMD_WIDTH}-lane simdgroups)"
+        )
+# ---------------------------------------------------------------------------
+# Parity-test helper
+# ---------------------------------------------------------------------------
+def parity_check(
+    kernel_fn,
+    reference_fn,
+    inputs,
+    arg_names,
+    grad_argnums,
+    *,
+    w_out=None,
+    y_tol=1e-3,
+    grad_rtol=1e-3,
+    label="",
+    verbose=True,
+):
+    """Compare a kernel against a pure-MLX reference: forward + all grads.
+    Both ``kernel_fn`` and ``reference_fn`` take the positional ``inputs``
+    and return the forward output ``y``. This helper builds a scalar loss
+    ``sum(y * w_out)`` and compares ``mx.grad`` of that loss w.r.t. every
+    argument in ``grad_argnums``.
+    Args:
+        kernel_fn:    the kernel under test, ``fn(*inputs) -> y``.
+        reference_fn: pure-MLX reference, ``fn(*inputs) -> y``.
+        inputs:       tuple/list of input arrays (positional).
+        arg_names:    names for each input (for readable output), same length
+                      as ``inputs``.
+        grad_argnums: tuple of argument indices to differentiate.
+        w_out:        output weighting for the scalar loss. Defaults to a
+                      fixed-seed random tensor shaped like ``y``.
+        y_tol:        absolute tolerance on the forward output diff.
+        grad_rtol:    relative tolerance on each gradient diff.
+        label:        prefix printed before results.
+        verbose:      print per-tensor diffs.
+    Returns:
+        (ok: bool, report: dict) where ``report`` maps each compared tensor
+        name to ``{"abs": max_abs_diff, "rel": max_rel_diff}``.
+    """
+    inputs = list(inputs)
+    y_k = kernel_fn(*inputs)
+    y_r = reference_fn(*inputs)
+    mx.eval(y_k, y_r)
+    if w_out is None:
+        mx.random.seed(1234)
+        w_out = mx.random.normal(y_r.shape)
+        mx.eval(w_out)
+    def loss_kernel(*args):
+        return mx.sum(kernel_fn(*args) * w_out)
+    def loss_ref(*args):
+        return mx.sum(reference_fn(*args) * w_out)
+    report = {}
+    ok = True
+    y_abs = float(mx.max(mx.abs(y_k - y_r)))
+    y_scale = float(mx.max(mx.abs(y_r))) + 1e-8
+    y_rel = y_abs / y_scale
+    report["y"] = {"abs": y_abs, "rel": y_rel}
+    ok = ok and (y_abs < y_tol)
+    g_k = mx.grad(loss_kernel, argnums=grad_argnums)(*inputs)
+    g_r = mx.grad(loss_ref, argnums=grad_argnums)(*inputs)
+    mx.eval(g_k, g_r)
+    if not isinstance(g_k, (tuple, list)):
+        g_k = (g_k,)
+        g_r = (g_r,)
+    grad_names = [arg_names[i] for i in grad_argnums]
+    for name, gk, gr in zip(grad_names, g_k, g_r):
+        abs_diff = float(mx.max(mx.abs(gk - gr)))
+        scale = float(mx.max(mx.abs(gr))) + 1e-8
+        rel = abs_diff / scale
+        report[f"grad_{name}"] = {"abs": abs_diff, "rel": rel}
+        ok = ok and (rel < grad_rtol)
+    if verbose:
+        prefix = f"{label}  " if label else ""
+        print(f"{prefix}y          max|diff| = {y_abs:.2e}  (rel {y_rel:.2e})")
+        for name in grad_names:
+            r = report[f"grad_{name}"]
+            print(
+                f"{prefix}grad_{name:<8} max|diff| = {r['abs']:.2e}"
+                f"  (rel {r['rel']:.2e})"
+            )
+        print(f"{prefix}-> {'PASS' if ok else 'FAIL'}")
+    return ok, report

mlx_recurrence/gla.py ADDED Viewed

@@ -0,0 +1,346 @@
+"""gla.py — Gated Linear Attention recurrence (checkpoint + recompute).
+v2 chassis port of the GLA recurrence kernel. Same segment-checkpoint +
+recompute backward pattern as :mod:`mlx_recurrence.ssd`, applied to the
+matrix-valued GLA state.
+Recurrence
+----------
+Per head ``head`` the state is the ``Dh x Dh`` matrix ``h[b, head, i, j]``
+with a single scalar forget gate per token::
+    h[i, j]       = gate[b,t,head] * h[i, j] + k[b,t,head,i] * v[b,t,head,j]
+    o[b,t,head,j] = sum_i q[b,t,head,i] * h[i, j]
+i.e. ``h_t = gate_t * h_{t-1} + k_t (outer) v_t`` and ``o_t = q_t @ h_t``
+(output uses the post-update state). ``gate`` is typically a sigmoid output
+in ``(0, 1)``; ``q`` is assumed pre-scaled / post-RoPE.
+The conceptual state tensor is ``[B, H, Dh, Dh]``. The checkpoint is laid
+out ``[B, nSeg, H, Dh, Dh]`` with ``j`` fastest-moving so the 32 lanes of a
+simdgroup own 32 contiguous ``j`` columns — coalesced reads/writes.
+Numerics: fp32 state and accumulation regardless of input dtype; identical
+update order (output uses post-update ``h``) and gradient formulas to the
+validated D-CSIL-3 training kernel. grad_v is exact per-thread; grad_q,
+grad_k, grad_gates are fused in-kernel via 32-lane ``simd_sum``.
+Constraints:
+    L  % seg == 0
+    Dh % 32  == 0       (Dh is the simd-reduced lane dimension)
+Public API:
+    gla_scan(q, k, v, gates, seg=32)             -> y
+    gla_scan_with_state(q, k, v, gates, seg=32)  -> (y, final_state)
+    gla_scan_reference(q, k, v, gates)           -> y   (pure MLX)
+"""
+from __future__ import annotations
+import mlx.core as mx
+from ._chassis import DEFAULT_SEG, get_or_build_kernel, check_segment_shape
+# ---------------------------------------------------------------------------
+# Metal forward: GLA scan with segment checkpoints (no full state history)
+# ---------------------------------------------------------------------------
+def _gla_forward_kernel(q, k, v, gates, seg):
+    """Forward GLA scan writing only segment-boundary state.
+    One thread per (batch*head, j) owns the ``h[:, j]`` state column in
+    registers across all L timesteps.
+    Args:
+        q, k, v: [B, L, H, Dh]
+        gates:   [B, L, H]
+        seg:     segment length (L % seg == 0)
+    Returns:
+        y:      [B, L, H, Dh]
+        h_ckpt: [B, nSeg, H, Dh, Dh]  state at the END of each segment
+    """
+    B_batch, L, H, Dh = q.shape
+    n_seg = L // seg
+    source = f"""
+        uint j  = thread_position_in_grid.x;
+        uint bh = thread_position_in_grid.y;
+        if (j >= {Dh}u || bh >= {B_batch * H}u) return;
+        uint b    = bh / {H}u;
+        uint head = bh % {H}u;
+        float h[{Dh}];
+        for (int i = 0; i < {Dh}; i++) h[i] = 0.0f;
+        for (int t = 0; t < {L}; t++) {{
+            int g_idx   = (b * {L} + t) * {H} + head;
+            int kv_base = ((b * {L} + t) * {H} + head) * {Dh};
+            float gate = (float)gates[g_idx];
+            float v_j  = (float)v[kv_base + j];
+            float o_j  = 0.0f;
+            for (int i = 0; i < {Dh}; i++) {{
+                h[i] = gate * h[i] + (float)k[kv_base + i] * v_j;
+                o_j += (float)q[kv_base + i] * h[i];
+            }}
+            output[kv_base + j] = o_j;
+            // checkpoint at end of each segment (j fastest -> coalesced)
+            if (((t + 1) % {seg}) == 0) {{
+                int s = t / {seg};
+                for (int i = 0; i < {Dh}; i++) {{
+                    int ck_idx = ((((b * {n_seg} + s) * {H} + head) * {Dh} + i) * {Dh} + j);
+                    h_ckpt[ck_idx] = h[i];
+                }}
+            }}
+        }}
+    """
+    kernel = get_or_build_kernel(
+        f"gla_fwd_{B_batch}_{L}_{H}_{Dh}_{seg}",
+        input_names=["q", "k", "v", "gates"],
+        output_names=["output", "h_ckpt"],
+        source=source,
+    )
+    results = kernel(
+        inputs=[q.reshape(-1), k.reshape(-1), v.reshape(-1), gates.reshape(-1)],
+        output_shapes=[
+            (B_batch * L * H * Dh,),
+            (B_batch * n_seg * H * Dh * Dh,),
+        ],
+        output_dtypes=[mx.float32, mx.float32],
+        grid=(Dh, B_batch * H, 1),
+        threadgroup=(min(Dh, 256), 1, 1),
+    )
+    y = results[0].reshape(B_batch, L, H, Dh)
+    h_ckpt = results[1].reshape(B_batch, n_seg, H, Dh, Dh)
+    return y, h_ckpt
+# ---------------------------------------------------------------------------
+# Metal backward: segment recompute + fused simd-reduced gradient partials
+# ---------------------------------------------------------------------------
+def _gla_backward_kernel(grad_y, h_ckpt, q, k, v, gates, seg):
+    """Recompute states from checkpoints, then adjoint sweep with fused reductions.
+    Returns:
+        grad_v:   [B, L, H, Dh]       (exact, per-thread)
+        grad_q_p: [B, L, H, nW, Dh]   (sum over nW -> grad_q)
+        grad_k_p: [B, L, H, nW, Dh]   (sum over nW -> grad_k)
+        grad_g_p: [B, L, H, nW]       (sum over nW -> grad_gates)
+    """
+    B_batch, L, H, Dh = q.shape
+    n_seg = L // seg
+    n_w = Dh // 32  # simdgroups (j-lane groups) per head
+    source = f"""
+        uint j  = thread_position_in_grid.x;
+        uint bh = thread_position_in_grid.y;
+        if (j >= {Dh}u || bh >= {B_batch * H}u) return;
+        uint b    = bh / {H}u;
+        uint head = bh % {H}u;
+        // threadgroup x is a multiple of 32 and x-major, so lanes of a
+        // simdgroup are 32 consecutive j values within one head.
+        uint lane = j % 32u;
+        uint w    = j / 32u;
+        float adj[{Dh}];
+        for (int i = 0; i < {Dh}; i++) adj[i] = 0.0f;
+        for (int s = {n_seg - 1}; s >= 0; s--) {{
+            // ---- phase 1: recompute states for this segment ----
+            float h[{Dh}];
+            if (s > 0) {{
+                for (int i = 0; i < {Dh}; i++) {{
+                    int ck_idx = ((((b * {n_seg} + (s - 1)) * {H} + head) * {Dh} + i) * {Dh} + j);
+                    h[i] = h_ckpt[ck_idx];
+                }}
+            }} else {{
+                for (int i = 0; i < {Dh}; i++) h[i] = 0.0f;
+            }}
+            for (int tl = 0; tl < {seg}; tl++) {{
+                int t = s * {seg} + tl;
+                int g_idx   = (b * {L} + t) * {H} + head;
+                int kv_base = ((b * {L} + t) * {H} + head) * {Dh};
+                float gate = (float)gates[g_idx];
+                float v_j  = (float)v[kv_base + j];
+                for (int i = 0; i < {Dh}; i++) {{
+                    h[i] = gate * h[i] + (float)k[kv_base + i] * v_j;
+                    int sc_idx = ((((b * {H} + head) * {seg} + tl) * {Dh} + i) * {Dh} + j);
+                    scratch[sc_idx] = h[i];
+                }}
+            }}
+            // ---- phase 2: adjoint sweep, newest -> oldest ----
+            for (int tl = {seg - 1}; tl >= 0; tl--) {{
+                int t = s * {seg} + tl;
+                int g_idx   = (b * {L} + t) * {H} + head;
+                int kv_base = ((b * {L} + t) * {H} + head) * {Dh};
+                float gate = (float)gates[g_idx];
+                float v_j  = (float)v[kv_base + j];
+                float go_j = (float)grad_y[kv_base + j];
+                float gv_j = 0.0f;
+                float gg   = 0.0f;
+                for (int i = 0; i < {Dh}; i++) {{
+                    float ki = (float)k[kv_base + i];
+                    int sc_idx = ((((b * {H} + head) * {seg} + tl) * {Dh} + i) * {Dh} + j);
+                    float h_cur = scratch[sc_idx];
+                    float h_prev;
+                    if (tl > 0) {{
+                        h_prev = scratch[sc_idx - {Dh * Dh}];
+                    }} else if (s > 0) {{
+                        int ck_idx = ((((b * {n_seg} + (s - 1)) * {H} + head) * {Dh} + i) * {Dh} + j);
+                        h_prev = h_ckpt[ck_idx];
+                    }} else {{
+                        h_prev = 0.0f;
+                    }}
+                    // driving term (adj sampled after this, before gate multiply)
+                    adj[i] += (float)q[kv_base + i] * go_j;
+                    gv_j += adj[i] * ki;
+                    gg   += adj[i] * h_prev;
+                    // fused reductions over the 32 j-lanes of this simdgroup
+                    float gq_l = simd_sum(go_j * h_cur);
+                    float gk_l = simd_sum(adj[i] * v_j);
+                    if (lane == 0u) {{
+                        int p_idx = ((((b * {L} + t) * {H} + head) * {n_w} + w) * {Dh} + i);
+                        grad_q_p[p_idx] = gq_l;
+                        grad_k_p[p_idx] = gk_l;
+                    }}
+                    adj[i] *= gate;
+                }}
+                grad_v[kv_base + j] = gv_j;
+                float gg_l = simd_sum(gg);
+                if (lane == 0u) {{
+                    grad_g_p[(((b * {L} + t) * {H} + head) * {n_w} + w)] = gg_l;
+                }}
+            }}
+        }}
+    """
+    kernel = get_or_build_kernel(
+        f"gla_bwd_{B_batch}_{L}_{H}_{Dh}_{seg}",
+        input_names=["grad_y", "h_ckpt", "q", "k", "v", "gates"],
+        output_names=["grad_v", "grad_q_p", "grad_k_p", "grad_g_p", "scratch"],
+        source=source,
+    )
+    results = kernel(
+        inputs=[grad_y.reshape(-1), h_ckpt.reshape(-1), q.reshape(-1),
+                k.reshape(-1), v.reshape(-1), gates.reshape(-1)],
+        output_shapes=[
+            (B_batch * L * H * Dh,),
+            (B_batch * L * H * n_w * Dh,),
+            (B_batch * L * H * n_w * Dh,),
+            (B_batch * L * H * n_w,),
+            (B_batch * H * seg * Dh * Dh,),   # scratch, discarded
+        ],
+        output_dtypes=[mx.float32] * 5,
+        grid=(Dh, B_batch * H, 1),
+        threadgroup=(min(Dh, 256), 1, 1),
+    )
+    grad_v   = results[0].reshape(B_batch, L, H, Dh)
+    grad_q_p = results[1].reshape(B_batch, L, H, n_w, Dh)
+    grad_k_p = results[2].reshape(B_batch, L, H, n_w, Dh)
+    grad_g_p = results[3].reshape(B_batch, L, H, n_w)
+    grad_q     = mx.sum(grad_q_p, axis=3)   # [B, L, H, Dh]
+    grad_k     = mx.sum(grad_k_p, axis=3)   # [B, L, H, Dh]
+    grad_gates = mx.sum(grad_g_p, axis=3)   # [B, L, H]
+    return grad_q, grad_k, grad_v, grad_gates
+# ---------------------------------------------------------------------------
+# Custom function + VJP (one cached impl per seg, so seg can be a Python arg)
+# ---------------------------------------------------------------------------
+_impl_cache: dict = {}
+def _make_impl(seg):
+    """Build (and cache) an ``mx.custom_function`` GLA impl bound to ``seg``."""
+    if seg in _impl_cache:
+        return _impl_cache[seg]
+    @mx.custom_function
+    def _impl(q, k, v, gates):
+        check_segment_shape(q.shape[1], seg, q.shape[3], "Dh")
+        return _gla_forward_kernel(q, k, v, gates, seg)
+    @_impl.vjp
+    def _vjp(primals, cotangents, outputs):
+        q, k, v, gates = primals
+        grad_y = cotangents[0]
+        _y, h_ckpt = outputs
+        return _gla_backward_kernel(grad_y, h_ckpt, q, k, v, gates, seg)
+    _impl_cache[seg] = _impl
+    return _impl
+def gla_scan(q, k, v, gates, seg=DEFAULT_SEG):
+    """Gated Linear Attention recurrence, fused Metal forward + backward.
+    Args:
+        q:      [B, L, H, Dh]   queries (pre-scaled, post-RoPE)
+        k:      [B, L, H, Dh]   keys
+        v:      [B, L, H, Dh]   values
+        gates:  [B, L, H]       scalar forget gate per head, typically in (0, 1)
+        seg:    segment length for checkpointing (L % seg == 0; default 32)
+    Returns:
+        y:      [B, L, H, Dh]
+    Note: ``Dh`` must be a multiple of 32 (it is the simd-reduced lane dim).
+    fp32 state/accumulation internally; bf16 inputs widen implicitly.
+    """
+    y, _h_ckpt = _make_impl(seg)(q, k, v, gates)
+    return y
+def gla_scan_with_state(q, k, v, gates, seg=DEFAULT_SEG):
+    """GLA scan that also returns the final state for chunked prefill.
+    Returns:
+        y:           [B, L, H, Dh]
+        final_state: [B, H, Dh, Dh]   (matches the GLA conceptual state)
+    """
+    y, h_ckpt = _make_impl(seg)(q, k, v, gates)
+    return y, h_ckpt[:, -1]
+# ---------------------------------------------------------------------------
+# Pure-MLX reference (slow, for parity testing only)
+# ---------------------------------------------------------------------------
+def gla_scan_reference(q, k, v, gates):
+    """Pure-MLX token-loop reference for :func:`gla_scan`. Differentiable."""
+    B_batch, L, H, Dh = q.shape
+    h = mx.zeros((B_batch, H, Dh, Dh))
+    ys = []
+    for t in range(L):
+        g = gates[:, t, :, None, None]                       # [B,H,1,1]
+        kv = k[:, t, :, :, None] * v[:, t, :, None, :]       # [B,H,Dh,Dh]
+        h = g * h + kv
+        ys.append(mx.sum(q[:, t, :, :, None] * h, axis=-2))  # [B,H,Dh]
+    return mx.stack(ys, axis=1)                               # [B,L,H,Dh]

mlx_recurrence/legacy/__init__.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""mlx_recurrence.legacy — v1 token-loop Metal kernels.
+These are the original (v0.1) kernels: a Metal forward that materialises
+the full per-timestep state tensor h_all and a backward that reads it back.
+They are kept for backwards compatibility and as a simple, readable
+reference. New work should prefer the v2 chassis-based kernels
+(``mlx_recurrence.ssd``, ``mlx_recurrence.gla``, ``mlx_recurrence.rglru``),
+which use segment checkpointing + recompute to slash DRAM traffic.
+Public API (unchanged from v0.1):
+    selective_scan_metal, selective_scan_chunked
+    gla_scan_metal, gla_scan_chunked
+"""
+from .ssm_scan import (
+    selective_scan_metal,
+    selective_scan_chunked,
+    _ssm_forward_kernel,
+    _ssm_backward_chunked,
+    _ssm_backward_metal,
+)
+from .gla_scan import (
+    gla_scan_metal,
+    gla_scan_chunked,
+    _gla_forward_kernel,
+    _gla_backward_chunked,
+    _gla_backward_metal,
+)
+__all__ = [
+    "selective_scan_metal",
+    "selective_scan_chunked",
+    "gla_scan_metal",
+    "gla_scan_chunked",
+    "_ssm_forward_kernel",
+    "_ssm_backward_chunked",
+    "_ssm_backward_metal",
+    "_gla_forward_kernel",
+    "_gla_backward_chunked",
+    "_gla_backward_metal",
+]