PyPI - fastvideo-kernel - Versions diffs - 0.2.4__tar.gz → 0.2.6__tar.gz - Mend

fastvideo-kernel 0.2.4tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fastvideo-kernel
-Version: 0.2.4
+Version: 0.2.6
 Summary: Unified CUDA kernels for FastVideo
 Author-Email: Hao AI Lab <contact@haoailab.com>
 License:                                  Apache License
@@ -240,6 +240,17 @@ out = video_sparse_attn(q, k, v, block_sizes, block_sizes, topk=5)
 out = moba_attn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_k, ...)
 ```
+## Benchmark
+### VSA (block-sparse) TFLOPs
+After building/installing `fastvideo-kernel`, run:
+```bash
+cd fastvideo-kernel
+python benchmarks/bench_vsa.py --batch_size 1 --num_heads 16 --head_dim 128 --q_seq_lens 49152 --topk 64
+```
 ### TurboDiffusion Kernels
 This package also includes kernels from [TurboDiffusion](https://github.com/thu-ml/TurboDiffusion), including INT8 GEMM, Quantization, RMSNorm and LayerNorm.

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/README.md RENAMED Viewed

@@ -40,6 +40,17 @@ out = video_sparse_attn(q, k, v, block_sizes, block_sizes, topk=5)
 out = moba_attn_varlen(q, k, v, cu_seqlens_q, cu_seqlens_k, ...)
 ```
+## Benchmark
+### VSA (block-sparse) TFLOPs
+After building/installing `fastvideo-kernel`, run:
+```bash
+cd fastvideo-kernel
+python benchmarks/bench_vsa.py --batch_size 1 --num_heads 16 --head_dim 128 --q_seq_lens 49152 --topk 64
+```
 ### TurboDiffusion Kernels
 This package also includes kernels from [TurboDiffusion](https://github.com/thu-ml/TurboDiffusion), including INT8 GEMM, Quantization, RMSNorm and LayerNorm.

fastvideo_kernel-0.2.6/benchmarks/bench_vsa.py ADDED Viewed

@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Benchmark VSA *wrapper* performance (forward + backward) and report TFLOPs.
+This script benchmarks the autograd-enabled wrapper:
+  - fastvideo_kernel.block_sparse_attn.block_sparse_attn
+So measured time includes wrapper overhead (map->index conversion, dispatch) plus kernel time.
+"""
+from __future__ import annotations
+import argparse
+import os
+import random
+from typing import Tuple, Callable
+import numpy as np
+import torch
+try:
+    from triton.testing import do_bench
+except Exception as e:  # pragma: no cover
+    raise ImportError("This benchmark requires triton (for triton.testing.do_bench).") from e
+BLOCK_M = 64
+BLOCK_N = 64
+def set_seed(seed: int = 42) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def parse_arguments() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Benchmark FastVideo VSA block-sparse attention")
+    p.add_argument("--batch_size", type=int, default=1)
+    p.add_argument("--num_heads", type=int, default=12)
+    p.add_argument("--head_dim", type=int, default=128, choices=[64, 128])
+    p.add_argument("--topk", type=int, default=None, help="KV blocks per Q block (default: ~90%% sparsity)")
+    p.add_argument("--q_seq_lens", type=int, nargs="+", default=[49152], help="Q sequence lengths (must be /64)")
+    p.add_argument("--kv_seq_lens", type=int, nargs="+", default=None, help="KV sequence lengths (defaults to q_seq_len)")
+    p.add_argument("--warmup", type=int, default=5)
+    p.add_argument("--rep", type=int, default=20)
+    p.add_argument("--seed", type=int, default=42)
+    p.add_argument("--dtype", type=str, default="bf16", choices=["bf16", "fp16"])
+    p.add_argument("--force_triton", action="store_true", help="Force wrapper to use Triton path (if supported by shapes).")
+    return p.parse_args()
+def create_qkv(batch: int, heads: int, q_len: int, kv_len: int, d: int, dtype: torch.dtype) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    q = torch.randn(batch, heads, q_len, d, dtype=dtype, device="cuda")
+    k = torch.randn(batch, heads, kv_len, d, dtype=dtype, device="cuda")
+    v = torch.randn(batch, heads, kv_len, d, dtype=dtype, device="cuda")
+    return q, k, v
+def make_block_map(bs: int, h: int, num_q_blocks: int, num_kv_blocks: int, topk: int) -> torch.Tensor:
+    # block_map: [bs, h, num_q_blocks, num_kv_blocks] bool
+    scores = torch.rand(bs, h, num_q_blocks, num_kv_blocks, device="cuda")
+    topk = min(max(1, topk), num_kv_blocks)
+    idx = torch.topk(scores, topk, dim=-1).indices
+    block_map = torch.zeros(bs, h, num_q_blocks, num_kv_blocks, dtype=torch.bool, device="cuda")
+    block_map.scatter_(-1, idx, True)
+    return block_map
+def flops_sparse_attention(bs: int, h: int, d: int, q_len: int, topk_blocks: int, block_n: int) -> float:
+    # Approx: QK^T + PV, each is ~2*bs*h*q_len*(topk_blocks*block_n)*d
+    return 4.0 * bs * h * d * q_len * (topk_blocks * block_n)
+def bench_ms(fn: Callable[[], object], warmup: int, rep: int) -> float:
+    return do_bench(fn, warmup=warmup, rep=rep, quantiles=None)
+def main() -> None:
+    args = parse_arguments()
+    set_seed(args.seed)
+    dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
+    if args.force_triton:
+        os.environ["FASTVIDEO_KERNEL_VSA_FORCE_TRITON"] = "1"
+    from fastvideo_kernel.block_sparse_attn import block_sparse_attn
+    bs, h, d = args.batch_size, args.num_heads, args.head_dim
+    kv_seq_lens = args.kv_seq_lens
+    if kv_seq_lens is None:
+        kv_seq_lens = args.q_seq_lens
+    if len(kv_seq_lens) != len(args.q_seq_lens):
+        raise ValueError("kv_seq_lens must have the same number of entries as q_seq_lens (or be omitted).")
+    print("VSA Block-Sparse Attention Benchmark (WRAPPER)")
+    print(f"device: {torch.cuda.get_device_name(0)}")
+    print(f"batch={bs}, heads={h}, head_dim={d}, dtype={args.dtype}")
+    print(f"BLOCK_M={BLOCK_M}, BLOCK_N={BLOCK_N}")
+    print("NOTE: timings include wrapper overhead (map->index + dispatch).")
+    if args.force_triton:
+        print("dispatch: forced Triton (FASTVIDEO_KERNEL_VSA_FORCE_TRITON=1)")
+    else:
+        print("dispatch: SM90 if available, else Triton")
+    for q_len, kv_len in zip(args.q_seq_lens, kv_seq_lens):
+        if q_len % BLOCK_M != 0 or kv_len % BLOCK_N != 0:
+            print(f"[skip] q_len={q_len}, kv_len={kv_len} must be divisible by 64")
+            continue
+        num_q_blocks = q_len // BLOCK_M
+        num_kv_blocks = kv_len // BLOCK_N
+        topk = args.topk if args.topk is not None else max(1, num_kv_blocks // 10)
+        topk = min(topk, num_kv_blocks)
+        print("\n" + "=" * 80)
+        print(f"q_len={q_len}, kv_len={kv_len}, num_q_blocks={num_q_blocks}, num_kv_blocks={num_kv_blocks}, topk={topk}")
+        q, k, v = create_qkv(bs, h, q_len, kv_len, d, dtype)
+        block_map = make_block_map(bs, h, num_q_blocks, num_kv_blocks, topk)
+        # Variable block sizes: default full blocks (64 tokens per KV block)
+        variable_block_sizes = torch.full((num_kv_blocks,), BLOCK_N, dtype=torch.int32, device="cuda")
+        def _fwd():
+            return block_sparse_attn(q, k, v, block_map, variable_block_sizes)
+        fwd_ms = bench_ms(_fwd, warmup=args.warmup, rep=args.rep)
+        # Backward benchmark (wrapper autograd). We build the graph once, then repeatedly run backward
+        # on the retained graph so bwd timing excludes the forward compute.
+        q_ = q.detach().requires_grad_(True)
+        k_ = k.detach().requires_grad_(True)
+        v_ = v.detach().requires_grad_(True)
+        o_, _aux_ = block_sparse_attn(q_, k_, v_, block_map, variable_block_sizes)
+        og = torch.randn_like(o_)
+        loss = (o_ * og).sum()
+        for _ in range(max(1, args.warmup // 2)):
+            torch.autograd.grad(loss, (q_, k_, v_), retain_graph=True)
+        torch.cuda.synchronize()
+        bwd_ms = bench_ms(
+            lambda: torch.autograd.grad(loss, (q_, k_, v_), retain_graph=True),
+            warmup=0,
+            rep=max(5, args.rep // 2),
+        )
+        flops = flops_sparse_attention(bs, h, d, q_len, topk, BLOCK_N)
+        fwd_tflops = flops / fwd_ms * 1e-12 * 1e3
+        # Rough backward multiplier (attention backward typically ~2-3x forward)
+        bwd_tflops = (2.5 * flops) / bwd_ms * 1e-12 * 1e3
+        print(f"fwd(wrapper): {fwd_ms:.3f} ms  | {fwd_tflops:.2f} TFLOPs (approx)")
+        print(f"bwd(wrapper): {bwd_ms:.3f} ms  | {bwd_tflops:.2f} TFLOPs (approx)")
+if __name__ == "__main__":
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for this benchmark.")
+    main()

fastvideo_kernel-0.2.4/dist/fastvideo_kernel-0.2.4-cp311-cp311-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl → fastvideo_kernel-0.2.6/dist/fastvideo_kernel-0.2.6-cp310-cp310-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl RENAMED Viewed

Binary file

fastvideo_kernel-0.2.4/dist/fastvideo_kernel-0.2.4-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl → fastvideo_kernel-0.2.6/dist/fastvideo_kernel-0.2.6-cp311-cp311-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl RENAMED Viewed

Binary file

fastvideo_kernel-0.2.4/dist/fastvideo_kernel-0.2.4-cp310-cp310-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl → fastvideo_kernel-0.2.6/dist/fastvideo_kernel-0.2.6-cp312-cp312-manylinux_2_34_x86_64.manylinux_2_35_x86_64.whl RENAMED Viewed

Binary file

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/pyproject.toml RENAMED Viewed

@@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build"
 [project]
 name = "fastvideo-kernel"
-version = "0.2.4"
+version = "0.2.6"
 description = "Unified CUDA kernels for FastVideo"
 readme = "README.md"
 requires-python = ">=3.10"

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/python/fastvideo_kernel/block_sparse_attn.py RENAMED Viewed

@@ -30,13 +30,12 @@ def _force_triton() -> bool:
     return os.environ.get("FASTVIDEO_KERNEL_VSA_FORCE_TRITON", "0") == "1"
-def _map_to_index_torch(block_map: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+def _map_to_index(block_map: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Pure-torch (no triton) conversion:
-      block_map: [B, H, Q, KV] bool  (or [H, Q, KV] which will be treated as B=1)
-      returns:
-        index: [B, H, Q, KV] int32 (packed KV indices, -1 padding)
-        num:   [B, H, Q] int32 (#kv blocks per q block)
+    Preferred map->index conversion used by the wrapper.
+    This wrapper **requires** the Triton implementation.
+    If Triton (or the Triton map_to_index module) is not available, it raises.
     """
     if block_map.dim() == 3:
         block_map = block_map.unsqueeze(0)
@@ -45,20 +44,17 @@ def _map_to_index_torch(block_map: torch.Tensor) -> Tuple[torch.Tensor, torch.Te
     if block_map.dtype != torch.bool:
         block_map = block_map.to(torch.bool)
-    B, H, Q, KV = block_map.shape
-    index = torch.full((B, H, Q, KV), -1, dtype=torch.int32, device=block_map.device)
-    num = torch.zeros((B, H, Q), dtype=torch.int32, device=block_map.device)
+    if not block_map.is_cuda:
+        raise RuntimeError("block_map must be a CUDA tensor (Triton map_to_index required).")
-    # Small sizes in practice (B=1, H<=16, Q/KV<=64), so a Python loop is fine.
-    for b in range(B):
-        for h in range(H):
-            for q in range(Q):
-                kv_idx = torch.nonzero(block_map[b, h, q], as_tuple=False).flatten().to(torch.int32)
-                n = int(kv_idx.numel())
-                if n:
-                    index[b, h, q, :n] = kv_idx
-                num[b, h, q] = n
-    return index, num
+    try:
+        from fastvideo_kernel.triton_kernels.index import map_to_index as triton_map_to_index  # local import
+    except Exception as e:
+        raise ImportError(
+            "Triton map_to_index is required but not available. "
+            "Ensure Triton is installed and fastvideo_kernel.triton_kernels.index is importable."
+        ) from e
+    return triton_map_to_index(block_map)
 @torch.library.custom_op(
@@ -77,7 +73,7 @@ def block_sparse_attn_triton(
     k = k.contiguous()
     v = v.contiguous()
     block_map = block_map.to(torch.bool)
-    q2k_idx, q2k_num = _map_to_index_torch(block_map)
+    q2k_idx, q2k_num = _map_to_index(block_map)
     from fastvideo_kernel.triton_kernels.block_sparse_attn_triton import (  # local import
         triton_block_sparse_attn_forward,
@@ -87,6 +83,7 @@ def block_sparse_attn_triton(
     return o, M
 @torch.library.register_fake("fastvideo_kernel::block_sparse_attn_triton")
 def _block_sparse_attn_triton_fake(
     q: torch.Tensor,
@@ -117,8 +114,8 @@ def block_sparse_attn_backward_triton(
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     grad_output = grad_output.contiguous()
     block_map = block_map.to(torch.bool)
-    q2k_idx, q2k_num = _map_to_index_torch(block_map)
-    k2q_idx, k2q_num = _map_to_index_torch(block_map.transpose(-1, -2).contiguous())
+    q2k_idx, q2k_num = _map_to_index(block_map)
+    k2q_idx, k2q_num = _map_to_index(block_map.transpose(-1, -2).contiguous())
     from fastvideo_kernel.triton_kernels.block_sparse_attn_triton import (  # local import
         triton_block_sparse_attn_backward,
@@ -182,7 +179,7 @@ def block_sparse_attn_sm90(
     k_padded = k_padded.contiguous()
     v_padded = v_padded.contiguous()
     block_map = block_map.to(torch.bool)
-    q2k_idx, q2k_num = _map_to_index_torch(block_map)
+    q2k_idx, q2k_num = _map_to_index(block_map)
     o_padded, lse_padded = block_sparse_fwd(
         q_padded, k_padded, v_padded, q2k_idx, q2k_num, variable_block_sizes.int()
@@ -224,7 +221,7 @@ def block_sparse_attn_backward_sm90(
     grad_output_padded = grad_output_padded.contiguous()
     block_map = block_map.to(torch.bool)
-    k2q_idx, k2q_num = _map_to_index_torch(block_map.transpose(-1, -2).contiguous())
+    k2q_idx, k2q_num = _map_to_index(block_map.transpose(-1, -2).contiguous())
     dq, dk, dv = block_sparse_bwd(
         q_padded,
@@ -290,9 +287,8 @@ def block_sparse_attn(
     block_sparse_fwd, block_sparse_bwd = _get_sm90_ops()
     if (not _force_triton()) and _is_sm90() and (block_sparse_fwd is not None) and (block_sparse_bwd is not None):
         return block_sparse_attn_sm90(q, k, v, block_map, variable_block_sizes)
-    # Triton path: generally assumes q/k/v share the same padded length
-    if q.shape[2] != k.shape[2] or q.shape[2] != v.shape[2]:
-        raise RuntimeError("Triton fallback requires q/k/v to have the same padded length.")
+    # Triton path: supports q_seq_len != kv_seq_len as long as both are padded
+    # to a multiple of the block size (64 tokens).
     return block_sparse_attn_triton(q, k, v, block_map, variable_block_sizes)

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/python/fastvideo_kernel/ops.py RENAMED Viewed

@@ -141,12 +141,6 @@ def video_sparse_attn(
         # Use autograd-enabled wrapper so backward works (and still uses SM90 kernel when available)
         out_s = block_sparse_attn(q, k, v, mask, variable_block_sizes)[0]
     else:
-        if q_seq_len != kv_seq_len:
-            raise RuntimeError(
-                "q/k have different lengths, but the compiled CUDA kernel (block_sparse_fwd) "
-                "is not available. The Triton fallback currently requires q and k/v to have "
-                "the same padded length."
-            )
         # Triton-only forward (kept for environments without the wrapper deps)
         out_s, _ = triton_block_sparse_attn_forward(q, k, v, idx, num, variable_block_sizes)

{fastvideo_kernel-0.2.4 → fastvideo_kernel-0.2.6}/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton.py RENAMED Viewed

@@ -29,7 +29,7 @@ configs = [
 # ──────────────────────────── SPARSE ADDITION BEGIN ───────────────────────────
-@triton.autotune(configs, key=["N_CTX", "HEAD_DIM"])
+@triton.autotune(configs, key=["N_CTX_Q", "HEAD_DIM"])
 @triton.jit
 def _attn_fwd_sparse(
         Q,
@@ -60,7 +60,8 @@ def _attn_fwd_sparse(
         stride_on,
         Z,
         H,
-        N_CTX,  #
+        N_CTX_Q,  #
+        N_CTX_KV,  #
         HEAD_DIM: tl.constexpr,  #
         BLOCK_M: tl.constexpr,
         BLOCK_N: tl.constexpr,
@@ -75,24 +76,29 @@ def _attn_fwd_sparse(
     off_hz = tl.program_id(1)  # fused (batch, head)
     b = off_hz // H
     h = off_hz % H
-    q_tiles = N_CTX // BLOCK_M
+    q_tiles = N_CTX_Q // BLOCK_M
     meta_base = ((b * H + h) * q_tiles + q_blk)
     kv_blocks = tl.load(q2k_num + meta_base)  # int32
     kv_ptr = q2k_index + meta_base * max_kv_blks  # ptr to list
     # ----- base pointers -----
-    qvk_off = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
-    Q_ptr = tl.make_block_ptr(base=Q + qvk_off,
-                              shape=(N_CTX, HEAD_DIM),
+    # Note: when q and kv have different sequence lengths, their per-(batch,head)
+    # strides differ, so we must compute separate base offsets.
+    q_off = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    k_off = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    v_off = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    o_off = (b.to(tl.int64) * stride_oz + h.to(tl.int64) * stride_oh)
+    Q_ptr = tl.make_block_ptr(base=Q + q_off,
+                              shape=(N_CTX_Q, HEAD_DIM),
                               strides=(stride_qm, stride_qk),
                               offsets=(q_blk * BLOCK_M, 0),
                               block_shape=(BLOCK_M, HEAD_DIM),
                               order=(1, 0))
-    K_base = tl.make_block_ptr(base=K + qvk_off,
-                               shape=(HEAD_DIM, N_CTX),
+    K_base = tl.make_block_ptr(base=K + k_off,
+                               shape=(HEAD_DIM, N_CTX_KV),
                                strides=(stride_kk, stride_kn),
                                offsets=(0, 0),
                                block_shape=(HEAD_DIM, BLOCK_N),
@@ -100,15 +106,15 @@ def _attn_fwd_sparse(
     v_order: tl.constexpr = (0, 1) if V.dtype.element_ty == tl.float8e5 else (1,
                                                                               0)
-    V_base = tl.make_block_ptr(base=V + qvk_off,
-                               shape=(N_CTX, HEAD_DIM),
+    V_base = tl.make_block_ptr(base=V + v_off,
+                               shape=(N_CTX_KV, HEAD_DIM),
                                strides=(stride_vk, stride_vn),
                                offsets=(0, 0),
                                block_shape=(BLOCK_N, HEAD_DIM),
                                order=v_order)
-    O_ptr = tl.make_block_ptr(base=Out + qvk_off,
-                              shape=(N_CTX, HEAD_DIM),
+    O_ptr = tl.make_block_ptr(base=Out + o_off,
+                              shape=(N_CTX_Q, HEAD_DIM),
                               strides=(stride_om, stride_on),
                               offsets=(q_blk * BLOCK_M, 0),
                               block_shape=(BLOCK_M, HEAD_DIM),
@@ -150,7 +156,7 @@ def _attn_fwd_sparse(
     # ----- epilogue -----
     m_i += tl.math.log2(l_i)
     acc = acc / l_i[:, None]
-    tl.store(M + off_hz * N_CTX + offs_m, m_i)
+    tl.store(M + off_hz * N_CTX_Q + offs_m, m_i)
     tl.store(O_ptr, acc.to(Out.type.element_ty))
@@ -201,7 +207,7 @@ def _attn_bwd_dkdv(
         stride_tok,
         stride_d,  #
         H,
-        N_CTX,
+        N_CTX_KV,
         BLOCK_M1: tl.constexpr,  #
         BLOCK_N1: tl.constexpr,  #
         HEAD_DIM: tl.constexpr,  #
@@ -221,8 +227,8 @@ def _attn_bwd_dkdv(
     off_hz = tl.program_id(2)  # fused (batch, head)
     b = off_hz // H
     h = off_hz % H
-    q_tiles = N_CTX // BLOCK_N1
-    meta_base = ((b * H + h) * q_tiles + kv_blk)
+    kv_tiles = N_CTX_KV // BLOCK_N1
+    meta_base = ((b * H + h) * kv_tiles + kv_blk)
     q_blocks = tl.load(k2q_num + meta_base)  # int32
     q_ptr = k2q_index + meta_base * max_q_blks  # ptr to list
@@ -302,16 +308,21 @@ def _attn_bwd_dq(
     kv_blocks = tl.load(q2k_num + meta_base)  # int32
     kv_ptr = q2k_index + meta_base * max_kv_blks  # ptr to list
-    block_size = tl.load(variable_block_sizes + q_blk)
     for blk_idx in range(kv_blocks * 2):
-        block_sparse_offset = (tl.load(kv_ptr + blk_idx // 2).to(tl.int32) * 2 +
-                               blk_idx % 2) * step_n * stride_tok
+        kv_idx = tl.load(kv_ptr + blk_idx // 2).to(tl.int32)
+        # variable_block_sizes is defined per KV block (tile). Mask must therefore
+        # use kv_idx (not q_blk). Also, because we split each 64-token block into
+        # two 32-token halves, the mask must account for the half-block offset.
+        block_size = tl.load(variable_block_sizes + kv_idx).to(tl.int32)
+        half = (blk_idx % 2).to(tl.int32)
+        block_sparse_offset = (kv_idx * 2 + half) * step_n * stride_tok
         kT = tl.load(kT_ptrs + block_sparse_offset)
         vT = tl.load(vT_ptrs + block_sparse_offset)
         qk = tl.dot(q, kT)
         p = tl.math.exp2(qk - m)
-        mask = tl.arange(0, BLOCK_N2) < block_size.to(tl.int32)
+        offs_in_block = half * step_n + tl.arange(0, BLOCK_N2)
+        mask = offs_in_block < block_size
         p = tl.where(mask[None, :], p, 0.0)
         # Compute dP and dS.
         dp = tl.dot(do, vT).to(tl.float32)
@@ -467,19 +478,235 @@ def _attn_bwd(
     tl.store(dq_ptrs, dq)
+@triton.jit
+def _attn_bwd_dkdv_kernel(
+        Q,
+        K,
+        V,
+        sm_scale,  #
+        DO,  #
+        DK,
+        DV,  #
+        M,
+        D,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        variable_block_sizes,
+        # shared token/dim strides (assumed contiguous along token and dim)
+        stride_tok,
+        stride_d,  #
+        # batch/head strides (may differ between Q and KV)
+        stride_qz,
+        stride_qh,
+        stride_kz,
+        stride_kh,
+        stride_vz,
+        stride_vh,
+        stride_doz,
+        stride_doh,
+        stride_dkz,
+        stride_dkh,
+        stride_dvz,
+        stride_dvh,
+        H,
+        N_CTX_Q,
+        N_CTX_KV,
+        BLOCK_M1: tl.constexpr,  #
+        BLOCK_N1: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr):
+    """
+    Backward kernel that computes dK and dV for each KV block (64 tokens).
+    Grid:
+      pid0: kv_blk in [0, N_CTX_KV/BLOCK_N1)
+      pid2: fused (batch, head) in [0, B*H)
+    """
+    bhid = tl.program_id(2)
+    b = bhid // H
+    h = bhid % H
+    kv_blk = tl.program_id(0)
+    q_adj = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    kv_adj_k = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    kv_adj_v = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    do_adj = (b.to(tl.int64) * stride_doz + h.to(tl.int64) * stride_doh)
+    dk_adj = (b.to(tl.int64) * stride_dkz + h.to(tl.int64) * stride_dkh)
+    dv_adj = (b.to(tl.int64) * stride_dvz + h.to(tl.int64) * stride_dvh)
+    Q = Q + q_adj
+    K = K + kv_adj_k
+    V = V + kv_adj_v
+    DO = DO + do_adj
+    DK = DK + dk_adj
+    DV = DV + dv_adj
+    # M and D (delta) are always sized by Q length.
+    M = M + (bhid * N_CTX_Q).to(tl.int64)
+    D = D + (bhid * N_CTX_Q).to(tl.int64)
+    offs_k = tl.arange(0, HEAD_DIM)
+    start_n = kv_blk * BLOCK_N1
+    offs_n = start_n + tl.arange(0, BLOCK_N1)
+    # load K and V: they stay in SRAM throughout the inner loop.
+    k = tl.load(K + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    v = tl.load(V + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    dv_acc = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    dk_acc = tl.zeros([BLOCK_N1, HEAD_DIM], dtype=tl.float32)
+    num_steps = N_CTX_Q // BLOCK_M1
+    dk_acc, dv_acc = _attn_bwd_dkdv(
+        dk_acc,
+        dv_acc,
+        Q,
+        k,
+        v,
+        sm_scale,
+        DO,
+        M,
+        D,
+        k2q_index,
+        k2q_num,
+        max_q_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,
+        H,
+        N_CTX_KV,
+        BLOCK_M1=BLOCK_M1,
+        BLOCK_N1=BLOCK_N1,
+        HEAD_DIM=HEAD_DIM,
+        start_n=start_n,
+        start_m=0,
+        num_steps=num_steps,
+    )
+    dv_ptrs = DV + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dv_ptrs, dv_acc)
+    dk_acc *= sm_scale
+    dk_ptrs = DK + offs_n[:, None] * stride_tok + offs_k[None, :] * stride_d
+    tl.store(dk_ptrs, dk_acc)
+@triton.jit
+def _attn_bwd_dq_kernel(
+        Q,
+        K,
+        V,
+        DO,  #
+        DQ,
+        M,
+        D,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        variable_block_sizes,
+        # shared token/dim strides (assumed contiguous along token and dim)
+        stride_tok,
+        stride_d,  #
+        # batch/head strides (may differ between Q and KV)
+        stride_qz,
+        stride_qh,
+        stride_kz,
+        stride_kh,
+        stride_vz,
+        stride_vh,
+        stride_doz,
+        stride_doh,
+        stride_dqz,
+        stride_dqh,
+        H,
+        N_CTX_Q,
+        BLOCK_M2: tl.constexpr,  #
+        BLOCK_N2: tl.constexpr,  #
+        HEAD_DIM: tl.constexpr):
+    """
+    Backward kernel that computes dQ for each Q block (64 tokens).
+    Grid:
+      pid0: q_blk in [0, N_CTX_Q/BLOCK_M2)
+      pid2: fused (batch, head) in [0, B*H)
+    """
+    LN2 = 0.6931471824645996  # = ln(2)
+    bhid = tl.program_id(2)
+    b = bhid // H
+    h = bhid % H
+    q_blk = tl.program_id(0)
+    q_adj = (b.to(tl.int64) * stride_qz + h.to(tl.int64) * stride_qh)
+    kv_adj_k = (b.to(tl.int64) * stride_kz + h.to(tl.int64) * stride_kh)
+    kv_adj_v = (b.to(tl.int64) * stride_vz + h.to(tl.int64) * stride_vh)
+    do_adj = (b.to(tl.int64) * stride_doz + h.to(tl.int64) * stride_doh)
+    dq_adj = (b.to(tl.int64) * stride_dqz + h.to(tl.int64) * stride_dqh)
+    Q = Q + q_adj
+    K = K + kv_adj_k
+    V = V + kv_adj_v
+    DO = DO + do_adj
+    DQ = DQ + dq_adj
+    M = M + (bhid * N_CTX_Q).to(tl.int64)
+    D = D + (bhid * N_CTX_Q).to(tl.int64)
+    offs_k = tl.arange(0, HEAD_DIM)
+    start_m = q_blk * BLOCK_M2
+    offs_m = start_m + tl.arange(0, BLOCK_M2)
+    q = tl.load(Q + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    do = tl.load(DO + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d)
+    m = tl.load(M + offs_m)[:, None]
+    dq_acc = tl.zeros([BLOCK_M2, HEAD_DIM], dtype=tl.float32)
+    num_steps = 0  # unused in _attn_bwd_dq
+    dq_acc = _attn_bwd_dq(
+        dq_acc,
+        q,
+        K,
+        V,
+        do,
+        m,
+        D,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        variable_block_sizes,
+        stride_tok,
+        stride_d,
+        H,
+        N_CTX_Q,
+        BLOCK_M2=BLOCK_M2,
+        BLOCK_N2=BLOCK_N2,
+        HEAD_DIM=HEAD_DIM,
+        start_m=start_m,
+        start_n=0,
+        num_steps=num_steps,
+    )
+    dq_ptrs = DQ + offs_m[:, None] * stride_tok + offs_k[None, :] * stride_d
+    dq_acc *= LN2
+    tl.store(dq_ptrs, dq_acc)
 # ──────────────────────────── SPARSE ADDITION BEGIN ───────────────────────────
 def triton_block_sparse_attn_forward(q, k, v, q2k_index, q2k_num,
                                      variable_block_sizes):
-    B, H, T, D = q.shape
+    B, H, Tq, D = q.shape
+    Tkv = k.shape[2]
     sm_scale = 1.0 / math.sqrt(D)
     max_kv_blks = q2k_index.shape[-1]
-    assert T % 64 == 0, f"T must be a multiple of 64, but got {T}"
+    assert Tq % 64 == 0, f"q length must be a multiple of 64, but got {Tq}"
+    assert Tkv % 64 == 0, f"kv length must be a multiple of 64, but got {Tkv}"
     assert q2k_num.shape[
-        -1] == T // 64, f"shape mismatch, T // 64 = {T // 64}, q2k_num.shape[-2] = {q2k_num.shape[-2]}"
+        -1] == Tq // 64, f"shape mismatch, Tq // 64 = {Tq // 64}, q2k_num.shape[-2] = {q2k_num.shape[-2]}"
+    assert variable_block_sizes.numel() == Tkv // 64, (
+        f"shape mismatch, variable_block_sizes must have length {Tkv // 64}, "
+        f"got {variable_block_sizes.numel()}"
+    )
     o = torch.empty_like(q)
-    M = torch.empty((B, H, T), dtype=torch.float32, device=q.device)
+    M = torch.empty((B, H, Tq), dtype=torch.float32, device=q.device)
-    grid = lambda _: (triton.cdiv(T, 64), B * H, 1)
+    grid = lambda _: (triton.cdiv(Tq, 64), B * H, 1)
     _attn_fwd_sparse[grid](q,
                            k,
                            v,
@@ -508,7 +735,8 @@ def triton_block_sparse_attn_forward(q, k, v, q2k_index, q2k_num,
                            o.stride(3),
                            B,
                            H,
-                           T,
+                           Tq,
+                           Tkv,
                            HEAD_DIM=D,
                            STAGE=3)
@@ -518,21 +746,21 @@ def triton_block_sparse_attn_forward(q, k, v, q2k_index, q2k_num,
 def triton_block_sparse_attn_backward(do, q, k, v, o, M, q2k_index, q2k_num,
                                       k2q_index, k2q_num, variable_block_sizes):
     assert do.is_contiguous()
-    assert q.stride() == k.stride() == v.stride() == o.stride() == do.stride()
-    B, H, T, D = q.shape
+    B, H, Tq, D = q.shape
+    Tkv = k.shape[2]
     sm_scale = 1.0 / math.sqrt(D)
     dq = torch.empty_like(q)
     dk = torch.empty_like(k)
     dv = torch.empty_like(v)
-    BATCH, N_HEAD, N_CTX = q.shape[:3]
+    BATCH, N_HEAD = q.shape[:2]
     BLOCK_M1, BLOCK_N1, BLOCK_M2, BLOCK_N2 = 32, 64, 64, 32
     RCP_LN2 = 1.4426950408889634  # = 1.0 / ln(2)
     arg_k = k
     arg_k = arg_k * (sm_scale * RCP_LN2)
     PRE_BLOCK = 64
-    assert N_CTX % PRE_BLOCK == 0
-    pre_grid = (N_CTX // PRE_BLOCK, BATCH * N_HEAD)
+    assert Tq % PRE_BLOCK == 0
+    pre_grid = (Tq // PRE_BLOCK, BATCH * N_HEAD)
     delta = torch.empty_like(M)
     _attn_bwd_preprocess[pre_grid](
         o,
@@ -540,7 +768,7 @@ def triton_block_sparse_attn_backward(do, q, k, v, o, M, q2k_index, q2k_num,
         delta,  #
         BATCH,
         N_HEAD,
-        N_CTX,  #
+        Tq,  #
         BLOCK_M=PRE_BLOCK,
         HEAD_DIM=D  #
     )
@@ -548,36 +776,75 @@ def triton_block_sparse_attn_backward(do, q, k, v, o, M, q2k_index, q2k_num,
     max_q_blks = k2q_index.shape[-1]
     max_kv_blks = q2k_index.shape[-1]
-    grid = (N_CTX // BLOCK_N1, 1, BATCH * N_HEAD)
-    _attn_bwd[grid](
+    # dK/dV kernel: grid over KV blocks
+    grid_kv = (Tkv // BLOCK_N1, 1, BATCH * N_HEAD)
+    _attn_bwd_dkdv_kernel[grid_kv](
         q,
         arg_k,
         v,
         sm_scale,
         do,
-        dq,
         dk,
-        dv,  #
+        dv,
         M,
-        delta,  #
-        q2k_index,
-        q2k_num,
-        max_kv_blks,
+        delta,
         k2q_index,
         k2q_num,
         max_q_blks,
         variable_block_sizes,
+        q.stride(2),
+        q.stride(3),
         q.stride(0),
         q.stride(1),
-        q.stride(2),
-        q.stride(3),  #
+        arg_k.stride(0),
+        arg_k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        do.stride(0),
+        do.stride(1),
+        dk.stride(0),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(1),
         N_HEAD,
-        N_CTX,  #
+        Tq,
+        Tkv,
         BLOCK_M1=BLOCK_M1,
-        BLOCK_N1=BLOCK_N1,  #
+        BLOCK_N1=BLOCK_N1,
+        HEAD_DIM=D,
+    )
+    # dQ kernel: grid over Q blocks
+    grid_q = (Tq // BLOCK_M2, 1, BATCH * N_HEAD)
+    _attn_bwd_dq_kernel[grid_q](
+        q,
+        arg_k,
+        v,
+        do,
+        dq,
+        M,
+        delta,
+        q2k_index,
+        q2k_num,
+        max_kv_blks,
+        variable_block_sizes,
+        q.stride(2),
+        q.stride(3),
+        q.stride(0),
+        q.stride(1),
+        arg_k.stride(0),
+        arg_k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        do.stride(0),
+        do.stride(1),
+        dq.stride(0),
+        dq.stride(1),
+        N_HEAD,
+        Tq,
         BLOCK_M2=BLOCK_M2,
-        BLOCK_N2=BLOCK_N2,  #
-        HEAD_DIM=D  #
+        BLOCK_N2=BLOCK_N2,
+        HEAD_DIM=D,
     )
     return dq, dk, dv