PyPI - blksprs - Versions diffs - 2.1.9__py3-none-any.whl → 2.2__py3-none-any.whl - Mend

blksprs 2.1.9py3-none-any.whl → 2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

blksprs/__init__.py +2 -1
blksprs/ops/distribution.py +3 -3
blksprs/ops/flash_attention.py +612 -0
blksprs/utils/autotuning.py +0 -1
blksprs/utils/tools.py +3 -1
{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/METADATA +32 -21
{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/RECORD +9 -8
{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/WHEEL +1 -1
{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/top_level.txt +0 -0

blksprs/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 # Capture scalar outputs for JIT compilation
 torch._dynamo.config.capture_scalar_outputs = True
 # Set version
-__version__ = "2.1.9"
+__version__ = "2.2"
 # Imports
@@ -14,6 +14,7 @@ from blksprs.utils.blksprs_tensor import BlksprsTensor
 class ops:
     from blksprs.ops.conversion import to_dense, to_sparse, from_blksprs, to_blksprs, adapt_layout
     from blksprs.ops.distribution import gather, scatter, scatter_reduce
+    from blksprs.ops.flash_attention import flash_attention, flash_attention_build_lut
     from blksprs.ops.matmul import matmul
     from blksprs.ops.softmax import softmax, softmax_fused
     from blksprs.ops.transpose import transpose

blksprs/ops/distribution.py CHANGED Viewed

@@ -174,7 +174,7 @@ def gather_kernel(x,
                  dst_col_x)
     blk_x_msk = (((blk_x_idx >= 0) &
                   (blk_x_idx < x_b * x_b_s)) &
-                 (rev_idx_spa_x_msk != -1))
+                 (rev_idx_spa_x >= 0))
     blk_x = tl.load(x + blk_x_idx, mask=blk_x_msk)
     # Store output
@@ -183,7 +183,7 @@ def gather_kernel(x,
                  ((pid_col * TRITON_BLOCK_SIZE + tl.arange(0, TRITON_BLOCK_SIZE)) * o_c_s)[None, :])
     blk_o_msk = (((blk_o_idx >= 0) &
                   (blk_o_idx < o_b * o_b_s)) &
-                 (rev_idx_spa_x_msk != -1))
+                 (rev_idx_spa_x >= 0))
     tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)
@@ -426,7 +426,7 @@ def scatter_reduce_kernel(x,
                  dst_col_o)
     blk_o_msk = (((blk_o_idx >= 0) &
                   (blk_o_idx < o_b * o_b_s)) &
-                 (rev_idx_spa_o_msk != -1))
+                 (rev_idx_spa_o >= 0))
     if reduce_op_ind == 0:
         tl.store(o + blk_o_idx, blk_x, mask=blk_o_msk)

blksprs/ops/flash_attention.py ADDED Viewed

@@ -0,0 +1,612 @@
+"""Block-sparse Flash Attention implementation for blksprs.
+This module implements Flash Attention 2 algorithm with block-sparse support,
+including cross-attention (seq_q != seq_k) and custom attention masks.
+Note: This implementation was developed with AI assistance.
+"""
+import math
+from typing import Tuple
+import torch
+import triton
+from torch import Tensor
+from triton import language as tl
+from blksprs.utils.validation import validate_contiguous, validate_device, validate_dtype_float, ensure_contiguous
+@torch.amp.custom_fwd(device_type="cuda", cast_inputs=torch.float16)
+def flash_attention(
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    attention_layout: Tensor,
+    sparsity_block_size: int,
+    scale: float = None,
+    attention_mask: Tensor = None,
+    lut: dict = None,
+) -> Tensor:
+    """Block-sparse flash attention with optional attention mask.
+    Args:
+        q: Query tensor [batch, seq_q, n_heads, head_dim]
+        k: Key tensor [batch, seq_k, n_heads, head_dim]
+        v: Value tensor [batch, seq_k, n_heads, head_dim]
+        attention_layout: Block attention pattern [batch*heads, n_seq_blocks_q, n_seq_blocks_k]
+        sparsity_block_size: Block size for sparsity pattern
+        scale: Attention scale (default: 1/sqrt(head_dim))
+        attention_mask: Boolean mask [batch*heads, seq_q, seq_k] where True=masked (default None)
+        lut: Optional pre-computed LUT dictionary
+    Returns:
+        Output tensor [batch, seq_q, n_heads, head_dim]
+    """
+    q, k, v = ensure_contiguous(q, k, v)
+    validate_contiguous(q, k, v)
+    validate_dtype_float(q, k, v)
+    validate_device(q, k, v)
+    batch, seq_q, n_heads, head_dim = q.shape
+    _, seq_k, _, _ = k.shape
+    if k.shape[0] != batch or k.shape[2] != n_heads or k.shape[3] != head_dim:
+        raise ValueError("K must have compatible shape with Q")
+    if v.shape != k.shape:
+        raise ValueError("V must have same shape as K")
+    if not (sparsity_block_size >= 16 and (sparsity_block_size & (sparsity_block_size - 1)) == 0):
+        raise ValueError(f"sparsity_block_size must be power of 2 >= 16, got {sparsity_block_size}")
+    if seq_q % sparsity_block_size != 0:
+        raise ValueError(f"seq_q ({seq_q}) must be divisible by sparsity_block_size")
+    if seq_k % sparsity_block_size != 0:
+        raise ValueError(f"seq_k ({seq_k}) must be divisible by sparsity_block_size")
+    n_batches = batch * n_heads
+    n_seq_blocks_q = seq_q // sparsity_block_size
+    n_seq_blocks_k = seq_k // sparsity_block_size
+    expected_layout_shape = (n_batches, n_seq_blocks_q, n_seq_blocks_k)
+    if attention_layout.shape != expected_layout_shape:
+        raise ValueError(f"attention_layout shape {tuple(attention_layout.shape)} doesn't match expected {expected_layout_shape}")
+    if scale is None:
+        scale = 1.0 / math.sqrt(head_dim)
+    if lut is None:
+        lut = flash_attention_build_lut(attention_layout, n_seq_blocks_q, n_seq_blocks_k)
+    has_mask = attention_mask is not None
+    if has_mask:
+        if attention_mask.shape != (n_batches, seq_q, seq_k):
+            raise ValueError(f"attention_mask shape {tuple(attention_mask.shape)} doesn't match expected ({n_batches}, {seq_q}, {seq_k})")
+        attention_mask_additive = torch.where(
+            attention_mask,
+            torch.tensor(float("-inf"), device=attention_mask.device, dtype=q.dtype),
+            torch.tensor(0.0, device=attention_mask.device, dtype=q.dtype)
+        ).contiguous()
+    else:
+        attention_mask_additive = torch.empty(0, device=q.device, dtype=q.dtype)
+    return BlockSparseFlashAttention.apply(
+        q, k, v,
+        attention_mask_additive,
+        lut["attn_lut"], lut["attn_offsets"],
+        lut["rev_attn_lut"], lut["rev_attn_offsets"],
+        sparsity_block_size, n_seq_blocks_q, n_seq_blocks_k,
+        lut["max_kv_blocks"], lut["max_q_per_k"],
+        scale, has_mask,
+    )
+class BlockSparseFlashAttention(torch.autograd.Function):
+    """Block-sparse Flash Attention with autograd support."""
+    @staticmethod
+    def forward(ctx, q, k, v, attention_mask, attn_lut, attn_offsets, rev_attn_lut, rev_attn_offsets,
+                sparsity_block_size, n_seq_blocks_q, n_seq_blocks_k, max_kv_blocks, max_q_per_k, scale, has_mask):
+        batch, seq_q, n_heads, head_dim = q.shape
+        _, seq_k, _, _ = k.shape
+        n_batches = batch * n_heads
+        q_flat = q.permute(0, 2, 1, 3).reshape(n_batches, seq_q, head_dim).contiguous()
+        k_flat = k.permute(0, 2, 1, 3).reshape(n_batches, seq_k, head_dim).contiguous()
+        v_flat = v.permute(0, 2, 1, 3).reshape(n_batches, seq_k, head_dim).contiguous()
+        o_flat = torch.empty_like(q_flat)
+        lse = torch.empty(n_batches, seq_q, device=q.device, dtype=torch.float32)
+        l = torch.empty(n_batches, seq_q, device=q.device, dtype=torch.float32)
+        if head_dim <= 64:
+            BLOCK_M = min(128, sparsity_block_size)
+        elif head_dim <= 128:
+            BLOCK_M = min(64, sparsity_block_size)
+        else:
+            BLOCK_M = min(32, sparsity_block_size)
+        BLOCK_N = sparsity_block_size
+        n_m_tiles = seq_q // BLOCK_M
+        grid = (n_m_tiles, n_batches)
+        if has_mask:
+            mask_stride_batch = attention_mask.stride(0)
+            mask_stride_row = attention_mask.stride(1)
+            mask_stride_col = attention_mask.stride(2)
+        else:
+            mask_stride_batch = 0
+            mask_stride_row = 0
+            mask_stride_col = 0
+        flash_attention_fwd_kernel[grid](
+            q_flat, k_flat, v_flat, o_flat,
+            attention_mask if has_mask else q_flat,
+            attn_lut, attn_offsets,
+            lse, l,
+            q_flat.stride(0), q_flat.stride(1), q_flat.stride(2),
+            k_flat.stride(0), k_flat.stride(1), k_flat.stride(2),
+            mask_stride_batch, mask_stride_row, mask_stride_col,
+            n_batches, seq_q, seq_k, head_dim, sparsity_block_size, n_seq_blocks_q, max_kv_blocks,
+            scale,
+            has_mask,
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+            num_stages=4, num_warps=4,
+        )
+        o = o_flat.reshape(batch, n_heads, seq_q, head_dim).permute(0, 2, 1, 3).contiguous()
+        ctx.save_for_backward(q_flat, k_flat, v_flat, o_flat, lse,
+                               attn_lut, attn_offsets, rev_attn_lut, rev_attn_offsets,
+                               attention_mask if has_mask else torch.empty(0, device=q.device))
+        ctx.sparsity_block_size = sparsity_block_size
+        ctx.n_seq_blocks_q = n_seq_blocks_q
+        ctx.n_seq_blocks_k = n_seq_blocks_k
+        ctx.max_kv_blocks = max_kv_blocks
+        ctx.max_q_per_k = max_q_per_k
+        ctx.scale = scale
+        ctx.has_mask = has_mask
+        ctx.batch = batch
+        ctx.n_heads = n_heads
+        ctx.seq_q = seq_q
+        ctx.seq_k = seq_k
+        ctx.head_dim = head_dim
+        ctx.BLOCK_M = BLOCK_M
+        ctx.BLOCK_N = BLOCK_N
+        return o
+    @staticmethod
+    def backward(ctx, grad_output):
+        (q_flat, k_flat, v_flat, o_flat, lse,
+         attn_lut, attn_offsets, rev_attn_lut, rev_attn_offsets, attention_mask) = ctx.saved_tensors
+        batch = ctx.batch
+        n_heads = ctx.n_heads
+        seq_q = ctx.seq_q
+        seq_k = ctx.seq_k
+        head_dim = ctx.head_dim
+        n_batches = batch * n_heads
+        sparsity_block_size = ctx.sparsity_block_size
+        BLOCK_M = ctx.BLOCK_M
+        BLOCK_N = ctx.BLOCK_N
+        has_mask = ctx.has_mask
+        do_flat = grad_output.permute(0, 2, 1, 3).reshape(n_batches, seq_q, head_dim).contiguous()
+        dq_flat = torch.zeros_like(q_flat)
+        dk_flat = torch.zeros_like(k_flat)
+        dv_flat = torch.zeros_like(v_flat)
+        delta = torch.empty(n_batches, seq_q, device=q_flat.device, dtype=torch.float32)
+        if has_mask:
+            mask_stride_batch = attention_mask.stride(0)
+            mask_stride_row = attention_mask.stride(1)
+            mask_stride_col = attention_mask.stride(2)
+        else:
+            mask_stride_batch = 0
+            mask_stride_row = 0
+            mask_stride_col = 0
+        n_m_tiles_q = seq_q // BLOCK_M
+        flash_attention_bwd_preprocess_kernel[(n_m_tiles_q, n_batches)](
+            o_flat, do_flat, delta,
+            o_flat.stride(0), o_flat.stride(1), o_flat.stride(2),
+            seq_q, head_dim,
+            BLOCK_M=BLOCK_M,
+        )
+        n_n_tiles_k = seq_k // BLOCK_N
+        flash_attention_bwd_dkdv_kernel[(n_n_tiles_k, n_batches)](
+            q_flat, k_flat, v_flat, do_flat,
+            dk_flat, dv_flat,
+            lse, delta,
+            attention_mask if has_mask else q_flat,
+            rev_attn_lut, rev_attn_offsets,
+            q_flat.stride(0), q_flat.stride(1),
+            k_flat.stride(0), k_flat.stride(1),
+            q_flat.stride(2),
+            mask_stride_batch, mask_stride_row, mask_stride_col,
+            n_batches, seq_q, seq_k, head_dim, sparsity_block_size, ctx.n_seq_blocks_k, ctx.max_q_per_k,
+            ctx.scale,
+            has_mask,
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        )
+        flash_attention_bwd_dq_kernel[(n_m_tiles_q, n_batches)](
+            q_flat, k_flat, v_flat, do_flat,
+            dq_flat,
+            lse, delta,
+            attention_mask if has_mask else q_flat,
+            attn_lut, attn_offsets,
+            q_flat.stride(0), q_flat.stride(1),
+            k_flat.stride(0), k_flat.stride(1),
+            q_flat.stride(2),
+            mask_stride_batch, mask_stride_row, mask_stride_col,
+            n_batches, seq_q, seq_k, head_dim, sparsity_block_size, ctx.n_seq_blocks_q, ctx.max_kv_blocks,
+            ctx.scale,
+            has_mask,
+            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
+        )
+        dq = dq_flat.reshape(batch, n_heads, seq_q, head_dim).permute(0, 2, 1, 3).contiguous()
+        dk = dk_flat.reshape(batch, n_heads, seq_k, head_dim).permute(0, 2, 1, 3).contiguous()
+        dv = dv_flat.reshape(batch, n_heads, seq_k, head_dim).permute(0, 2, 1, 3).contiguous()
+        return dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None
+@triton.jit
+def flash_attention_fwd_kernel(
+    q_ptr, k_ptr, v_ptr, o_ptr,
+    mask_ptr,
+    attn_lut_ptr, attn_offsets_ptr,
+    m_ptr, l_ptr,
+    stride_q_batch, stride_q_seq, stride_q_dim,
+    stride_kv_batch, stride_kv_seq, stride_kv_dim,
+    stride_mask_batch, stride_mask_row, stride_mask_col,
+    n_batches: tl.constexpr,
+    seq_q: tl.constexpr,
+    seq_k: tl.constexpr,
+    head_dim: tl.constexpr,
+    sparsity_block_size: tl.constexpr,
+    n_seq_blocks_q: tl.constexpr,
+    max_kv_blocks: tl.constexpr,
+    scale,
+    has_mask: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """Flash attention forward kernel with block-sparse mask support."""
+    pid_m = tl.program_id(0)
+    pid_batch = tl.program_id(1)
+    n_m_tiles: tl.constexpr = sparsity_block_size // BLOCK_M
+    n_n_tiles: tl.constexpr = sparsity_block_size // BLOCK_N
+    q_seq_block = pid_m // n_m_tiles
+    m_tile_idx = pid_m % n_m_tiles
+    q_row_start = q_seq_block * sparsity_block_size + m_tile_idx * BLOCK_M
+    offs_m = q_row_start + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, head_dim)
+    q_ptrs = q_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+    q_mask = offs_m[:, None] < seq_q
+    q = tl.load(q_ptrs, mask=q_mask, other=0.0)
+    m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32)
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, head_dim], dtype=tl.float32)
+    qk_scale = scale * 1.44269504
+    attn_offset_idx = pid_batch * n_seq_blocks_q + q_seq_block
+    attn_start = tl.load(attn_offsets_ptr + attn_offset_idx)
+    attn_end = tl.load(attn_offsets_ptr + attn_offset_idx + 1)
+    n_kv_blocks = attn_end - attn_start
+    for kv_idx in range(max_kv_blocks):
+        if kv_idx < n_kv_blocks:
+            k_seq_block = tl.load(attn_lut_ptr + attn_start + kv_idx)
+            k_row_start = k_seq_block * sparsity_block_size
+            offs_n = k_row_start + tl.arange(0, BLOCK_N)
+            k_ptrs = k_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+            k_mask = offs_n[:, None] < seq_k
+            k = tl.load(k_ptrs, mask=k_mask, other=0.0)
+            qk = tl.dot(q, tl.trans(k)) * qk_scale
+            if has_mask:
+                mask_ptrs = mask_ptr + pid_batch * stride_mask_batch + offs_m[:, None] * stride_mask_row + offs_n[None, :] * stride_mask_col
+                mask_vals = tl.load(mask_ptrs, mask=(offs_m[:, None] < seq_q) & (offs_n[None, :] < seq_k), other=0.0)
+                qk = qk + mask_vals
+            m_ij = tl.maximum(m_i, tl.max(qk, axis=1))
+            alpha = tl.math.exp2(m_i - m_ij)
+            p = tl.math.exp2(qk - m_ij[:, None])
+            l_i = l_i * alpha + tl.sum(p, axis=1)
+            acc = acc * alpha[:, None]
+            v_ptrs = v_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+            v = tl.load(v_ptrs, mask=k_mask, other=0.0)
+            acc = tl.dot(p.to(v.dtype), v, acc)
+            m_i = m_ij
+    has_attention = l_i > 0
+    l_safe = tl.where(has_attention, l_i, 1.0)
+    acc = acc / l_safe[:, None]
+    acc = tl.where(has_attention[:, None], acc, 0.0)
+    o_ptrs = o_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+    tl.store(o_ptrs, acc.to(o_ptr.dtype.element_ty), mask=offs_m[:, None] < seq_q)
+    lse = tl.where(has_attention, m_i + tl.math.log2(l_safe), float("-inf"))
+    tl.store(m_ptr + pid_batch * seq_q + offs_m, lse, mask=offs_m < seq_q)
+    tl.store(l_ptr + pid_batch * seq_q + offs_m, l_i, mask=offs_m < seq_q)
+@triton.jit
+def flash_attention_bwd_preprocess_kernel(
+    o_ptr, do_ptr, delta_ptr,
+    stride_batch, stride_seq, stride_dim,
+    seq_len: tl.constexpr,
+    head_dim: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+):
+    """Compute delta = (O * dO).sum(dim=-1)."""
+    pid_m = tl.program_id(0)
+    pid_batch = tl.program_id(1)
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, head_dim)
+    o_ptrs = o_ptr + pid_batch * stride_batch + offs_m[:, None] * stride_seq + offs_d[None, :]
+    do_ptrs = do_ptr + pid_batch * stride_batch + offs_m[:, None] * stride_seq + offs_d[None, :]
+    mask = offs_m[:, None] < seq_len
+    o = tl.load(o_ptrs, mask=mask, other=0.0).to(tl.float32)
+    do = tl.load(do_ptrs, mask=mask, other=0.0).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    tl.store(delta_ptr + pid_batch * seq_len + offs_m, delta, mask=offs_m < seq_len)
+@triton.jit
+def flash_attention_bwd_dkdv_kernel(
+    q_ptr, k_ptr, v_ptr, do_ptr,
+    dk_ptr, dv_ptr,
+    lse_ptr, delta_ptr,
+    mask_ptr,
+    rev_attn_lut_ptr, rev_attn_offsets_ptr,
+    stride_q_batch, stride_q_seq,
+    stride_kv_batch, stride_kv_seq,
+    stride_dim,
+    stride_mask_batch, stride_mask_row, stride_mask_col,
+    n_batches: tl.constexpr,
+    seq_q: tl.constexpr,
+    seq_k: tl.constexpr,
+    head_dim: tl.constexpr,
+    sparsity_block_size: tl.constexpr,
+    n_seq_blocks_k: tl.constexpr,
+    max_q_per_k: tl.constexpr,
+    scale,
+    has_mask: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """Compute dK and dV gradients."""
+    pid_n = tl.program_id(0)
+    pid_batch = tl.program_id(1)
+    n_n_tiles = sparsity_block_size // BLOCK_N
+    n_m_tiles = sparsity_block_size // BLOCK_M
+    k_seq_block = pid_n // n_n_tiles
+    n_tile_idx = pid_n % n_n_tiles
+    k_row_start = k_seq_block * sparsity_block_size + n_tile_idx * BLOCK_N
+    offs_n = k_row_start + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, head_dim)
+    qk_scale = scale * 1.44269504
+    k_ptrs = k_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+    v_ptrs = v_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+    k_mask = offs_n[:, None] < seq_k
+    k = tl.load(k_ptrs, mask=k_mask, other=0.0)
+    v = tl.load(v_ptrs, mask=k_mask, other=0.0)
+    dk = tl.zeros([BLOCK_N, head_dim], dtype=tl.float32)
+    dv = tl.zeros([BLOCK_N, head_dim], dtype=tl.float32)
+    rev_offset_idx = pid_batch * n_seq_blocks_k + k_seq_block
+    rev_start = tl.load(rev_attn_offsets_ptr + rev_offset_idx)
+    rev_end = tl.load(rev_attn_offsets_ptr + rev_offset_idx + 1)
+    n_q_blocks = rev_end - rev_start
+    for q_idx in range(max_q_per_k):
+        if q_idx < n_q_blocks:
+            q_seq_block = tl.load(rev_attn_lut_ptr + rev_start + q_idx)
+            for m_tile_idx in range(n_m_tiles):
+                q_row_start = q_seq_block * sparsity_block_size + m_tile_idx * BLOCK_M
+                offs_m = q_row_start + tl.arange(0, BLOCK_M)
+                q_ptrs = q_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+                do_ptrs = do_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+                q_mask = offs_m[:, None] < seq_q
+                q = tl.load(q_ptrs, mask=q_mask, other=0.0)
+                do = tl.load(do_ptrs, mask=q_mask, other=0.0)
+                m = tl.load(lse_ptr + pid_batch * seq_q + offs_m, mask=offs_m < seq_q, other=0.0)
+                Di = tl.load(delta_ptr + pid_batch * seq_q + offs_m, mask=offs_m < seq_q, other=0.0)
+                qk = tl.dot(q, tl.trans(k)) * qk_scale
+                if has_mask:
+                    mask_ptrs = mask_ptr + pid_batch * stride_mask_batch + offs_m[:, None] * stride_mask_row + offs_n[None, :] * stride_mask_col
+                    mask_vals = tl.load(mask_ptrs, mask=(offs_m[:, None] < seq_q) & (offs_n[None, :] < seq_k), other=0.0)
+                    qk = qk + mask_vals
+                valid_lse = m > float("-inf")
+                safe_m = tl.where(valid_lse, m, 0.0)
+                p = tl.math.exp2(qk - safe_m[:, None])
+                p = tl.where(valid_lse[:, None], p, 0.0)
+                dv += tl.dot(tl.trans(p.to(do.dtype)), do)
+                dp = tl.dot(do, tl.trans(v))
+                ds = p * (dp - Di[:, None])
+                dk += tl.dot(tl.trans(ds.to(q.dtype)), q)
+    dk = dk * scale
+    tl.store(dk_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :], dk.to(dk_ptr.dtype.element_ty), mask=k_mask)
+    tl.store(dv_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :], dv.to(dv_ptr.dtype.element_ty), mask=k_mask)
+@triton.jit
+def flash_attention_bwd_dq_kernel(
+    q_ptr, k_ptr, v_ptr, do_ptr,
+    dq_ptr,
+    lse_ptr, delta_ptr,
+    mask_ptr,
+    attn_lut_ptr, attn_offsets_ptr,
+    stride_q_batch, stride_q_seq,
+    stride_kv_batch, stride_kv_seq,
+    stride_dim,
+    stride_mask_batch, stride_mask_row, stride_mask_col,
+    n_batches: tl.constexpr,
+    seq_q: tl.constexpr,
+    seq_k: tl.constexpr,
+    head_dim: tl.constexpr,
+    sparsity_block_size: tl.constexpr,
+    n_seq_blocks_q: tl.constexpr,
+    max_kv_blocks: tl.constexpr,
+    scale,
+    has_mask: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """Compute dQ gradients."""
+    pid_m = tl.program_id(0)
+    pid_batch = tl.program_id(1)
+    n_m_tiles = sparsity_block_size // BLOCK_M
+    n_n_tiles = sparsity_block_size // BLOCK_N
+    q_seq_block = pid_m // n_m_tiles
+    m_tile_idx = pid_m % n_m_tiles
+    q_row_start = q_seq_block * sparsity_block_size + m_tile_idx * BLOCK_M
+    offs_m = q_row_start + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, head_dim)
+    qk_scale = scale * 1.44269504
+    q_ptrs = q_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+    do_ptrs = do_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :]
+    q_mask = offs_m[:, None] < seq_q
+    q = tl.load(q_ptrs, mask=q_mask, other=0.0)
+    do = tl.load(do_ptrs, mask=q_mask, other=0.0)
+    m = tl.load(lse_ptr + pid_batch * seq_q + offs_m, mask=offs_m < seq_q, other=0.0)
+    Di = tl.load(delta_ptr + pid_batch * seq_q + offs_m, mask=offs_m < seq_q, other=0.0)
+    dq = tl.zeros([BLOCK_M, head_dim], dtype=tl.float32)
+    attn_offset_idx = pid_batch * n_seq_blocks_q + q_seq_block
+    attn_start = tl.load(attn_offsets_ptr + attn_offset_idx)
+    attn_end = tl.load(attn_offsets_ptr + attn_offset_idx + 1)
+    n_kv_blocks = attn_end - attn_start
+    for kv_idx in range(max_kv_blocks):
+        if kv_idx < n_kv_blocks:
+            k_seq_block = tl.load(attn_lut_ptr + attn_start + kv_idx)
+            for n_tile_idx in range(n_n_tiles):
+                k_row_start = k_seq_block * sparsity_block_size + n_tile_idx * BLOCK_N
+                offs_n = k_row_start + tl.arange(0, BLOCK_N)
+                k_ptrs = k_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+                v_ptrs = v_ptr + pid_batch * stride_kv_batch + offs_n[:, None] * stride_kv_seq + offs_d[None, :]
+                k_mask = offs_n[:, None] < seq_k
+                k = tl.load(k_ptrs, mask=k_mask, other=0.0)
+                v = tl.load(v_ptrs, mask=k_mask, other=0.0)
+                qk = tl.dot(q, tl.trans(k)) * qk_scale
+                if has_mask:
+                    mask_ptrs = mask_ptr + pid_batch * stride_mask_batch + offs_m[:, None] * stride_mask_row + offs_n[None, :] * stride_mask_col
+                    mask_vals = tl.load(mask_ptrs, mask=(offs_m[:, None] < seq_q) & (offs_n[None, :] < seq_k), other=0.0)
+                    qk = qk + mask_vals
+                valid_lse = m > float("-inf")
+                safe_m = tl.where(valid_lse, m, 0.0)
+                p = tl.math.exp2(qk - safe_m[:, None])
+                p = tl.where(valid_lse[:, None], p, 0.0)
+                dp = tl.dot(do, tl.trans(v))
+                ds = p * (dp - Di[:, None])
+                dq += tl.dot(ds.to(k.dtype), k)
+    dq = dq * scale
+    tl.store(dq_ptr + pid_batch * stride_q_batch + offs_m[:, None] * stride_q_seq + offs_d[None, :], dq.to(dq_ptr.dtype.element_ty), mask=q_mask)
+def flash_attention_build_lut(
+    attention_layout: Tensor,
+    n_seq_blocks_q: int = None,
+    n_seq_blocks_k: int = None,
+) -> dict:
+    """Build attention LUTs for reuse across multiple calls."""
+    n_batches = attention_layout.shape[0]
+    if n_seq_blocks_q is None:
+        n_seq_blocks_q = attention_layout.shape[1]
+    if n_seq_blocks_k is None:
+        n_seq_blocks_k = attention_layout.shape[2]
+    attn_lut, attn_offsets, max_kv_blocks = _build_attention_lut_fast(
+        attention_layout, n_batches, n_seq_blocks_q, n_seq_blocks_k
+    )
+    attention_layout_t = attention_layout.transpose(1, 2).contiguous()
+    rev_attn_lut, rev_attn_offsets, max_q_per_k = _build_attention_lut_fast(
+        attention_layout_t, n_batches, n_seq_blocks_k, n_seq_blocks_q
+    )
+    return {
+        "attn_lut": attn_lut,
+        "attn_offsets": attn_offsets,
+        "max_kv_blocks": max_kv_blocks,
+        "rev_attn_lut": rev_attn_lut,
+        "rev_attn_offsets": rev_attn_offsets,
+        "max_q_per_k": max_q_per_k,
+    }
+def _build_attention_lut_fast(
+    attention_layout: Tensor,
+    n_batches: int,
+    n_blocks_row: int,
+    n_blocks_col: int,
+) -> Tuple[Tensor, Tensor, int]:
+    """Build attention LUT efficiently."""
+    device = attention_layout.device
+    counts = attention_layout.sum(dim=2).flatten()
+    max_blocks_per_row = int(counts.max().item())
+    if max_blocks_per_row == 0:
+        offsets = torch.zeros(n_batches * n_blocks_row + 1, dtype=torch.int32, device=device)
+        lut = torch.empty(0, dtype=torch.int32, device=device)
+        return lut, offsets, 1
+    offsets = torch.zeros(n_batches * n_blocks_row + 1, dtype=torch.int32, device=device)
+    offsets[1:] = counts.cumsum(0).to(torch.int32)
+    indices = attention_layout.reshape(n_batches * n_blocks_row, n_blocks_col).nonzero(as_tuple=False)
+    lut = indices[:, 1].to(torch.int32)
+    return lut, offsets, max_blocks_per_row

blksprs/utils/autotuning.py CHANGED Viewed

@@ -65,7 +65,6 @@ def prune_autotune_configs_conversion(autotune_configs, kernel_args, **kwargs):
     return pruned_configs
-@torch.compile
 def get_autotune_configs():
     global autotune_parameters

blksprs/utils/tools.py CHANGED Viewed

@@ -16,7 +16,9 @@ def undo_shape_blocksparse(x: Tensor, shape: Size | tuple[int, ...]) -> Tensor:
 def stride(x: Tensor):
-    if x.dim() == 2:
+    if x.dim() == 1:
+        return 1
+    elif x.dim() == 2:
         return x.size(1), 1
     elif x.dim() == 3:
         return x.size(1) * x.size(2), x.size(2), 1

{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blksprs
-Version: 2.1.9
+Version: 2.2
 Summary: A lightweight library for operations on block-sparse matrices in PyTorch.
 Author-email: Felix Schön <schoen@kr.tuwien.ac.at>
 Project-URL: Homepage, https://github.com/FelixSchoen/blksprs
@@ -17,20 +17,13 @@ Requires-Dist: coverage; extra == "test"
 Requires-Dist: build; extra == "test"
 Requires-Dist: matplotlib; extra == "test"
-# blksprs
+# 🧊 blksprs
 [![GitHub Release](https://img.shields.io/github/v/release/FelixSchoen/blksprs?include_prereleases&label=Latest%20Release)](https://github.com/FelixSchoen/blksprs/releases)
 [![Python 3.11](https://img.shields.io/badge/Python%20Version-3.11-blue)](https://www.python.org/downloads/release/python-3119/)
 [![Python 3.12](https://img.shields.io/badge/Python%20Version-3.12-blue)](https://www.python.org/downloads/release/python-31210/)
-## Overview
-### News
-🎉 ***Version 2.0 released***. blksprs now supports kernel auto-tuning, JIT compilation, specification of pre-calculated
-LUTs, autocasting, and makes use of `torch.library.triton_op()`!
----
+## 📖 Overview
 A lightweight and efficient library for operations on block-sparse matrices in PyTorch using Triton.
@@ -46,6 +39,7 @@ Currently supported operations (includes gradient calculation):
 - Splitting and merging of matrices (_currently* only supports splitting and merging along the last dimension_)
 - Conversion to and from sparse form
 - Conversion to different sparsity layouts and different sparsity block sizes
+- Flash Attention (_supports custom masks and cross-attention_)
 As with this library sparse matrices are represented using a tuple of `(matrix, sparsity_layout, sparsity_block_size)`,
 any element-wise operations can be applied in regular torch-like fashion.
@@ -74,7 +68,7 @@ Furthermore, the library provides a set of utility functions
 _* see the [Roadmap](#roadmap) section for more information_
-## Installation
+## 🛠️ Installation
 Note that due to the dependency on [Triton](https://github.com/triton-lang/triton) this library is **only compatible with the Linux platform**.
 Keep track of this [issue](https://github.com/triton-lang/triton/issues/1640) for updates.
@@ -89,11 +83,11 @@ We recommend installing blksprs from [PyPI](https://pypi.org/project/blksprs/) u
 - _[NumPy](https://numpy.org/) (to get rid of warnings, built with v2.3.1)_
 - _[Triton](https://github.com/triton-lang/triton) (included with PyTorch)_
-## Changelog
+## 📝 Changelog
 See [`CHANGELOG.md`](https://github.com/FelixSchoen/blksprs/blob/main/CHANGELOG.md) for a detailed changelog.
-## Roadmap
+## 🗺️ Roadmap
 Note that since this library covers all our current needs it is in a **bugfix-only** state.
 This means that there are no plans to add new features, e.g., support for dimension specification of the ``split`` and
@@ -105,17 +99,15 @@ We also encourage [pull requests](https://github.com/FelixSchoen/blksprs/pulls).
 It might be that this changes with future projects, but as of August 2025, we are content with the current state of the
 library.
-## Known Limitations and Issues
+## ⚠️ Known Limitations and Issues
-- Triton has a bug with `tl.atomix_max()` used for the row-wise max operation.
-  In order to work around this bug a manual conversion of some values is needed, (slightly) negatively impacting
-  performance.
-  Watch the [issue](https://github.com/triton-lang/triton/issues/6376) on Triton's issue tracker for more information.
 - There will be some slight numerical differences between vanilla and blksprs operations.
   These instabilities are due to Triton and thus cannot be fixed by this library alone.
   However, for all intents and purposes, these very minor differences should not matter and can safely be ignored.
-## Usage
+- Flash Attention is a recent addition. While it has been tested and appears stable, please report any issues you encounter.
+## 💻 Usage
 We provide an example below to demonstrate the usage of the library.
 For more detailed examples, please refer to
@@ -128,7 +120,6 @@ the [test cases](https://github.com/FelixSchoen/blksprs/blob/main/test/cases/tes
 import torch
 import blksprs as bs
 def test_readme():
     # Set up parameters (batch size, number of heads, dimensions for matrices (m, k) and (n, k))
     b, h, m, n, k = 2, 4, 64, 64, 16
@@ -193,10 +184,30 @@ def test_readme():
     # Other available functions
     bs.ops.transpose(o_sparse, sparsity_layout_o, sparsity_block_size)
     bs.ops.softmax(o_sparse, sparsity_layout_o, sparsity_block_size, flag_fused=False)
-    bs.ops.softmax_fused(o_sparse, sparsity_layout_o, sparsity_block_size) # Significantly faster version that requires that rows of matrix fit into memory (default if flag is not set)
+    bs.ops.softmax_fused(o_sparse, sparsity_layout_o,
+                         sparsity_block_size)  # Significantly faster version that requires that rows of matrix fit into memory (default if flag is not set)
     bs.ops.misc.row_wise_sum(o_sparse, sparsity_layout_o, sparsity_block_size)
     bs.ops.misc.row_wise_max(o_sparse, sparsity_layout_o, sparsity_block_size)
+    # Flash Attention
+    seq_len, head_dim = 512, 64
+    sparsity_block_size_attn = 128
+    q = torch.randn(b, seq_len, h, head_dim, device="cuda")
+    k = torch.randn(b, seq_len, h, head_dim, device="cuda")
+    v = torch.randn(b, seq_len, h, head_dim, device="cuda")
+    n_batches_attn = b * h
+    n_seq_blocks = seq_len // sparsity_block_size_attn
+    attention_layout = torch.tril(torch.ones(n_batches_attn, n_seq_blocks, n_seq_blocks, device="cuda", dtype=torch.bool))
+    lut = bs.ops.flash_attention_build_lut(attention_layout, n_seq_blocks, n_seq_blocks)
+    attn_out = bs.ops.flash_attention(q, k, v, attention_layout, sparsity_block_size_attn, lut=lut)
+    assert attn_out.shape == (b, seq_len, h, head_dim)
 def _get_random_sparsity_layout(b, m, n, sparsity_block_size, sparsity_percentage):
     """Helper function, creates a random sparsity layout for a given shape with a given percentage of blocks marked as sparse.

{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,9 @@
-blksprs/__init__.py,sha256=ntOKh_5mZ7UdVkFhS8uSbzx8P_XfFt-o7qHX8xjbS3o,1777
+blksprs/__init__.py,sha256=x6jBdOoukS032NnaO5zR-rJjdnQavBv8bA1E9C0wv7Y,1862
 blksprs/layouting/distribution_layout.py,sha256=a2C3DG3pYhAaPpOEgSMCRqDK1RYuFenSHqp0JdWMWmQ,5934
 blksprs/layouting/sparsity_layout.py,sha256=nl4qAJxtteZ6cx4td8FktbPiIfNEZl6zWUmMahv9Wac,11320
 blksprs/ops/conversion.py,sha256=PEgXwN-UZilr7OUBlOI1NzT8902Baxa3ie9f6K1mGQc,21543
-blksprs/ops/distribution.py,sha256=oEXJWD_u8cLnZ-_hYwpC1nQ76ijMBZAT3LXIrFQCXPY,20236
+blksprs/ops/distribution.py,sha256=na_bBldK8MXuu8u7MMMoZwl2css7cplhjkqgA3e1NPg,20221
+blksprs/ops/flash_attention.py,sha256=ktdwdyUxgqlmTGvo-sB5hdx1yy9m4SDoYrKlAc3lkG8,24571
 blksprs/ops/flow.py,sha256=e1SKZUNMWTRgG16aK7BjYNdxWuDnLl2s0ozSkUYDBYs,7818
 blksprs/ops/matmul.py,sha256=Q_mcSfHpziZYrasB1_TbH8FmFtaf-lfoigg8H0POK64,11677
 blksprs/ops/partitioning.py,sha256=88TU77uDbvZTcYdTah9oChJrbgqZdkj4tNPylf9IS1c,9995
@@ -11,13 +12,13 @@ blksprs/ops/softmax.py,sha256=SrWZaLxk0rGbyKCxH4np97mL7k10Oqg2VP2-qZFQ8ec,23679
 blksprs/ops/transpose.py,sha256=IaNdqWDZ2rNSaO8kwpQyoSUpVpsoxMREgEXzhVBTsaY,4112
 blksprs/ops/misc/broadcast_ops.py,sha256=RmLSFFugRcRn70CU5ahrTRTplk8_At-5XkaF0UFiCQs,5703
 blksprs/ops/misc/row_wise.py,sha256=UYrgteIDp7NFqbV85hEmdzXxiJ-wQPuFGJV88rnEjdg,19344
-blksprs/utils/autotuning.py,sha256=SOvVesXmVDbAprKtVGlfqKQ-JyRHfynvxtsbH7Qjem0,2053
+blksprs/utils/autotuning.py,sha256=dWFYY_xoGCFxmX9qIyul37f62Bra1R9MY_turMHxYS8,2038
 blksprs/utils/benchmarking.py,sha256=dLabDscTFn5NkmOI1g7DnKeTneUYW3RIVv9MDF-8BKc,1271
 blksprs/utils/blksprs_tensor.py,sha256=Y8YnsFPifvdCf5Khsm8bDVv-589U0N8IsCFlnDETfzE,476
 blksprs/utils/processing.py,sha256=GcsUl54DDrEoZ0iuWZV5Q0BR2ZML3jWOhypOMxDCsrs,3759
-blksprs/utils/tools.py,sha256=vlIH89TzMxotKeqts0Pipr09uf0HDQN9oQYGSGfAdk4,751
+blksprs/utils/tools.py,sha256=3puJ7S-Pfb1ILnzco09pz7RQOt7Vrkj-LpPpnj3zZHY,791
 blksprs/utils/validation.py,sha256=P98sCk6PZCQB0wO3scGTJIXfkv5EpHFM_uNHBXr42n4,4844
-blksprs-2.1.9.dist-info/METADATA,sha256=jpHxFxI0XRXoBeN-2CtVjoR_1Niws9OCUx-YvdIWCq8,9599
-blksprs-2.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-blksprs-2.1.9.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
-blksprs-2.1.9.dist-info/RECORD,,
+blksprs-2.2.dist-info/METADATA,sha256=9OCvQ0g7nMoNmazKUJUe7izBYc3d335rmO02zmY7iqc,10050
+blksprs-2.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+blksprs-2.2.dist-info/top_level.txt,sha256=qyp0IHeY3H2GQA97i4hk_To5rRBS2YcE1HRPSLy04fk,8
+blksprs-2.2.dist-info/RECORD,,

{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: true
 Tag: py3-none-any

{blksprs-2.1.9.dist-info → blksprs-2.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

blksprs 2.1.9__py3-none-any.whl → 2.2__py3-none-any.whl

blksprs 2.1.9py3-none-any.whl → 2.2py3-none-any.whl