PyPI - mps-flash-attn - Versions diffs - 0.2.1__tar.gz → 0.2.5__tar.gz - Mend

mps-flash-attn 0.2.1tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (40) hide show

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.1
+Version: 0.2.5
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -201,6 +201,13 @@ Python API (mps_flash_attn)
 - Python 3.10+
 - PyTorch 2.0+
+## TODO / Future Optimizations
+- [ ] **Batched kernel dispatch** - Currently dispatches B×H separate kernels per attention call. Should use 3D grid to handle all batch/heads in one dispatch (major perf win for small sequences like Swin Transformer windows)
+- [ ] **Fused QKV projection + attention** - Single kernel from input to output, avoid intermediate buffers
+- [ ] **Pre-scaled bias option** - Allow passing pre-scaled bias to avoid per-call scaling overhead
+- [ ] **LoRA fusion** - Fuse adapter weights into attention computation
 ## Credits
 - [metal-flash-attention](https://github.com/philipturner/metal-flash-attention) by Philip Turner

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/README.md RENAMED Viewed

@@ -176,6 +176,13 @@ Python API (mps_flash_attn)
 - Python 3.10+
 - PyTorch 2.0+
+## TODO / Future Optimizations
+- [ ] **Batched kernel dispatch** - Currently dispatches B×H separate kernels per attention call. Should use 3D grid to handle all batch/heads in one dispatch (major perf win for small sequences like Swin Transformer windows)
+- [ ] **Fused QKV projection + attention** - Single kernel from input to output, avoid intermediate buffers
+- [ ] **Pre-scaled bias option** - Allow passing pre-scaled bias to avoid per-call scaling overhead
+- [ ] **LoRA fusion** - Fuse adapter weights into attention computation
 ## Credits
 - [metal-flash-attention](https://github.com/philipturner/metal-flash-attention) by Philip Turner

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.1"
+__version__ = "0.2.5"
 import torch
 from typing import Optional
@@ -296,6 +296,86 @@ def precompile():
     print("\nPre-compilation complete! Kernels cached to disk.")
+def flash_attention_with_bias(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_bias: torch.Tensor,
+    is_causal: bool = False,
+    window_size: int = 0,
+    bias_repeat_count: int = 0,
+) -> torch.Tensor:
+    """
+    Compute scaled dot-product attention with additive attention bias.
+    This function supports additive attention bias (like relative position encodings
+    or ALiBi) which is added to the attention scores before softmax:
+        Attention(Q, K, V) = softmax((Q @ K.T) / sqrt(d) + bias) @ V
+    IMPORTANT: MFA adds bias to UNSCALED scores internally and scales during softmax.
+    If your bias was computed for scaled scores (like PyTorch SDPA), you need to
+    pre-scale it by multiplying by sqrt(head_dim).
+    Args:
+        query: Query tensor of shape (B, H, N_q, D)
+        key: Key tensor of shape (B, H, N_kv, D)
+        value: Value tensor of shape (B, H, N_kv, D)
+        attn_bias: Additive attention bias of shape:
+            - (B, H, N_q, N_kv): Full bias for each batch/head
+            - (1, H, N_q, N_kv): Broadcast across batch
+            - (H, N_q, N_kv): Broadcast across batch (3D)
+        is_causal: If True, applies causal masking
+        window_size: Sliding window attention size (0 = full attention)
+        bias_repeat_count: If > 0, the bias tensor repeats every N batches.
+            Useful for window attention where multiple windows share the same
+            position bias pattern. E.g., for Swin Transformer with 4 windows,
+            set bias_repeat_count=num_windows so bias[batch_idx % num_windows]
+            is used.
+    Returns:
+        Output tensor of shape (B, H, N_q, D)
+    Example:
+        >>> # Relative position bias (Swin Transformer style)
+        >>> q = torch.randn(4, 8, 64, 64, device='mps', dtype=torch.float16)
+        >>> k = torch.randn(4, 8, 64, 64, device='mps', dtype=torch.float16)
+        >>> v = torch.randn(4, 8, 64, 64, device='mps', dtype=torch.float16)
+        >>> # Position bias: (1, num_heads, seq_len, seq_len)
+        >>> bias = torch.randn(1, 8, 64, 64, device='mps', dtype=torch.float16)
+        >>> # Pre-scale bias since MFA uses unscaled scores
+        >>> scaled_bias = bias * math.sqrt(64)  # sqrt(head_dim)
+        >>> out = flash_attention_with_bias(q, k, v, scaled_bias)
+        >>> # Window attention with repeating bias pattern
+        >>> n_windows = 16
+        >>> q = torch.randn(n_windows * 4, 8, 49, 64, device='mps', dtype=torch.float16)
+        >>> bias = torch.randn(n_windows, 8, 49, 49, device='mps', dtype=torch.float16)
+        >>> scaled_bias = bias * math.sqrt(64)
+        >>> out = flash_attention_with_bias(q, k, v, scaled_bias, bias_repeat_count=n_windows)
+    """
+    if not _HAS_MFA:
+        raise RuntimeError(
+            f"MPS Flash Attention C++ extension not available: {_IMPORT_ERROR}\n"
+            "Please rebuild with: pip install -e ."
+        )
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available")
+    # Validate device
+    if query.device.type != 'mps':
+        raise ValueError("query must be on MPS device")
+    if key.device.type != 'mps':
+        raise ValueError("key must be on MPS device")
+    if value.device.type != 'mps':
+        raise ValueError("value must be on MPS device")
+    if attn_bias.device.type != 'mps':
+        raise ValueError("attn_bias must be on MPS device")
+    return _C.forward_with_bias(query, key, value, attn_bias, is_causal, window_size, bias_repeat_count)
 def flash_attention_chunked(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -541,6 +621,80 @@ def quantize_kv_int8(
     return k_quant, v_quant, k_scale, v_scale
+# NF4 codebook (must match Metal shader's NF4_CODEBOOK exactly)
+_NF4_CODEBOOK = torch.tensor([
+    -1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453,
+    -0.28444138169288635, -0.18477343022823334, -0.09105003625154495, 0.0,
+    0.07958029955625534, 0.16093020141124725, 0.24611230194568634, 0.33791524171829224,
+    0.44070982933044434, 0.5626170039176941, 0.7229568362236023, 1.0
+], dtype=torch.float32)
+def quantize_kv_nf4(
+    key: torch.Tensor,
+    value: torch.Tensor,
+) -> tuple:
+    """
+    Quantize Key and Value tensors to NF4 (NormalFloat 4-bit) format.
+    NF4 quantization provides 4x memory reduction using a 16-value codebook
+    optimized for normally distributed weights. Two values are packed per byte.
+    Args:
+        key: Key tensor of shape (B, H, N, D) where D must be even
+        value: Value tensor of shape (B, H, N, D) where D must be even
+    Returns:
+        Tuple of (key_quant, value_quant, k_scale, v_scale) where:
+        - key_quant, value_quant: uint8 tensors of shape (B, H, N, D//2) with packed values
+        - k_scale, v_scale: float32 tensors with per-head scale factors (B, H)
+    Example:
+        >>> k_q, v_q, k_s, v_s = quantize_kv_nf4(key, value)
+        >>> out = flash_attention_nf4(query, k_q, v_q, k_s, v_s)
+    """
+    if not _HAS_MFA:
+        raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    def _quantize_nf4(tensor: torch.Tensor) -> tuple:
+        """Quantize a single tensor to NF4 format."""
+        B, H, N, D = tensor.shape
+        if D % 2 != 0:
+            raise ValueError(f"Head dimension D must be even for NF4 quantization, got D={D}")
+        # Convert to float32 for quantization
+        t = tensor.float()
+        # Compute per-head absmax for scale
+        abs_max = t.abs().amax(dim=(2, 3))  # (B, H)
+        scale = abs_max.clamp_min(1e-12)
+        # Normalize to [-1, 1] range
+        scale_expanded = scale.unsqueeze(-1).unsqueeze(-1)  # (B, H, 1, 1)
+        normalized = t / scale_expanded
+        # Find nearest NF4 codebook entry for each value
+        codebook = _NF4_CODEBOOK.to(tensor.device)  # (16,)
+        # Reshape for broadcasting: normalized is (B, H, N, D), codebook is (16,)
+        # Compute distances to all codebook entries
+        flat = normalized.reshape(-1, 1)  # (B*H*N*D, 1)
+        distances = (flat - codebook.unsqueeze(0)).abs()  # (B*H*N*D, 16)
+        indices = distances.argmin(dim=1)  # (B*H*N*D,)
+        indices = indices.reshape(B, H, N, D)  # (B, H, N, D)
+        # Pack two 4-bit indices per byte
+        # Even indices go to low nibble, odd indices go to high nibble
+        indices_even = indices[:, :, :, 0::2]  # (B, H, N, D//2)
+        indices_odd = indices[:, :, :, 1::2]   # (B, H, N, D//2)
+        packed = (indices_even | (indices_odd << 4)).to(torch.uint8)  # (B, H, N, D//2)
+        return packed, scale
+    k_quant, k_scale = _quantize_nf4(key)
+    v_quant, v_scale = _quantize_nf4(value)
+    return k_quant, v_quant, k_scale, v_scale
 def flash_attention_fp8(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -551,6 +705,7 @@ def flash_attention_fp8(
     attn_mask: Optional[torch.Tensor] = None,
     window_size: int = 0,
     use_e5m2: bool = False,
+    scale: Optional[float] = None,
 ) -> torch.Tensor:
     """
     Compute attention with FP8 quantized Key/Value tensors.
@@ -568,6 +723,7 @@ def flash_attention_fp8(
         attn_mask: Optional boolean attention mask
         window_size: Sliding window size (0 = full attention)
         use_e5m2: If True, use E5M2 format. Default: False (E4M3)
+        scale: Softmax scale factor. If None, uses 1/sqrt(head_dim)
     Returns:
         Output tensor of shape (B, H, N, D)
@@ -581,6 +737,14 @@ def flash_attention_fp8(
     if not _HAS_MFA:
         raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    # Apply custom scale by pre-scaling Q
+    if scale is not None:
+        head_dim = query.shape[-1]
+        default_scale = 1.0 / math.sqrt(head_dim)
+        if abs(scale - default_scale) > 1e-9:
+            scale_factor = scale / default_scale
+            query = query * scale_factor
     quant_type = QUANT_FP8_E5M2 if use_e5m2 else QUANT_FP8_E4M3
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
@@ -597,6 +761,7 @@ def flash_attention_int8(
     is_causal: bool = False,
     attn_mask: Optional[torch.Tensor] = None,
     window_size: int = 0,
+    scale: Optional[float] = None,
 ) -> torch.Tensor:
     """
     Compute attention with INT8 quantized Key/Value tensors.
@@ -613,6 +778,7 @@ def flash_attention_int8(
         is_causal: If True, applies causal masking
         attn_mask: Optional boolean attention mask
         window_size: Sliding window size (0 = full attention)
+        scale: Softmax scale factor. If None, uses 1/sqrt(head_dim)
     Returns:
         Output tensor of shape (B, H, N, D)
@@ -624,12 +790,76 @@ def flash_attention_int8(
     if not _HAS_MFA:
         raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    # Apply custom scale by pre-scaling Q
+    if scale is not None:
+        head_dim = query.shape[-1]
+        default_scale = 1.0 / math.sqrt(head_dim)
+        if abs(scale - default_scale) > 1e-9:
+            scale_factor = scale / default_scale
+            query = query * scale_factor
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_INT8, is_causal, attn_mask, window_size
     )
+def flash_attention_nf4(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    is_causal: bool = False,
+    attn_mask: Optional[torch.Tensor] = None,
+    window_size: int = 0,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    Compute attention with NF4 (NormalFloat 4-bit) quantized Key/Value tensors.
+    This function provides 4x memory reduction for K/V cache using NF4 quantization
+    with a 16-value codebook optimized for normally distributed weights.
+    NF4 packs two 4-bit values per byte, so the key/value tensors have shape
+    (B, H, N, D//2) where D is the original head dimension.
+    Args:
+        query: Query tensor (B, H, N, D) in FP16/BF16/FP32
+        key: Quantized Key tensor (B, H, N, D//2) as uint8 (packed NF4)
+        value: Quantized Value tensor (B, H, N, D//2) as uint8 (packed NF4)
+        k_scale: Per-head scale for K (B, H) or (H,)
+        v_scale: Per-head scale for V (B, H) or (H,)
+        is_causal: If True, applies causal masking
+        attn_mask: Optional boolean attention mask
+        window_size: Sliding window size (0 = full attention)
+        scale: Softmax scale factor. If None, uses 1/sqrt(head_dim)
+    Returns:
+        Output tensor of shape (B, H, N, D)
+    Example:
+        >>> k_q, v_q, k_s, v_s = quantize_kv_nf4(key, value)
+        >>> out = flash_attention_nf4(query, k_q, v_q, k_s, v_s)
+    """
+    if not _HAS_MFA:
+        raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    # Apply custom scale by pre-scaling Q
+    # Kernel uses 1/sqrt(D), so we adjust Q to achieve desired scale
+    if scale is not None:
+        head_dim = query.shape[-1]
+        default_scale = 1.0 / math.sqrt(head_dim)
+        if abs(scale - default_scale) > 1e-9:
+            scale_factor = scale / default_scale
+            query = query * scale_factor
+    return _C.forward_quantized(
+        query, key, value, k_scale, v_scale,
+        QUANT_NF4, is_causal, attn_mask, window_size
+    )
 def flash_attention_quantized(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -640,6 +870,7 @@ def flash_attention_quantized(
     is_causal: bool = False,
     attn_mask: Optional[torch.Tensor] = None,
     window_size: int = 0,
+    scale: Optional[float] = None,
 ) -> torch.Tensor:
     """
     Generic quantized attention with configurable quantization type.
@@ -661,6 +892,7 @@ def flash_attention_quantized(
         is_causal: If True, applies causal masking
         attn_mask: Optional boolean attention mask
         window_size: Sliding window size (0 = full attention)
+        scale: Softmax scale factor. If None, uses 1/sqrt(head_dim)
     Returns:
         Output tensor of shape (B, H, N, D)
@@ -668,6 +900,14 @@ def flash_attention_quantized(
     if not _HAS_MFA:
         raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    # Apply custom scale by pre-scaling Q
+    if scale is not None:
+        head_dim = query.shape[-1]
+        default_scale = 1.0 / math.sqrt(head_dim)
+        if abs(scale - default_scale) > 1e-9:
+            scale_factor = scale / default_scale
+            query = query * scale_factor
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         quant_type, is_causal, attn_mask, window_size

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/mps_flash_attn/csrc/mps_flash_attn.mm RENAMED Viewed

@@ -25,11 +25,16 @@ typedef void* (*mfa_create_kernel_v2_fn)(int32_t, int32_t, int32_t, bool, bool,
 typedef void* (*mfa_create_kernel_v3_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool, uint32_t);
 typedef void* (*mfa_create_kernel_v4_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool, uint32_t, uint16_t);
 typedef void* (*mfa_create_kernel_v5_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool, uint32_t, uint16_t, bool);
+typedef void* (*mfa_create_kernel_v6_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool, uint32_t, uint16_t, bool, bool, uint32_t, uint32_t);
+typedef void* (*mfa_create_kernel_v7_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool, uint32_t, uint16_t, bool, bool, uint32_t, uint32_t, uint32_t);
 // New zero-sync encode functions that take PyTorch's command encoder
 // Added mask_ptr and mask_offset parameters
 typedef bool (*mfa_forward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*,
                                        int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                        int32_t, int32_t);
+typedef bool (*mfa_forward_encode_bias_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                            int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                            int32_t, int32_t);
 typedef bool (*mfa_forward_encode_quantized_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
                                                  int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                                  int32_t, int32_t);
@@ -52,7 +57,10 @@ static mfa_create_kernel_v2_fn g_mfa_create_kernel_v2 = nullptr;
 static mfa_create_kernel_v3_fn g_mfa_create_kernel_v3 = nullptr;
 static mfa_create_kernel_v4_fn g_mfa_create_kernel_v4 = nullptr;
 static mfa_create_kernel_v5_fn g_mfa_create_kernel_v5 = nullptr;
+static mfa_create_kernel_v6_fn g_mfa_create_kernel_v6 = nullptr;
+static mfa_create_kernel_v7_fn g_mfa_create_kernel_v7 = nullptr;
 static mfa_forward_encode_fn g_mfa_forward_encode = nullptr;
+static mfa_forward_encode_bias_fn g_mfa_forward_encode_bias = nullptr;
 static mfa_forward_encode_quantized_fn g_mfa_forward_encode_quantized = nullptr;
 static mfa_backward_encode_fn g_mfa_backward_encode = nullptr;
 static mfa_forward_fn g_mfa_forward = nullptr;
@@ -115,8 +123,11 @@ static bool load_mfa_bridge() {
     g_mfa_create_kernel_v3 = (mfa_create_kernel_v3_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v3");
     g_mfa_create_kernel_v4 = (mfa_create_kernel_v4_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v4");
     g_mfa_create_kernel_v5 = (mfa_create_kernel_v5_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v5");
+    g_mfa_create_kernel_v6 = (mfa_create_kernel_v6_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v6");
+    g_mfa_create_kernel_v7 = (mfa_create_kernel_v7_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v7");
     g_mfa_forward_encode_quantized = (mfa_forward_encode_quantized_fn)dlsym(g_dylib_handle, "mfa_forward_encode_quantized");
     g_mfa_forward_encode = (mfa_forward_encode_fn)dlsym(g_dylib_handle, "mfa_forward_encode");
+    g_mfa_forward_encode_bias = (mfa_forward_encode_bias_fn)dlsym(g_dylib_handle, "mfa_forward_encode_bias");
     g_mfa_backward_encode = (mfa_backward_encode_fn)dlsym(g_dylib_handle, "mfa_backward_encode");
     g_mfa_forward = (mfa_forward_fn)dlsym(g_dylib_handle, "mfa_forward");
     g_mfa_backward = (mfa_backward_fn)dlsym(g_dylib_handle, "mfa_backward");
@@ -169,6 +180,10 @@ struct KernelCacheKey {
     uint32_t window_size;
     uint16_t quantized_kv;  // 0 = none, 3 = FP8_E4M3, 4 = FP8_E5M2, 5 = INT8, 6 = NF4
     bool bf16_backward;     // true = use BF16 for backward intermediates (faster)
+    bool has_attn_bias;     // true = additive attention bias
+    uint32_t bias_batch_stride;
+    uint32_t bias_head_stride;
+    uint32_t bias_repeat_count;
     bool operator==(const KernelCacheKey& other) const {
         return seq_len_q == other.seq_len_q &&
@@ -181,7 +196,11 @@ struct KernelCacheKey {
                use_bf16 == other.use_bf16 &&
                window_size == other.window_size &&
                quantized_kv == other.quantized_kv &&
-               bf16_backward == other.bf16_backward;
+               bf16_backward == other.bf16_backward &&
+               has_attn_bias == other.has_attn_bias &&
+               bias_batch_stride == other.bias_batch_stride &&
+               bias_head_stride == other.bias_head_stride &&
+               bias_repeat_count == other.bias_repeat_count;
     }
 };
@@ -197,14 +216,18 @@ struct KernelCacheKeyHash {
                (std::hash<bool>()(k.use_bf16) << 7) ^
                (std::hash<uint32_t>()(k.window_size) << 8) ^
                (std::hash<uint16_t>()(k.quantized_kv) << 9) ^
-               (std::hash<bool>()(k.bf16_backward) << 10);
+               (std::hash<bool>()(k.bf16_backward) << 10) ^
+               (std::hash<bool>()(k.has_attn_bias) << 11) ^
+               (std::hash<uint32_t>()(k.bias_batch_stride) << 12) ^
+               (std::hash<uint32_t>()(k.bias_head_stride) << 13) ^
+               (std::hash<uint32_t>()(k.bias_repeat_count) << 14);
     }
 };
 static std::unordered_map<KernelCacheKey, void*, KernelCacheKeyHash> g_kernel_cache;
-static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask, bool use_bf16 = false, uint32_t window_size = 0, uint16_t quantized_kv = 0, bool bf16_backward = false) {
-    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask, use_bf16, window_size, quantized_kv, bf16_backward};
+static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask, bool use_bf16 = false, uint32_t window_size = 0, uint16_t quantized_kv = 0, bool bf16_backward = false, bool has_attn_bias = false, uint32_t bias_batch_stride = 0, uint32_t bias_head_stride = 0, uint32_t bias_repeat_count = 0) {
+    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask, use_bf16, window_size, quantized_kv, bf16_backward, has_attn_bias, bias_batch_stride, bias_head_stride, bias_repeat_count};
     auto it = g_kernel_cache.find(key);
     if (it != g_kernel_cache.end()) {
@@ -212,7 +235,44 @@ static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_di
     }
     void* kernel = nullptr;
-    if (g_mfa_create_kernel_v5) {
+    if (has_attn_bias && g_mfa_create_kernel_v7) {
+        // Use v7 API with additive attention bias and repeat support
+        kernel = g_mfa_create_kernel_v7(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask,
+            use_bf16,
+            window_size,
+            quantized_kv,
+            bf16_backward,
+            has_attn_bias,
+            bias_batch_stride,
+            bias_head_stride,
+            bias_repeat_count
+        );
+    } else if (has_attn_bias && g_mfa_create_kernel_v6) {
+        // Use v6 API with additive attention bias (no repeat)
+        kernel = g_mfa_create_kernel_v6(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask,
+            use_bf16,
+            window_size,
+            quantized_kv,
+            bf16_backward,
+            has_attn_bias,
+            bias_batch_stride,
+            bias_head_stride
+        );
+    } else if (g_mfa_create_kernel_v5) {
         // Use v5 API with all features including mixed-precision backward
         kernel = g_mfa_create_kernel_v5(
             static_cast<int32_t>(seq_q),
@@ -489,6 +549,168 @@ at::Tensor mps_flash_attention_forward(
     return output;
 }
+// ============================================================================
+// Flash Attention Forward with Additive Bias
+// ============================================================================
+at::Tensor mps_flash_attention_forward_with_bias(
+    const at::Tensor& query,   // (B, H, N, D)
+    const at::Tensor& key,     // (B, H, N, D)
+    const at::Tensor& value,   // (B, H, N, D)
+    const at::Tensor& attn_bias,  // (B, H, N_q, N_kv) or (1, H, N_q, N_kv) or (H, N_q, N_kv)
+    bool is_causal,
+    int64_t window_size,
+    int64_t bias_repeat_count  // >0 means bias repeats every N batches (for window attention)
+) {
+    // Initialize MFA on first call
+    if (!g_initialized) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized = true;
+    }
+    // Check that v6/v7 API is available
+    TORCH_CHECK(g_mfa_create_kernel_v6 || g_mfa_create_kernel_v7,
+                "Attention bias requires MFA v6+ API (update libMFABridge.dylib)");
+    TORCH_CHECK(g_mfa_forward_encode_bias,
+                "Attention bias requires mfa_forward_encode_bias");
+    // Validate inputs
+    TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
+    TORCH_CHECK(key.dim() == 4, "Key must be 4D (B, H, N, D)");
+    TORCH_CHECK(value.dim() == 4, "Value must be 4D (B, H, N, D)");
+    TORCH_CHECK(query.device().is_mps(), "Query must be on MPS device");
+    TORCH_CHECK(key.device().is_mps(), "Key must be on MPS device");
+    TORCH_CHECK(value.device().is_mps(), "Value must be on MPS device");
+    TORCH_CHECK(attn_bias.device().is_mps(), "Attention bias must be on MPS device");
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t seq_len_q = query.size(2);
+    const int64_t head_dim = query.size(3);
+    const int64_t seq_len_kv = key.size(2);
+    // Determine bias strides for broadcasting
+    // Bias can be: (B, H, N_q, N_kv), (1, H, N_q, N_kv), or (H, N_q, N_kv)
+    uint32_t bias_batch_stride = 0;
+    uint32_t bias_head_stride = static_cast<uint32_t>(seq_len_q * seq_len_kv);
+    if (attn_bias.dim() == 4) {
+        if (attn_bias.size(0) > 1) {
+            bias_batch_stride = static_cast<uint32_t>(attn_bias.size(1) * seq_len_q * seq_len_kv);
+        }
+        if (attn_bias.size(1) == 1) {
+            bias_head_stride = 0;  // Broadcast across heads
+        }
+    } else if (attn_bias.dim() == 3) {
+        // (H, N_q, N_kv) - broadcast across batch
+        bias_batch_stride = 0;
+        if (attn_bias.size(0) == 1) {
+            bias_head_stride = 0;
+        }
+    }
+    // Determine precision
+    bool is_bfloat16 = (query.scalar_type() == at::kBFloat16);
+    bool is_fp16 = (query.scalar_type() == at::kHalf);
+    bool use_bf16_kernel = is_bfloat16 && g_mfa_create_kernel_v2;
+    bool low_precision = is_fp16;
+    bool low_precision_outputs = is_fp16 || use_bf16_kernel;
+    // Make inputs contiguous
+    auto q = query.contiguous();
+    auto k = key.contiguous();
+    auto v = value.contiguous();
+    auto bias = attn_bias.contiguous();
+    // For BF16 without native kernel, convert to FP32
+    if (is_bfloat16 && !use_bf16_kernel) {
+        q = q.to(at::kFloat);
+        k = k.to(at::kFloat);
+        v = v.to(at::kFloat);
+        bias = bias.to(at::kFloat);
+    }
+    // Allocate output
+    at::Tensor output;
+    if (use_bf16_kernel) {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kBFloat16));
+    } else if (low_precision_outputs) {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kHalf));
+    } else {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kFloat));
+    }
+    // Allocate logsumexp (always fp32)
+    auto logsumexp = at::empty({batch_size, num_heads, seq_len_q},
+                                query.options().dtype(at::kFloat));
+    // Get or create kernel with bias support
+    void* kernel = get_or_create_kernel(
+        seq_len_q, seq_len_kv, head_dim,
+        low_precision, low_precision_outputs, is_causal, false,  // has_mask = false
+        use_bf16_kernel,
+        static_cast<uint32_t>(window_size > 0 ? window_size : 0),
+        0,  // no quantization
+        false,  // bf16_backward
+        true,  // has_attn_bias
+        bias_batch_stride,
+        bias_head_stride,
+        static_cast<uint32_t>(bias_repeat_count > 0 ? bias_repeat_count : 0)
+    );
+    // Get Metal buffers
+    auto q_info = getBufferInfo(q);
+    auto k_info = getBufferInfo(k);
+    auto v_info = getBufferInfo(v);
+    auto o_info = getBufferInfo(output);
+    auto l_info = getBufferInfo(logsumexp);
+    auto bias_info = getBufferInfo(bias);
+    // Execute using bias forward encode
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        bool success = g_mfa_forward_encode_bias(
+            kernel,
+            (__bridge void*)encoder,
+            (__bridge void*)q_info.buffer,
+            (__bridge void*)k_info.buffer,
+            (__bridge void*)v_info.buffer,
+            (__bridge void*)o_info.buffer,
+            (__bridge void*)l_info.buffer,
+            nullptr,  // no boolean mask
+            (__bridge void*)bias_info.buffer,
+            q_info.byte_offset,
+            k_info.byte_offset,
+            v_info.byte_offset,
+            o_info.byte_offset,
+            l_info.byte_offset,
+            0,  // mask_offset
+            bias_info.byte_offset,
+            static_cast<int32_t>(batch_size),
+            static_cast<int32_t>(num_heads)
+        );
+        if (!success) {
+            throw std::runtime_error("MFA forward with bias failed");
+        }
+    }
+    // Convert output back to BF16 if needed
+    if (is_bfloat16 && !use_bf16_kernel) {
+        output = output.to(at::kBFloat16);
+    }
+    return output;
+}
 // ============================================================================
 // Quantized Flash Attention Forward (FP8, INT8, NF4)
 // ============================================================================
@@ -550,6 +772,9 @@ at::Tensor mps_flash_attention_forward_quantized(
     TORCH_CHECK(v_scale.scalar_type() == at::kFloat,
                 "V scale must be float32");
+    // For NF4, K/V have packed head dimension (D//2) since 2 values per byte
+    bool is_nf4 = (quant_type == static_cast<int64_t>(QuantizationType::NF4));
     const int64_t batch_size = query.size(0);
     const int64_t num_heads_q = query.size(1);
     const int64_t num_heads_kv = key.size(1);
@@ -557,10 +782,13 @@ at::Tensor mps_flash_attention_forward_quantized(
     const int64_t head_dim = query.size(3);
     const int64_t seq_len_kv = key.size(2);
+    // For NF4, expected K/V head dim is D//2 (packed)
+    int64_t expected_kv_head_dim = is_nf4 ? (head_dim / 2) : head_dim;
     TORCH_CHECK(key.size(0) == batch_size && value.size(0) == batch_size,
                 "Batch size mismatch");
-    TORCH_CHECK(key.size(3) == head_dim && value.size(3) == head_dim,
-                "Head dimension mismatch");
+    TORCH_CHECK(key.size(3) == expected_kv_head_dim && value.size(3) == expected_kv_head_dim,
+                is_nf4 ? "Head dimension mismatch for NF4 (expected D//2)" : "Head dimension mismatch");
     TORCH_CHECK(key.size(1) == value.size(1),
                 "K and V must have same number of heads");
@@ -970,6 +1198,17 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("window_size") = 0,
           py::arg("bf16_backward") = false);
+    // Forward with additive attention bias (e.g., relative position encoding)
+    m.def("forward_with_bias", &mps_flash_attention_forward_with_bias,
+          "Flash Attention forward with additive attention bias",
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("attn_bias"),
+          py::arg("is_causal") = false,
+          py::arg("window_size") = 0,
+          py::arg("bias_repeat_count") = 0);
     // Quantized attention (forward only - no gradients through quantized weights)
     m.def("forward_quantized", &mps_flash_attention_forward_quantized,
           "Quantized Flash Attention forward (FP8/INT8/NF4 K/V)",

mps_flash_attn-0.2.5/mps_flash_attn/lib/libMFABridge.dylib ADDED Viewed

Binary file

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.1
+Version: 0.2.5
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -201,6 +201,13 @@ Python API (mps_flash_attn)
 - Python 3.10+
 - PyTorch 2.0+
+## TODO / Future Optimizations
+- [ ] **Batched kernel dispatch** - Currently dispatches B×H separate kernels per attention call. Should use 3D grid to handle all batch/heads in one dispatch (major perf win for small sequences like Swin Transformer windows)
+- [ ] **Fused QKV projection + attention** - Single kernel from input to output, avoid intermediate buffers
+- [ ] **Pre-scaled bias option** - Allow passing pre-scaled bias to avoid per-call scaling overhead
+- [ ] **LoRA fusion** - Fuse adapter weights into attention computation
 ## Credits
 - [metal-flash-attention](https://github.com/philipturner/metal-flash-attention) by Philip Turner

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.2.1"
+version = "0.2.5"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

{mps_flash_attn-0.2.1 → mps_flash_attn-0.2.5}/tests/test_mfa_v2.py RENAMED Viewed

@@ -342,6 +342,97 @@ class TestQuantized:
         assert output.shape == (B, H, N, D)
         assert not torch.isnan(output).any()
+    def test_quantize_nf4(self, mfa):
+        """Test NF4 quantization helper.
+        NF4 packs 2 values per byte along head dimension, so output shape is (B,H,N,D//2).
+        """
+        B, H, N, D = 1, 4, 128, 64
+        dtype = torch.float16
+        k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k_q, v_q, k_s, v_s = mfa.quantize_kv_nf4(k, v)
+        assert k_q.dtype == torch.uint8
+        assert v_q.dtype == torch.uint8
+        assert k_s.dtype == torch.float32
+        assert v_s.dtype == torch.float32
+        # NF4 packs 2 values per byte, so D dimension is halved
+        assert k_q.shape == (B, H, N, D // 2)
+        assert v_q.shape == (B, H, N, D // 2)
+    def test_flash_attention_nf4(self, mfa):
+        """Test NF4 quantized attention forward.
+        NF4 uses a 16-value codebook for 4-bit quantization, packing 2 values per byte.
+        This provides 4x memory reduction for K/V cache with acceptable accuracy loss.
+        """
+        B, H, N, D = 2, 8, 128, 64
+        dtype = torch.float16
+        torch.manual_seed(42)
+        q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k_q, v_q, k_s, v_s = mfa.quantize_kv_nf4(k, v)
+        output = mfa.flash_attention_nf4(q, k_q, v_q, k_s, v_s)
+        assert output.shape == (B, H, N, D)
+        assert not torch.isnan(output).any()
+        assert not torch.isinf(output).any()
+    def test_flash_attention_nf4_correctness(self, mfa):
+        """Test NF4 attention correctness against reference.
+        NF4 is 4-bit so we expect larger error than FP8/INT8, but output
+        should still be in a reasonable range (max diff < 0.5).
+        """
+        B, H, N, D = 1, 4, 64, 64
+        dtype = torch.float16
+        torch.manual_seed(42)
+        q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        # Reference
+        ref = reference_attention(q, k, v)
+        # NF4
+        k_q, v_q, k_s, v_s = mfa.quantize_kv_nf4(k, v)
+        output = mfa.flash_attention_nf4(q, k_q, v_q, k_s, v_s)
+        max_diff = (ref - output).abs().max().item()
+        mean_diff = (ref - output).abs().mean().item()
+        # 4-bit quantization has larger error, but should be bounded
+        assert max_diff < 0.5, f"NF4 max diff {max_diff} exceeds threshold 0.5"
+        assert mean_diff < 0.1, f"NF4 mean diff {mean_diff} exceeds threshold 0.1"
+    @pytest.mark.parametrize("config", [
+        (1, 1, 32, 32),    # Small
+        (1, 8, 128, 64),   # Medium
+        (2, 16, 256, 64),  # Large
+        (1, 8, 512, 128),  # Large head dim
+    ])
+    def test_flash_attention_nf4_various_sizes(self, mfa, config):
+        """Test NF4 attention with various tensor sizes."""
+        B, H, N, D = config
+        dtype = torch.float16
+        q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+        k_q, v_q, k_s, v_s = mfa.quantize_kv_nf4(k, v)
+        output = mfa.flash_attention_nf4(q, k_q, v_q, k_s, v_s)
+        assert output.shape == (B, H, N, D)
+        assert not torch.isnan(output).any()
 # =============================================================================
 # Chunked/Streaming Attention Tests