PyPI - mps-flash-attn - Versions diffs - 0.2.6__tar.gz → 0.2.8__tar.gz - Mend

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.5"
+__version__ = "0.2.8"
 import torch
 from typing import Optional
@@ -217,18 +217,27 @@ def replace_sdpa():
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None, enable_gqa=False, **kwargs):
         # Use MFA for MPS tensors without dropout
-        # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
+        # Only use MFA for seq_len >= 512 where it outperforms PyTorch's math backend
         # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
         # Benchmark results (B=1-4, H=8, D=64-128, fp16/bf16):
-        #   seq=512:  0.3-0.5x (MFA slower)
-        #   seq=1024: 1.1-2.0x (MFA faster)
-        #   seq=2048: 1.7-3.7x (MFA much faster)
-        #   seq=4096: 2.0-3.9x (MFA much faster)
+        #   seq=512:  1.2-1.6x (MFA faster)
+        #   seq=1024: 2.3-3.7x (MFA much faster)
+        #   seq=2048: 2.2-3.9x (MFA much faster)
+        #   seq=4096: 2.1-3.7x (MFA much faster)
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 1024):
+            query.shape[2] >= 512):
             try:
+                # Handle GQA (Grouped Query Attention) - expand K/V heads to match Q heads
+                # Common in Llama 2/3, Mistral, Qwen, etc.
+                k, v = key, value
+                if enable_gqa and query.shape[1] != key.shape[1]:
+                    # Expand KV heads: (B, kv_heads, S, D) -> (B, q_heads, S, D)
+                    n_rep = query.shape[1] // key.shape[1]
+                    k = key.repeat_interleave(n_rep, dim=1)
+                    v = value.repeat_interleave(n_rep, dim=1)
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
                 # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
@@ -242,7 +251,7 @@ def replace_sdpa():
                         # Convert: positions with large negative values -> True (masked)
                         # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
                         mfa_mask = attn_mask <= -1e3
-                return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
+                return flash_attention(query, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
             except Exception:
                 # Fall back to original on any error
                 pass

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.6
+Version: 0.2.8
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.6
+Version: 0.2.8
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.2.6"
+version = "0.2.8"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

mps-flash-attn 0.2.6tar.gz → 0.2.8tar.gz

Potentially problematic release.

mps-flash-attn 0.2.6__tar.gz → 0.2.8__tar.gz

Potentially problematic release.

mps-flash-attn 0.2.6tar.gz → 0.2.8tar.gz