PyPI - mps-flash-attn - Versions diffs - 0.1.13__tar.gz → 0.1.15__tar.gz - Mend

mps-flash-attn 0.1.13tar.gz → 0.1.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (39) hide show

{mps_flash_attn-0.1.13 → mps_flash_attn-0.1.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.13
+Version: 0.1.15
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.1.13 → mps_flash_attn-0.1.15}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.1.13"
+__version__ = "0.1.14"
 import torch
 from typing import Optional
@@ -200,12 +200,13 @@ def replace_sdpa():
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None):
         # Use MFA for MPS tensors without dropout
-        # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
+        # Only use MFA for seq_len >= 1536 where it outperforms PyTorch's math backend
         # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
+        # Benchmark (BF16, heads=30, head_dim=128): crossover is ~1200-1500
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 1024):
+            query.shape[2] >= 1536):
             try:
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)

{mps_flash_attn-0.1.13 → mps_flash_attn-0.1.15}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.13
+Version: 0.1.15
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.1.13 → mps_flash_attn-0.1.15}/mps_flash_attn.egg-info/SOURCES.txt RENAMED Viewed

@@ -33,4 +33,5 @@ mps_flash_attn/kernels/eab4f40de4b0ebd2765b41c25dba7ccab5db4abf6a6eb87d76fff7b5e
 mps_flash_attn/kernels/f08fe0efd72e055177e068154dae01e08c4d52d3cb883330a04f1431d274aece.metallib
 mps_flash_attn/kernels/manifest.json
 mps_flash_attn/lib/libMFABridge.dylib
-tests/test_attention.py
+tests/test_attention.py
+tests/test_flash_attn.py

{mps_flash_attn-0.1.13 → mps_flash_attn-0.1.15}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.1.13"
+version = "0.1.15"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

mps_flash_attn-0.1.15/tests/test_flash_attn.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""Test mps-flash-attn: FP32, FP16, BF16 support with benchmarks"""
+import torch
+import torch.nn.functional as F
+import math
+import time
+# Build and load the extension
+print("Loading mps_flash_attn...")
+from mps_flash_attn import flash_attention, is_available
+print(f"MPS available: {is_available()}")
+def reference_attention(q, k, v, is_causal=False, attn_mask=None):
+    """Reference implementation using PyTorch ops."""
+    scale = 1.0 / math.sqrt(q.size(-1))
+    attn = torch.matmul(q.float(), k.float().transpose(-2, -1)) * scale
+    if is_causal:
+        seq_len = q.size(-2)
+        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=q.device, dtype=torch.bool), diagonal=1)
+        attn = attn.masked_fill(causal_mask, float('-inf'))
+    if attn_mask is not None:
+        attn = attn.masked_fill(attn_mask.bool(), float('-inf'))
+    attn = F.softmax(attn, dim=-1)
+    out = torch.matmul(attn, v.float())
+    return out.to(q.dtype)
+def test_forward(dtype, name):
+    """Test forward pass."""
+    torch.manual_seed(42)
+    B, H, N, D = 2, 8, 128, 64
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    output = flash_attention(q, k, v)
+    has_nan = torch.isnan(output).any().item()
+    has_inf = torch.isinf(output).any().item()
+    ok = not has_nan and not has_inf
+    shape_ok = output.shape == (B, H, N, D)
+    print(f"  {name} forward: shape={output.shape}, dtype={output.dtype}, ok={ok and shape_ok}")
+    return ok and shape_ok
+def test_backward(dtype, name):
+    """Test backward pass."""
+    torch.manual_seed(42)
+    B, H, N, D = 2, 4, 64, 32
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype, requires_grad=True)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype, requires_grad=True)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype, requires_grad=True)
+    output = flash_attention(q, k, v)
+    loss = output.sum()
+    loss.backward()
+    grad_q_ok = q.grad is not None and not torch.isnan(q.grad).any()
+    grad_k_ok = k.grad is not None and not torch.isnan(k.grad).any()
+    grad_v_ok = v.grad is not None and not torch.isnan(v.grad).any()
+    ok = grad_q_ok and grad_k_ok and grad_v_ok
+    print(f"  {name} backward: q={grad_q_ok}, k={grad_k_ok}, v={grad_v_ok}")
+    return ok
+def test_causal(dtype, name):
+    """Test causal attention."""
+    torch.manual_seed(42)
+    B, H, N, D = 2, 8, 128, 64
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    output = flash_attention(q, k, v, is_causal=True)
+    ok = not torch.isnan(output).any() and not torch.isinf(output).any()
+    print(f"  {name} causal: shape={output.shape}, ok={ok}")
+    return ok
+def test_gqa(dtype, name):
+    """Test Grouped Query Attention (GQA)."""
+    torch.manual_seed(42)
+    B, H_q, H_kv, N, D = 2, 8, 2, 128, 64  # 8 query heads, 2 KV heads
+    q = torch.randn(B, H_q, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H_kv, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H_kv, N, D, device='mps', dtype=dtype)
+    output = flash_attention(q, k, v)
+    ok = not torch.isnan(output).any() and not torch.isinf(output).any()
+    shape_ok = output.shape == (B, H_q, N, D)
+    print(f"  {name} GQA ({H_q}q/{H_kv}kv): shape={output.shape}, ok={ok and shape_ok}")
+    return ok and shape_ok
+def test_correctness(dtype, name):
+    """Test correctness against reference implementation."""
+    torch.manual_seed(42)
+    B, H, N, D = 1, 4, 64, 32
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    output_flash = flash_attention(q, k, v)
+    output_ref = reference_attention(q, k, v)
+    diff = (output_flash.float() - output_ref.float()).abs()
+    max_diff = diff.max().item()
+    rel_diff = (diff / (output_ref.float().abs() + 1e-6)).mean().item() * 100
+    # Tolerance depends on dtype
+    if dtype == torch.float32:
+        ok = rel_diff < 1.0
+    elif dtype == torch.float16:
+        ok = rel_diff < 2.0
+    else:  # BF16
+        ok = rel_diff < 5.0
+    print(f"  {name} correctness: max_diff={max_diff:.6f}, rel_diff={rel_diff:.2f}%, ok={ok}")
+    return ok
+def compare_fp32_bf16():
+    """Compare FP32 vs BF16 outputs."""
+    torch.manual_seed(42)
+    B, H, N, D = 1, 4, 64, 32
+    q_fp32 = torch.randn(B, H, N, D, device='mps', dtype=torch.float32)
+    k_fp32 = torch.randn(B, H, N, D, device='mps', dtype=torch.float32)
+    v_fp32 = torch.randn(B, H, N, D, device='mps', dtype=torch.float32)
+    output_fp32 = flash_attention(q_fp32, k_fp32, v_fp32)
+    # BF16
+    q_bf16 = q_fp32.to(torch.bfloat16)
+    k_bf16 = k_fp32.to(torch.bfloat16)
+    v_bf16 = v_fp32.to(torch.bfloat16)
+    output_bf16 = flash_attention(q_bf16, k_bf16, v_bf16)
+    diff = (output_fp32 - output_bf16.to(torch.float32)).abs()
+    max_diff = diff.max().item()
+    rel_diff = (diff / (output_fp32.abs() + 1e-6)).mean().item() * 100
+    # Attention has softmax which amplifies small differences, 10% tolerance is acceptable
+    ok = rel_diff < 10.0
+    print(f"  FP32 vs BF16: max_diff={max_diff:.6f}, rel_diff={rel_diff:.2f}%, ok={ok}")
+    return ok
+def test_large_sequence():
+    """Test with large sequence length (memory efficiency test)."""
+    torch.manual_seed(42)
+    B, H, N, D = 1, 8, 4096, 64  # Large sequence
+    dtype = torch.float16
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    # This should NOT OOM with flash attention
+    output = flash_attention(q, k, v)
+    ok = not torch.isnan(output).any() and not torch.isinf(output).any()
+    shape_ok = output.shape == (B, H, N, D)
+    print(f"  Large seq (N={N}): shape={output.shape}, ok={ok and shape_ok}")
+    return ok and shape_ok
+def benchmark(dtype, name, warmup=5, runs=20):
+    """Benchmark forward pass."""
+    torch.manual_seed(42)
+    B, H, N, D = 4, 8, 512, 64
+    q = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    k = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    v = torch.randn(B, H, N, D, device='mps', dtype=dtype)
+    for _ in range(warmup):
+        _ = flash_attention(q, k, v)
+    torch.mps.synchronize()
+    start = time.time()
+    for _ in range(runs):
+        _ = flash_attention(q, k, v)
+    torch.mps.synchronize()
+    elapsed = time.time() - start
+    ms = (elapsed / runs) * 1000
+    print(f"  {name}: {ms:.2f} ms")
+    return ms
+if __name__ == "__main__":
+    print("\n" + "=" * 50)
+    print("Testing mps-flash-attn")
+    print("=" * 50)
+    all_ok = True
+    print("\n1. Forward pass:")
+    all_ok &= test_forward(torch.float32, "FP32")
+    all_ok &= test_forward(torch.float16, "FP16")
+    all_ok &= test_forward(torch.bfloat16, "BF16")
+    print("\n2. Backward pass:")
+    all_ok &= test_backward(torch.float32, "FP32")
+    all_ok &= test_backward(torch.float16, "FP16")
+    all_ok &= test_backward(torch.bfloat16, "BF16")
+    print("\n3. Causal attention:")
+    all_ok &= test_causal(torch.float32, "FP32")
+    all_ok &= test_causal(torch.float16, "FP16")
+    all_ok &= test_causal(torch.bfloat16, "BF16")
+    print("\n4. GQA (Grouped Query Attention):")
+    all_ok &= test_gqa(torch.float32, "FP32")
+    all_ok &= test_gqa(torch.float16, "FP16")
+    all_ok &= test_gqa(torch.bfloat16, "BF16")
+    print("\n5. Correctness vs reference:")
+    all_ok &= test_correctness(torch.float32, "FP32")
+    all_ok &= test_correctness(torch.float16, "FP16")
+    all_ok &= test_correctness(torch.bfloat16, "BF16")
+    print("\n6. FP32 vs BF16 comparison:")
+    all_ok &= compare_fp32_bf16()
+    print("\n7. Large sequence (memory efficiency):")
+    all_ok &= test_large_sequence()
+    print("\n8. Benchmarks (B=4, H=8, N=512, D=64):")
+    benchmark(torch.float32, "FP32")
+    benchmark(torch.float16, "FP16")
+    benchmark(torch.bfloat16, "BF16")
+    print("\n" + "=" * 50)
+    if all_ok:
+        print("ALL TESTS PASSED!")
+    else:
+        print("SOME TESTS FAILED!")
+    print("=" * 50)