PyPI - mps-flash-attn - Versions diffs - 0.2.8__tar.gz → 0.3.0__tar.gz - Mend

mps-flash-attn 0.2.8tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (40) hide show

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.8
+Version: 0.3.0
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,13 +4,42 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.8"
+__version__ = "0.3.0"
+__all__ = [
+    # Core functions
+    "flash_attention",
+    "flash_attention_with_bias",
+    "flash_attention_chunked",
+    # Quantized attention
+    "flash_attention_fp8",
+    "flash_attention_int8",
+    "flash_attention_nf4",
+    "quantize_kv_fp8",
+    "quantize_kv_int8",
+    "quantize_kv_nf4",
+    # Utilities
+    "replace_sdpa",
+    "precompile",
+    "clear_cache",
+    "register_custom_op",
+    "is_available",
+    "convert_mask",
+    # Constants
+    "QUANT_FP8_E4M3",
+    "QUANT_FP8_E5M2",
+    "QUANT_INT8",
+    "QUANT_NF4",
+    # Version
+    "__version__",
+]
 import torch
-from typing import Optional
+from typing import Optional, Tuple
 import math
 import threading
 import os
+import warnings
 # Try to import the C++ extension
 try:
@@ -30,6 +59,20 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def _ensure_contiguous(tensor: torch.Tensor, name: str) -> torch.Tensor:
+    """Ensure tensor is contiguous, with a debug warning if conversion needed."""
+    if tensor.is_contiguous():
+        return tensor
+    # Auto-convert with debug info
+    if os.environ.get("MFA_DEBUG", "0") == "1":
+        warnings.warn(
+            f"MFA: {name} tensor was not contiguous (stride={tensor.stride()}), "
+            f"auto-converting. For best performance, ensure inputs are contiguous.",
+            UserWarning
+        )
+    return tensor.contiguous()
 def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     """
     Convert attention mask to MFA's boolean format.
@@ -176,6 +219,20 @@ def flash_attention(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values that could cause numerical issues
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -186,6 +243,31 @@ def flash_attention(
     if attn_mask is not None and attn_mask.device.type != 'mps':
         raise ValueError("attn_mask must be on MPS device")
+    # Ensure contiguous (auto-convert with debug warning)
+    query = _ensure_contiguous(query, "query")
+    key = _ensure_contiguous(key, "key")
+    value = _ensure_contiguous(value, "value")
+    if attn_mask is not None:
+        attn_mask = _ensure_contiguous(attn_mask, "attn_mask")
+        # Validate mask shape
+        B, H, N_q, D = query.shape
+        N_kv = key.shape[2]
+        if attn_mask.dim() != 4:
+            raise ValueError(f"attn_mask must be 4D (B, H, N_q, N_kv), got {attn_mask.dim()}D")
+        mb, mh, mq, mk = attn_mask.shape
+        if mq != N_q or mk != N_kv:
+            raise ValueError(
+                f"attn_mask shape mismatch: mask is ({mq}, {mk}) but expected ({N_q}, {N_kv})"
+            )
+        if mb != 1 and mb != B:
+            raise ValueError(
+                f"attn_mask batch size must be 1 or {B}, got {mb}"
+            )
+        if mh != 1 and mh != H:
+            raise ValueError(
+                f"attn_mask head count must be 1 or {H}, got {mh}"
+            )
     # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
     if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
         # Apply scale if provided
@@ -213,6 +295,10 @@ def replace_sdpa():
     import torch.nn.functional as F
     original_sdpa = F.scaled_dot_product_attention
+    _debug = os.environ.get("MFA_DEBUG", "0") == "1"
+    _call_count = [0]  # mutable for closure
+    _fallback_count = [0]  # track fallbacks for warning
+    _last_fallback_error = [None]
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None, enable_gqa=False, **kwargs):
@@ -224,37 +310,92 @@ def replace_sdpa():
         #   seq=1024: 2.3-3.7x (MFA much faster)
         #   seq=2048: 2.2-3.9x (MFA much faster)
         #   seq=4096: 2.1-3.7x (MFA much faster)
+        # Determine seq_len based on tensor dimensionality
+        # 4D: (B, H, S, D) -> seq_len = shape[2]
+        # 3D: (B, S, D) -> seq_len = shape[1] (single-head attention, e.g., VAE)
+        is_3d = query.ndim == 3
+        seq_len = query.shape[1] if is_3d else query.shape[2]
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 512):
+            query.ndim >= 3 and
+            seq_len >= 512):
             try:
+                q, k, v = query, key, value
+                # Handle 3D tensors (B, S, D) - treat as single-head attention
+                # Unsqueeze to (B, 1, S, D) for MFA, squeeze back after
+                if is_3d:
+                    q = q.unsqueeze(1)  # (B, S, D) -> (B, 1, S, D)
+                    k = k.unsqueeze(1)
+                    v = v.unsqueeze(1)
                 # Handle GQA (Grouped Query Attention) - expand K/V heads to match Q heads
                 # Common in Llama 2/3, Mistral, Qwen, etc.
-                k, v = key, value
-                if enable_gqa and query.shape[1] != key.shape[1]:
+                # NOTE: Always expand when heads mismatch, not just when enable_gqa=True
+                # Transformers may pass enable_gqa=True on MPS (torch>=2.5, no mask) even though
+                # MPS SDPA doesn't support native GQA - we handle it here
+                if q.shape[1] != k.shape[1]:
                     # Expand KV heads: (B, kv_heads, S, D) -> (B, q_heads, S, D)
-                    n_rep = query.shape[1] // key.shape[1]
-                    k = key.repeat_interleave(n_rep, dim=1)
-                    v = value.repeat_interleave(n_rep, dim=1)
+                    n_rep = q.shape[1] // k.shape[1]
+                    k = k.repeat_interleave(n_rep, dim=1)
+                    v = v.repeat_interleave(n_rep, dim=1)
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
                 # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
                 mfa_mask = None
                 if attn_mask is not None:
+                    if _debug:
+                        print(f"[MFA MASK] dtype={attn_mask.dtype} shape={tuple(attn_mask.shape)} min={attn_mask.min().item():.2f} max={attn_mask.max().item():.2f}")
                     if attn_mask.dtype == torch.bool:
-                        # Boolean mask: True means masked (don't attend)
-                        mfa_mask = attn_mask
+                        # PyTorch SDPA bool mask: True = ATTEND, False = MASKED
+                        # MFA bool mask: True = MASKED, False = ATTEND
+                        # They're opposite! Invert it.
+                        mfa_mask = ~attn_mask
                     else:
                         # Float mask: typically -inf for masked positions, 0 for unmasked
                         # Convert: positions with large negative values -> True (masked)
                         # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
                         mfa_mask = attn_mask <= -1e3
-                return flash_attention(query, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
-            except Exception:
-                # Fall back to original on any error
-                pass
+                    if _debug:
+                        print(f"[MFA MASK] converted: True(masked)={mfa_mask.sum().item()} False(attend)={(~mfa_mask).sum().item()}")
+                out = flash_attention(q, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
+                # Squeeze back for 3D input
+                if is_3d:
+                    out = out.squeeze(1)  # (B, 1, S, D) -> (B, S, D)
+                if _debug:
+                    _call_count[0] += 1
+                    print(f"[MFA #{_call_count[0]}] shape={tuple(query.shape)} is_3d={is_3d} gqa={enable_gqa} mask={attn_mask is not None} causal={is_causal}")
+                return out
+            except Exception as e:
+                # Fall back to original on any error, but track it
+                _fallback_count[0] += 1
+                _last_fallback_error[0] = str(e)
+                if _debug:
+                    import traceback
+                    print(f"[MFA FALLBACK #{_fallback_count[0]}] shape={tuple(query.shape)}\n{traceback.format_exc()}")
+                # Warn user after repeated fallbacks (likely a real problem)
+                if _fallback_count[0] == 10:
+                    warnings.warn(
+                        f"MFA has fallen back to native SDPA {_fallback_count[0]} times. "
+                        f"Last error: {_last_fallback_error[0]}. "
+                        f"Set MFA_DEBUG=1 for details.",
+                        UserWarning
+                    )
+        if _debug and query.device.type == 'mps':
+            _call_count[0] += 1
+            reason = []
+            if dropout_p != 0.0: reason.append(f"dropout={dropout_p}")
+            if query.ndim < 3: reason.append(f"ndim={query.ndim}")
+            if seq_len < 512: reason.append(f"seq={seq_len}<512")
+            print(f"[NATIVE #{_call_count[0]}] shape={tuple(query.shape)} reason={','.join(reason) or 'unknown'}")
         return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale=scale, enable_gqa=enable_gqa, **kwargs)
@@ -461,13 +602,13 @@ def flash_attention_chunked(
         return _C.forward(query, key, value, is_causal, None, 0)
     # Initialize running statistics for online softmax
-    # m = running max, l = running sum of exp, acc = accumulated output
     device = query.device
     dtype = query.dtype
     # Use float32 for numerical stability of softmax statistics
-    running_max = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
-    running_sum = torch.zeros((B, H, seq_len_q, 1), device=device, dtype=torch.float32)
+    # running_L: base-2 logsumexp of all attention scores seen so far (-inf means no data yet)
+    # output_acc: weighted combination of outputs (weights sum to 1 after each update)
+    running_L = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
     output_acc = torch.zeros((B, H, seq_len_q, D), device=device, dtype=torch.float32)
     # Process K/V in chunks
@@ -488,51 +629,81 @@ def flash_attention_chunked(
         # - Partial chunk (up to q) if start_idx <= q < end_idx
         # - None of chunk if q < start_idx
-        chunk_is_causal = is_causal and (end_idx <= seq_len_q)
+        if is_causal:
+            # Create explicit causal mask for this chunk
+            # Query positions: 0 to seq_len_q-1
+            # Key positions in chunk: start_idx to end_idx-1
+            chunk_len = end_idx - start_idx
-        # Compute attention for this chunk
-        # forward_with_lse returns (output, logsumexp) where logsumexp = m + log(l)
-        chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, chunk_is_causal, None, 0)
+            # Build mask: mask[q, k_local] = True means DON'T attend
+            # We want to attend when global_k_pos <= q
+            # global_k_pos = start_idx + k_local
+            # So: attend when start_idx + k_local <= q
+            # mask = start_idx + k_local > q
-        # chunk_lse shape: (B, H, seq_len_q)
-        # We need to convert logsumexp to (max, sum) for online algorithm
-        chunk_lse = chunk_lse.unsqueeze(-1)  # (B, H, seq_len_q, 1)
+            q_pos = torch.arange(seq_len_q, device=device).view(1, 1, seq_len_q, 1)
+            k_pos = torch.arange(chunk_len, device=device).view(1, 1, 1, chunk_len) + start_idx
+            causal_mask = k_pos > q_pos  # True = masked (don't attend)
-        # Convert chunk output to float32 for accumulation
-        chunk_out = chunk_out.float()
-        # Online softmax update:
-        # new_max = max(running_max, chunk_max)
-        # For flash attention, chunk_lse ≈ chunk_max + log(chunk_sum)
-        # We approximate chunk_max ≈ chunk_lse (valid when exp sum dominates)
-        chunk_max = chunk_lse  # Approximation: logsumexp ≈ max when sum is dominated by max
+            # Expand to batch and heads
+            causal_mask = causal_mask.expand(B, H, seq_len_q, chunk_len)
-        # Compute new max
-        new_max = torch.maximum(running_max, chunk_max)
+            # Call forward with explicit mask (is_causal=False since we handle it)
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, causal_mask, 0)
+        else:
+            # Non-causal: just process the chunk directly
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, None, 0)
-        # Rescale previous accumulator
-        # correction_old = exp(running_max - new_max)
-        correction_old = torch.exp(running_max - new_max)
-        # Clip to avoid inf * 0 issues when running_max was -inf
-        correction_old = torch.where(running_max == float('-inf'), torch.zeros_like(correction_old), correction_old)
+        # chunk_L shape: (B, H, seq_len_q)
+        # The kernel returns L = m + log2(l) where:
+        #   m = max(scores * log2(e) / sqrt(D))
+        #   l = sum(exp2(scores * log2(e) / sqrt(D) - m))
+        # This is a base-2 logsumexp: L = log2(sum(exp2(scaled_scores)))
+        chunk_L = chunk_lse.unsqueeze(-1).float()  # (B, H, seq_len_q, 1)
-        # Rescale chunk output
-        # correction_new = exp(chunk_max - new_max)
-        correction_new = torch.exp(chunk_max - new_max)
+        # Convert chunk output to float32 for accumulation
+        chunk_out = chunk_out.float()
-        # For the sum, we need exp(chunk_lse - new_max) = exp(chunk_max + log(chunk_sum) - new_max)
-        # = exp(chunk_max - new_max) * chunk_sum
-        # But we only have logsumexp, so: exp(chunk_lse - new_max)
-        chunk_sum_scaled = torch.exp(chunk_lse - new_max)
+        # Online softmax algorithm using base-2 representation
+        #
+        # Flash attention returns: chunk_out = softmax(scores) @ V
+        # The output is already normalized. For online combination:
+        #   new_L = log2(2^running_L + 2^chunk_L)
+        #         = max(running_L, chunk_L) + log2(2^(running_L - max) + 2^(chunk_L - max))
+        #
+        # The weights for combining outputs are:
+        #   old_weight = 2^(running_L - new_L)
+        #   new_weight = 2^(chunk_L - new_L)
+        # These weights sum to 1, so: output = old_weight * old_out + new_weight * new_out
+        # Compute new base-2 logsumexp
+        max_L = torch.maximum(running_L, chunk_L)
+        # Handle -inf case (no previous data)
+        # Use exp2 for base-2 (matches kernel's internal representation)
+        running_exp2 = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - max_L)
+        )
+        chunk_exp2 = torch.exp2(chunk_L - max_L)
+        new_L = max_L + torch.log2(running_exp2 + chunk_exp2)
+        # Compute correction factors using base-2 exp
+        old_weight = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - new_L)
+        )
+        new_weight = torch.exp2(chunk_L - new_L)
         # Update accumulator
-        output_acc = output_acc * correction_old + chunk_out * correction_new
-        running_sum = running_sum * correction_old + chunk_sum_scaled
-        running_max = new_max
+        # Update accumulator
+        output_acc = output_acc * old_weight + chunk_out * new_weight
+        running_L = new_L
-    # Final normalization
-    output = output_acc / running_sum
+    # No final normalization needed - weights already sum to 1
+    output = output_acc
     # Convert back to original dtype
     return output.to(dtype)

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/mps_flash_attn/csrc/mps_flash_attn.mm RENAMED Viewed

@@ -14,6 +14,8 @@
 #include <dlfcn.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <atomic>
 // ============================================================================
 // MFA Bridge Function Types
@@ -67,7 +69,8 @@ static mfa_forward_fn g_mfa_forward = nullptr;
 static mfa_backward_fn g_mfa_backward = nullptr;
 static mfa_release_kernel_fn g_mfa_release_kernel = nullptr;
 static void* g_dylib_handle = nullptr;
-static bool g_initialized = false;
+static std::atomic<bool> g_initialized{false};
+static std::mutex g_init_mutex;
 // ============================================================================
 // Load MFA Bridge Library
@@ -141,6 +144,24 @@ static bool load_mfa_bridge() {
     return true;
 }
+// Thread-safe initialization helper
+static void ensure_initialized() {
+    // Fast path: already initialized
+    if (g_initialized.load(std::memory_order_acquire)) {
+        return;
+    }
+    // Slow path: need to initialize with lock
+    std::lock_guard<std::mutex> lock(g_init_mutex);
+    // Double-check after acquiring lock
+    if (!g_initialized.load(std::memory_order_relaxed)) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized.store(true, std::memory_order_release);
+    }
+}
 // ============================================================================
 // Get MTLBuffer from PyTorch MPS Tensor
 // ============================================================================
@@ -359,14 +380,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     const c10::optional<at::Tensor>& attn_mask,  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
     int64_t window_size  // 0 = full attention, >0 = sliding window
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
@@ -562,14 +577,8 @@ at::Tensor mps_flash_attention_forward_with_bias(
     int64_t window_size,
     int64_t bias_repeat_count  // >0 means bias repeats every N batches (for window attention)
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v6/v7 API is available
     TORCH_CHECK(g_mfa_create_kernel_v6 || g_mfa_create_kernel_v7,
@@ -735,14 +744,8 @@ at::Tensor mps_flash_attention_forward_quantized(
     const c10::optional<at::Tensor>& attn_mask,
     int64_t window_size
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v4 API is available
     TORCH_CHECK(g_mfa_create_kernel_v4, "Quantized attention requires MFA v4 API (update libMFABridge.dylib)");
@@ -992,14 +995,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     int64_t window_size,  // 0 = full attention, >0 = sliding window
     bool bf16_backward    // true = use BF16 intermediates for ~2x faster backward
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(grad_output.dim() == 4, "grad_output must be 4D (B, H, N, D)");

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.8
+Version: 0.3.0
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/mps_flash_attn.egg-info/SOURCES.txt RENAMED Viewed

@@ -34,4 +34,5 @@ mps_flash_attn/kernels/eab4f40de4b0ebd2765b41c25dba7ccab5db4abf6a6eb87d76fff7b5e
 mps_flash_attn/kernels/f08fe0efd72e055177e068154dae01e08c4d52d3cb883330a04f1431d274aece.metallib
 mps_flash_attn/kernels/manifest.json
 mps_flash_attn/lib/libMFABridge.dylib
+tests/test_issues.py
 tests/test_mfa_v2.py

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.2.8"
+version = "0.3.0"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

{mps_flash_attn-0.2.8 → mps_flash_attn-0.3.0}/setup.py RENAMED Viewed

@@ -72,7 +72,7 @@ def get_extensions():
 setup(
     name="mps-flash-attn",
-    version="0.1.5",
+    version="0.3.0",
     packages=find_packages(),
     package_data={
         "mps_flash_attn": [

mps_flash_attn-0.3.0/tests/test_issues.py ADDED Viewed

@@ -0,0 +1,446 @@
+"""
+Tests for known issues and fixes in mps-flash-attention.
+These tests verify that:
+1. Version is consistent between setup.py and __init__.py
+2. __all__ exports are properly defined
+3. Scale validation works correctly
+4. Chunked attention accuracy is within expected bounds
+5. All exported symbols are importable
+"""
+import pytest
+import torch
+import math
+import warnings
+import re
+import sys
+from pathlib import Path
+# Get the version from setup.py for comparison
+def get_setup_version():
+    setup_path = Path(__file__).parent.parent / "setup.py"
+    content = setup_path.read_text()
+    match = re.search(r'version="([^"]+)"', content)
+    return match.group(1) if match else None
+class TestVersionConsistency:
+    """Test that version is consistent across the package."""
+    def test_version_matches_setup(self):
+        """Version in __init__.py should match setup.py."""
+        import mps_flash_attn
+        setup_version = get_setup_version()
+        assert setup_version is not None, "Could not parse version from setup.py"
+        assert mps_flash_attn.__version__ == setup_version, (
+            f"Version mismatch: __init__.py has {mps_flash_attn.__version__}, "
+            f"setup.py has {setup_version}"
+        )
+class TestExports:
+    """Test that __all__ exports are properly defined."""
+    def test_all_is_defined(self):
+        """__all__ should be defined."""
+        import mps_flash_attn
+        assert hasattr(mps_flash_attn, "__all__"), "__all__ not defined"
+        assert isinstance(mps_flash_attn.__all__, list), "__all__ should be a list"
+        assert len(mps_flash_attn.__all__) > 0, "__all__ should not be empty"
+    def test_all_symbols_exist(self):
+        """All symbols in __all__ should be importable."""
+        import mps_flash_attn
+        missing = []
+        for name in mps_flash_attn.__all__:
+            if not hasattr(mps_flash_attn, name):
+                missing.append(name)
+        assert not missing, f"Missing symbols from __all__: {missing}"
+    def test_core_functions_exported(self):
+        """Core functions should be in __all__."""
+        import mps_flash_attn
+        required = [
+            "flash_attention",
+            "flash_attention_chunked",
+            "is_available",
+            "__version__",
+        ]
+        for name in required:
+            assert name in mps_flash_attn.__all__, f"{name} missing from __all__"
+class TestScaleValidation:
+    """Test scale parameter validation."""
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_negative_scale_raises(self):
+        """Negative scale should raise ValueError."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        with pytest.raises(ValueError, match="scale must be positive"):
+            mps_flash_attn.flash_attention(q, k, v, scale=-1.0)
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_zero_scale_raises(self):
+        """Zero scale should raise ValueError."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        with pytest.raises(ValueError, match="scale must be positive"):
+            mps_flash_attn.flash_attention(q, k, v, scale=0.0)
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_extreme_scale_warns(self):
+        """Extreme scale values should warn."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        # Very large scale (1000x default)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            mps_flash_attn.flash_attention(q, k, v, scale=100.0)  # default is ~0.177
+            assert any("very different from default" in str(warning.message) for warning in w), \
+                "Should warn about extreme scale"
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_normal_scale_no_warning(self):
+        """Normal scale values should not warn."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        default_scale = 1.0 / math.sqrt(32)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            mps_flash_attn.flash_attention(q, k, v, scale=default_scale)
+            scale_warnings = [x for x in w if "scale" in str(x.message).lower()]
+            assert not scale_warnings, f"Should not warn about normal scale: {scale_warnings}"
+class TestChunkedAccuracy:
+    """Test chunked attention accuracy vs non-chunked."""
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_chunked_vs_regular_accuracy(self):
+        """Chunked attention should match regular within tolerance.
+        Note: Online softmax approximation has inherent numerical error.
+        The max difference is typically ~0.05-0.06 due to the approximation
+        at line 556 where chunk_max = chunk_lse (simplified online softmax).
+        """
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        torch.manual_seed(42)
+        # Use a moderate size to keep test fast
+        B, H, N, D = 2, 4, 2048, 64
+        q = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        k = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        v = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        # Regular flash attention
+        out_regular = mps_flash_attn.flash_attention(q, k, v)
+        # Chunked flash attention with smaller chunk
+        out_chunked = mps_flash_attn.flash_attention_chunked(q, k, v, chunk_size=512)
+        torch.mps.synchronize()
+        # Compute differences
+        max_diff = (out_regular - out_chunked).abs().max().item()
+        mean_diff = (out_regular - out_chunked).abs().mean().item()
+        # Online softmax has inherent ~0.05 max error
+        # This is NOT a regression - it's expected behavior
+        assert max_diff < 0.10, f"Max diff {max_diff:.6f} too large (expected < 0.10)"
+        assert mean_diff < 0.01, f"Mean diff {mean_diff:.6f} too large (expected < 0.01)"
+        # Log actual values for transparency
+        print(f"\nChunked accuracy: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}")
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_chunked_causal_accuracy(self):
+        """Chunked causal attention should also be accurate."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        torch.manual_seed(123)
+        B, H, N, D = 2, 4, 1024, 64
+        q = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        k = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        v = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        out_regular = mps_flash_attn.flash_attention(q, k, v, is_causal=True)
+        out_chunked = mps_flash_attn.flash_attention_chunked(q, k, v, is_causal=True, chunk_size=256)
+        torch.mps.synchronize()
+        max_diff = (out_regular - out_chunked).abs().max().item()
+        mean_diff = (out_regular - out_chunked).abs().mean().item()
+        assert max_diff < 0.10, f"Causal max diff {max_diff:.6f} too large"
+        assert mean_diff < 0.01, f"Causal mean diff {mean_diff:.6f} too large"
+class TestInputValidation:
+    """Test input validation catches errors early."""
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_wrong_device_raises(self):
+        """Tensors on wrong device should raise."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 4, 64, 32)  # CPU tensor
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        with pytest.raises((RuntimeError, ValueError)):
+            mps_flash_attn.flash_attention(q, k, v)
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_wrong_dims_raises(self):
+        """Wrong tensor dimensions should raise."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        q = torch.randn(1, 64, 32, device="mps", dtype=torch.float16)  # 3D instead of 4D
+        k = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        v = torch.randn(1, 4, 64, 32, device="mps", dtype=torch.float16)
+        with pytest.raises(RuntimeError):
+            mps_flash_attn.flash_attention(q, k, v)
+class TestMaskValidation:
+    """Test attention mask shape validation."""
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_valid_mask_shape(self):
+        """Valid mask shapes should work."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        B, H, N, D = 2, 4, 64, 32
+        q = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        k = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        v = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        # Full mask
+        mask = torch.zeros(B, H, N, N, dtype=torch.bool, device="mps")
+        out = mps_flash_attn.flash_attention(q, k, v, attn_mask=mask)
+        assert out.shape == (B, H, N, D)
+        # Broadcast over batch
+        mask = torch.zeros(1, H, N, N, dtype=torch.bool, device="mps")
+        out = mps_flash_attn.flash_attention(q, k, v, attn_mask=mask)
+        assert out.shape == (B, H, N, D)
+        # Broadcast over heads
+        mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device="mps")
+        out = mps_flash_attn.flash_attention(q, k, v, attn_mask=mask)
+        assert out.shape == (B, H, N, D)
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_invalid_mask_dims_raises(self):
+        """Wrong mask dimensions should raise."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        B, H, N, D = 2, 4, 64, 32
+        q = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        k = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        v = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        # 3D mask (missing batch dim)
+        mask = torch.zeros(H, N, N, dtype=torch.bool, device="mps")
+        with pytest.raises(ValueError, match="must be 4D"):
+            mps_flash_attn.flash_attention(q, k, v, attn_mask=mask)
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_invalid_mask_seq_len_raises(self):
+        """Wrong mask sequence length should raise."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        B, H, N, D = 2, 4, 64, 32
+        q = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        k = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        v = torch.randn(B, H, N, D, device="mps", dtype=torch.float16)
+        # Wrong N_kv
+        mask = torch.zeros(B, H, N, N // 2, dtype=torch.bool, device="mps")
+        with pytest.raises(ValueError, match="shape mismatch"):
+            mps_flash_attn.flash_attention(q, k, v, attn_mask=mask)
+class TestAutoContiguous:
+    """Test auto-contiguous conversion."""
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_non_contiguous_works(self):
+        """Non-contiguous tensors should be auto-converted."""
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        B, H, N, D = 2, 4, 64, 32
+        # Create non-contiguous tensor via transpose
+        q = torch.randn(B, N, H, D, device="mps", dtype=torch.float16).transpose(1, 2)
+        k = torch.randn(B, N, H, D, device="mps", dtype=torch.float16).transpose(1, 2)
+        v = torch.randn(B, N, H, D, device="mps", dtype=torch.float16).transpose(1, 2)
+        assert not q.is_contiguous(), "Test setup: q should be non-contiguous"
+        # Should work without error
+        out = mps_flash_attn.flash_attention(q, k, v)
+        assert out.shape == (B, H, N, D)
+class TestFallbackTracking:
+    """Test SDPA fallback tracking."""
+    def test_fallback_counter_exists(self):
+        """The replace_sdpa function should track fallbacks."""
+        import mps_flash_attn
+        # Just verify the module has the replace_sdpa function
+        assert hasattr(mps_flash_attn, 'replace_sdpa')
+class TestThreadSafety:
+    """Test thread-safe initialization.
+    Note: MPS itself doesn't support concurrent operations from multiple threads,
+    so we can't test concurrent flash_attention calls. But we CAN test that
+    the initialization code is thread-safe by checking that multiple threads
+    attempting to initialize don't cause crashes or double-initialization.
+    """
+    @pytest.mark.skipif(not torch.backends.mps.is_available(), reason="MPS not available")
+    def test_concurrent_initialization(self):
+        """Multiple threads initializing simultaneously should not crash.
+        This tests the thread-safe initialization in the C++ code.
+        We spawn multiple threads that all try to call flash_attention
+        with a barrier to synchronize their start times. Only ONE should
+        actually initialize the MFA bridge.
+        Note: We serialize the actual MPS operations to avoid driver crashes,
+        but the initialization race is what we're testing.
+        """
+        import mps_flash_attn
+        if not mps_flash_attn.is_available():
+            pytest.skip("MFA not available")
+        import threading
+        import queue
+        num_threads = 4
+        results = queue.Queue()
+        barrier = threading.Barrier(num_threads)
+        # Use a lock to serialize MPS operations (avoid driver crash)
+        # but let initialization race
+        mps_lock = threading.Lock()
+        def worker(thread_id):
+            try:
+                # All threads wait here, then start simultaneously
+                barrier.wait()
+                # Each thread creates its own tensors and calls flash_attention
+                # The initialization will race, but should be thread-safe
+                with mps_lock:  # Serialize MPS operations
+                    q = torch.randn(1, 2, 32, 16, device="mps", dtype=torch.float16)
+                    k = torch.randn(1, 2, 32, 16, device="mps", dtype=torch.float16)
+                    v = torch.randn(1, 2, 32, 16, device="mps", dtype=torch.float16)
+                    out = mps_flash_attn.flash_attention(q, k, v)
+                    torch.mps.synchronize()
+                results.put((thread_id, "success", out.shape))
+            except Exception as e:
+                results.put((thread_id, "error", str(e)))
+        # Spawn threads
+        threads = []
+        for i in range(num_threads):
+            t = threading.Thread(target=worker, args=(i,))
+            threads.append(t)
+            t.start()
+        # Wait for completion
+        for t in threads:
+            t.join(timeout=30)
+        # Check results
+        successes = 0
+        errors = []
+        while not results.empty():
+            thread_id, status, data = results.get()
+            if status == "success":
+                successes += 1
+                assert data == (1, 2, 32, 16), f"Thread {thread_id} got wrong shape: {data}"
+            else:
+                errors.append(f"Thread {thread_id}: {data}")
+        assert successes == num_threads, f"Only {successes}/{num_threads} succeeded. Errors: {errors}"
+    def test_atomic_flag_exists(self):
+        """Verify the C++ code uses atomic for g_initialized.
+        This is a static check - we grep the source to ensure the fix is in place.
+        """
+        cpp_file = Path(__file__).parent.parent / "mps_flash_attn" / "csrc" / "mps_flash_attn.mm"
+        if not cpp_file.exists():
+            pytest.skip("C++ source not found")
+        content = cpp_file.read_text()
+        # Check for atomic<bool>
+        assert "std::atomic<bool>" in content or "atomic<bool>" in content, \
+            "g_initialized should be std::atomic<bool>"
+        # Check for mutex
+        assert "std::mutex" in content or "mutex" in content, \
+            "Should have mutex for thread-safe init"
+        # Check for ensure_initialized helper
+        assert "ensure_initialized" in content, \
+            "Should have ensure_initialized() helper function"
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])