PyPI - mps-flash-attn - Versions diffs - 0.2.8__cp314-cp314-macosx_15_0_arm64.whl → 0.3.2__cp314-cp314-macosx_15_0_arm64.whl - Mend

mps-flash-attn 0.2.8__cp314-cp314-macosx_15_0_arm64.whl → 0.3.2__cp314-cp314-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (8) hide show

mps_flash_attn/_C.cpython-314-darwin.so CHANGED Viewed

Binary file

mps_flash_attn/__init__.py CHANGED Viewed

@@ -4,13 +4,42 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.8"
+__version__ = "0.3.2"
+__all__ = [
+    # Core functions
+    "flash_attention",
+    "flash_attention_with_bias",
+    "flash_attention_chunked",
+    # Quantized attention
+    "flash_attention_fp8",
+    "flash_attention_int8",
+    "flash_attention_nf4",
+    "quantize_kv_fp8",
+    "quantize_kv_int8",
+    "quantize_kv_nf4",
+    # Utilities
+    "replace_sdpa",
+    "precompile",
+    "clear_cache",
+    "register_custom_op",
+    "is_available",
+    "convert_mask",
+    # Constants
+    "QUANT_FP8_E4M3",
+    "QUANT_FP8_E5M2",
+    "QUANT_INT8",
+    "QUANT_NF4",
+    # Version
+    "__version__",
+]
 import torch
-from typing import Optional
+from typing import Optional, Tuple
 import math
 import threading
 import os
+import warnings
 # Try to import the C++ extension
 try:
@@ -30,6 +59,20 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def _ensure_contiguous(tensor: torch.Tensor, name: str) -> torch.Tensor:
+    """Ensure tensor is contiguous, with a debug warning if conversion needed."""
+    if tensor.is_contiguous():
+        return tensor
+    # Auto-convert with debug info
+    if os.environ.get("MFA_DEBUG", "0") == "1":
+        warnings.warn(
+            f"MFA: {name} tensor was not contiguous (stride={tensor.stride()}), "
+            f"auto-converting. For best performance, ensure inputs are contiguous.",
+            UserWarning
+        )
+    return tensor.contiguous()
 def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     """
     Convert attention mask to MFA's boolean format.
@@ -54,6 +97,56 @@ def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     return attn_mask <= -1e3
+def _validate_and_expand_mask(
+    attn_mask: Optional[torch.Tensor],
+    B: int,
+    H: int,
+    N_q: int,
+    N_kv: int,
+) -> Optional[torch.Tensor]:
+    """
+    Validate attention mask shape and expand broadcast dimensions.
+    Args:
+        attn_mask: Optional mask of shape (B, H, N_q, N_kv) or broadcastable
+        B: Batch size
+        H: Number of heads
+        N_q: Query sequence length
+        N_kv: Key/Value sequence length
+    Returns:
+        Expanded mask of shape (mb, mh, N_q, N_kv) or None
+    """
+    if attn_mask is None:
+        return None
+    attn_mask = _ensure_contiguous(attn_mask, "attn_mask")
+    if attn_mask.dim() != 4:
+        raise ValueError(f"attn_mask must be 4D (B, H, N_q, N_kv), got {attn_mask.dim()}D")
+    mb, mh, mq, mk = attn_mask.shape
+    # Allow broadcast: mq can be 1 (applies same mask to all query positions) or N_q
+    if (mq != 1 and mq != N_q) or (mk != 1 and mk != N_kv):
+        raise ValueError(
+            f"attn_mask shape mismatch: mask is ({mq}, {mk}) but expected ({N_q}, {N_kv}) or broadcastable (1, {N_kv})"
+        )
+    # Expand broadcast mask to full shape for Metal kernel
+    if mq == 1 and N_q > 1:
+        attn_mask = attn_mask.expand(mb, mh, N_q, mk)
+    if mk == 1 and N_kv > 1:
+        attn_mask = attn_mask.expand(mb, mh, mq if mq > 1 else N_q, N_kv)
+    if mb != 1 and mb != B:
+        raise ValueError(f"attn_mask batch size must be 1 or {B}, got {mb}")
+    if mh != 1 and mh != H:
+        raise ValueError(f"attn_mask head count must be 1 or {H}, got {mh}")
+    return attn_mask
 class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
@@ -176,6 +269,20 @@ def flash_attention(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values that could cause numerical issues
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -186,6 +293,24 @@ def flash_attention(
     if attn_mask is not None and attn_mask.device.type != 'mps':
         raise ValueError("attn_mask must be on MPS device")
+    # Ensure contiguous (auto-convert with debug warning)
+    query = _ensure_contiguous(query, "query")
+    key = _ensure_contiguous(key, "key")
+    value = _ensure_contiguous(value, "value")
+    # Validate tensor dimensions
+    if query.dim() != 4:
+        raise RuntimeError(f"query must be 4D (B, H, N, D), got {query.dim()}D")
+    if key.dim() != 4:
+        raise RuntimeError(f"key must be 4D (B, H, N, D), got {key.dim()}D")
+    if value.dim() != 4:
+        raise RuntimeError(f"value must be 4D (B, H, N, D), got {value.dim()}D")
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
     if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
         # Apply scale if provided
@@ -213,6 +338,10 @@ def replace_sdpa():
     import torch.nn.functional as F
     original_sdpa = F.scaled_dot_product_attention
+    _debug = os.environ.get("MFA_DEBUG", "0") == "1"
+    _call_count = [0]  # mutable for closure
+    _fallback_count = [0]  # track fallbacks for warning
+    _last_fallback_error = [None]
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None, enable_gqa=False, **kwargs):
@@ -224,37 +353,92 @@ def replace_sdpa():
         #   seq=1024: 2.3-3.7x (MFA much faster)
         #   seq=2048: 2.2-3.9x (MFA much faster)
         #   seq=4096: 2.1-3.7x (MFA much faster)
+        # Determine seq_len based on tensor dimensionality
+        # 4D: (B, H, S, D) -> seq_len = shape[2]
+        # 3D: (B, S, D) -> seq_len = shape[1] (single-head attention, e.g., VAE)
+        is_3d = query.ndim == 3
+        seq_len = query.shape[1] if is_3d else query.shape[2]
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 512):
+            query.ndim >= 3 and
+            seq_len >= 512):
             try:
+                q, k, v = query, key, value
+                # Handle 3D tensors (B, S, D) - treat as single-head attention
+                # Unsqueeze to (B, 1, S, D) for MFA, squeeze back after
+                if is_3d:
+                    q = q.unsqueeze(1)  # (B, S, D) -> (B, 1, S, D)
+                    k = k.unsqueeze(1)
+                    v = v.unsqueeze(1)
                 # Handle GQA (Grouped Query Attention) - expand K/V heads to match Q heads
                 # Common in Llama 2/3, Mistral, Qwen, etc.
-                k, v = key, value
-                if enable_gqa and query.shape[1] != key.shape[1]:
+                # NOTE: Always expand when heads mismatch, not just when enable_gqa=True
+                # Transformers may pass enable_gqa=True on MPS (torch>=2.5, no mask) even though
+                # MPS SDPA doesn't support native GQA - we handle it here
+                if q.shape[1] != k.shape[1]:
                     # Expand KV heads: (B, kv_heads, S, D) -> (B, q_heads, S, D)
-                    n_rep = query.shape[1] // key.shape[1]
-                    k = key.repeat_interleave(n_rep, dim=1)
-                    v = value.repeat_interleave(n_rep, dim=1)
+                    n_rep = q.shape[1] // k.shape[1]
+                    k = k.repeat_interleave(n_rep, dim=1)
+                    v = v.repeat_interleave(n_rep, dim=1)
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
                 # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
                 mfa_mask = None
                 if attn_mask is not None:
+                    if _debug:
+                        print(f"[MFA MASK] dtype={attn_mask.dtype} shape={tuple(attn_mask.shape)} min={attn_mask.min().item():.2f} max={attn_mask.max().item():.2f}")
                     if attn_mask.dtype == torch.bool:
-                        # Boolean mask: True means masked (don't attend)
-                        mfa_mask = attn_mask
+                        # PyTorch SDPA bool mask: True = ATTEND, False = MASKED
+                        # MFA bool mask: True = MASKED, False = ATTEND
+                        # They're opposite! Invert it.
+                        mfa_mask = ~attn_mask
                     else:
                         # Float mask: typically -inf for masked positions, 0 for unmasked
                         # Convert: positions with large negative values -> True (masked)
                         # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
                         mfa_mask = attn_mask <= -1e3
-                return flash_attention(query, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
-            except Exception:
-                # Fall back to original on any error
-                pass
+                    if _debug:
+                        print(f"[MFA MASK] converted: True(masked)={mfa_mask.sum().item()} False(attend)={(~mfa_mask).sum().item()}")
+                out = flash_attention(q, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
+                # Squeeze back for 3D input
+                if is_3d:
+                    out = out.squeeze(1)  # (B, 1, S, D) -> (B, S, D)
+                if _debug:
+                    _call_count[0] += 1
+                    print(f"[MFA #{_call_count[0]}] shape={tuple(query.shape)} is_3d={is_3d} gqa={enable_gqa} mask={attn_mask is not None} causal={is_causal}")
+                return out
+            except Exception as e:
+                # Fall back to original on any error, but track it
+                _fallback_count[0] += 1
+                _last_fallback_error[0] = str(e)
+                if _debug:
+                    import traceback
+                    print(f"[MFA FALLBACK #{_fallback_count[0]}] shape={tuple(query.shape)}\n{traceback.format_exc()}")
+                # Warn user after repeated fallbacks (likely a real problem)
+                if _fallback_count[0] == 10:
+                    warnings.warn(
+                        f"MFA has fallen back to native SDPA {_fallback_count[0]} times. "
+                        f"Last error: {_last_fallback_error[0]}. "
+                        f"Set MFA_DEBUG=1 for details.",
+                        UserWarning
+                    )
+        if _debug and query.device.type == 'mps':
+            _call_count[0] += 1
+            reason = []
+            if dropout_p != 0.0: reason.append(f"dropout={dropout_p}")
+            if query.ndim < 3: reason.append(f"ndim={query.ndim}")
+            if seq_len < 512: reason.append(f"seq={seq_len}<512")
+            print(f"[NATIVE #{_call_count[0]}] shape={tuple(query.shape)} reason={','.join(reason) or 'unknown'}")
         return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale=scale, enable_gqa=enable_gqa, **kwargs)
@@ -461,13 +645,13 @@ def flash_attention_chunked(
         return _C.forward(query, key, value, is_causal, None, 0)
     # Initialize running statistics for online softmax
-    # m = running max, l = running sum of exp, acc = accumulated output
     device = query.device
     dtype = query.dtype
     # Use float32 for numerical stability of softmax statistics
-    running_max = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
-    running_sum = torch.zeros((B, H, seq_len_q, 1), device=device, dtype=torch.float32)
+    # running_L: base-2 logsumexp of all attention scores seen so far (-inf means no data yet)
+    # output_acc: weighted combination of outputs (weights sum to 1 after each update)
+    running_L = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
     output_acc = torch.zeros((B, H, seq_len_q, D), device=device, dtype=torch.float32)
     # Process K/V in chunks
@@ -488,51 +672,81 @@ def flash_attention_chunked(
         # - Partial chunk (up to q) if start_idx <= q < end_idx
         # - None of chunk if q < start_idx
-        chunk_is_causal = is_causal and (end_idx <= seq_len_q)
-        # Compute attention for this chunk
-        # forward_with_lse returns (output, logsumexp) where logsumexp = m + log(l)
-        chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, chunk_is_causal, None, 0)
-        # chunk_lse shape: (B, H, seq_len_q)
-        # We need to convert logsumexp to (max, sum) for online algorithm
-        chunk_lse = chunk_lse.unsqueeze(-1)  # (B, H, seq_len_q, 1)
+        if is_causal:
+            # Create explicit causal mask for this chunk
+            # Query positions: 0 to seq_len_q-1
+            # Key positions in chunk: start_idx to end_idx-1
+            chunk_len = end_idx - start_idx
-        # Convert chunk output to float32 for accumulation
-        chunk_out = chunk_out.float()
+            # Build mask: mask[q, k_local] = True means DON'T attend
+            # We want to attend when global_k_pos <= q
+            # global_k_pos = start_idx + k_local
+            # So: attend when start_idx + k_local <= q
+            # mask = start_idx + k_local > q
-        # Online softmax update:
-        # new_max = max(running_max, chunk_max)
-        # For flash attention, chunk_lse ≈ chunk_max + log(chunk_sum)
-        # We approximate chunk_max ≈ chunk_lse (valid when exp sum dominates)
+            q_pos = torch.arange(seq_len_q, device=device).view(1, 1, seq_len_q, 1)
+            k_pos = torch.arange(chunk_len, device=device).view(1, 1, 1, chunk_len) + start_idx
+            causal_mask = k_pos > q_pos  # True = masked (don't attend)
-        chunk_max = chunk_lse  # Approximation: logsumexp ≈ max when sum is dominated by max
+            # Expand to batch and heads
+            causal_mask = causal_mask.expand(B, H, seq_len_q, chunk_len)
-        # Compute new max
-        new_max = torch.maximum(running_max, chunk_max)
+            # Call forward with explicit mask (is_causal=False since we handle it)
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, causal_mask, 0)
+        else:
+            # Non-causal: just process the chunk directly
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, None, 0)
-        # Rescale previous accumulator
-        # correction_old = exp(running_max - new_max)
-        correction_old = torch.exp(running_max - new_max)
-        # Clip to avoid inf * 0 issues when running_max was -inf
-        correction_old = torch.where(running_max == float('-inf'), torch.zeros_like(correction_old), correction_old)
+        # chunk_L shape: (B, H, seq_len_q)
+        # The kernel returns L = m + log2(l) where:
+        #   m = max(scores * log2(e) / sqrt(D))
+        #   l = sum(exp2(scores * log2(e) / sqrt(D) - m))
+        # This is a base-2 logsumexp: L = log2(sum(exp2(scaled_scores)))
+        chunk_L = chunk_lse.unsqueeze(-1).float()  # (B, H, seq_len_q, 1)
-        # Rescale chunk output
-        # correction_new = exp(chunk_max - new_max)
-        correction_new = torch.exp(chunk_max - new_max)
+        # Convert chunk output to float32 for accumulation
+        chunk_out = chunk_out.float()
-        # For the sum, we need exp(chunk_lse - new_max) = exp(chunk_max + log(chunk_sum) - new_max)
-        # = exp(chunk_max - new_max) * chunk_sum
-        # But we only have logsumexp, so: exp(chunk_lse - new_max)
-        chunk_sum_scaled = torch.exp(chunk_lse - new_max)
+        # Online softmax algorithm using base-2 representation
+        #
+        # Flash attention returns: chunk_out = softmax(scores) @ V
+        # The output is already normalized. For online combination:
+        #   new_L = log2(2^running_L + 2^chunk_L)
+        #         = max(running_L, chunk_L) + log2(2^(running_L - max) + 2^(chunk_L - max))
+        #
+        # The weights for combining outputs are:
+        #   old_weight = 2^(running_L - new_L)
+        #   new_weight = 2^(chunk_L - new_L)
+        # These weights sum to 1, so: output = old_weight * old_out + new_weight * new_out
+        # Compute new base-2 logsumexp
+        max_L = torch.maximum(running_L, chunk_L)
+        # Handle -inf case (no previous data)
+        # Use exp2 for base-2 (matches kernel's internal representation)
+        running_exp2 = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - max_L)
+        )
+        chunk_exp2 = torch.exp2(chunk_L - max_L)
+        new_L = max_L + torch.log2(running_exp2 + chunk_exp2)
+        # Compute correction factors using base-2 exp
+        old_weight = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - new_L)
+        )
+        new_weight = torch.exp2(chunk_L - new_L)
         # Update accumulator
-        output_acc = output_acc * correction_old + chunk_out * correction_new
-        running_sum = running_sum * correction_old + chunk_sum_scaled
-        running_max = new_max
+        # Update accumulator
+        output_acc = output_acc * old_weight + chunk_out * new_weight
+        running_L = new_L
-    # Final normalization
-    output = output_acc / running_sum
+    # No final normalization needed - weights already sum to 1
+    output = output_acc
     # Convert back to original dtype
     return output.to(dtype)
@@ -754,6 +968,11 @@ def flash_attention_fp8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     quant_type = QUANT_FP8_E5M2 if use_e5m2 else QUANT_FP8_E4M3
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
@@ -807,6 +1026,11 @@ def flash_attention_int8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_INT8, is_causal, attn_mask, window_size
@@ -863,6 +1087,11 @@ def flash_attention_nf4(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_NF4, is_causal, attn_mask, window_size
@@ -917,6 +1146,11 @@ def flash_attention_quantized(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         quant_type, is_causal, attn_mask, window_size

mps_flash_attn/csrc/mps_flash_attn.mm CHANGED Viewed

@@ -14,6 +14,8 @@
 #include <dlfcn.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <atomic>
 // ============================================================================
 // MFA Bridge Function Types
@@ -67,7 +69,8 @@ static mfa_forward_fn g_mfa_forward = nullptr;
 static mfa_backward_fn g_mfa_backward = nullptr;
 static mfa_release_kernel_fn g_mfa_release_kernel = nullptr;
 static void* g_dylib_handle = nullptr;
-static bool g_initialized = false;
+static std::atomic<bool> g_initialized{false};
+static std::mutex g_init_mutex;
 // ============================================================================
 // Load MFA Bridge Library
@@ -141,6 +144,24 @@ static bool load_mfa_bridge() {
     return true;
 }
+// Thread-safe initialization helper
+static void ensure_initialized() {
+    // Fast path: already initialized
+    if (g_initialized.load(std::memory_order_acquire)) {
+        return;
+    }
+    // Slow path: need to initialize with lock
+    std::lock_guard<std::mutex> lock(g_init_mutex);
+    // Double-check after acquiring lock
+    if (!g_initialized.load(std::memory_order_relaxed)) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized.store(true, std::memory_order_release);
+    }
+}
 // ============================================================================
 // Get MTLBuffer from PyTorch MPS Tensor
 // ============================================================================
@@ -359,14 +380,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     const c10::optional<at::Tensor>& attn_mask,  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
     int64_t window_size  // 0 = full attention, >0 = sliding window
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
@@ -562,14 +577,8 @@ at::Tensor mps_flash_attention_forward_with_bias(
     int64_t window_size,
     int64_t bias_repeat_count  // >0 means bias repeats every N batches (for window attention)
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v6/v7 API is available
     TORCH_CHECK(g_mfa_create_kernel_v6 || g_mfa_create_kernel_v7,
@@ -735,14 +744,8 @@ at::Tensor mps_flash_attention_forward_quantized(
     const c10::optional<at::Tensor>& attn_mask,
     int64_t window_size
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v4 API is available
     TORCH_CHECK(g_mfa_create_kernel_v4, "Quantized attention requires MFA v4 API (update libMFABridge.dylib)");
@@ -992,14 +995,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     int64_t window_size,  // 0 = full attention, >0 = sliding window
     bool bf16_backward    // true = use BF16 intermediates for ~2x faster backward
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(grad_output.dim() == 4, "grad_output must be 4D (B, H, N, D)");

{mps_flash_attn-0.2.8.dist-info → mps_flash_attn-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.8
+Version: 0.3.2
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.2.8.dist-info → mps_flash_attn-0.3.2.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-mps_flash_attn/_C.cpython-314-darwin.so,sha256=Npw6N708E8Fzy14nnd99XgwPSxkXIRE4iAhwVTwaZHc,313160
-mps_flash_attn/__init__.py,sha256=Fdz8SYpudWlogBU6QiJxy6ybyFR4Eqq7GMPdBusMl6U,40221
+mps_flash_attn/_C.cpython-314-darwin.so,sha256=GtWa4KIcynqjbCQYw-uTBpkX5NTcxyKuK0APoGnRlQM,313448
+mps_flash_attn/__init__.py,sha256=u6B_WenZOTk1WMe9u4-PbyPy7pt8NValR5i8Oz6bI-U,49252
 mps_flash_attn/benchmark.py,sha256=qHhvb8Dmh07OEa_iXuPuJSEnRJlrjVF5nKzVwbWypWE,24141
-mps_flash_attn/csrc/mps_flash_attn.mm,sha256=d7Bjcm2VOTNANmdqUevN-mqa5aOEVMMxAuTYINAeSr0,51215
+mps_flash_attn/csrc/mps_flash_attn.mm,sha256=mR4S8SHLtRiksrmoFH6s2118q662SMNlFU8HmxAE3YY,51204
 mps_flash_attn/kernels/06c421e7a01418cf64aafa07f6b1df0558148583959c596d9a7ce260987f89f0.metallib,sha256=_oig6f2I6ZxBCKWbJF3ofmZMySm8gB399_M-lD2NOfM,13747
 mps_flash_attn/kernels/09b9615289be632fdf05444004a0b3b67fb1b70b05a7e0fce8e0ba3a95e3921c.metallib,sha256=1fsmVvB5EubhN-y6s5CB-eVk_wuO2tfrabiQTwXvJJc,13171
 mps_flash_attn/kernels/0c36461301fb52cbad786d0642b020ad2bfc7229b487ccb5dff44d198423b347.metallib,sha256=5WKo_yAU-PgmulBUQhnzvt0DZRteVmo4-nc4U-T6G2g,17507
@@ -26,8 +26,8 @@ mps_flash_attn/kernels/eab4f40de4b0ebd2765b41c25dba7ccab5db4abf6a6eb87d76fff7b5e
 mps_flash_attn/kernels/f08fe0efd72e055177e068154dae01e08c4d52d3cb883330a04f1431d274aece.metallib,sha256=qyOaQtRVwL_Wc6GGdu6z-ftf0iX84XexuY09-lNLl5o,13747
 mps_flash_attn/kernels/manifest.json,sha256=d5MkE_BjqDQuMNm1jZiwWkQKfB-yfFml3lLSeR-wCLo,1867
 mps_flash_attn/lib/libMFABridge.dylib,sha256=iKgfYISSKMSNt_iXnljjUr_hZZHyCAg2tdS3_ZjmLkc,605696
-mps_flash_attn-0.2.8.dist-info/licenses/LICENSE,sha256=F_XmXSab2O-hHcqLpYJWeFaqB6GA_qiTEN23p2VfZWU,1237
-mps_flash_attn-0.2.8.dist-info/METADATA,sha256=-Nx-lkEs-hfr1QevTiUt9yLU81x2JfKmsf7HqoSEKeg,5834
-mps_flash_attn-0.2.8.dist-info/WHEEL,sha256=uAzMRtb2noxPlbYLbRgeD25pPgKOo3k59IS71Dg5Qjs,110
-mps_flash_attn-0.2.8.dist-info/top_level.txt,sha256=zbArDcWhJDnJfMUKnOUhs5TjsMgSxa2GzOlscTRfobE,15
-mps_flash_attn-0.2.8.dist-info/RECORD,,
+mps_flash_attn-0.3.2.dist-info/licenses/LICENSE,sha256=F_XmXSab2O-hHcqLpYJWeFaqB6GA_qiTEN23p2VfZWU,1237
+mps_flash_attn-0.3.2.dist-info/METADATA,sha256=vhcu8d8NdzmuQbOqVUpzacXJF__Eu-BW1C7Em_CNoyg,5834
+mps_flash_attn-0.3.2.dist-info/WHEEL,sha256=uAzMRtb2noxPlbYLbRgeD25pPgKOo3k59IS71Dg5Qjs,110
+mps_flash_attn-0.3.2.dist-info/top_level.txt,sha256=zbArDcWhJDnJfMUKnOUhs5TjsMgSxa2GzOlscTRfobE,15
+mps_flash_attn-0.3.2.dist-info/RECORD,,

{mps_flash_attn-0.2.8.dist-info → mps_flash_attn-0.3.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{mps_flash_attn-0.2.8.dist-info → mps_flash_attn-0.3.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mps_flash_attn-0.2.8.dist-info → mps_flash_attn-0.3.2.dist-info}/top_level.txt RENAMED Viewed

File without changes