PyPI - mps-flash-attn - Versions diffs - 0.2.6__cp314-cp314-macosx_15_0_arm64.whl → 0.3.7__cp314-cp314-macosx_15_0_arm64.whl - Mend

mps-flash-attn 0.2.6__cp314-cp314-macosx_15_0_arm64.whl → 0.3.7__cp314-cp314-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

mps_flash_attn/_C.cpython-314-darwin.so CHANGED Viewed

Binary file

mps_flash_attn/__init__.py CHANGED Viewed

@@ -4,13 +4,43 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.5"
+__version__ = "0.3.7"
+__all__ = [
+    # Core functions
+    "flash_attention",
+    "flash_attention_with_bias",
+    "flash_attention_chunked",
+    # Quantized attention
+    "flash_attention_fp8",
+    "flash_attention_int8",
+    "flash_attention_nf4",
+    "quantize_kv_fp8",
+    "quantize_kv_int8",
+    "quantize_kv_nf4",
+    # Utilities
+    "replace_sdpa",
+    "precompile",
+    "clear_cache",
+    "register_custom_op",
+    "is_available",
+    "convert_mask",
+    # Constants
+    "QUANT_FP8_E4M3",
+    "QUANT_FP8_E5M2",
+    "QUANT_INT8",
+    "QUANT_NF4",
+    # Version
+    "__version__",
+]
 import torch
-from typing import Optional
+import torch.nn.functional as F
+from typing import Optional, Tuple
 import math
 import threading
 import os
+import warnings
 # Try to import the C++ extension
 try:
@@ -30,6 +60,20 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def _ensure_contiguous(tensor: torch.Tensor, name: str) -> torch.Tensor:
+    """Ensure tensor is contiguous, with a debug warning if conversion needed."""
+    if tensor.is_contiguous():
+        return tensor
+    # Auto-convert with debug info
+    if os.environ.get("MFA_DEBUG", "0") == "1":
+        warnings.warn(
+            f"MFA: {name} tensor was not contiguous (stride={tensor.stride()}), "
+            f"auto-converting. For best performance, ensure inputs are contiguous.",
+            UserWarning
+        )
+    return tensor.contiguous()
 def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     """
     Convert attention mask to MFA's boolean format.
@@ -54,6 +98,98 @@ def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     return attn_mask <= -1e3
+def _validate_and_expand_mask(
+    attn_mask: Optional[torch.Tensor],
+    B: int,
+    H: int,
+    N_q: int,
+    N_kv: int,
+) -> Optional[torch.Tensor]:
+    """
+    Validate attention mask shape and expand broadcast dimensions.
+    Args:
+        attn_mask: Optional mask of shape (B, H, N_q, N_kv) or broadcastable
+        B: Batch size
+        H: Number of heads
+        N_q: Query sequence length
+        N_kv: Key/Value sequence length
+    Returns:
+        Expanded mask of shape (mb, mh, N_q, N_kv) or None
+    """
+    if attn_mask is None:
+        return None
+    attn_mask = _ensure_contiguous(attn_mask, "attn_mask")
+    if attn_mask.dim() != 4:
+        raise ValueError(f"attn_mask must be 4D (B, H, N_q, N_kv), got {attn_mask.dim()}D")
+    mb, mh, mq, mk = attn_mask.shape
+    # Allow broadcast: mq can be 1 (applies same mask to all query positions) or N_q
+    if (mq != 1 and mq != N_q) or (mk != 1 and mk != N_kv):
+        raise ValueError(
+            f"attn_mask shape mismatch: mask is ({mq}, {mk}) but expected ({N_q}, {N_kv}) or broadcastable (1, {N_kv})"
+        )
+    # Expand broadcast mask to full shape for Metal kernel
+    if mq == 1 and N_q > 1:
+        attn_mask = attn_mask.expand(mb, mh, N_q, mk)
+    if mk == 1 and N_kv > 1:
+        attn_mask = attn_mask.expand(mb, mh, mq if mq > 1 else N_q, N_kv)
+    if mb != 1 and mb != B:
+        raise ValueError(f"attn_mask batch size must be 1 or {B}, got {mb}")
+    if mh != 1 and mh != H:
+        raise ValueError(f"attn_mask head count must be 1 or {H}, got {mh}")
+    return attn_mask
+class FlashAttentionWithBiasFunction(torch.autograd.Function):
+    """Autograd function for Flash Attention with bias - native C++ backward."""
+    @staticmethod
+    def forward(ctx, query, key, value, attn_bias, is_causal, scale, window_size, bias_repeat_count):
+        # Apply scale if provided
+        scale_factor = 1.0
+        if scale is not None:
+            default_scale = 1.0 / math.sqrt(query.shape[-1])
+            if abs(scale - default_scale) > 1e-6:
+                scale_factor = scale / default_scale
+                query = query * scale_factor
+        # Call C++ forward with bias (returns output and logsumexp)
+        output, logsumexp = _C.forward_with_bias_lse(query, key, value, attn_bias, is_causal, window_size, bias_repeat_count)
+        # Save for backward
+        ctx.save_for_backward(query, key, value, output, logsumexp, attn_bias)
+        ctx.is_causal = is_causal
+        ctx.scale_factor = scale_factor
+        ctx.window_size = window_size
+        ctx.bias_repeat_count = bias_repeat_count
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        query, key, value, output, logsumexp, attn_bias = ctx.saved_tensors
+        # Call native C++ backward with bias
+        dQ, dK, dV = _C.backward_with_bias(
+            grad_output, query, key, value, output, logsumexp, attn_bias,
+            ctx.is_causal, ctx.window_size, ctx.bias_repeat_count
+        )
+        # Scale dQ back if we scaled query
+        if ctx.scale_factor != 1.0:
+            dQ = dQ * ctx.scale_factor
+        return dQ, dK, dV, None, None, None, None, None
 class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
@@ -176,6 +312,20 @@ def flash_attention(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values that could cause numerical issues
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -186,6 +336,24 @@ def flash_attention(
     if attn_mask is not None and attn_mask.device.type != 'mps':
         raise ValueError("attn_mask must be on MPS device")
+    # Ensure contiguous (auto-convert with debug warning)
+    query = _ensure_contiguous(query, "query")
+    key = _ensure_contiguous(key, "key")
+    value = _ensure_contiguous(value, "value")
+    # Validate tensor dimensions
+    if query.dim() != 4:
+        raise RuntimeError(f"query must be 4D (B, H, N, D), got {query.dim()}D")
+    if key.dim() != 4:
+        raise RuntimeError(f"key must be 4D (B, H, N, D), got {key.dim()}D")
+    if value.dim() != 4:
+        raise RuntimeError(f"value must be 4D (B, H, N, D), got {value.dim()}D")
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
     if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
         # Apply scale if provided
@@ -213,39 +381,107 @@ def replace_sdpa():
     import torch.nn.functional as F
     original_sdpa = F.scaled_dot_product_attention
+    _debug = os.environ.get("MFA_DEBUG", "0") == "1"
+    _call_count = [0]  # mutable for closure
+    _fallback_count = [0]  # track fallbacks for warning
+    _last_fallback_error = [None]
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None, enable_gqa=False, **kwargs):
         # Use MFA for MPS tensors without dropout
-        # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
+        # Only use MFA for seq_len >= 512 where it outperforms PyTorch's math backend
         # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
         # Benchmark results (B=1-4, H=8, D=64-128, fp16/bf16):
-        #   seq=512:  0.3-0.5x (MFA slower)
-        #   seq=1024: 1.1-2.0x (MFA faster)
-        #   seq=2048: 1.7-3.7x (MFA much faster)
-        #   seq=4096: 2.0-3.9x (MFA much faster)
+        #   seq=512:  1.2-1.6x (MFA faster)
+        #   seq=1024: 2.3-3.7x (MFA much faster)
+        #   seq=2048: 2.2-3.9x (MFA much faster)
+        #   seq=4096: 2.1-3.7x (MFA much faster)
+        # Determine seq_len based on tensor dimensionality
+        # 4D: (B, H, S, D) -> seq_len = shape[2]
+        # 3D: (B, S, D) -> seq_len = shape[1] (single-head attention, e.g., VAE)
+        is_3d = query.ndim == 3
+        seq_len = query.shape[1] if is_3d else query.shape[2]
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 1024):
+            query.ndim >= 3 and
+            seq_len >= 512):
             try:
+                q, k, v = query, key, value
+                # Handle 3D tensors (B, S, D) - treat as single-head attention
+                # Unsqueeze to (B, 1, S, D) for MFA, squeeze back after
+                if is_3d:
+                    q = q.unsqueeze(1)  # (B, S, D) -> (B, 1, S, D)
+                    k = k.unsqueeze(1)
+                    v = v.unsqueeze(1)
+                # Handle GQA (Grouped Query Attention) - expand K/V heads to match Q heads
+                # Common in Llama 2/3, Mistral, Qwen, etc.
+                # NOTE: Always expand when heads mismatch, not just when enable_gqa=True
+                # Transformers may pass enable_gqa=True on MPS (torch>=2.5, no mask) even though
+                # MPS SDPA doesn't support native GQA - we handle it here
+                if q.shape[1] != k.shape[1]:
+                    # Expand KV heads: (B, kv_heads, S, D) -> (B, q_heads, S, D)
+                    n_rep = q.shape[1] // k.shape[1]
+                    k = k.repeat_interleave(n_rep, dim=1)
+                    v = v.repeat_interleave(n_rep, dim=1)
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
                 # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
                 mfa_mask = None
                 if attn_mask is not None:
+                    if _debug:
+                        print(f"[MFA MASK] dtype={attn_mask.dtype} shape={tuple(attn_mask.shape)} min={attn_mask.min().item():.2f} max={attn_mask.max().item():.2f}")
                     if attn_mask.dtype == torch.bool:
-                        # Boolean mask: True means masked (don't attend)
-                        mfa_mask = attn_mask
+                        # PyTorch SDPA bool mask: True = ATTEND, False = MASKED
+                        # MFA bool mask: True = MASKED, False = ATTEND
+                        # They're opposite! Invert it.
+                        mfa_mask = ~attn_mask
                     else:
                         # Float mask: typically -inf for masked positions, 0 for unmasked
                         # Convert: positions with large negative values -> True (masked)
                         # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
                         mfa_mask = attn_mask <= -1e3
-                return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
-            except Exception:
-                # Fall back to original on any error
-                pass
+                    if _debug:
+                        print(f"[MFA MASK] converted: True(masked)={mfa_mask.sum().item()} False(attend)={(~mfa_mask).sum().item()}")
+                out = flash_attention(q, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
+                # Squeeze back for 3D input
+                if is_3d:
+                    out = out.squeeze(1)  # (B, 1, S, D) -> (B, S, D)
+                if _debug:
+                    _call_count[0] += 1
+                    print(f"[MFA #{_call_count[0]}] shape={tuple(query.shape)} is_3d={is_3d} gqa={enable_gqa} mask={attn_mask is not None} causal={is_causal}")
+                return out
+            except Exception as e:
+                # Fall back to original on any error, but track it
+                _fallback_count[0] += 1
+                _last_fallback_error[0] = str(e)
+                if _debug:
+                    import traceback
+                    print(f"[MFA FALLBACK #{_fallback_count[0]}] shape={tuple(query.shape)}\n{traceback.format_exc()}")
+                # Warn user after repeated fallbacks (likely a real problem)
+                if _fallback_count[0] == 10:
+                    warnings.warn(
+                        f"MFA has fallen back to native SDPA {_fallback_count[0]} times. "
+                        f"Last error: {_last_fallback_error[0]}. "
+                        f"Set MFA_DEBUG=1 for details.",
+                        UserWarning
+                    )
+        if _debug and query.device.type == 'mps':
+            _call_count[0] += 1
+            reason = []
+            if dropout_p != 0.0: reason.append(f"dropout={dropout_p}")
+            if query.ndim < 3: reason.append(f"ndim={query.ndim}")
+            if seq_len < 512: reason.append(f"seq={seq_len}<512")
+            print(f"[NATIVE #{_call_count[0]}] shape={tuple(query.shape)} reason={','.join(reason) or 'unknown'}")
         return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale=scale, enable_gqa=enable_gqa, **kwargs)
@@ -302,6 +538,7 @@ def flash_attention_with_bias(
     value: torch.Tensor,
     attn_bias: torch.Tensor,
     is_causal: bool = False,
+    scale: Optional[float] = None,
     window_size: int = 0,
     bias_repeat_count: int = 0,
 ) -> torch.Tensor:
@@ -309,13 +546,15 @@ def flash_attention_with_bias(
     Compute scaled dot-product attention with additive attention bias.
     This function supports additive attention bias (like relative position encodings
-    or ALiBi) which is added to the attention scores before softmax:
+    or ALiBi) which is added to the attention scores:
+        Attention(Q, K, V) = softmax((Q @ K.T + bias) * scale) @ V
-        Attention(Q, K, V) = softmax((Q @ K.T) / sqrt(d) + bias) @ V
+    IMPORTANT: The bias is added to UNSCALED scores, then the sum is scaled.
+    This differs from PyTorch SDPA which does: softmax((Q @ K.T) * scale + bias).
-    IMPORTANT: MFA adds bias to UNSCALED scores internally and scales during softmax.
-    If your bias was computed for scaled scores (like PyTorch SDPA), you need to
-    pre-scale it by multiplying by sqrt(head_dim).
+    To convert from SDPA-style bias to MFA-style:
+        bias_mfa = bias_sdpa * sqrt(head_dim)  # when using default scale
     Args:
         query: Query tensor of shape (B, H, N_q, D)
@@ -326,6 +565,7 @@ def flash_attention_with_bias(
             - (1, H, N_q, N_kv): Broadcast across batch
             - (H, N_q, N_kv): Broadcast across batch (3D)
         is_causal: If True, applies causal masking
+        scale: Scaling factor for attention scores. Default: 1/sqrt(head_dim)
         window_size: Sliding window attention size (0 = full attention)
         bias_repeat_count: If > 0, the bias tensor repeats every N batches.
             Useful for window attention where multiple windows share the same
@@ -343,10 +583,13 @@ def flash_attention_with_bias(
         >>> v = torch.randn(4, 8, 64, 64, device='mps', dtype=torch.float16)
         >>> # Position bias: (1, num_heads, seq_len, seq_len)
         >>> bias = torch.randn(1, 8, 64, 64, device='mps', dtype=torch.float16)
-        >>> # Pre-scale bias since MFA uses unscaled scores
+        >>> # Pre-scale bias since default scale is 1/sqrt(head_dim)
         >>> scaled_bias = bias * math.sqrt(64)  # sqrt(head_dim)
         >>> out = flash_attention_with_bias(q, k, v, scaled_bias)
+        >>> # With custom scale
+        >>> out = flash_attention_with_bias(q, k, v, bias, scale=0.1)
         >>> # Window attention with repeating bias pattern
         >>> n_windows = 16
         >>> q = torch.randn(n_windows * 4, 8, 49, 64, device='mps', dtype=torch.float16)
@@ -363,6 +606,20 @@ def flash_attention_with_bias(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -373,7 +630,10 @@ def flash_attention_with_bias(
     if attn_bias.device.type != 'mps':
         raise ValueError("attn_bias must be on MPS device")
-    return _C.forward_with_bias(query, key, value, attn_bias, is_causal, window_size, bias_repeat_count)
+    # Use autograd Function for backward support
+    return FlashAttentionWithBiasFunction.apply(
+        query, key, value, attn_bias, is_causal, scale, window_size, bias_repeat_count
+    )
 def flash_attention_chunked(
@@ -452,13 +712,13 @@ def flash_attention_chunked(
         return _C.forward(query, key, value, is_causal, None, 0)
     # Initialize running statistics for online softmax
-    # m = running max, l = running sum of exp, acc = accumulated output
     device = query.device
     dtype = query.dtype
     # Use float32 for numerical stability of softmax statistics
-    running_max = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
-    running_sum = torch.zeros((B, H, seq_len_q, 1), device=device, dtype=torch.float32)
+    # running_L: base-2 logsumexp of all attention scores seen so far (-inf means no data yet)
+    # output_acc: weighted combination of outputs (weights sum to 1 after each update)
+    running_L = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
     output_acc = torch.zeros((B, H, seq_len_q, D), device=device, dtype=torch.float32)
     # Process K/V in chunks
@@ -479,51 +739,81 @@ def flash_attention_chunked(
         # - Partial chunk (up to q) if start_idx <= q < end_idx
         # - None of chunk if q < start_idx
-        chunk_is_causal = is_causal and (end_idx <= seq_len_q)
+        if is_causal:
+            # Create explicit causal mask for this chunk
+            # Query positions: 0 to seq_len_q-1
+            # Key positions in chunk: start_idx to end_idx-1
+            chunk_len = end_idx - start_idx
-        # Compute attention for this chunk
-        # forward_with_lse returns (output, logsumexp) where logsumexp = m + log(l)
-        chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, chunk_is_causal, None, 0)
+            # Build mask: mask[q, k_local] = True means DON'T attend
+            # We want to attend when global_k_pos <= q
+            # global_k_pos = start_idx + k_local
+            # So: attend when start_idx + k_local <= q
+            # mask = start_idx + k_local > q
-        # chunk_lse shape: (B, H, seq_len_q)
-        # We need to convert logsumexp to (max, sum) for online algorithm
-        chunk_lse = chunk_lse.unsqueeze(-1)  # (B, H, seq_len_q, 1)
+            q_pos = torch.arange(seq_len_q, device=device).view(1, 1, seq_len_q, 1)
+            k_pos = torch.arange(chunk_len, device=device).view(1, 1, 1, chunk_len) + start_idx
+            causal_mask = k_pos > q_pos  # True = masked (don't attend)
-        # Convert chunk output to float32 for accumulation
-        chunk_out = chunk_out.float()
-        # Online softmax update:
-        # new_max = max(running_max, chunk_max)
-        # For flash attention, chunk_lse ≈ chunk_max + log(chunk_sum)
-        # We approximate chunk_max ≈ chunk_lse (valid when exp sum dominates)
+            # Expand to batch and heads
+            causal_mask = causal_mask.expand(B, H, seq_len_q, chunk_len)
-        chunk_max = chunk_lse  # Approximation: logsumexp ≈ max when sum is dominated by max
-        # Compute new max
-        new_max = torch.maximum(running_max, chunk_max)
+            # Call forward with explicit mask (is_causal=False since we handle it)
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, causal_mask, 0)
+        else:
+            # Non-causal: just process the chunk directly
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, None, 0)
-        # Rescale previous accumulator
-        # correction_old = exp(running_max - new_max)
-        correction_old = torch.exp(running_max - new_max)
-        # Clip to avoid inf * 0 issues when running_max was -inf
-        correction_old = torch.where(running_max == float('-inf'), torch.zeros_like(correction_old), correction_old)
+        # chunk_L shape: (B, H, seq_len_q)
+        # The kernel returns L = m + log2(l) where:
+        #   m = max(scores * log2(e) / sqrt(D))
+        #   l = sum(exp2(scores * log2(e) / sqrt(D) - m))
+        # This is a base-2 logsumexp: L = log2(sum(exp2(scaled_scores)))
+        chunk_L = chunk_lse.unsqueeze(-1).float()  # (B, H, seq_len_q, 1)
-        # Rescale chunk output
-        # correction_new = exp(chunk_max - new_max)
-        correction_new = torch.exp(chunk_max - new_max)
+        # Convert chunk output to float32 for accumulation
+        chunk_out = chunk_out.float()
-        # For the sum, we need exp(chunk_lse - new_max) = exp(chunk_max + log(chunk_sum) - new_max)
-        # = exp(chunk_max - new_max) * chunk_sum
-        # But we only have logsumexp, so: exp(chunk_lse - new_max)
-        chunk_sum_scaled = torch.exp(chunk_lse - new_max)
+        # Online softmax algorithm using base-2 representation
+        #
+        # Flash attention returns: chunk_out = softmax(scores) @ V
+        # The output is already normalized. For online combination:
+        #   new_L = log2(2^running_L + 2^chunk_L)
+        #         = max(running_L, chunk_L) + log2(2^(running_L - max) + 2^(chunk_L - max))
+        #
+        # The weights for combining outputs are:
+        #   old_weight = 2^(running_L - new_L)
+        #   new_weight = 2^(chunk_L - new_L)
+        # These weights sum to 1, so: output = old_weight * old_out + new_weight * new_out
+        # Compute new base-2 logsumexp
+        max_L = torch.maximum(running_L, chunk_L)
+        # Handle -inf case (no previous data)
+        # Use exp2 for base-2 (matches kernel's internal representation)
+        running_exp2 = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - max_L)
+        )
+        chunk_exp2 = torch.exp2(chunk_L - max_L)
+        new_L = max_L + torch.log2(running_exp2 + chunk_exp2)
+        # Compute correction factors using base-2 exp
+        old_weight = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - new_L)
+        )
+        new_weight = torch.exp2(chunk_L - new_L)
         # Update accumulator
-        output_acc = output_acc * correction_old + chunk_out * correction_new
-        running_sum = running_sum * correction_old + chunk_sum_scaled
-        running_max = new_max
+        # Update accumulator
+        output_acc = output_acc * old_weight + chunk_out * new_weight
+        running_L = new_L
-    # Final normalization
-    output = output_acc / running_sum
+    # No final normalization needed - weights already sum to 1
+    output = output_acc
     # Convert back to original dtype
     return output.to(dtype)
@@ -745,6 +1035,11 @@ def flash_attention_fp8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     quant_type = QUANT_FP8_E5M2 if use_e5m2 else QUANT_FP8_E4M3
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
@@ -798,6 +1093,11 @@ def flash_attention_int8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_INT8, is_causal, attn_mask, window_size
@@ -854,6 +1154,11 @@ def flash_attention_nf4(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_NF4, is_causal, attn_mask, window_size
@@ -908,6 +1213,11 @@ def flash_attention_quantized(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         quant_type, is_causal, attn_mask, window_size

mps_flash_attn/csrc/mps_flash_attn.mm CHANGED Viewed

@@ -14,6 +14,8 @@
 #include <dlfcn.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <atomic>
 // ============================================================================
 // MFA Bridge Function Types
@@ -41,6 +43,9 @@ typedef bool (*mfa_forward_encode_quantized_fn)(void*, void*, void*, void*, void
 typedef bool (*mfa_backward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
                                         int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                         int32_t, int32_t);
+typedef bool (*mfa_backward_encode_bias_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                            int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                            int32_t, int32_t);
 // Legacy sync functions (fallback)
 typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, void*,
                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
@@ -63,11 +68,13 @@ static mfa_forward_encode_fn g_mfa_forward_encode = nullptr;
 static mfa_forward_encode_bias_fn g_mfa_forward_encode_bias = nullptr;
 static mfa_forward_encode_quantized_fn g_mfa_forward_encode_quantized = nullptr;
 static mfa_backward_encode_fn g_mfa_backward_encode = nullptr;
+static mfa_backward_encode_bias_fn g_mfa_backward_encode_bias = nullptr;
 static mfa_forward_fn g_mfa_forward = nullptr;
 static mfa_backward_fn g_mfa_backward = nullptr;
 static mfa_release_kernel_fn g_mfa_release_kernel = nullptr;
 static void* g_dylib_handle = nullptr;
-static bool g_initialized = false;
+static std::atomic<bool> g_initialized{false};
+static std::mutex g_init_mutex;
 // ============================================================================
 // Load MFA Bridge Library
@@ -129,6 +136,7 @@ static bool load_mfa_bridge() {
     g_mfa_forward_encode = (mfa_forward_encode_fn)dlsym(g_dylib_handle, "mfa_forward_encode");
     g_mfa_forward_encode_bias = (mfa_forward_encode_bias_fn)dlsym(g_dylib_handle, "mfa_forward_encode_bias");
     g_mfa_backward_encode = (mfa_backward_encode_fn)dlsym(g_dylib_handle, "mfa_backward_encode");
+    g_mfa_backward_encode_bias = (mfa_backward_encode_bias_fn)dlsym(g_dylib_handle, "mfa_backward_encode_bias");
     g_mfa_forward = (mfa_forward_fn)dlsym(g_dylib_handle, "mfa_forward");
     g_mfa_backward = (mfa_backward_fn)dlsym(g_dylib_handle, "mfa_backward");
     g_mfa_release_kernel = (mfa_release_kernel_fn)dlsym(g_dylib_handle, "mfa_release_kernel");
@@ -141,6 +149,24 @@ static bool load_mfa_bridge() {
     return true;
 }
+// Thread-safe initialization helper
+static void ensure_initialized() {
+    // Fast path: already initialized
+    if (g_initialized.load(std::memory_order_acquire)) {
+        return;
+    }
+    // Slow path: need to initialize with lock
+    std::lock_guard<std::mutex> lock(g_init_mutex);
+    // Double-check after acquiring lock
+    if (!g_initialized.load(std::memory_order_relaxed)) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized.store(true, std::memory_order_release);
+    }
+}
 // ============================================================================
 // Get MTLBuffer from PyTorch MPS Tensor
 // ============================================================================
@@ -359,14 +385,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     const c10::optional<at::Tensor>& attn_mask,  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
     int64_t window_size  // 0 = full attention, >0 = sliding window
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
@@ -562,14 +582,8 @@ at::Tensor mps_flash_attention_forward_with_bias(
     int64_t window_size,
     int64_t bias_repeat_count  // >0 means bias repeats every N batches (for window attention)
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v6/v7 API is available
     TORCH_CHECK(g_mfa_create_kernel_v6 || g_mfa_create_kernel_v7,
@@ -630,9 +644,13 @@ at::Tensor mps_flash_attention_forward_with_bias(
         q = q.to(at::kFloat);
         k = k.to(at::kFloat);
         v = v.to(at::kFloat);
-        bias = bias.to(at::kFloat);
     }
+    // IMPORTANT: Bias is always FP32 in the Metal kernel (registerPrecisions[.S] = .FP32
+    // when lowPrecisionIntermediates = false, which is the common case)
+    // Always convert bias to FP32 to match the kernel's expected type
+    bias = bias.to(at::kFloat);
     // Allocate output
     at::Tensor output;
     if (use_bf16_kernel) {
@@ -711,6 +729,156 @@ at::Tensor mps_flash_attention_forward_with_bias(
     return output;
 }
+// Forward with bias returning both output and logsumexp (for backward pass)
+std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_bias_lse(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& attn_bias,
+    bool is_causal,
+    int64_t window_size,
+    int64_t bias_repeat_count
+) {
+    // Thread-safe initialization
+    ensure_initialized();
+    // Check that v6/v7 API is available
+    TORCH_CHECK(g_mfa_create_kernel_v6 || g_mfa_create_kernel_v7,
+                "Attention bias requires MFA v6+ API (update libMFABridge.dylib)");
+    TORCH_CHECK(g_mfa_forward_encode_bias,
+                "Attention bias requires mfa_forward_encode_bias");
+    // Validate inputs
+    TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
+    TORCH_CHECK(key.dim() == 4, "Key must be 4D (B, H, N, D)");
+    TORCH_CHECK(value.dim() == 4, "Value must be 4D (B, H, N, D)");
+    TORCH_CHECK(query.device().is_mps(), "Query must be on MPS device");
+    TORCH_CHECK(key.device().is_mps(), "Key must be on MPS device");
+    TORCH_CHECK(value.device().is_mps(), "Value must be on MPS device");
+    TORCH_CHECK(attn_bias.device().is_mps(), "Attention bias must be on MPS device");
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t seq_len_q = query.size(2);
+    const int64_t head_dim = query.size(3);
+    const int64_t seq_len_kv = key.size(2);
+    // Determine bias strides for broadcasting
+    uint32_t bias_batch_stride = 0;
+    uint32_t bias_head_stride = static_cast<uint32_t>(seq_len_q * seq_len_kv);
+    if (attn_bias.dim() == 4) {
+        if (attn_bias.size(0) > 1) {
+            bias_batch_stride = static_cast<uint32_t>(attn_bias.size(1) * seq_len_q * seq_len_kv);
+        }
+        if (attn_bias.size(1) == 1) {
+            bias_head_stride = 0;
+        }
+    } else if (attn_bias.dim() == 3) {
+        bias_batch_stride = 0;
+        if (attn_bias.size(0) == 1) {
+            bias_head_stride = 0;
+        }
+    }
+    // Determine precision
+    bool is_bfloat16 = (query.scalar_type() == at::kBFloat16);
+    bool is_fp16 = (query.scalar_type() == at::kHalf);
+    bool use_bf16_kernel = is_bfloat16 && g_mfa_create_kernel_v2;
+    bool low_precision = is_fp16;
+    bool low_precision_outputs = is_fp16 || use_bf16_kernel;
+    // Make inputs contiguous
+    auto q = query.contiguous();
+    auto k = key.contiguous();
+    auto v = value.contiguous();
+    auto bias = attn_bias.contiguous();
+    // For BF16 without native kernel, convert to FP32
+    if (is_bfloat16 && !use_bf16_kernel) {
+        q = q.to(at::kFloat);
+        k = k.to(at::kFloat);
+        v = v.to(at::kFloat);
+    }
+    // Bias is always FP32 in the Metal kernel
+    bias = bias.to(at::kFloat);
+    // Allocate output
+    at::Tensor output;
+    if (use_bf16_kernel) {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kBFloat16));
+    } else if (low_precision_outputs) {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kHalf));
+    } else {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kFloat));
+    }
+    // Allocate logsumexp (always fp32)
+    auto logsumexp = at::empty({batch_size, num_heads, seq_len_q},
+                                query.options().dtype(at::kFloat));
+    // Get or create kernel with bias support
+    void* kernel = get_or_create_kernel(
+        seq_len_q, seq_len_kv, head_dim,
+        low_precision, low_precision_outputs, is_causal, false,
+        use_bf16_kernel,
+        static_cast<uint32_t>(window_size > 0 ? window_size : 0),
+        0, false, true,
+        bias_batch_stride, bias_head_stride,
+        static_cast<uint32_t>(bias_repeat_count > 0 ? bias_repeat_count : 0)
+    );
+    // Get Metal buffers
+    auto q_info = getBufferInfo(q);
+    auto k_info = getBufferInfo(k);
+    auto v_info = getBufferInfo(v);
+    auto o_info = getBufferInfo(output);
+    auto l_info = getBufferInfo(logsumexp);
+    auto bias_info = getBufferInfo(bias);
+    // Execute
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        bool success = g_mfa_forward_encode_bias(
+            kernel,
+            (__bridge void*)encoder,
+            (__bridge void*)q_info.buffer,
+            (__bridge void*)k_info.buffer,
+            (__bridge void*)v_info.buffer,
+            (__bridge void*)o_info.buffer,
+            (__bridge void*)l_info.buffer,
+            nullptr,
+            (__bridge void*)bias_info.buffer,
+            q_info.byte_offset,
+            k_info.byte_offset,
+            v_info.byte_offset,
+            o_info.byte_offset,
+            l_info.byte_offset,
+            0,
+            bias_info.byte_offset,
+            static_cast<int32_t>(batch_size),
+            static_cast<int32_t>(num_heads)
+        );
+        if (!success) {
+            throw std::runtime_error("MFA forward with bias failed");
+        }
+    }
+    // Convert output back to BF16 if needed
+    if (is_bfloat16 && !use_bf16_kernel) {
+        output = output.to(at::kBFloat16);
+    }
+    return std::make_tuple(output, logsumexp);
+}
 // ============================================================================
 // Quantized Flash Attention Forward (FP8, INT8, NF4)
 // ============================================================================
@@ -735,14 +903,8 @@ at::Tensor mps_flash_attention_forward_quantized(
     const c10::optional<at::Tensor>& attn_mask,
     int64_t window_size
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Check that v4 API is available
     TORCH_CHECK(g_mfa_create_kernel_v4, "Quantized attention requires MFA v4 API (update libMFABridge.dylib)");
@@ -992,14 +1154,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     int64_t window_size,  // 0 = full attention, >0 = sliding window
     bool bf16_backward    // true = use BF16 intermediates for ~2x faster backward
 ) {
-    // Initialize MFA on first call
-    if (!g_initialized) {
-        load_mfa_bridge();
-        if (!g_mfa_init()) {
-            throw std::runtime_error("Failed to initialize MFA");
-        }
-        g_initialized = true;
-    }
+    // Thread-safe initialization
+    ensure_initialized();
     // Validate inputs
     TORCH_CHECK(grad_output.dim() == 4, "grad_output must be 4D (B, H, N, D)");
@@ -1160,6 +1316,146 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     return std::make_tuple(dQ, dK, dV);
 }
+// Backward with bias support
+std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward_with_bias(
+    const at::Tensor& grad_output,
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& output,
+    const at::Tensor& logsumexp,
+    const at::Tensor& attn_bias,
+    bool is_causal,
+    int64_t window_size,
+    int64_t bias_repeat_count
+) {
+    ensure_initialized();
+    TORCH_CHECK(g_mfa_backward_encode_bias,
+                "Backward with bias requires mfa_backward_encode_bias (update libMFABridge.dylib)");
+    TORCH_CHECK(grad_output.dim() == 4, "grad_output must be 4D (B, H, N, D)");
+    TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
+    TORCH_CHECK(key.dim() == 4, "Key must be 4D (B, H, N, D)");
+    TORCH_CHECK(value.dim() == 4, "Value must be 4D (B, H, N, D)");
+    TORCH_CHECK(output.dim() == 4, "Output must be 4D (B, H, N, D)");
+    TORCH_CHECK(logsumexp.dim() == 3, "Logsumexp must be 3D (B, H, N)");
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t seq_len_q = query.size(2);
+    const int64_t head_dim = query.size(3);
+    const int64_t seq_len_kv = key.size(2);
+    // Determine bias strides
+    uint32_t bias_batch_stride = 0;
+    uint32_t bias_head_stride = static_cast<uint32_t>(seq_len_q * seq_len_kv);
+    if (attn_bias.dim() == 4) {
+        if (attn_bias.size(0) > 1) {
+            bias_batch_stride = static_cast<uint32_t>(attn_bias.size(1) * seq_len_q * seq_len_kv);
+        }
+        if (attn_bias.size(1) == 1) {
+            bias_head_stride = 0;
+        }
+    } else if (attn_bias.dim() == 3) {
+        bias_batch_stride = 0;
+        if (attn_bias.size(0) == 1) {
+            bias_head_stride = 0;
+        }
+    }
+    bool low_precision = (query.scalar_type() == at::kHalf || query.scalar_type() == at::kBFloat16);
+    // Standard backward: upcast to FP32
+    auto q = query.contiguous().to(at::kFloat);
+    auto k = key.contiguous().to(at::kFloat);
+    auto v = value.contiguous().to(at::kFloat);
+    auto o = output.contiguous().to(at::kFloat);
+    auto dO = grad_output.contiguous().to(at::kFloat);
+    auto lse_tensor = logsumexp.contiguous();
+    auto bias = attn_bias.contiguous().to(at::kFloat);
+    auto D = at::empty({batch_size, num_heads, seq_len_q}, query.options().dtype(at::kFloat));
+    // Get kernel with bias support
+    void* kernel = get_or_create_kernel(
+        seq_len_q, seq_len_kv, head_dim,
+        false, false, is_causal, false, false,
+        static_cast<uint32_t>(window_size > 0 ? window_size : 0),
+        0, false, true,
+        bias_batch_stride, bias_head_stride,
+        static_cast<uint32_t>(bias_repeat_count > 0 ? bias_repeat_count : 0)
+    );
+    // Allocate gradients
+    auto dQ = at::zeros({batch_size, num_heads, seq_len_q, head_dim}, query.options().dtype(at::kFloat));
+    auto dK = at::zeros({batch_size, num_heads, seq_len_kv, head_dim}, query.options().dtype(at::kFloat));
+    auto dV = at::zeros({batch_size, num_heads, seq_len_kv, head_dim}, query.options().dtype(at::kFloat));
+    // Get Metal buffers
+    auto q_info = getBufferInfo(q);
+    auto k_info = getBufferInfo(k);
+    auto v_info = getBufferInfo(v);
+    auto o_info = getBufferInfo(o);
+    auto do_info = getBufferInfo(dO);
+    auto l_info = getBufferInfo(lse_tensor);
+    auto d_info = getBufferInfo(D);
+    auto dq_info = getBufferInfo(dQ);
+    auto dk_info = getBufferInfo(dK);
+    auto dv_info = getBufferInfo(dV);
+    auto bias_info = getBufferInfo(bias);
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        bool success = g_mfa_backward_encode_bias(
+            kernel,
+            (__bridge void*)encoder,
+            (__bridge void*)q_info.buffer,
+            (__bridge void*)k_info.buffer,
+            (__bridge void*)v_info.buffer,
+            (__bridge void*)o_info.buffer,
+            (__bridge void*)do_info.buffer,
+            (__bridge void*)l_info.buffer,
+            (__bridge void*)d_info.buffer,
+            (__bridge void*)dq_info.buffer,
+            (__bridge void*)dk_info.buffer,
+            (__bridge void*)dv_info.buffer,
+            nullptr,  // no mask
+            (__bridge void*)bias_info.buffer,
+            q_info.byte_offset,
+            k_info.byte_offset,
+            v_info.byte_offset,
+            o_info.byte_offset,
+            do_info.byte_offset,
+            l_info.byte_offset,
+            d_info.byte_offset,
+            dq_info.byte_offset,
+            dk_info.byte_offset,
+            dv_info.byte_offset,
+            0,  // mask_offset
+            bias_info.byte_offset,
+            static_cast<int32_t>(batch_size),
+            static_cast<int32_t>(num_heads)
+        );
+        if (!success) {
+            throw std::runtime_error("MFA backward with bias failed");
+        }
+    }
+    // Convert gradients back to input dtype
+    if (low_precision) {
+        dQ = dQ.to(query.scalar_type());
+        dK = dK.to(query.scalar_type());
+        dV = dV.to(query.scalar_type());
+    }
+    return std::make_tuple(dQ, dK, dV);
+}
 // ============================================================================
 // Python Bindings
 // ============================================================================
@@ -1200,10 +1496,35 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     // Forward with additive attention bias (e.g., relative position encoding)
     m.def("forward_with_bias", &mps_flash_attention_forward_with_bias,
-          "Flash Attention forward with additive attention bias",
+          "Flash Attention forward with additive attention bias (returns output only)",
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("attn_bias"),
+          py::arg("is_causal") = false,
+          py::arg("window_size") = 0,
+          py::arg("bias_repeat_count") = 0);
+    // Forward with bias returning logsumexp (for backward)
+    m.def("forward_with_bias_lse", &mps_flash_attention_forward_with_bias_lse,
+          "Flash Attention forward with bias (returns output and logsumexp)",
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("attn_bias"),
+          py::arg("is_causal") = false,
+          py::arg("window_size") = 0,
+          py::arg("bias_repeat_count") = 0);
+    // Backward with bias
+    m.def("backward_with_bias", &mps_flash_attention_backward_with_bias,
+          "Flash Attention backward with bias",
+          py::arg("grad_output"),
           py::arg("query"),
           py::arg("key"),
           py::arg("value"),
+          py::arg("output"),
+          py::arg("logsumexp"),
           py::arg("attn_bias"),
           py::arg("is_causal") = false,
           py::arg("window_size") = 0,

mps_flash_attn/lib/libMFABridge.dylib CHANGED Viewed

Binary file

{mps_flash_attn-0.2.6.dist-info → mps_flash_attn-0.3.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.6
+Version: 0.3.7
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.2.6.dist-info → mps_flash_attn-0.3.7.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-mps_flash_attn/_C.cpython-314-darwin.so,sha256=-KfSzahwyV6JcqoT9TstAFmTQiFwS8Bl6NhiphCUzdA,313160
-mps_flash_attn/__init__.py,sha256=tNSb4nu1MhLqstuFAEA8ezWYKuhpwDya8TVxGmA9VMw,39711
+mps_flash_attn/_C.cpython-314-darwin.so,sha256=MzbWTzTvd0siau5-Vk7TaovGYq3nBursRoRxnMzVpxo,335544
+mps_flash_attn/__init__.py,sha256=tocFoOTiCauMObdVdvaD_f1sV2ioUW2wZr8vV5RZIaY,51807
 mps_flash_attn/benchmark.py,sha256=qHhvb8Dmh07OEa_iXuPuJSEnRJlrjVF5nKzVwbWypWE,24141
-mps_flash_attn/csrc/mps_flash_attn.mm,sha256=d7Bjcm2VOTNANmdqUevN-mqa5aOEVMMxAuTYINAeSr0,51215
+mps_flash_attn/csrc/mps_flash_attn.mm,sha256=4RVXZZHceOXPX_XxKVlHjjedtXMOsEFLkQDIgwJBwtQ,63498
 mps_flash_attn/kernels/06c421e7a01418cf64aafa07f6b1df0558148583959c596d9a7ce260987f89f0.metallib,sha256=_oig6f2I6ZxBCKWbJF3ofmZMySm8gB399_M-lD2NOfM,13747
 mps_flash_attn/kernels/09b9615289be632fdf05444004a0b3b67fb1b70b05a7e0fce8e0ba3a95e3921c.metallib,sha256=1fsmVvB5EubhN-y6s5CB-eVk_wuO2tfrabiQTwXvJJc,13171
 mps_flash_attn/kernels/0c36461301fb52cbad786d0642b020ad2bfc7229b487ccb5dff44d198423b347.metallib,sha256=5WKo_yAU-PgmulBUQhnzvt0DZRteVmo4-nc4U-T6G2g,17507
@@ -25,9 +25,9 @@ mps_flash_attn/kernels/eab4f40de4b0ebd2765b41c25dba7ccab5db4abf6a6eb87d76fff7b5e
 mps_flash_attn/kernels/eab4f40de4b0ebd2765b41c25dba7ccab5db4abf6a6eb87d76fff7b5e0829b2f_8192_8192.bin,sha256=4BjRprnMycnhZql9829R6FS3HW30jejuDJM9p9vzVPs,34112
 mps_flash_attn/kernels/f08fe0efd72e055177e068154dae01e08c4d52d3cb883330a04f1431d274aece.metallib,sha256=qyOaQtRVwL_Wc6GGdu6z-ftf0iX84XexuY09-lNLl5o,13747
 mps_flash_attn/kernels/manifest.json,sha256=d5MkE_BjqDQuMNm1jZiwWkQKfB-yfFml3lLSeR-wCLo,1867
-mps_flash_attn/lib/libMFABridge.dylib,sha256=iKgfYISSKMSNt_iXnljjUr_hZZHyCAg2tdS3_ZjmLkc,605696
-mps_flash_attn-0.2.6.dist-info/licenses/LICENSE,sha256=F_XmXSab2O-hHcqLpYJWeFaqB6GA_qiTEN23p2VfZWU,1237
-mps_flash_attn-0.2.6.dist-info/METADATA,sha256=uxBPVD-lDaQrg9cAKjtS65ze_NiyHi37x5_LsABU9Cc,5834
-mps_flash_attn-0.2.6.dist-info/WHEEL,sha256=uAzMRtb2noxPlbYLbRgeD25pPgKOo3k59IS71Dg5Qjs,110
-mps_flash_attn-0.2.6.dist-info/top_level.txt,sha256=zbArDcWhJDnJfMUKnOUhs5TjsMgSxa2GzOlscTRfobE,15
-mps_flash_attn-0.2.6.dist-info/RECORD,,
+mps_flash_attn/lib/libMFABridge.dylib,sha256=k9a015iToIw2DcEaDmMAxPdb4FpW4sJ9IqBS4jJms0g,608768
+mps_flash_attn-0.3.7.dist-info/licenses/LICENSE,sha256=F_XmXSab2O-hHcqLpYJWeFaqB6GA_qiTEN23p2VfZWU,1237
+mps_flash_attn-0.3.7.dist-info/METADATA,sha256=chjYcevEKiWpsZJFSKkZ5xm5k5oD9_LqUAy_23JyIq4,5834
+mps_flash_attn-0.3.7.dist-info/WHEEL,sha256=uAzMRtb2noxPlbYLbRgeD25pPgKOo3k59IS71Dg5Qjs,110
+mps_flash_attn-0.3.7.dist-info/top_level.txt,sha256=zbArDcWhJDnJfMUKnOUhs5TjsMgSxa2GzOlscTRfobE,15
+mps_flash_attn-0.3.7.dist-info/RECORD,,

{mps_flash_attn-0.2.6.dist-info → mps_flash_attn-0.3.7.dist-info}/WHEEL RENAMED Viewed

File without changes

{mps_flash_attn-0.2.6.dist-info → mps_flash_attn-0.3.7.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{mps_flash_attn-0.2.6.dist-info → mps_flash_attn-0.3.7.dist-info}/top_level.txt RENAMED Viewed

File without changes