PyPI - mps-flash-attn - Versions diffs - 0.2.7__tar.gz → 0.3.7__tar.gz - Mend

mps-flash-attn 0.2.7tar.gz → 0.3.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

{mps_flash_attn-0.2.7 → mps_flash_attn-0.3.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.2.7
+Version: 0.3.7
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.2.7 → mps_flash_attn-0.3.7}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,13 +4,43 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.2.7"
+__version__ = "0.3.7"
+__all__ = [
+    # Core functions
+    "flash_attention",
+    "flash_attention_with_bias",
+    "flash_attention_chunked",
+    # Quantized attention
+    "flash_attention_fp8",
+    "flash_attention_int8",
+    "flash_attention_nf4",
+    "quantize_kv_fp8",
+    "quantize_kv_int8",
+    "quantize_kv_nf4",
+    # Utilities
+    "replace_sdpa",
+    "precompile",
+    "clear_cache",
+    "register_custom_op",
+    "is_available",
+    "convert_mask",
+    # Constants
+    "QUANT_FP8_E4M3",
+    "QUANT_FP8_E5M2",
+    "QUANT_INT8",
+    "QUANT_NF4",
+    # Version
+    "__version__",
+]
 import torch
-from typing import Optional
+import torch.nn.functional as F
+from typing import Optional, Tuple
 import math
 import threading
 import os
+import warnings
 # Try to import the C++ extension
 try:
@@ -30,6 +60,20 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def _ensure_contiguous(tensor: torch.Tensor, name: str) -> torch.Tensor:
+    """Ensure tensor is contiguous, with a debug warning if conversion needed."""
+    if tensor.is_contiguous():
+        return tensor
+    # Auto-convert with debug info
+    if os.environ.get("MFA_DEBUG", "0") == "1":
+        warnings.warn(
+            f"MFA: {name} tensor was not contiguous (stride={tensor.stride()}), "
+            f"auto-converting. For best performance, ensure inputs are contiguous.",
+            UserWarning
+        )
+    return tensor.contiguous()
 def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     """
     Convert attention mask to MFA's boolean format.
@@ -54,6 +98,98 @@ def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
     return attn_mask <= -1e3
+def _validate_and_expand_mask(
+    attn_mask: Optional[torch.Tensor],
+    B: int,
+    H: int,
+    N_q: int,
+    N_kv: int,
+) -> Optional[torch.Tensor]:
+    """
+    Validate attention mask shape and expand broadcast dimensions.
+    Args:
+        attn_mask: Optional mask of shape (B, H, N_q, N_kv) or broadcastable
+        B: Batch size
+        H: Number of heads
+        N_q: Query sequence length
+        N_kv: Key/Value sequence length
+    Returns:
+        Expanded mask of shape (mb, mh, N_q, N_kv) or None
+    """
+    if attn_mask is None:
+        return None
+    attn_mask = _ensure_contiguous(attn_mask, "attn_mask")
+    if attn_mask.dim() != 4:
+        raise ValueError(f"attn_mask must be 4D (B, H, N_q, N_kv), got {attn_mask.dim()}D")
+    mb, mh, mq, mk = attn_mask.shape
+    # Allow broadcast: mq can be 1 (applies same mask to all query positions) or N_q
+    if (mq != 1 and mq != N_q) or (mk != 1 and mk != N_kv):
+        raise ValueError(
+            f"attn_mask shape mismatch: mask is ({mq}, {mk}) but expected ({N_q}, {N_kv}) or broadcastable (1, {N_kv})"
+        )
+    # Expand broadcast mask to full shape for Metal kernel
+    if mq == 1 and N_q > 1:
+        attn_mask = attn_mask.expand(mb, mh, N_q, mk)
+    if mk == 1 and N_kv > 1:
+        attn_mask = attn_mask.expand(mb, mh, mq if mq > 1 else N_q, N_kv)
+    if mb != 1 and mb != B:
+        raise ValueError(f"attn_mask batch size must be 1 or {B}, got {mb}")
+    if mh != 1 and mh != H:
+        raise ValueError(f"attn_mask head count must be 1 or {H}, got {mh}")
+    return attn_mask
+class FlashAttentionWithBiasFunction(torch.autograd.Function):
+    """Autograd function for Flash Attention with bias - native C++ backward."""
+    @staticmethod
+    def forward(ctx, query, key, value, attn_bias, is_causal, scale, window_size, bias_repeat_count):
+        # Apply scale if provided
+        scale_factor = 1.0
+        if scale is not None:
+            default_scale = 1.0 / math.sqrt(query.shape[-1])
+            if abs(scale - default_scale) > 1e-6:
+                scale_factor = scale / default_scale
+                query = query * scale_factor
+        # Call C++ forward with bias (returns output and logsumexp)
+        output, logsumexp = _C.forward_with_bias_lse(query, key, value, attn_bias, is_causal, window_size, bias_repeat_count)
+        # Save for backward
+        ctx.save_for_backward(query, key, value, output, logsumexp, attn_bias)
+        ctx.is_causal = is_causal
+        ctx.scale_factor = scale_factor
+        ctx.window_size = window_size
+        ctx.bias_repeat_count = bias_repeat_count
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        query, key, value, output, logsumexp, attn_bias = ctx.saved_tensors
+        # Call native C++ backward with bias
+        dQ, dK, dV = _C.backward_with_bias(
+            grad_output, query, key, value, output, logsumexp, attn_bias,
+            ctx.is_causal, ctx.window_size, ctx.bias_repeat_count
+        )
+        # Scale dQ back if we scaled query
+        if ctx.scale_factor != 1.0:
+            dQ = dQ * ctx.scale_factor
+        return dQ, dK, dV, None, None, None, None, None
 class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
@@ -176,6 +312,20 @@ def flash_attention(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values that could cause numerical issues
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -186,6 +336,24 @@ def flash_attention(
     if attn_mask is not None and attn_mask.device.type != 'mps':
         raise ValueError("attn_mask must be on MPS device")
+    # Ensure contiguous (auto-convert with debug warning)
+    query = _ensure_contiguous(query, "query")
+    key = _ensure_contiguous(key, "key")
+    value = _ensure_contiguous(value, "value")
+    # Validate tensor dimensions
+    if query.dim() != 4:
+        raise RuntimeError(f"query must be 4D (B, H, N, D), got {query.dim()}D")
+    if key.dim() != 4:
+        raise RuntimeError(f"key must be 4D (B, H, N, D), got {key.dim()}D")
+    if value.dim() != 4:
+        raise RuntimeError(f"value must be 4D (B, H, N, D), got {value.dim()}D")
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
     if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
         # Apply scale if provided
@@ -213,6 +381,10 @@ def replace_sdpa():
     import torch.nn.functional as F
     original_sdpa = F.scaled_dot_product_attention
+    _debug = os.environ.get("MFA_DEBUG", "0") == "1"
+    _call_count = [0]  # mutable for closure
+    _fallback_count = [0]  # track fallbacks for warning
+    _last_fallback_error = [None]
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None, enable_gqa=False, **kwargs):
@@ -224,28 +396,92 @@ def replace_sdpa():
         #   seq=1024: 2.3-3.7x (MFA much faster)
         #   seq=2048: 2.2-3.9x (MFA much faster)
         #   seq=4096: 2.1-3.7x (MFA much faster)
+        # Determine seq_len based on tensor dimensionality
+        # 4D: (B, H, S, D) -> seq_len = shape[2]
+        # 3D: (B, S, D) -> seq_len = shape[1] (single-head attention, e.g., VAE)
+        is_3d = query.ndim == 3
+        seq_len = query.shape[1] if is_3d else query.shape[2]
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 512):
+            query.ndim >= 3 and
+            seq_len >= 512):
             try:
+                q, k, v = query, key, value
+                # Handle 3D tensors (B, S, D) - treat as single-head attention
+                # Unsqueeze to (B, 1, S, D) for MFA, squeeze back after
+                if is_3d:
+                    q = q.unsqueeze(1)  # (B, S, D) -> (B, 1, S, D)
+                    k = k.unsqueeze(1)
+                    v = v.unsqueeze(1)
+                # Handle GQA (Grouped Query Attention) - expand K/V heads to match Q heads
+                # Common in Llama 2/3, Mistral, Qwen, etc.
+                # NOTE: Always expand when heads mismatch, not just when enable_gqa=True
+                # Transformers may pass enable_gqa=True on MPS (torch>=2.5, no mask) even though
+                # MPS SDPA doesn't support native GQA - we handle it here
+                if q.shape[1] != k.shape[1]:
+                    # Expand KV heads: (B, kv_heads, S, D) -> (B, q_heads, S, D)
+                    n_rep = q.shape[1] // k.shape[1]
+                    k = k.repeat_interleave(n_rep, dim=1)
+                    v = v.repeat_interleave(n_rep, dim=1)
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
                 # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
                 mfa_mask = None
                 if attn_mask is not None:
+                    if _debug:
+                        print(f"[MFA MASK] dtype={attn_mask.dtype} shape={tuple(attn_mask.shape)} min={attn_mask.min().item():.2f} max={attn_mask.max().item():.2f}")
                     if attn_mask.dtype == torch.bool:
-                        # Boolean mask: True means masked (don't attend)
-                        mfa_mask = attn_mask
+                        # PyTorch SDPA bool mask: True = ATTEND, False = MASKED
+                        # MFA bool mask: True = MASKED, False = ATTEND
+                        # They're opposite! Invert it.
+                        mfa_mask = ~attn_mask
                     else:
                         # Float mask: typically -inf for masked positions, 0 for unmasked
                         # Convert: positions with large negative values -> True (masked)
                         # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
                         mfa_mask = attn_mask <= -1e3
-                return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
-            except Exception:
-                # Fall back to original on any error
-                pass
+                    if _debug:
+                        print(f"[MFA MASK] converted: True(masked)={mfa_mask.sum().item()} False(attend)={(~mfa_mask).sum().item()}")
+                out = flash_attention(q, k, v, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
+                # Squeeze back for 3D input
+                if is_3d:
+                    out = out.squeeze(1)  # (B, 1, S, D) -> (B, S, D)
+                if _debug:
+                    _call_count[0] += 1
+                    print(f"[MFA #{_call_count[0]}] shape={tuple(query.shape)} is_3d={is_3d} gqa={enable_gqa} mask={attn_mask is not None} causal={is_causal}")
+                return out
+            except Exception as e:
+                # Fall back to original on any error, but track it
+                _fallback_count[0] += 1
+                _last_fallback_error[0] = str(e)
+                if _debug:
+                    import traceback
+                    print(f"[MFA FALLBACK #{_fallback_count[0]}] shape={tuple(query.shape)}\n{traceback.format_exc()}")
+                # Warn user after repeated fallbacks (likely a real problem)
+                if _fallback_count[0] == 10:
+                    warnings.warn(
+                        f"MFA has fallen back to native SDPA {_fallback_count[0]} times. "
+                        f"Last error: {_last_fallback_error[0]}. "
+                        f"Set MFA_DEBUG=1 for details.",
+                        UserWarning
+                    )
+        if _debug and query.device.type == 'mps':
+            _call_count[0] += 1
+            reason = []
+            if dropout_p != 0.0: reason.append(f"dropout={dropout_p}")
+            if query.ndim < 3: reason.append(f"ndim={query.ndim}")
+            if seq_len < 512: reason.append(f"seq={seq_len}<512")
+            print(f"[NATIVE #{_call_count[0]}] shape={tuple(query.shape)} reason={','.join(reason) or 'unknown'}")
         return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale=scale, enable_gqa=enable_gqa, **kwargs)
@@ -302,6 +538,7 @@ def flash_attention_with_bias(
     value: torch.Tensor,
     attn_bias: torch.Tensor,
     is_causal: bool = False,
+    scale: Optional[float] = None,
     window_size: int = 0,
     bias_repeat_count: int = 0,
 ) -> torch.Tensor:
@@ -309,13 +546,15 @@ def flash_attention_with_bias(
     Compute scaled dot-product attention with additive attention bias.
     This function supports additive attention bias (like relative position encodings
-    or ALiBi) which is added to the attention scores before softmax:
+    or ALiBi) which is added to the attention scores:
+        Attention(Q, K, V) = softmax((Q @ K.T + bias) * scale) @ V
-        Attention(Q, K, V) = softmax((Q @ K.T) / sqrt(d) + bias) @ V
+    IMPORTANT: The bias is added to UNSCALED scores, then the sum is scaled.
+    This differs from PyTorch SDPA which does: softmax((Q @ K.T) * scale + bias).
-    IMPORTANT: MFA adds bias to UNSCALED scores internally and scales during softmax.
-    If your bias was computed for scaled scores (like PyTorch SDPA), you need to
-    pre-scale it by multiplying by sqrt(head_dim).
+    To convert from SDPA-style bias to MFA-style:
+        bias_mfa = bias_sdpa * sqrt(head_dim)  # when using default scale
     Args:
         query: Query tensor of shape (B, H, N_q, D)
@@ -326,6 +565,7 @@ def flash_attention_with_bias(
             - (1, H, N_q, N_kv): Broadcast across batch
             - (H, N_q, N_kv): Broadcast across batch (3D)
         is_causal: If True, applies causal masking
+        scale: Scaling factor for attention scores. Default: 1/sqrt(head_dim)
         window_size: Sliding window attention size (0 = full attention)
         bias_repeat_count: If > 0, the bias tensor repeats every N batches.
             Useful for window attention where multiple windows share the same
@@ -343,10 +583,13 @@ def flash_attention_with_bias(
         >>> v = torch.randn(4, 8, 64, 64, device='mps', dtype=torch.float16)
         >>> # Position bias: (1, num_heads, seq_len, seq_len)
         >>> bias = torch.randn(1, 8, 64, 64, device='mps', dtype=torch.float16)
-        >>> # Pre-scale bias since MFA uses unscaled scores
+        >>> # Pre-scale bias since default scale is 1/sqrt(head_dim)
         >>> scaled_bias = bias * math.sqrt(64)  # sqrt(head_dim)
         >>> out = flash_attention_with_bias(q, k, v, scaled_bias)
+        >>> # With custom scale
+        >>> out = flash_attention_with_bias(q, k, v, bias, scale=0.1)
         >>> # Window attention with repeating bias pattern
         >>> n_windows = 16
         >>> q = torch.randn(n_windows * 4, 8, 49, 64, device='mps', dtype=torch.float16)
@@ -363,6 +606,20 @@ def flash_attention_with_bias(
     if not torch.backends.mps.is_available():
         raise RuntimeError("MPS not available")
+    # Validate scale parameter
+    if scale is not None:
+        if scale <= 0:
+            raise ValueError(f"scale must be positive, got {scale}")
+        # Warn about extreme scale values
+        default_scale = 1.0 / math.sqrt(query.shape[-1])
+        if scale < default_scale * 0.01 or scale > default_scale * 100:
+            warnings.warn(
+                f"scale={scale:.6g} is very different from default {default_scale:.6g}, "
+                "this may cause numerical issues",
+                UserWarning,
+                stacklevel=2
+            )
     # Validate device
     if query.device.type != 'mps':
         raise ValueError("query must be on MPS device")
@@ -373,7 +630,10 @@ def flash_attention_with_bias(
     if attn_bias.device.type != 'mps':
         raise ValueError("attn_bias must be on MPS device")
-    return _C.forward_with_bias(query, key, value, attn_bias, is_causal, window_size, bias_repeat_count)
+    # Use autograd Function for backward support
+    return FlashAttentionWithBiasFunction.apply(
+        query, key, value, attn_bias, is_causal, scale, window_size, bias_repeat_count
+    )
 def flash_attention_chunked(
@@ -452,13 +712,13 @@ def flash_attention_chunked(
         return _C.forward(query, key, value, is_causal, None, 0)
     # Initialize running statistics for online softmax
-    # m = running max, l = running sum of exp, acc = accumulated output
     device = query.device
     dtype = query.dtype
     # Use float32 for numerical stability of softmax statistics
-    running_max = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
-    running_sum = torch.zeros((B, H, seq_len_q, 1), device=device, dtype=torch.float32)
+    # running_L: base-2 logsumexp of all attention scores seen so far (-inf means no data yet)
+    # output_acc: weighted combination of outputs (weights sum to 1 after each update)
+    running_L = torch.full((B, H, seq_len_q, 1), float('-inf'), device=device, dtype=torch.float32)
     output_acc = torch.zeros((B, H, seq_len_q, D), device=device, dtype=torch.float32)
     # Process K/V in chunks
@@ -479,51 +739,81 @@ def flash_attention_chunked(
         # - Partial chunk (up to q) if start_idx <= q < end_idx
         # - None of chunk if q < start_idx
-        chunk_is_causal = is_causal and (end_idx <= seq_len_q)
+        if is_causal:
+            # Create explicit causal mask for this chunk
+            # Query positions: 0 to seq_len_q-1
+            # Key positions in chunk: start_idx to end_idx-1
+            chunk_len = end_idx - start_idx
-        # Compute attention for this chunk
-        # forward_with_lse returns (output, logsumexp) where logsumexp = m + log(l)
-        chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, chunk_is_causal, None, 0)
+            # Build mask: mask[q, k_local] = True means DON'T attend
+            # We want to attend when global_k_pos <= q
+            # global_k_pos = start_idx + k_local
+            # So: attend when start_idx + k_local <= q
+            # mask = start_idx + k_local > q
-        # chunk_lse shape: (B, H, seq_len_q)
-        # We need to convert logsumexp to (max, sum) for online algorithm
-        chunk_lse = chunk_lse.unsqueeze(-1)  # (B, H, seq_len_q, 1)
+            q_pos = torch.arange(seq_len_q, device=device).view(1, 1, seq_len_q, 1)
+            k_pos = torch.arange(chunk_len, device=device).view(1, 1, 1, chunk_len) + start_idx
+            causal_mask = k_pos > q_pos  # True = masked (don't attend)
-        # Convert chunk output to float32 for accumulation
-        chunk_out = chunk_out.float()
-        # Online softmax update:
-        # new_max = max(running_max, chunk_max)
-        # For flash attention, chunk_lse ≈ chunk_max + log(chunk_sum)
-        # We approximate chunk_max ≈ chunk_lse (valid when exp sum dominates)
+            # Expand to batch and heads
+            causal_mask = causal_mask.expand(B, H, seq_len_q, chunk_len)
-        chunk_max = chunk_lse  # Approximation: logsumexp ≈ max when sum is dominated by max
-        # Compute new max
-        new_max = torch.maximum(running_max, chunk_max)
+            # Call forward with explicit mask (is_causal=False since we handle it)
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, causal_mask, 0)
+        else:
+            # Non-causal: just process the chunk directly
+            chunk_out, chunk_lse = _C.forward_with_lse(query, k_chunk, v_chunk, False, None, 0)
-        # Rescale previous accumulator
-        # correction_old = exp(running_max - new_max)
-        correction_old = torch.exp(running_max - new_max)
-        # Clip to avoid inf * 0 issues when running_max was -inf
-        correction_old = torch.where(running_max == float('-inf'), torch.zeros_like(correction_old), correction_old)
+        # chunk_L shape: (B, H, seq_len_q)
+        # The kernel returns L = m + log2(l) where:
+        #   m = max(scores * log2(e) / sqrt(D))
+        #   l = sum(exp2(scores * log2(e) / sqrt(D) - m))
+        # This is a base-2 logsumexp: L = log2(sum(exp2(scaled_scores)))
+        chunk_L = chunk_lse.unsqueeze(-1).float()  # (B, H, seq_len_q, 1)
-        # Rescale chunk output
-        # correction_new = exp(chunk_max - new_max)
-        correction_new = torch.exp(chunk_max - new_max)
+        # Convert chunk output to float32 for accumulation
+        chunk_out = chunk_out.float()
-        # For the sum, we need exp(chunk_lse - new_max) = exp(chunk_max + log(chunk_sum) - new_max)
-        # = exp(chunk_max - new_max) * chunk_sum
-        # But we only have logsumexp, so: exp(chunk_lse - new_max)
-        chunk_sum_scaled = torch.exp(chunk_lse - new_max)
+        # Online softmax algorithm using base-2 representation
+        #
+        # Flash attention returns: chunk_out = softmax(scores) @ V
+        # The output is already normalized. For online combination:
+        #   new_L = log2(2^running_L + 2^chunk_L)
+        #         = max(running_L, chunk_L) + log2(2^(running_L - max) + 2^(chunk_L - max))
+        #
+        # The weights for combining outputs are:
+        #   old_weight = 2^(running_L - new_L)
+        #   new_weight = 2^(chunk_L - new_L)
+        # These weights sum to 1, so: output = old_weight * old_out + new_weight * new_out
+        # Compute new base-2 logsumexp
+        max_L = torch.maximum(running_L, chunk_L)
+        # Handle -inf case (no previous data)
+        # Use exp2 for base-2 (matches kernel's internal representation)
+        running_exp2 = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - max_L)
+        )
+        chunk_exp2 = torch.exp2(chunk_L - max_L)
+        new_L = max_L + torch.log2(running_exp2 + chunk_exp2)
+        # Compute correction factors using base-2 exp
+        old_weight = torch.where(
+            running_L == float('-inf'),
+            torch.zeros_like(running_L),
+            torch.exp2(running_L - new_L)
+        )
+        new_weight = torch.exp2(chunk_L - new_L)
         # Update accumulator
-        output_acc = output_acc * correction_old + chunk_out * correction_new
-        running_sum = running_sum * correction_old + chunk_sum_scaled
-        running_max = new_max
+        # Update accumulator
+        output_acc = output_acc * old_weight + chunk_out * new_weight
+        running_L = new_L
-    # Final normalization
-    output = output_acc / running_sum
+    # No final normalization needed - weights already sum to 1
+    output = output_acc
     # Convert back to original dtype
     return output.to(dtype)
@@ -745,6 +1035,11 @@ def flash_attention_fp8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     quant_type = QUANT_FP8_E5M2 if use_e5m2 else QUANT_FP8_E4M3
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
@@ -798,6 +1093,11 @@ def flash_attention_int8(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_INT8, is_causal, attn_mask, window_size
@@ -854,6 +1154,11 @@ def flash_attention_nf4(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         QUANT_NF4, is_causal, attn_mask, window_size
@@ -908,6 +1213,11 @@ def flash_attention_quantized(
             scale_factor = scale / default_scale
             query = query * scale_factor
+    # Validate and expand broadcast mask
+    B, H, N_q, D = query.shape
+    N_kv = key.shape[2]
+    attn_mask = _validate_and_expand_mask(attn_mask, B, H, N_q, N_kv)
     return _C.forward_quantized(
         query, key, value, k_scale, v_scale,
         quant_type, is_causal, attn_mask, window_size

mps-flash-attn 0.2.7__tar.gz → 0.3.7__tar.gz

mps-flash-attn 0.2.7tar.gz → 0.3.7tar.gz