PyPI - mps-flash-attn - Versions diffs - 0.1.6__tar.gz → 0.1.8__tar.gz - Mend

mps-flash-attn 0.1.6tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (39) hide show

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.6
+Version: 0.1.8
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -32,8 +32,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -98,6 +99,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -247,7 +258,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/README.md RENAMED Viewed

@@ -7,8 +7,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -73,6 +74,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -222,7 +233,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.1.6"
+__version__ = "0.1.8"
 import torch
 from typing import Optional
@@ -20,39 +20,9 @@ except ImportError as e:
     _HAS_MFA = False
     _IMPORT_ERROR = str(e)
-# Set up shipped kernels directory for zero-compilation loading
-def _init_shipped_kernels():
-    """Point the Swift bridge to pre-shipped kernel binaries."""
-    try:
-        import ctypes
-        bridge_path = os.environ.get("MFA_BRIDGE_PATH")
-        if not bridge_path:
-            module_dir = os.path.dirname(__file__)
-            candidates = [
-                os.path.join(module_dir, "lib", "libMFABridge.dylib"),  # Bundled in wheel
-                os.path.join(module_dir, "..", "swift-bridge", ".build", "release", "libMFABridge.dylib"),
-                os.path.join(module_dir, "libMFABridge.dylib"),
-            ]
-            for path in candidates:
-                if os.path.exists(path):
-                    bridge_path = path
-                    break
-        if bridge_path and os.path.exists(bridge_path):
-            lib = ctypes.CDLL(bridge_path)
-            # Set shipped kernels directory (pre-compiled metallibs + pipeline binaries)
-            kernels_dir = os.path.join(os.path.dirname(__file__), "kernels")
-            if os.path.exists(kernels_dir):
-                lib.mfa_set_kernels_dir(kernels_dir.encode('utf-8'))
-            lib.mfa_init()
-    except Exception:
-        pass  # Init is optional, will fall back to runtime compilation
-# Initialize shipped kernels on import
-if _HAS_MFA:
-    _init_shipped_kernels()
+# Note: The C++ extension handles loading libMFABridge.dylib via dlopen.
+# Set MFA_BRIDGE_PATH environment variable to specify the library location.
+# Do NOT load the library here via ctypes - that causes duplicate class warnings.
 def is_available() -> bool:
@@ -64,7 +34,7 @@ class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
     @staticmethod
-    def forward(ctx, query, key, value, is_causal, scale):
+    def forward(ctx, query, key, value, is_causal, scale, attn_mask):
         # Apply scale if provided (MFA uses 1/sqrt(D) internally)
         scale_factor = 1.0
         if scale is not None:
@@ -74,10 +44,15 @@ class FlashAttentionFunction(torch.autograd.Function):
                 query = query * scale_factor
         # Forward with logsumexp for backward
-        output, logsumexp = _C.forward_with_lse(query, key, value, is_causal)
+        output, logsumexp = _C.forward_with_lse(query, key, value, is_causal, attn_mask)
         # Save for backward
-        ctx.save_for_backward(query, key, value, output, logsumexp)
+        if attn_mask is not None:
+            ctx.save_for_backward(query, key, value, output, logsumexp, attn_mask)
+            ctx.has_mask = True
+        else:
+            ctx.save_for_backward(query, key, value, output, logsumexp)
+            ctx.has_mask = False
         ctx.is_causal = is_causal
         ctx.scale_factor = scale_factor
@@ -85,19 +60,23 @@ class FlashAttentionFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
-        query, key, value, output, logsumexp = ctx.saved_tensors
+        if ctx.has_mask:
+            query, key, value, output, logsumexp, attn_mask = ctx.saved_tensors
+        else:
+            query, key, value, output, logsumexp = ctx.saved_tensors
+            attn_mask = None
         # Compute gradients
         dQ, dK, dV = _C.backward(
-            grad_output, query, key, value, output, logsumexp, ctx.is_causal
+            grad_output, query, key, value, output, logsumexp, ctx.is_causal, attn_mask
         )
         # If we scaled the query in forward, scale the gradient back
         if ctx.scale_factor != 1.0:
             dQ = dQ * ctx.scale_factor
-        # Return gradients (None for is_causal and scale since they're not tensors)
-        return dQ, dK, dV, None, None
+        # Return gradients (None for is_causal, scale, and attn_mask since they're not tensors or don't need grad)
+        return dQ, dK, dV, None, None, None
 def flash_attention(
@@ -106,6 +85,7 @@ def flash_attention(
     value: torch.Tensor,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    attn_mask: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Compute scaled dot-product attention using Flash Attention on MPS.
@@ -121,6 +101,9 @@ def flash_attention(
         value: Value tensor of shape (B, num_heads, seq_len, head_dim)
         is_causal: If True, applies causal masking (for autoregressive models)
         scale: Scaling factor for attention scores. Default: 1/sqrt(head_dim)
+        attn_mask: Optional boolean attention mask of shape (B, 1, seq_len_q, seq_len_kv)
+                   or (B, num_heads, seq_len_q, seq_len_kv). True values indicate
+                   positions to be masked (not attended to).
     Returns:
         Output tensor of shape (B, num_heads, seq_len, head_dim)
@@ -137,6 +120,11 @@ def flash_attention(
         >>> q.requires_grad = True
         >>> out = flash_attention(q, k, v)
         >>> out.sum().backward()  # Computes dQ
+        # With attention mask:
+        >>> mask = torch.zeros(2, 1, 4096, 4096, dtype=torch.bool, device='mps')
+        >>> mask[:, :, :, 2048:] = True  # mask out second half of keys
+        >>> out = flash_attention(q, k, v, attn_mask=mask)
     """
     if not _HAS_MFA:
         raise RuntimeError(
@@ -154,9 +142,23 @@ def flash_attention(
         raise ValueError("key must be on MPS device")
     if value.device.type != 'mps':
         raise ValueError("value must be on MPS device")
+    if attn_mask is not None and attn_mask.device.type != 'mps':
+        raise ValueError("attn_mask must be on MPS device")
+    # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
+    if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
+        # Apply scale if provided
+        if scale is not None:
+            default_scale = 1.0 / math.sqrt(query.shape[-1])
+            if abs(scale - default_scale) > 1e-6:
+                scale_factor = scale / default_scale
+                query = query * scale_factor
+        # Forward only - no logsumexp needed, no tensors saved
+        return _C.forward(query, key, value, is_causal, attn_mask)
     # Use autograd function for gradient support
-    return FlashAttentionFunction.apply(query, key, value, is_causal, scale)
+    return FlashAttentionFunction.apply(query, key, value, is_causal, scale, attn_mask)
 def replace_sdpa():
@@ -173,16 +175,27 @@ def replace_sdpa():
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None):
-        # Use MFA for MPS tensors without mask/dropout
+        # Use MFA for MPS tensors without dropout
         # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
         # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
         if (query.device.type == 'mps' and
-            attn_mask is None and
             dropout_p == 0.0 and
             _HAS_MFA and
             query.shape[2] >= 1024):
             try:
-                return flash_attention(query, key, value, is_causal=is_causal, scale=scale)
+                # Convert float mask to bool mask if needed
+                # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
+                # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
+                mfa_mask = None
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        # Boolean mask: True means masked (don't attend)
+                        mfa_mask = attn_mask
+                    else:
+                        # Float mask: typically -inf for masked positions, 0 for unmasked
+                        # Convert: positions with large negative values -> True (masked)
+                        mfa_mask = attn_mask < -1e4
+                return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
             except Exception:
                 # Fall back to original on any error
                 pass

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/mps_flash_attn/csrc/mps_flash_attn.mm RENAMED Viewed

@@ -12,22 +12,29 @@
 #import <Foundation/Foundation.h>
 #include <dlfcn.h>
+#include <string>
+#include <vector>
 // ============================================================================
 // MFA Bridge Function Types
 // ============================================================================
 typedef bool (*mfa_init_fn)();
-typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool);
+typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool);
 // New zero-sync encode functions that take PyTorch's command encoder
-typedef bool (*mfa_forward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, int32_t);
-typedef bool (*mfa_backward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
-                                        int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+// Added mask_ptr and mask_offset parameters
+typedef bool (*mfa_forward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*,
+                                       int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                       int32_t, int32_t);
+typedef bool (*mfa_backward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                        int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                         int32_t, int32_t);
 // Legacy sync functions (fallback)
-typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, int32_t);
-typedef bool (*mfa_backward_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
-                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, void*,
+                                int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                int32_t, int32_t);
+typedef bool (*mfa_backward_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                  int32_t, int32_t);
 typedef void (*mfa_release_kernel_fn)(void*);
@@ -46,31 +53,44 @@ static bool g_initialized = false;
 // Load MFA Bridge Library
 // ============================================================================
+// Get the directory containing this shared library
+static std::string get_module_dir() {
+    Dl_info info;
+    if (dladdr((void*)get_module_dir, &info) && info.dli_fname) {
+        std::string path(info.dli_fname);
+        size_t last_slash = path.rfind('/');
+        if (last_slash != std::string::npos) {
+            return path.substr(0, last_slash);
+        }
+    }
+    return ".";
+}
 static bool load_mfa_bridge() {
     if (g_dylib_handle) return true;
-    // Try to find the dylib relative to this extension
-    // First try the standard location
-    const char* paths[] = {
-        "libMFABridge.dylib",
-        "./libMFABridge.dylib",
-        "../swift-bridge/.build/release/libMFABridge.dylib",
-        nullptr
+    // First check environment variable (highest priority)
+    const char* mfa_path = getenv("MFA_BRIDGE_PATH");
+    if (mfa_path) {
+        g_dylib_handle = dlopen(mfa_path, RTLD_NOW);
+        if (g_dylib_handle) return true;
+    }
+    // Get the directory containing this extension module
+    std::string module_dir = get_module_dir();
+    // Try paths relative to the module directory
+    std::vector<std::string> paths = {
+        module_dir + "/lib/libMFABridge.dylib",  // Bundled in wheel
+        module_dir + "/../swift-bridge/.build/release/libMFABridge.dylib",  // Dev build
+        "libMFABridge.dylib",  // Current directory fallback
     };
-    for (int i = 0; paths[i] != nullptr; i++) {
-        g_dylib_handle = dlopen(paths[i], RTLD_NOW);
+    for (const auto& path : paths) {
+        g_dylib_handle = dlopen(path.c_str(), RTLD_NOW);
         if (g_dylib_handle) break;
     }
-    if (!g_dylib_handle) {
-        // Try with absolute path from environment
-        const char* mfa_path = getenv("MFA_BRIDGE_PATH");
-        if (mfa_path) {
-            g_dylib_handle = dlopen(mfa_path, RTLD_NOW);
-        }
-    }
     if (!g_dylib_handle) {
         throw std::runtime_error(
             "Failed to load libMFABridge.dylib. Set MFA_BRIDGE_PATH environment variable.");
@@ -127,6 +147,7 @@ struct KernelCacheKey {
     bool low_precision;
     bool low_precision_outputs;
     bool causal;
+    bool has_mask;
     bool operator==(const KernelCacheKey& other) const {
         return seq_len_q == other.seq_len_q &&
@@ -134,7 +155,8 @@ struct KernelCacheKey {
                head_dim == other.head_dim &&
                low_precision == other.low_precision &&
                low_precision_outputs == other.low_precision_outputs &&
-               causal == other.causal;
+               causal == other.causal &&
+               has_mask == other.has_mask;
     }
 };
@@ -145,14 +167,15 @@ struct KernelCacheKeyHash {
                (std::hash<int64_t>()(k.head_dim) << 2) ^
                (std::hash<bool>()(k.low_precision) << 3) ^
                (std::hash<bool>()(k.low_precision_outputs) << 4) ^
-               (std::hash<bool>()(k.causal) << 5);
+               (std::hash<bool>()(k.causal) << 5) ^
+               (std::hash<bool>()(k.has_mask) << 6);
     }
 };
 static std::unordered_map<KernelCacheKey, void*, KernelCacheKeyHash> g_kernel_cache;
-static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal) {
-    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal};
+static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask) {
+    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask};
     auto it = g_kernel_cache.find(key);
     if (it != g_kernel_cache.end()) {
@@ -165,7 +188,8 @@ static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_di
         static_cast<int32_t>(head_dim),
         low_prec,
         low_prec_outputs,
-        causal
+        causal,
+        has_mask
     );
     if (!kernel) {
@@ -184,7 +208,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     const at::Tensor& query,   // (B, H, N, D)
     const at::Tensor& key,     // (B, H, N, D)
     const at::Tensor& value,   // (B, H, N, D)
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
 ) {
     // Initialize MFA on first call
     if (!g_initialized) {
@@ -248,6 +273,27 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     auto k = k_expanded.contiguous();
     auto v = v_expanded.contiguous();
+    // Handle attention mask
+    bool has_mask = attn_mask.has_value();
+    at::Tensor mask;
+    if (has_mask) {
+        mask = attn_mask.value();
+        TORCH_CHECK(mask.dim() == 4, "Attention mask must be 4D (B, H or 1, N_q, N_kv)");
+        TORCH_CHECK(mask.device().is_mps(), "Attention mask must be on MPS device");
+        // Convert to bool/uint8 if needed - kernel expects uchar (0 = attend, non-0 = mask out)
+        if (mask.scalar_type() == at::kBool) {
+            // Convert bool to uint8 for Metal compatibility
+            mask = mask.to(at::kByte);
+        }
+        TORCH_CHECK(mask.scalar_type() == at::kByte,
+                    "Attention mask must be bool or uint8");
+        // Expand mask heads if needed (B, 1, N_q, N_kv) -> (B, H, N_q, N_kv)
+        if (mask.size(1) == 1 && num_heads > 1) {
+            mask = mask.expand({batch_size, num_heads, seq_len_q, seq_len_kv});
+        }
+        mask = mask.contiguous();
+    }
     // Allocate output in the appropriate precision
     // With lowPrecisionOutputs=true, MFA writes FP16 directly
     at::Tensor output;
@@ -264,7 +310,7 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
                                 query.options().dtype(at::kFloat));
     // Get or create kernel with matching output precision and causal mode
-    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal, has_mask);
     // Get Metal buffers with byte offsets
     auto q_info = getBufferInfo(q);
@@ -273,6 +319,12 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     auto o_info = getBufferInfo(output);
     auto l_info = getBufferInfo(logsumexp);
+    // Mask buffer info (may be nullptr if no mask)
+    BufferInfo mask_info = {nil, 0};
+    if (has_mask) {
+        mask_info = getBufferInfo(mask);
+    }
     // Use PyTorch's MPS stream command encoder for zero-sync integration
     @autoreleasepool {
         auto stream = at::mps::getCurrentMPSStream();
@@ -290,11 +342,13 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
             (__bridge void*)v_info.buffer,
             (__bridge void*)o_info.buffer,
             (__bridge void*)l_info.buffer,
+            has_mask ? (__bridge void*)mask_info.buffer : nullptr,
             q_info.byte_offset,
             k_info.byte_offset,
             v_info.byte_offset,
             o_info.byte_offset,
             l_info.byte_offset,
+            mask_info.byte_offset,
             static_cast<int32_t>(batch_size),
             static_cast<int32_t>(num_heads)
         );
@@ -317,9 +371,10 @@ at::Tensor mps_flash_attention_forward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask
 ) {
-    auto [output, logsumexp] = mps_flash_attention_forward_with_lse(query, key, value, is_causal);
+    auto [output, logsumexp] = mps_flash_attention_forward_with_lse(query, key, value, is_causal, attn_mask);
     return output;
 }
@@ -334,7 +389,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     const at::Tensor& value,        // (B, H, N, D)
     const at::Tensor& output,       // (B, H, N, D)
     const at::Tensor& logsumexp,    // (B, H, N)
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
 ) {
     // Initialize MFA on first call
     if (!g_initialized) {
@@ -364,16 +420,35 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
                           query.scalar_type() == at::kBFloat16);
     bool low_precision_outputs = low_precision;
-    // Make inputs contiguous
-    auto q = query.contiguous();
-    auto k = key.contiguous();
-    auto v = value.contiguous();
-    auto o = output.contiguous();
-    auto dO = grad_output.contiguous();
+    // Handle attention mask
+    bool has_mask = attn_mask.has_value();
+    at::Tensor mask;
+    if (has_mask) {
+        mask = attn_mask.value();
+        TORCH_CHECK(mask.dim() == 4, "Attention mask must be 4D (B, H or 1, N_q, N_kv)");
+        TORCH_CHECK(mask.device().is_mps(), "Attention mask must be on MPS device");
+        if (mask.scalar_type() == at::kBool) {
+            mask = mask.to(at::kByte);
+        }
+        TORCH_CHECK(mask.scalar_type() == at::kByte,
+                    "Attention mask must be bool or uint8");
+        if (mask.size(1) == 1 && num_heads > 1) {
+            mask = mask.expand({batch_size, num_heads, seq_len_q, seq_len_kv});
+        }
+        mask = mask.contiguous();
+    }
+    // Make inputs contiguous and upcast to fp32 for numerical stability
+    // The backward pass accumulates many small values, so fp32 precision is critical
+    auto q = query.contiguous().to(at::kFloat);
+    auto k = key.contiguous().to(at::kFloat);
+    auto v = value.contiguous().to(at::kFloat);
+    auto o = output.contiguous().to(at::kFloat);
+    auto dO = grad_output.contiguous().to(at::kFloat);
     auto lse = logsumexp.contiguous();
-    // Get or create kernel (with causal mode)
-    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    // Get or create kernel - always use fp32 for backward pass
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, false, false, is_causal, has_mask);
     // Allocate D buffer (dO * O reduction, always fp32)
     auto D = at::empty({batch_size, num_heads, seq_len_q},
@@ -399,6 +474,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     auto dk_info = getBufferInfo(dK);
     auto dv_info = getBufferInfo(dV);
+    // Mask buffer info (may be nullptr if no mask)
+    BufferInfo mask_info = {nil, 0};
+    if (has_mask) {
+        mask_info = getBufferInfo(mask);
+    }
     // Use PyTorch's MPS stream command encoder for zero-sync integration
     @autoreleasepool {
         auto stream = at::mps::getCurrentMPSStream();
@@ -419,6 +500,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
             (__bridge void*)dq_info.buffer,
             (__bridge void*)dk_info.buffer,
             (__bridge void*)dv_info.buffer,
+            has_mask ? (__bridge void*)mask_info.buffer : nullptr,
             q_info.byte_offset,
             k_info.byte_offset,
             v_info.byte_offset,
@@ -429,6 +511,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
             dq_info.byte_offset,
             dk_info.byte_offset,
             dv_info.byte_offset,
+            mask_info.byte_offset,
             static_cast<int32_t>(batch_size),
             static_cast<int32_t>(num_heads)
         );
@@ -462,14 +545,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("query"),
           py::arg("key"),
           py::arg("value"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
     m.def("forward_with_lse", &mps_flash_attention_forward_with_lse,
           "Flash Attention forward pass (returns output and logsumexp for backward)",
           py::arg("query"),
           py::arg("key"),
           py::arg("value"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
     m.def("backward", &mps_flash_attention_backward,
           "Flash Attention backward pass",
@@ -479,5 +564,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("value"),
           py::arg("output"),
           py::arg("logsumexp"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
 }

mps_flash_attn-0.1.8/mps_flash_attn/lib/libMFABridge.dylib ADDED Viewed

Binary file

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.6
+Version: 0.1.8
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -32,8 +32,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -98,6 +99,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -247,7 +258,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.6 → mps_flash_attn-0.1.8}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.1.6"
+version = "0.1.8"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"