PyPI - mps-flash-attn - Versions diffs - 0.1.4__tar.gz → 0.1.13__tar.gz - Mend

mps-flash-attn 0.1.4tar.gz → 0.1.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (39) hide show

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.4
+Version: 0.1.13
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -32,8 +32,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -98,6 +99,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -247,7 +258,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/README.md RENAMED Viewed

@@ -7,8 +7,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -73,6 +74,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -222,7 +233,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.1.4"
+__version__ = "0.1.13"
 import torch
 from typing import Optional
@@ -20,39 +20,9 @@ except ImportError as e:
     _HAS_MFA = False
     _IMPORT_ERROR = str(e)
-# Set up shipped kernels directory for zero-compilation loading
-def _init_shipped_kernels():
-    """Point the Swift bridge to pre-shipped kernel binaries."""
-    try:
-        import ctypes
-        bridge_path = os.environ.get("MFA_BRIDGE_PATH")
-        if not bridge_path:
-            module_dir = os.path.dirname(__file__)
-            candidates = [
-                os.path.join(module_dir, "lib", "libMFABridge.dylib"),  # Bundled in wheel
-                os.path.join(module_dir, "..", "swift-bridge", ".build", "release", "libMFABridge.dylib"),
-                os.path.join(module_dir, "libMFABridge.dylib"),
-            ]
-            for path in candidates:
-                if os.path.exists(path):
-                    bridge_path = path
-                    break
-        if bridge_path and os.path.exists(bridge_path):
-            lib = ctypes.CDLL(bridge_path)
-            # Set shipped kernels directory (pre-compiled metallibs + pipeline binaries)
-            kernels_dir = os.path.join(os.path.dirname(__file__), "kernels")
-            if os.path.exists(kernels_dir):
-                lib.mfa_set_kernels_dir(kernels_dir.encode('utf-8'))
-            lib.mfa_init()
-    except Exception:
-        pass  # Init is optional, will fall back to runtime compilation
-# Initialize shipped kernels on import
-if _HAS_MFA:
-    _init_shipped_kernels()
+# Note: The C++ extension handles loading libMFABridge.dylib via dlopen.
+# Set MFA_BRIDGE_PATH environment variable to specify the library location.
+# Do NOT load the library here via ctypes - that causes duplicate class warnings.
 def is_available() -> bool:
@@ -60,11 +30,35 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """
+    Convert attention mask to MFA's boolean format.
+    MFA uses boolean masks where True = masked (don't attend).
+    PyTorch SDPA uses additive float masks where -inf/large negative = masked.
+    Args:
+        attn_mask: Optional mask, either:
+            - None: no mask
+            - bool tensor: already in MFA format (True = masked)
+            - float tensor: additive mask (large negative = masked)
+    Returns:
+        Boolean mask suitable for flash_attention(), or None
+    """
+    if attn_mask is None:
+        return None
+    if attn_mask.dtype == torch.bool:
+        return attn_mask
+    # Float mask: large negative values indicate masked positions
+    return attn_mask <= -1e3
 class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
     @staticmethod
-    def forward(ctx, query, key, value, is_causal, scale):
+    def forward(ctx, query, key, value, is_causal, scale, attn_mask):
         # Apply scale if provided (MFA uses 1/sqrt(D) internally)
         scale_factor = 1.0
         if scale is not None:
@@ -74,10 +68,15 @@ class FlashAttentionFunction(torch.autograd.Function):
                 query = query * scale_factor
         # Forward with logsumexp for backward
-        output, logsumexp = _C.forward_with_lse(query, key, value, is_causal)
+        output, logsumexp = _C.forward_with_lse(query, key, value, is_causal, attn_mask)
         # Save for backward
-        ctx.save_for_backward(query, key, value, output, logsumexp)
+        if attn_mask is not None:
+            ctx.save_for_backward(query, key, value, output, logsumexp, attn_mask)
+            ctx.has_mask = True
+        else:
+            ctx.save_for_backward(query, key, value, output, logsumexp)
+            ctx.has_mask = False
         ctx.is_causal = is_causal
         ctx.scale_factor = scale_factor
@@ -85,19 +84,23 @@ class FlashAttentionFunction(torch.autograd.Function):
     @staticmethod
     def backward(ctx, grad_output):
-        query, key, value, output, logsumexp = ctx.saved_tensors
+        if ctx.has_mask:
+            query, key, value, output, logsumexp, attn_mask = ctx.saved_tensors
+        else:
+            query, key, value, output, logsumexp = ctx.saved_tensors
+            attn_mask = None
         # Compute gradients
         dQ, dK, dV = _C.backward(
-            grad_output, query, key, value, output, logsumexp, ctx.is_causal
+            grad_output, query, key, value, output, logsumexp, ctx.is_causal, attn_mask
         )
         # If we scaled the query in forward, scale the gradient back
         if ctx.scale_factor != 1.0:
             dQ = dQ * ctx.scale_factor
-        # Return gradients (None for is_causal and scale since they're not tensors)
-        return dQ, dK, dV, None, None
+        # Return gradients (None for is_causal, scale, and attn_mask since they're not tensors or don't need grad)
+        return dQ, dK, dV, None, None, None
 def flash_attention(
@@ -106,6 +109,7 @@ def flash_attention(
     value: torch.Tensor,
     is_causal: bool = False,
     scale: Optional[float] = None,
+    attn_mask: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Compute scaled dot-product attention using Flash Attention on MPS.
@@ -121,6 +125,9 @@ def flash_attention(
         value: Value tensor of shape (B, num_heads, seq_len, head_dim)
         is_causal: If True, applies causal masking (for autoregressive models)
         scale: Scaling factor for attention scores. Default: 1/sqrt(head_dim)
+        attn_mask: Optional boolean attention mask of shape (B, 1, seq_len_q, seq_len_kv)
+                   or (B, num_heads, seq_len_q, seq_len_kv). True values indicate
+                   positions to be masked (not attended to).
     Returns:
         Output tensor of shape (B, num_heads, seq_len, head_dim)
@@ -137,6 +144,11 @@ def flash_attention(
         >>> q.requires_grad = True
         >>> out = flash_attention(q, k, v)
         >>> out.sum().backward()  # Computes dQ
+        # With attention mask:
+        >>> mask = torch.zeros(2, 1, 4096, 4096, dtype=torch.bool, device='mps')
+        >>> mask[:, :, :, 2048:] = True  # mask out second half of keys
+        >>> out = flash_attention(q, k, v, attn_mask=mask)
     """
     if not _HAS_MFA:
         raise RuntimeError(
@@ -154,9 +166,23 @@ def flash_attention(
         raise ValueError("key must be on MPS device")
     if value.device.type != 'mps':
         raise ValueError("value must be on MPS device")
+    if attn_mask is not None and attn_mask.device.type != 'mps':
+        raise ValueError("attn_mask must be on MPS device")
+    # Fast path: inference mode (no grad) - skip autograd overhead and don't save tensors
+    if not torch.is_grad_enabled() or (not query.requires_grad and not key.requires_grad and not value.requires_grad):
+        # Apply scale if provided
+        if scale is not None:
+            default_scale = 1.0 / math.sqrt(query.shape[-1])
+            if abs(scale - default_scale) > 1e-6:
+                scale_factor = scale / default_scale
+                query = query * scale_factor
+        # Forward only - no logsumexp needed, no tensors saved
+        return _C.forward(query, key, value, is_causal, attn_mask)
     # Use autograd function for gradient support
-    return FlashAttentionFunction.apply(query, key, value, is_causal, scale)
+    return FlashAttentionFunction.apply(query, key, value, is_causal, scale, attn_mask)
 def replace_sdpa():
@@ -173,13 +199,28 @@ def replace_sdpa():
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None):
-        # Use MFA for MPS tensors without mask/dropout
+        # Use MFA for MPS tensors without dropout
+        # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
+        # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
         if (query.device.type == 'mps' and
-            attn_mask is None and
             dropout_p == 0.0 and
-            _HAS_MFA):
+            _HAS_MFA and
+            query.shape[2] >= 1024):
             try:
-                return flash_attention(query, key, value, is_causal=is_causal, scale=scale)
+                # Convert float mask to bool mask if needed
+                # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
+                # MFA uses boolean masks (False/0 = attend, True/non-zero = mask)
+                mfa_mask = None
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        # Boolean mask: True means masked (don't attend)
+                        mfa_mask = attn_mask
+                    else:
+                        # Float mask: typically -inf for masked positions, 0 for unmasked
+                        # Convert: positions with large negative values -> True (masked)
+                        # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
+                        mfa_mask = attn_mask <= -1e3
+                return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
             except Exception:
                 # Fall back to original on any error
                 pass

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/mps_flash_attn/csrc/mps_flash_attn.mm RENAMED Viewed

@@ -12,22 +12,39 @@
 #import <Foundation/Foundation.h>
 #include <dlfcn.h>
+#include <string>
+#include <vector>
 // ============================================================================
 // MFA Bridge Function Types
 // ============================================================================
 typedef bool (*mfa_init_fn)();
-typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool);  // Added causal param
-typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, int32_t);
-typedef bool (*mfa_backward_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
-                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool);
+typedef void* (*mfa_create_kernel_v2_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool);
+// New zero-sync encode functions that take PyTorch's command encoder
+// Added mask_ptr and mask_offset parameters
+typedef bool (*mfa_forward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*,
+                                       int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                       int32_t, int32_t);
+typedef bool (*mfa_backward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                        int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                        int32_t, int32_t);
+// Legacy sync functions (fallback)
+typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, void*,
+                                int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                int32_t, int32_t);
+typedef bool (*mfa_backward_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
                                  int32_t, int32_t);
 typedef void (*mfa_release_kernel_fn)(void*);
 // Global function pointers
 static mfa_init_fn g_mfa_init = nullptr;
 static mfa_create_kernel_fn g_mfa_create_kernel = nullptr;
+static mfa_create_kernel_v2_fn g_mfa_create_kernel_v2 = nullptr;
+static mfa_forward_encode_fn g_mfa_forward_encode = nullptr;
+static mfa_backward_encode_fn g_mfa_backward_encode = nullptr;
 static mfa_forward_fn g_mfa_forward = nullptr;
 static mfa_backward_fn g_mfa_backward = nullptr;
 static mfa_release_kernel_fn g_mfa_release_kernel = nullptr;
@@ -38,31 +55,44 @@ static bool g_initialized = false;
 // Load MFA Bridge Library
 // ============================================================================
+// Get the directory containing this shared library
+static std::string get_module_dir() {
+    Dl_info info;
+    if (dladdr((void*)get_module_dir, &info) && info.dli_fname) {
+        std::string path(info.dli_fname);
+        size_t last_slash = path.rfind('/');
+        if (last_slash != std::string::npos) {
+            return path.substr(0, last_slash);
+        }
+    }
+    return ".";
+}
 static bool load_mfa_bridge() {
     if (g_dylib_handle) return true;
-    // Try to find the dylib relative to this extension
-    // First try the standard location
-    const char* paths[] = {
-        "libMFABridge.dylib",
-        "./libMFABridge.dylib",
-        "../swift-bridge/.build/release/libMFABridge.dylib",
-        nullptr
+    // First check environment variable (highest priority)
+    const char* mfa_path = getenv("MFA_BRIDGE_PATH");
+    if (mfa_path) {
+        g_dylib_handle = dlopen(mfa_path, RTLD_NOW);
+        if (g_dylib_handle) return true;
+    }
+    // Get the directory containing this extension module
+    std::string module_dir = get_module_dir();
+    // Try paths relative to the module directory
+    std::vector<std::string> paths = {
+        module_dir + "/lib/libMFABridge.dylib",  // Bundled in wheel
+        module_dir + "/../swift-bridge/.build/release/libMFABridge.dylib",  // Dev build
+        "libMFABridge.dylib",  // Current directory fallback
     };
-    for (int i = 0; paths[i] != nullptr; i++) {
-        g_dylib_handle = dlopen(paths[i], RTLD_NOW);
+    for (const auto& path : paths) {
+        g_dylib_handle = dlopen(path.c_str(), RTLD_NOW);
         if (g_dylib_handle) break;
     }
-    if (!g_dylib_handle) {
-        // Try with absolute path from environment
-        const char* mfa_path = getenv("MFA_BRIDGE_PATH");
-        if (mfa_path) {
-            g_dylib_handle = dlopen(mfa_path, RTLD_NOW);
-        }
-    }
     if (!g_dylib_handle) {
         throw std::runtime_error(
             "Failed to load libMFABridge.dylib. Set MFA_BRIDGE_PATH environment variable.");
@@ -71,11 +101,15 @@ static bool load_mfa_bridge() {
     // Load function pointers
     g_mfa_init = (mfa_init_fn)dlsym(g_dylib_handle, "mfa_init");
     g_mfa_create_kernel = (mfa_create_kernel_fn)dlsym(g_dylib_handle, "mfa_create_kernel");
+    g_mfa_create_kernel_v2 = (mfa_create_kernel_v2_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v2");
+    g_mfa_forward_encode = (mfa_forward_encode_fn)dlsym(g_dylib_handle, "mfa_forward_encode");
+    g_mfa_backward_encode = (mfa_backward_encode_fn)dlsym(g_dylib_handle, "mfa_backward_encode");
     g_mfa_forward = (mfa_forward_fn)dlsym(g_dylib_handle, "mfa_forward");
     g_mfa_backward = (mfa_backward_fn)dlsym(g_dylib_handle, "mfa_backward");
     g_mfa_release_kernel = (mfa_release_kernel_fn)dlsym(g_dylib_handle, "mfa_release_kernel");
-    if (!g_mfa_init || !g_mfa_create_kernel || !g_mfa_forward || !g_mfa_backward || !g_mfa_release_kernel) {
+    // Require at least init, create_kernel, forward_encode (for zero-sync path)
+    if (!g_mfa_init || !g_mfa_create_kernel || !g_mfa_forward_encode) {
         throw std::runtime_error("Failed to load MFA bridge functions");
     }
@@ -116,6 +150,8 @@ struct KernelCacheKey {
     bool low_precision;
     bool low_precision_outputs;
     bool causal;
+    bool has_mask;
+    bool use_bf16;
     bool operator==(const KernelCacheKey& other) const {
         return seq_len_q == other.seq_len_q &&
@@ -123,7 +159,9 @@ struct KernelCacheKey {
                head_dim == other.head_dim &&
                low_precision == other.low_precision &&
                low_precision_outputs == other.low_precision_outputs &&
-               causal == other.causal;
+               causal == other.causal &&
+               has_mask == other.has_mask &&
+               use_bf16 == other.use_bf16;
     }
 };
@@ -134,28 +172,47 @@ struct KernelCacheKeyHash {
                (std::hash<int64_t>()(k.head_dim) << 2) ^
                (std::hash<bool>()(k.low_precision) << 3) ^
                (std::hash<bool>()(k.low_precision_outputs) << 4) ^
-               (std::hash<bool>()(k.causal) << 5);
+               (std::hash<bool>()(k.causal) << 5) ^
+               (std::hash<bool>()(k.has_mask) << 6) ^
+               (std::hash<bool>()(k.use_bf16) << 7);
     }
 };
 static std::unordered_map<KernelCacheKey, void*, KernelCacheKeyHash> g_kernel_cache;
-static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal) {
-    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal};
+static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask, bool use_bf16 = false) {
+    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask, use_bf16};
     auto it = g_kernel_cache.find(key);
     if (it != g_kernel_cache.end()) {
         return it->second;
     }
-    void* kernel = g_mfa_create_kernel(
-        static_cast<int32_t>(seq_q),
-        static_cast<int32_t>(seq_kv),
-        static_cast<int32_t>(head_dim),
-        low_prec,
-        low_prec_outputs,
-        causal
-    );
+    void* kernel = nullptr;
+    if (use_bf16 && g_mfa_create_kernel_v2) {
+        // Use v2 API with BF16 support
+        kernel = g_mfa_create_kernel_v2(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask,
+            use_bf16
+        );
+    } else {
+        // Legacy API
+        kernel = g_mfa_create_kernel(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask
+        );
+    }
     if (!kernel) {
         throw std::runtime_error("Failed to create MFA kernel");
@@ -173,7 +230,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     const at::Tensor& query,   // (B, H, N, D)
     const at::Tensor& key,     // (B, H, N, D)
     const at::Tensor& value,   // (B, H, N, D)
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
 ) {
     // Initialize MFA on first call
     if (!g_initialized) {
@@ -193,37 +251,95 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     TORCH_CHECK(value.device().is_mps(), "Value must be on MPS device");
     const int64_t batch_size = query.size(0);
-    const int64_t num_heads = query.size(1);
+    const int64_t num_heads_q = query.size(1);
+    const int64_t num_heads_kv = key.size(1);
     const int64_t seq_len_q = query.size(2);
     const int64_t head_dim = query.size(3);
     const int64_t seq_len_kv = key.size(2);
     TORCH_CHECK(key.size(0) == batch_size && value.size(0) == batch_size,
                 "Batch size mismatch");
-    TORCH_CHECK(key.size(1) == num_heads && value.size(1) == num_heads,
-                "Number of heads mismatch");
     TORCH_CHECK(key.size(3) == head_dim && value.size(3) == head_dim,
                 "Head dimension mismatch");
+    TORCH_CHECK(key.size(1) == value.size(1),
+                "K and V must have same number of heads");
+    // Handle GQA (Grouped Query Attention): expand K/V if fewer heads than Q
+    const int64_t num_heads = num_heads_q;
+    at::Tensor k_expanded, v_expanded;
+    if (num_heads_kv != num_heads_q) {
+        // GQA: num_heads_q must be divisible by num_heads_kv
+        TORCH_CHECK(num_heads_q % num_heads_kv == 0,
+                    "num_heads_q (", num_heads_q, ") must be divisible by num_heads_kv (", num_heads_kv, ")");
+        int64_t repeat_factor = num_heads_q / num_heads_kv;
+        // Expand K and V to match Q's head count: (B, H_kv, S, D) -> (B, H_q, S, D)
+        // Use repeat_interleave for proper GQA expansion
+        k_expanded = key.repeat_interleave(repeat_factor, /*dim=*/1);
+        v_expanded = value.repeat_interleave(repeat_factor, /*dim=*/1);
+    } else {
+        k_expanded = key;
+        v_expanded = value;
+    }
-    // Determine precision
-    bool low_precision = (query.scalar_type() == at::kHalf ||
-                          query.scalar_type() == at::kBFloat16);
+    // Determine precision - MFA kernel supports FP16, BF16, and FP32
+    bool is_bfloat16 = (query.scalar_type() == at::kBFloat16);
+    bool is_fp16 = (query.scalar_type() == at::kHalf);
-    // For fp16 inputs, we can now output directly to fp16 (no extra conversion needed!)
-    bool low_precision_outputs = low_precision;
+    // Use native BF16 kernel if available, otherwise fall back to FP32
+    bool use_bf16_kernel = is_bfloat16 && g_mfa_create_kernel_v2;
+    bool low_precision = is_fp16;  // FP16 path
+    bool low_precision_outputs = is_fp16 || use_bf16_kernel;
     // Make inputs contiguous
     auto q = query.contiguous();
-    auto k = key.contiguous();
-    auto v = value.contiguous();
+    auto k = k_expanded.contiguous();
+    auto v = v_expanded.contiguous();
+    // For BF16 without native kernel support, convert to FP32 (avoids FP16 overflow)
+    if (is_bfloat16 && !use_bf16_kernel) {
+        q = q.to(at::kFloat);
+        k = k.to(at::kFloat);
+        v = v.to(at::kFloat);
+    }
+    // Handle attention mask
+    bool has_mask = attn_mask.has_value();
+    at::Tensor mask;
+    if (has_mask) {
+        mask = attn_mask.value();
+        TORCH_CHECK(mask.dim() == 4, "Attention mask must be 4D (B, H or 1, N_q, N_kv)");
+        TORCH_CHECK(mask.device().is_mps(), "Attention mask must be on MPS device");
+        // Convert to bool/uint8 if needed - kernel expects uchar (0 = attend, non-0 = mask out)
+        if (mask.scalar_type() == at::kBool) {
+            // Convert bool to uint8 for Metal compatibility
+            mask = mask.to(at::kByte);
+        }
+        TORCH_CHECK(mask.scalar_type() == at::kByte,
+                    "Attention mask must be bool or uint8");
+        // Expand mask dimensions if needed -> (B, H, N_q, N_kv)
+        // Handle (B, 1, N_q, N_kv) -> expand heads
+        // Handle (B, H, 1, N_kv) -> expand query dim (1D key mask)
+        // Handle (B, 1, 1, N_kv) -> expand both
+        if (mask.size(1) == 1 || mask.size(2) == 1) {
+            mask = mask.expand({batch_size, num_heads, seq_len_q, seq_len_kv});
+        }
+        mask = mask.contiguous();
+    }
     // Allocate output in the appropriate precision
-    // With lowPrecisionOutputs=true, MFA writes FP16 directly
     at::Tensor output;
-    if (low_precision_outputs) {
+    if (use_bf16_kernel) {
+        // Native BF16 kernel outputs BF16
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kBFloat16));
+    } else if (low_precision_outputs) {
+        // FP16 kernel outputs FP16
         output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
                            query.options().dtype(at::kHalf));
     } else {
+        // FP32 kernel outputs FP32
         output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
                            query.options().dtype(at::kFloat));
     }
@@ -232,8 +348,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     auto logsumexp = at::empty({batch_size, num_heads, seq_len_q},
                                 query.options().dtype(at::kFloat));
-    // Get or create kernel with matching output precision and causal mode
-    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    // Get or create kernel with matching precision and causal mode
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal, has_mask, use_bf16_kernel);
     // Get Metal buffers with byte offsets
     auto q_info = getBufferInfo(q);
@@ -242,25 +358,36 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     auto o_info = getBufferInfo(output);
     auto l_info = getBufferInfo(logsumexp);
-    // Synchronize with PyTorch's MPS stream
+    // Mask buffer info (may be nullptr if no mask)
+    BufferInfo mask_info = {nil, 0};
+    if (has_mask) {
+        mask_info = getBufferInfo(mask);
+    }
+    // Use PyTorch's MPS stream command encoder for zero-sync integration
     @autoreleasepool {
-        // Wait for PyTorch operations to complete
         auto stream = at::mps::getCurrentMPSStream();
-        stream->synchronize(at::mps::SyncType::COMMIT_AND_WAIT);
-        // Execute MFA with storage byte offsets
-        bool success = g_mfa_forward(
+        // Get PyTorch's shared command encoder - this is the key for zero-sync!
+        // All our dispatches go onto the same encoder that PyTorch uses.
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        // Execute MFA using the shared encoder (no sync needed!)
+        bool success = g_mfa_forward_encode(
             kernel,
+            (__bridge void*)encoder,  // PyTorch's shared command encoder
             (__bridge void*)q_info.buffer,
             (__bridge void*)k_info.buffer,
             (__bridge void*)v_info.buffer,
             (__bridge void*)o_info.buffer,
             (__bridge void*)l_info.buffer,
+            has_mask ? (__bridge void*)mask_info.buffer : nullptr,
             q_info.byte_offset,
             k_info.byte_offset,
             v_info.byte_offset,
             o_info.byte_offset,
             l_info.byte_offset,
+            mask_info.byte_offset,
             static_cast<int32_t>(batch_size),
             static_cast<int32_t>(num_heads)
         );
@@ -268,9 +395,17 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
         if (!success) {
             throw std::runtime_error("MFA forward pass failed");
         }
+        // No commit needed - PyTorch will commit when it needs the results
+        // The encoder stays open for coalescing more kernels
+    }
+    // Convert output back to BF16 if input was BF16 and we used FP32 fallback
+    // (native BF16 kernel already outputs BF16, no conversion needed)
+    if (is_bfloat16 && !use_bf16_kernel) {
+        output = output.to(at::kBFloat16);
     }
-    // Output is already in the correct dtype (fp16 or fp32)
     // Return both output and logsumexp (needed for backward pass)
     return std::make_tuple(output, logsumexp);
 }
@@ -280,9 +415,10 @@ at::Tensor mps_flash_attention_forward(
     const at::Tensor& query,
     const at::Tensor& key,
     const at::Tensor& value,
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask
 ) {
-    auto [output, logsumexp] = mps_flash_attention_forward_with_lse(query, key, value, is_causal);
+    auto [output, logsumexp] = mps_flash_attention_forward_with_lse(query, key, value, is_causal, attn_mask);
     return output;
 }
@@ -297,7 +433,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     const at::Tensor& value,        // (B, H, N, D)
     const at::Tensor& output,       // (B, H, N, D)
     const at::Tensor& logsumexp,    // (B, H, N)
-    bool is_causal
+    bool is_causal,
+    const c10::optional<at::Tensor>& attn_mask  // Optional (B, 1, N_q, N_kv) or (B, H, N_q, N_kv)
 ) {
     // Initialize MFA on first call
     if (!g_initialized) {
@@ -327,16 +464,35 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
                           query.scalar_type() == at::kBFloat16);
     bool low_precision_outputs = low_precision;
-    // Make inputs contiguous
-    auto q = query.contiguous();
-    auto k = key.contiguous();
-    auto v = value.contiguous();
-    auto o = output.contiguous();
-    auto dO = grad_output.contiguous();
+    // Handle attention mask
+    bool has_mask = attn_mask.has_value();
+    at::Tensor mask;
+    if (has_mask) {
+        mask = attn_mask.value();
+        TORCH_CHECK(mask.dim() == 4, "Attention mask must be 4D (B, H or 1, N_q, N_kv)");
+        TORCH_CHECK(mask.device().is_mps(), "Attention mask must be on MPS device");
+        if (mask.scalar_type() == at::kBool) {
+            mask = mask.to(at::kByte);
+        }
+        TORCH_CHECK(mask.scalar_type() == at::kByte,
+                    "Attention mask must be bool or uint8");
+        if (mask.size(1) == 1 && num_heads > 1) {
+            mask = mask.expand({batch_size, num_heads, seq_len_q, seq_len_kv});
+        }
+        mask = mask.contiguous();
+    }
+    // Make inputs contiguous and upcast to fp32 for numerical stability
+    // The backward pass accumulates many small values, so fp32 precision is critical
+    auto q = query.contiguous().to(at::kFloat);
+    auto k = key.contiguous().to(at::kFloat);
+    auto v = value.contiguous().to(at::kFloat);
+    auto o = output.contiguous().to(at::kFloat);
+    auto dO = grad_output.contiguous().to(at::kFloat);
     auto lse = logsumexp.contiguous();
-    // Get or create kernel (with causal mode)
-    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    // Get or create kernel - always use fp32 for backward pass
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, false, false, is_causal, has_mask);
     // Allocate D buffer (dO * O reduction, always fp32)
     auto D = at::empty({batch_size, num_heads, seq_len_q},
@@ -362,13 +518,22 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
     auto dk_info = getBufferInfo(dK);
     auto dv_info = getBufferInfo(dV);
-    // Execute backward pass
+    // Mask buffer info (may be nullptr if no mask)
+    BufferInfo mask_info = {nil, 0};
+    if (has_mask) {
+        mask_info = getBufferInfo(mask);
+    }
+    // Use PyTorch's MPS stream command encoder for zero-sync integration
     @autoreleasepool {
         auto stream = at::mps::getCurrentMPSStream();
-        stream->synchronize(at::mps::SyncType::COMMIT_AND_WAIT);
-        bool success = g_mfa_backward(
+        // Get PyTorch's shared command encoder
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        bool success = g_mfa_backward_encode(
             kernel,
+            (__bridge void*)encoder,  // PyTorch's shared command encoder
             (__bridge void*)q_info.buffer,
             (__bridge void*)k_info.buffer,
             (__bridge void*)v_info.buffer,
@@ -379,6 +544,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
             (__bridge void*)dq_info.buffer,
             (__bridge void*)dk_info.buffer,
             (__bridge void*)dv_info.buffer,
+            has_mask ? (__bridge void*)mask_info.buffer : nullptr,
             q_info.byte_offset,
             k_info.byte_offset,
             v_info.byte_offset,
@@ -389,6 +555,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
             dq_info.byte_offset,
             dk_info.byte_offset,
             dv_info.byte_offset,
+            mask_info.byte_offset,
             static_cast<int32_t>(batch_size),
             static_cast<int32_t>(num_heads)
         );
@@ -396,6 +563,8 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
         if (!success) {
             throw std::runtime_error("MFA backward pass failed");
         }
+        // No commit needed - PyTorch will commit when it needs the results
     }
     // Convert gradients back to input dtype if needed
@@ -420,14 +589,16 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("query"),
           py::arg("key"),
           py::arg("value"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
     m.def("forward_with_lse", &mps_flash_attention_forward_with_lse,
           "Flash Attention forward pass (returns output and logsumexp for backward)",
           py::arg("query"),
           py::arg("key"),
           py::arg("value"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
     m.def("backward", &mps_flash_attention_backward,
           "Flash Attention backward pass",
@@ -437,5 +608,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
           py::arg("value"),
           py::arg("output"),
           py::arg("logsumexp"),
-          py::arg("is_causal") = false);
+          py::arg("is_causal") = false,
+          py::arg("attn_mask") = py::none());
 }

mps_flash_attn-0.1.13/mps_flash_attn/lib/libMFABridge.dylib ADDED Viewed

Binary file

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.4
+Version: 0.1.13
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT
@@ -32,8 +32,9 @@ Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4).
 ## Features
 - **Forward pass**: 2-5x faster than PyTorch SDPA
-- **Backward pass**: Full gradient support for training
+- **Backward pass**: Full gradient support for training (fp32 precision)
 - **Causal masking**: Native kernel support (only 5% overhead)
+- **Attention masks**: Full boolean mask support for arbitrary masking patterns
 - **FP16/FP32**: Native fp16 output (no conversion overhead)
 - **Pre-compiled kernels**: Zero-compilation cold start (~6ms)
@@ -98,6 +99,16 @@ out = flash_attention(q, k, v)
 out = flash_attention(q, k, v, is_causal=True)
 ```
+### Attention masks (for custom masking patterns)
+```python
+# Boolean mask: True = masked (don't attend), False = attend
+mask = torch.zeros(B, 1, N, N, dtype=torch.bool, device='mps')
+mask[:, :, :, 512:] = True  # Mask out positions after 512
+out = flash_attention(q, k, v, attn_mask=mask)
+```
 ### Training with gradients
 ```python
@@ -247,7 +258,6 @@ python scripts/build_metallibs.py
 **Known limitations:**
 - Sequence length must be divisible by block size (typically 64)
 - Head dimension: Best with 32, 64, 96, 128
-- No arbitrary attention masks (only causal or none)
 - No dropout
 ## Credits

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.1.4"
+version = "0.1.13"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

{mps_flash_attn-0.1.4 → mps_flash_attn-0.1.13}/setup.py RENAMED Viewed

@@ -4,6 +4,7 @@ Setup script for MPS Flash Attention
 import os
 import sys
+import shutil
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
@@ -36,6 +37,26 @@ class ObjCppBuildExt(build_ext):
         super().build_extensions()
+        # Copy libMFABridge.dylib to lib/ after building
+        self._copy_swift_bridge()
+    def _copy_swift_bridge(self):
+        """Copy Swift bridge dylib to package lib/ directory."""
+        src_path = os.path.join(
+            os.path.dirname(__file__),
+            "swift-bridge", ".build", "release", "libMFABridge.dylib"
+        )
+        dst_dir = os.path.join(os.path.dirname(__file__), "mps_flash_attn", "lib")
+        dst_path = os.path.join(dst_dir, "libMFABridge.dylib")
+        if os.path.exists(src_path):
+            os.makedirs(dst_dir, exist_ok=True)
+            shutil.copy2(src_path, dst_path)
+            print(f"Copied libMFABridge.dylib to {dst_path}")
+        else:
+            print(f"Warning: {src_path} not found. Build swift-bridge first with:")
+            print("  cd swift-bridge && swift build -c release")
 def get_extensions():
     if sys.platform != "darwin":
@@ -44,14 +65,14 @@ def get_extensions():
     return [Extension(
         name="mps_flash_attn._C",
         sources=["mps_flash_attn/csrc/mps_flash_attn.mm"],
-        extra_compile_args=["-std=c++17", "-O3"],
+        extra_compile_args=["-std=c++17", "-O3", "-DTORCH_EXTENSION_NAME=_C"],
         extra_link_args=["-framework", "Metal", "-framework", "Foundation"],
     )]
 setup(
     name="mps-flash-attn",
-    version="0.1.1",
+    version="0.1.5",
     packages=find_packages(),
     package_data={
         "mps_flash_attn": [