PyPI - mps-flash-attn - Versions diffs - 0.1.8__tar.gz → 0.1.14__tar.gz - Mend

mps-flash-attn 0.1.8tar.gz → 0.1.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.8
+Version: 0.1.14
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/mps_flash_attn/__init__.py RENAMED Viewed

@@ -4,7 +4,7 @@ MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
 This package provides memory-efficient attention using Metal Flash Attention kernels.
 """
-__version__ = "0.1.8"
+__version__ = "0.1.14"
 import torch
 from typing import Optional
@@ -30,6 +30,30 @@ def is_available() -> bool:
     return _HAS_MFA and torch.backends.mps.is_available()
+def convert_mask(attn_mask: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+    """
+    Convert attention mask to MFA's boolean format.
+    MFA uses boolean masks where True = masked (don't attend).
+    PyTorch SDPA uses additive float masks where -inf/large negative = masked.
+    Args:
+        attn_mask: Optional mask, either:
+            - None: no mask
+            - bool tensor: already in MFA format (True = masked)
+            - float tensor: additive mask (large negative = masked)
+    Returns:
+        Boolean mask suitable for flash_attention(), or None
+    """
+    if attn_mask is None:
+        return None
+    if attn_mask.dtype == torch.bool:
+        return attn_mask
+    # Float mask: large negative values indicate masked positions
+    return attn_mask <= -1e3
 class FlashAttentionFunction(torch.autograd.Function):
     """Autograd function for Flash Attention with backward pass support."""
@@ -176,12 +200,13 @@ def replace_sdpa():
     def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
                      is_causal=False, scale=None):
         # Use MFA for MPS tensors without dropout
-        # Only use MFA for seq_len >= 1024 where it outperforms PyTorch's math backend
+        # Only use MFA for seq_len >= 1536 where it outperforms PyTorch's math backend
         # For shorter sequences, PyTorch's simpler matmul+softmax approach is faster
+        # Benchmark (BF16, heads=30, head_dim=128): crossover is ~1200-1500
         if (query.device.type == 'mps' and
             dropout_p == 0.0 and
             _HAS_MFA and
-            query.shape[2] >= 1024):
+            query.shape[2] >= 1536):
             try:
                 # Convert float mask to bool mask if needed
                 # PyTorch SDPA uses additive masks (0 = attend, -inf = mask)
@@ -194,7 +219,8 @@ def replace_sdpa():
                     else:
                         # Float mask: typically -inf for masked positions, 0 for unmasked
                         # Convert: positions with large negative values -> True (masked)
-                        mfa_mask = attn_mask < -1e4
+                        # Use -1e3 threshold to catch -1000, -10000, -inf, etc.
+                        mfa_mask = attn_mask <= -1e3
                 return flash_attention(query, key, value, is_causal=is_causal, scale=scale, attn_mask=mfa_mask)
             except Exception:
                 # Fall back to original on any error

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/mps_flash_attn/csrc/mps_flash_attn.mm RENAMED Viewed

@@ -21,6 +21,7 @@
 typedef bool (*mfa_init_fn)();
 typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool);
+typedef void* (*mfa_create_kernel_v2_fn)(int32_t, int32_t, int32_t, bool, bool, bool, bool, bool);
 // New zero-sync encode functions that take PyTorch's command encoder
 // Added mask_ptr and mask_offset parameters
 typedef bool (*mfa_forward_encode_fn)(void*, void*, void*, void*, void*, void*, void*, void*,
@@ -41,6 +42,7 @@ typedef void (*mfa_release_kernel_fn)(void*);
 // Global function pointers
 static mfa_init_fn g_mfa_init = nullptr;
 static mfa_create_kernel_fn g_mfa_create_kernel = nullptr;
+static mfa_create_kernel_v2_fn g_mfa_create_kernel_v2 = nullptr;
 static mfa_forward_encode_fn g_mfa_forward_encode = nullptr;
 static mfa_backward_encode_fn g_mfa_backward_encode = nullptr;
 static mfa_forward_fn g_mfa_forward = nullptr;
@@ -99,6 +101,7 @@ static bool load_mfa_bridge() {
     // Load function pointers
     g_mfa_init = (mfa_init_fn)dlsym(g_dylib_handle, "mfa_init");
     g_mfa_create_kernel = (mfa_create_kernel_fn)dlsym(g_dylib_handle, "mfa_create_kernel");
+    g_mfa_create_kernel_v2 = (mfa_create_kernel_v2_fn)dlsym(g_dylib_handle, "mfa_create_kernel_v2");
     g_mfa_forward_encode = (mfa_forward_encode_fn)dlsym(g_dylib_handle, "mfa_forward_encode");
     g_mfa_backward_encode = (mfa_backward_encode_fn)dlsym(g_dylib_handle, "mfa_backward_encode");
     g_mfa_forward = (mfa_forward_fn)dlsym(g_dylib_handle, "mfa_forward");
@@ -148,6 +151,7 @@ struct KernelCacheKey {
     bool low_precision_outputs;
     bool causal;
     bool has_mask;
+    bool use_bf16;
     bool operator==(const KernelCacheKey& other) const {
         return seq_len_q == other.seq_len_q &&
@@ -156,7 +160,8 @@ struct KernelCacheKey {
                low_precision == other.low_precision &&
                low_precision_outputs == other.low_precision_outputs &&
                causal == other.causal &&
-               has_mask == other.has_mask;
+               has_mask == other.has_mask &&
+               use_bf16 == other.use_bf16;
     }
 };
@@ -168,29 +173,46 @@ struct KernelCacheKeyHash {
                (std::hash<bool>()(k.low_precision) << 3) ^
                (std::hash<bool>()(k.low_precision_outputs) << 4) ^
                (std::hash<bool>()(k.causal) << 5) ^
-               (std::hash<bool>()(k.has_mask) << 6);
+               (std::hash<bool>()(k.has_mask) << 6) ^
+               (std::hash<bool>()(k.use_bf16) << 7);
     }
 };
 static std::unordered_map<KernelCacheKey, void*, KernelCacheKeyHash> g_kernel_cache;
-static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask) {
-    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask};
+static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal, bool has_mask, bool use_bf16 = false) {
+    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal, has_mask, use_bf16};
     auto it = g_kernel_cache.find(key);
     if (it != g_kernel_cache.end()) {
         return it->second;
     }
-    void* kernel = g_mfa_create_kernel(
-        static_cast<int32_t>(seq_q),
-        static_cast<int32_t>(seq_kv),
-        static_cast<int32_t>(head_dim),
-        low_prec,
-        low_prec_outputs,
-        causal,
-        has_mask
-    );
+    void* kernel = nullptr;
+    if (use_bf16 && g_mfa_create_kernel_v2) {
+        // Use v2 API with BF16 support
+        kernel = g_mfa_create_kernel_v2(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask,
+            use_bf16
+        );
+    } else {
+        // Legacy API
+        kernel = g_mfa_create_kernel(
+            static_cast<int32_t>(seq_q),
+            static_cast<int32_t>(seq_kv),
+            static_cast<int32_t>(head_dim),
+            low_prec,
+            low_prec_outputs,
+            causal,
+            has_mask
+        );
+    }
     if (!kernel) {
         throw std::runtime_error("Failed to create MFA kernel");
@@ -261,18 +283,27 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
         v_expanded = value;
     }
-    // Determine precision
-    bool low_precision = (query.scalar_type() == at::kHalf ||
-                          query.scalar_type() == at::kBFloat16);
+    // Determine precision - MFA kernel supports FP16, BF16, and FP32
+    bool is_bfloat16 = (query.scalar_type() == at::kBFloat16);
+    bool is_fp16 = (query.scalar_type() == at::kHalf);
-    // For fp16 inputs, we can now output directly to fp16 (no extra conversion needed!)
-    bool low_precision_outputs = low_precision;
+    // Use native BF16 kernel if available, otherwise fall back to FP32
+    bool use_bf16_kernel = is_bfloat16 && g_mfa_create_kernel_v2;
+    bool low_precision = is_fp16;  // FP16 path
+    bool low_precision_outputs = is_fp16 || use_bf16_kernel;
     // Make inputs contiguous
     auto q = query.contiguous();
     auto k = k_expanded.contiguous();
     auto v = v_expanded.contiguous();
+    // For BF16 without native kernel support, convert to FP32 (avoids FP16 overflow)
+    if (is_bfloat16 && !use_bf16_kernel) {
+        q = q.to(at::kFloat);
+        k = k.to(at::kFloat);
+        v = v.to(at::kFloat);
+    }
     // Handle attention mask
     bool has_mask = attn_mask.has_value();
     at::Tensor mask;
@@ -287,20 +318,28 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
         }
         TORCH_CHECK(mask.scalar_type() == at::kByte,
                     "Attention mask must be bool or uint8");
-        // Expand mask heads if needed (B, 1, N_q, N_kv) -> (B, H, N_q, N_kv)
-        if (mask.size(1) == 1 && num_heads > 1) {
+        // Expand mask dimensions if needed -> (B, H, N_q, N_kv)
+        // Handle (B, 1, N_q, N_kv) -> expand heads
+        // Handle (B, H, 1, N_kv) -> expand query dim (1D key mask)
+        // Handle (B, 1, 1, N_kv) -> expand both
+        if (mask.size(1) == 1 || mask.size(2) == 1) {
             mask = mask.expand({batch_size, num_heads, seq_len_q, seq_len_kv});
         }
         mask = mask.contiguous();
     }
     // Allocate output in the appropriate precision
-    // With lowPrecisionOutputs=true, MFA writes FP16 directly
     at::Tensor output;
-    if (low_precision_outputs) {
+    if (use_bf16_kernel) {
+        // Native BF16 kernel outputs BF16
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kBFloat16));
+    } else if (low_precision_outputs) {
+        // FP16 kernel outputs FP16
         output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
                            query.options().dtype(at::kHalf));
     } else {
+        // FP32 kernel outputs FP32
         output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
                            query.options().dtype(at::kFloat));
     }
@@ -309,8 +348,8 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
     auto logsumexp = at::empty({batch_size, num_heads, seq_len_q},
                                 query.options().dtype(at::kFloat));
-    // Get or create kernel with matching output precision and causal mode
-    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal, has_mask);
+    // Get or create kernel with matching precision and causal mode
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal, has_mask, use_bf16_kernel);
     // Get Metal buffers with byte offsets
     auto q_info = getBufferInfo(q);
@@ -361,7 +400,12 @@ std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
         // The encoder stays open for coalescing more kernels
     }
-    // Output is already in the correct dtype (fp16 or fp32)
+    // Convert output back to BF16 if input was BF16 and we used FP32 fallback
+    // (native BF16 kernel already outputs BF16, no conversion needed)
+    if (is_bfloat16 && !use_bf16_kernel) {
+        output = output.to(at::kBFloat16);
+    }
     // Return both output and logsumexp (needed for backward pass)
     return std::make_tuple(output, logsumexp);
 }

mps_flash_attn-0.1.14/mps_flash_attn/lib/libMFABridge.dylib ADDED Viewed

Binary file

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/mps_flash_attn.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mps-flash-attn
-Version: 0.1.8
+Version: 0.1.14
 Summary: Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)
 Author: imperatormk
 License-Expression: MIT

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "mps-flash-attn"
-version = "0.1.8"
+version = "0.1.14"
 description = "Flash Attention for PyTorch on Apple Silicon (M1/M2/M3/M4)"
 readme = "README.md"
 license = "MIT"

{mps_flash_attn-0.1.8 → mps_flash_attn-0.1.14}/setup.py RENAMED Viewed

@@ -4,6 +4,7 @@ Setup script for MPS Flash Attention
 import os
 import sys
+import shutil
 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext
@@ -36,6 +37,26 @@ class ObjCppBuildExt(build_ext):
         super().build_extensions()
+        # Copy libMFABridge.dylib to lib/ after building
+        self._copy_swift_bridge()
+    def _copy_swift_bridge(self):
+        """Copy Swift bridge dylib to package lib/ directory."""
+        src_path = os.path.join(
+            os.path.dirname(__file__),
+            "swift-bridge", ".build", "release", "libMFABridge.dylib"
+        )
+        dst_dir = os.path.join(os.path.dirname(__file__), "mps_flash_attn", "lib")
+        dst_path = os.path.join(dst_dir, "libMFABridge.dylib")
+        if os.path.exists(src_path):
+            os.makedirs(dst_dir, exist_ok=True)
+            shutil.copy2(src_path, dst_path)
+            print(f"Copied libMFABridge.dylib to {dst_path}")
+        else:
+            print(f"Warning: {src_path} not found. Build swift-bridge first with:")
+            print("  cd swift-bridge && swift build -c release")
 def get_extensions():
     if sys.platform != "darwin":