PyPI - mps-flash-attn - Versions diffs - 0.1.0__cp314-cp314-macosx_15_0_arm64.whl - Mend

mps-flash-attn 0.1.0__cp314-cp314-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mps-flash-attn might be problematic. Click here for more details.

Files changed (33) hide show

mps_flash_attn/_C.cpython-314-darwin.so ADDED Viewed

Binary file

mps_flash_attn/__init__.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""
+MPS Flash Attention - Flash Attention for PyTorch on Apple Silicon
+This package provides memory-efficient attention using Metal Flash Attention kernels.
+"""
+import torch
+from typing import Optional
+import math
+import threading
+import os
+# Try to import the C++ extension
+try:
+    from . import _C
+    _HAS_MFA = True
+except ImportError as e:
+    _HAS_MFA = False
+    _IMPORT_ERROR = str(e)
+# Set up shipped kernels directory for zero-compilation loading
+def _init_shipped_kernels():
+    """Point the Swift bridge to pre-shipped kernel binaries."""
+    try:
+        import ctypes
+        bridge_path = os.environ.get("MFA_BRIDGE_PATH")
+        if not bridge_path:
+            module_dir = os.path.dirname(__file__)
+            candidates = [
+                os.path.join(module_dir, "lib", "libMFABridge.dylib"),  # Bundled in wheel
+                os.path.join(module_dir, "..", "swift-bridge", ".build", "release", "libMFABridge.dylib"),
+                os.path.join(module_dir, "libMFABridge.dylib"),
+            ]
+            for path in candidates:
+                if os.path.exists(path):
+                    bridge_path = path
+                    break
+        if bridge_path and os.path.exists(bridge_path):
+            lib = ctypes.CDLL(bridge_path)
+            # Set shipped kernels directory (pre-compiled metallibs + pipeline binaries)
+            kernels_dir = os.path.join(os.path.dirname(__file__), "kernels")
+            if os.path.exists(kernels_dir):
+                lib.mfa_set_kernels_dir(kernels_dir.encode('utf-8'))
+            lib.mfa_init()
+    except Exception:
+        pass  # Init is optional, will fall back to runtime compilation
+# Initialize shipped kernels on import
+if _HAS_MFA:
+    _init_shipped_kernels()
+def is_available() -> bool:
+    """Check if MPS Flash Attention is available."""
+    return _HAS_MFA and torch.backends.mps.is_available()
+class FlashAttentionFunction(torch.autograd.Function):
+    """Autograd function for Flash Attention with backward pass support."""
+    @staticmethod
+    def forward(ctx, query, key, value, is_causal, scale):
+        # Apply scale if provided (MFA uses 1/sqrt(D) internally)
+        scale_factor = 1.0
+        if scale is not None:
+            default_scale = 1.0 / math.sqrt(query.shape[-1])
+            if abs(scale - default_scale) > 1e-6:
+                scale_factor = scale / default_scale
+                query = query * scale_factor
+        # Forward with logsumexp for backward
+        output, logsumexp = _C.forward_with_lse(query, key, value, is_causal)
+        # Save for backward
+        ctx.save_for_backward(query, key, value, output, logsumexp)
+        ctx.is_causal = is_causal
+        ctx.scale_factor = scale_factor
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):
+        query, key, value, output, logsumexp = ctx.saved_tensors
+        # Compute gradients
+        dQ, dK, dV = _C.backward(
+            grad_output, query, key, value, output, logsumexp, ctx.is_causal
+        )
+        # If we scaled the query in forward, scale the gradient back
+        if ctx.scale_factor != 1.0:
+            dQ = dQ * ctx.scale_factor
+        # Return gradients (None for is_causal and scale since they're not tensors)
+        return dQ, dK, dV, None, None
+def flash_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    Compute scaled dot-product attention using Flash Attention on MPS.
+    This function provides O(N) memory complexity instead of O(N²) by using
+    tiled computation, allowing much longer sequences on limited GPU memory.
+    Supports both forward and backward passes for training.
+    Args:
+        query: Query tensor of shape (B, num_heads, seq_len, head_dim)
+        key: Key tensor of shape (B, num_heads, seq_len, head_dim)
+        value: Value tensor of shape (B, num_heads, seq_len, head_dim)
+        is_causal: If True, applies causal masking (for autoregressive models)
+        scale: Scaling factor for attention scores. Default: 1/sqrt(head_dim)
+    Returns:
+        Output tensor of shape (B, num_heads, seq_len, head_dim)
+    Example:
+        >>> import torch
+        >>> from mps_flash_attn import flash_attention
+        >>> q = torch.randn(2, 8, 4096, 64, device='mps', dtype=torch.float16)
+        >>> k = torch.randn(2, 8, 4096, 64, device='mps', dtype=torch.float16)
+        >>> v = torch.randn(2, 8, 4096, 64, device='mps', dtype=torch.float16)
+        >>> out = flash_attention(q, k, v)
+        # With gradients:
+        >>> q.requires_grad = True
+        >>> out = flash_attention(q, k, v)
+        >>> out.sum().backward()  # Computes dQ
+    """
+    if not _HAS_MFA:
+        raise RuntimeError(
+            f"MPS Flash Attention C++ extension not available: {_IMPORT_ERROR}\n"
+            "Please rebuild with: pip install -e ."
+        )
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available")
+    # Validate device
+    if query.device.type != 'mps':
+        raise ValueError("query must be on MPS device")
+    if key.device.type != 'mps':
+        raise ValueError("key must be on MPS device")
+    if value.device.type != 'mps':
+        raise ValueError("value must be on MPS device")
+    # Use autograd function for gradient support
+    return FlashAttentionFunction.apply(query, key, value, is_causal, scale)
+def replace_sdpa():
+    """
+    Monkey-patch torch.nn.functional.scaled_dot_product_attention to use
+    Flash Attention on MPS devices.
+    Call this at the start of your script to automatically use Flash Attention
+    for all attention operations.
+    """
+    import torch.nn.functional as F
+    original_sdpa = F.scaled_dot_product_attention
+    def patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0,
+                     is_causal=False, scale=None):
+        # Use MFA for MPS tensors without mask/dropout
+        if (query.device.type == 'mps' and
+            attn_mask is None and
+            dropout_p == 0.0 and
+            _HAS_MFA):
+            try:
+                return flash_attention(query, key, value, is_causal=is_causal, scale=scale)
+            except Exception:
+                # Fall back to original on any error
+                pass
+        return original_sdpa(query, key, value, attn_mask, dropout_p, is_causal, scale)
+    F.scaled_dot_product_attention = patched_sdpa
+    print("MPS Flash Attention: Patched F.scaled_dot_product_attention")
+def precompile():
+    """
+    Pre-compile Metal kernels for common configurations.
+    Call this once after installation to eliminate runtime compilation overhead.
+    Pre-compiled kernels are cached to disk and loaded instantly on subsequent runs.
+    This compiles kernels for:
+    - Sequence lengths: 64, 128, 256, 512, 1024, 2048, 4096, 8192
+    - Head dimensions: 32, 48, 64, 80, 96, 128
+    - Both fp32 and fp16 precision
+    Total: 96 kernel configurations
+    """
+    if not _HAS_MFA:
+        raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    import ctypes
+    import os
+    # Load the Swift bridge directly
+    bridge_path = os.environ.get("MFA_BRIDGE_PATH")
+    if not bridge_path:
+        # Try common locations
+        module_dir = os.path.dirname(__file__)
+        candidates = [
+            os.path.join(module_dir, "lib", "libMFABridge.dylib"),  # Bundled in wheel
+            os.path.join(module_dir, "..", "swift-bridge", ".build", "release", "libMFABridge.dylib"),
+            os.path.join(module_dir, "libMFABridge.dylib"),
+        ]
+        for path in candidates:
+            if os.path.exists(path):
+                bridge_path = path
+                break
+    if not bridge_path or not os.path.exists(bridge_path):
+        raise RuntimeError("Cannot find libMFABridge.dylib. Set MFA_BRIDGE_PATH environment variable.")
+    lib = ctypes.CDLL(bridge_path)
+    lib.mfa_precompile()
+    print("\nPre-compilation complete! Kernels cached to disk.")
+def clear_cache():
+    """Clear the pre-compiled kernel cache."""
+    if not _HAS_MFA:
+        raise RuntimeError(f"MPS Flash Attention not available: {_IMPORT_ERROR}")
+    import ctypes
+    import os
+    bridge_path = os.environ.get("MFA_BRIDGE_PATH")
+    if bridge_path and os.path.exists(bridge_path):
+        lib = ctypes.CDLL(bridge_path)
+        lib.mfa_clear_cache()
+        print("Cache cleared.")

mps_flash_attn/csrc/mps_flash_attn.cpp ADDED Viewed

@@ -0,0 +1,441 @@
+/**
+ * MPS Flash Attention - PyTorch C++ Extension
+ *
+ * Bridges PyTorch MPS tensors to the MFA Swift library.
+ */
+#include <torch/extension.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#import <Metal/Metal.h>
+#import <Foundation/Foundation.h>
+#include <dlfcn.h>
+// ============================================================================
+// MFA Bridge Function Types
+// ============================================================================
+typedef bool (*mfa_init_fn)();
+typedef void* (*mfa_create_kernel_fn)(int32_t, int32_t, int32_t, bool, bool, bool);  // Added causal param
+typedef bool (*mfa_forward_fn)(void*, void*, void*, void*, void*, void*, int64_t, int64_t, int64_t, int64_t, int64_t, int32_t, int32_t);
+typedef bool (*mfa_backward_fn)(void*, void*, void*, void*, void*, void*, void*, void*, void*, void*, void*,
+                                 int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t,
+                                 int32_t, int32_t);
+typedef void (*mfa_release_kernel_fn)(void*);
+// Global function pointers
+static mfa_init_fn g_mfa_init = nullptr;
+static mfa_create_kernel_fn g_mfa_create_kernel = nullptr;
+static mfa_forward_fn g_mfa_forward = nullptr;
+static mfa_backward_fn g_mfa_backward = nullptr;
+static mfa_release_kernel_fn g_mfa_release_kernel = nullptr;
+static void* g_dylib_handle = nullptr;
+static bool g_initialized = false;
+// ============================================================================
+// Load MFA Bridge Library
+// ============================================================================
+static bool load_mfa_bridge() {
+    if (g_dylib_handle) return true;
+    // Try to find the dylib relative to this extension
+    // First try the standard location
+    const char* paths[] = {
+        "libMFABridge.dylib",
+        "./libMFABridge.dylib",
+        "../swift-bridge/.build/release/libMFABridge.dylib",
+        nullptr
+    };
+    for (int i = 0; paths[i] != nullptr; i++) {
+        g_dylib_handle = dlopen(paths[i], RTLD_NOW);
+        if (g_dylib_handle) break;
+    }
+    if (!g_dylib_handle) {
+        // Try with absolute path from environment
+        const char* mfa_path = getenv("MFA_BRIDGE_PATH");
+        if (mfa_path) {
+            g_dylib_handle = dlopen(mfa_path, RTLD_NOW);
+        }
+    }
+    if (!g_dylib_handle) {
+        throw std::runtime_error(
+            "Failed to load libMFABridge.dylib. Set MFA_BRIDGE_PATH environment variable.");
+    }
+    // Load function pointers
+    g_mfa_init = (mfa_init_fn)dlsym(g_dylib_handle, "mfa_init");
+    g_mfa_create_kernel = (mfa_create_kernel_fn)dlsym(g_dylib_handle, "mfa_create_kernel");
+    g_mfa_forward = (mfa_forward_fn)dlsym(g_dylib_handle, "mfa_forward");
+    g_mfa_backward = (mfa_backward_fn)dlsym(g_dylib_handle, "mfa_backward");
+    g_mfa_release_kernel = (mfa_release_kernel_fn)dlsym(g_dylib_handle, "mfa_release_kernel");
+    if (!g_mfa_init || !g_mfa_create_kernel || !g_mfa_forward || !g_mfa_backward || !g_mfa_release_kernel) {
+        throw std::runtime_error("Failed to load MFA bridge functions");
+    }
+    return true;
+}
+// ============================================================================
+// Get MTLBuffer from PyTorch MPS Tensor
+// ============================================================================
+struct BufferInfo {
+    id<MTLBuffer> buffer;
+    int64_t byte_offset;
+};
+static BufferInfo getBufferInfo(const at::Tensor& tensor) {
+    TORCH_CHECK(tensor.device().is_mps(), "Tensor must be on MPS device");
+    TORCH_CHECK(tensor.is_contiguous(), "Tensor must be contiguous");
+    // Get the underlying Metal buffer (covers entire storage)
+    id<MTLBuffer> buffer = at::native::mps::getMTLBufferStorage(tensor);
+    // Calculate byte offset: storage_offset() is in elements, multiply by element size
+    int64_t element_size = tensor.element_size();
+    int64_t byte_offset = tensor.storage_offset() * element_size;
+    return {buffer, byte_offset};
+}
+// ============================================================================
+// Kernel Cache
+// ============================================================================
+struct KernelCacheKey {
+    int64_t seq_len_q;
+    int64_t seq_len_kv;
+    int64_t head_dim;
+    bool low_precision;
+    bool low_precision_outputs;
+    bool causal;
+    bool operator==(const KernelCacheKey& other) const {
+        return seq_len_q == other.seq_len_q &&
+               seq_len_kv == other.seq_len_kv &&
+               head_dim == other.head_dim &&
+               low_precision == other.low_precision &&
+               low_precision_outputs == other.low_precision_outputs &&
+               causal == other.causal;
+    }
+};
+struct KernelCacheKeyHash {
+    size_t operator()(const KernelCacheKey& k) const {
+        return std::hash<int64_t>()(k.seq_len_q) ^
+               (std::hash<int64_t>()(k.seq_len_kv) << 1) ^
+               (std::hash<int64_t>()(k.head_dim) << 2) ^
+               (std::hash<bool>()(k.low_precision) << 3) ^
+               (std::hash<bool>()(k.low_precision_outputs) << 4) ^
+               (std::hash<bool>()(k.causal) << 5);
+    }
+};
+static std::unordered_map<KernelCacheKey, void*, KernelCacheKeyHash> g_kernel_cache;
+static void* get_or_create_kernel(int64_t seq_q, int64_t seq_kv, int64_t head_dim, bool low_prec, bool low_prec_outputs, bool causal) {
+    KernelCacheKey key{seq_q, seq_kv, head_dim, low_prec, low_prec_outputs, causal};
+    auto it = g_kernel_cache.find(key);
+    if (it != g_kernel_cache.end()) {
+        return it->second;
+    }
+    void* kernel = g_mfa_create_kernel(
+        static_cast<int32_t>(seq_q),
+        static_cast<int32_t>(seq_kv),
+        static_cast<int32_t>(head_dim),
+        low_prec,
+        low_prec_outputs,
+        causal
+    );
+    if (!kernel) {
+        throw std::runtime_error("Failed to create MFA kernel");
+    }
+    g_kernel_cache[key] = kernel;
+    return kernel;
+}
+// ============================================================================
+// Flash Attention Forward
+// ============================================================================
+std::tuple<at::Tensor, at::Tensor> mps_flash_attention_forward_with_lse(
+    const at::Tensor& query,   // (B, H, N, D)
+    const at::Tensor& key,     // (B, H, N, D)
+    const at::Tensor& value,   // (B, H, N, D)
+    bool is_causal
+) {
+    // Initialize MFA on first call
+    if (!g_initialized) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized = true;
+    }
+    // Validate inputs
+    TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
+    TORCH_CHECK(key.dim() == 4, "Key must be 4D (B, H, N, D)");
+    TORCH_CHECK(value.dim() == 4, "Value must be 4D (B, H, N, D)");
+    TORCH_CHECK(query.device().is_mps(), "Query must be on MPS device");
+    TORCH_CHECK(key.device().is_mps(), "Key must be on MPS device");
+    TORCH_CHECK(value.device().is_mps(), "Value must be on MPS device");
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t seq_len_q = query.size(2);
+    const int64_t head_dim = query.size(3);
+    const int64_t seq_len_kv = key.size(2);
+    TORCH_CHECK(key.size(0) == batch_size && value.size(0) == batch_size,
+                "Batch size mismatch");
+    TORCH_CHECK(key.size(1) == num_heads && value.size(1) == num_heads,
+                "Number of heads mismatch");
+    TORCH_CHECK(key.size(3) == head_dim && value.size(3) == head_dim,
+                "Head dimension mismatch");
+    // Determine precision
+    bool low_precision = (query.scalar_type() == at::kHalf ||
+                          query.scalar_type() == at::kBFloat16);
+    // For fp16 inputs, we can now output directly to fp16 (no extra conversion needed!)
+    bool low_precision_outputs = low_precision;
+    // Make inputs contiguous
+    auto q = query.contiguous();
+    auto k = key.contiguous();
+    auto v = value.contiguous();
+    // Allocate output in the appropriate precision
+    // With lowPrecisionOutputs=true, MFA writes FP16 directly
+    at::Tensor output;
+    if (low_precision_outputs) {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kHalf));
+    } else {
+        output = at::empty({batch_size, num_heads, seq_len_q, head_dim},
+                           query.options().dtype(at::kFloat));
+    }
+    // Allocate logsumexp (for backward pass, always fp32)
+    auto logsumexp = at::empty({batch_size, num_heads, seq_len_q},
+                                query.options().dtype(at::kFloat));
+    // Get or create kernel with matching output precision and causal mode
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    // Get Metal buffers with byte offsets
+    auto q_info = getBufferInfo(q);
+    auto k_info = getBufferInfo(k);
+    auto v_info = getBufferInfo(v);
+    auto o_info = getBufferInfo(output);
+    auto l_info = getBufferInfo(logsumexp);
+    // Synchronize with PyTorch's MPS stream
+    @autoreleasepool {
+        // Wait for PyTorch operations to complete
+        auto stream = at::mps::getCurrentMPSStream();
+        stream->synchronize(at::mps::SyncType::COMMIT_AND_WAIT);
+        // Execute MFA with storage byte offsets
+        bool success = g_mfa_forward(
+            kernel,
+            (__bridge void*)q_info.buffer,
+            (__bridge void*)k_info.buffer,
+            (__bridge void*)v_info.buffer,
+            (__bridge void*)o_info.buffer,
+            (__bridge void*)l_info.buffer,
+            q_info.byte_offset,
+            k_info.byte_offset,
+            v_info.byte_offset,
+            o_info.byte_offset,
+            l_info.byte_offset,
+            static_cast<int32_t>(batch_size),
+            static_cast<int32_t>(num_heads)
+        );
+        if (!success) {
+            throw std::runtime_error("MFA forward pass failed");
+        }
+    }
+    // Output is already in the correct dtype (fp16 or fp32)
+    // Return both output and logsumexp (needed for backward pass)
+    return std::make_tuple(output, logsumexp);
+}
+// Simple forward that only returns output (for inference)
+at::Tensor mps_flash_attention_forward(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    bool is_causal
+) {
+    auto [output, logsumexp] = mps_flash_attention_forward_with_lse(query, key, value, is_causal);
+    return output;
+}
+// ============================================================================
+// Flash Attention Backward
+// ============================================================================
+std::tuple<at::Tensor, at::Tensor, at::Tensor> mps_flash_attention_backward(
+    const at::Tensor& grad_output,  // (B, H, N, D)
+    const at::Tensor& query,        // (B, H, N, D)
+    const at::Tensor& key,          // (B, H, N, D)
+    const at::Tensor& value,        // (B, H, N, D)
+    const at::Tensor& output,       // (B, H, N, D)
+    const at::Tensor& logsumexp,    // (B, H, N)
+    bool is_causal
+) {
+    // Initialize MFA on first call
+    if (!g_initialized) {
+        load_mfa_bridge();
+        if (!g_mfa_init()) {
+            throw std::runtime_error("Failed to initialize MFA");
+        }
+        g_initialized = true;
+    }
+    // Validate inputs
+    TORCH_CHECK(grad_output.dim() == 4, "grad_output must be 4D (B, H, N, D)");
+    TORCH_CHECK(query.dim() == 4, "Query must be 4D (B, H, N, D)");
+    TORCH_CHECK(key.dim() == 4, "Key must be 4D (B, H, N, D)");
+    TORCH_CHECK(value.dim() == 4, "Value must be 4D (B, H, N, D)");
+    TORCH_CHECK(output.dim() == 4, "Output must be 4D (B, H, N, D)");
+    TORCH_CHECK(logsumexp.dim() == 3, "Logsumexp must be 3D (B, H, N)");
+    const int64_t batch_size = query.size(0);
+    const int64_t num_heads = query.size(1);
+    const int64_t seq_len_q = query.size(2);
+    const int64_t head_dim = query.size(3);
+    const int64_t seq_len_kv = key.size(2);
+    // Determine precision
+    bool low_precision = (query.scalar_type() == at::kHalf ||
+                          query.scalar_type() == at::kBFloat16);
+    bool low_precision_outputs = low_precision;
+    // Make inputs contiguous
+    auto q = query.contiguous();
+    auto k = key.contiguous();
+    auto v = value.contiguous();
+    auto o = output.contiguous();
+    auto dO = grad_output.contiguous();
+    auto lse = logsumexp.contiguous();
+    // Get or create kernel (with causal mode)
+    void* kernel = get_or_create_kernel(seq_len_q, seq_len_kv, head_dim, low_precision, low_precision_outputs, is_causal);
+    // Allocate D buffer (dO * O reduction, always fp32)
+    auto D = at::empty({batch_size, num_heads, seq_len_q},
+                        query.options().dtype(at::kFloat));
+    // Allocate gradients (always fp32 for numerical stability)
+    auto dQ = at::zeros({batch_size, num_heads, seq_len_q, head_dim},
+                         query.options().dtype(at::kFloat));
+    auto dK = at::zeros({batch_size, num_heads, seq_len_kv, head_dim},
+                         query.options().dtype(at::kFloat));
+    auto dV = at::zeros({batch_size, num_heads, seq_len_kv, head_dim},
+                         query.options().dtype(at::kFloat));
+    // Get Metal buffers with byte offsets
+    auto q_info = getBufferInfo(q);
+    auto k_info = getBufferInfo(k);
+    auto v_info = getBufferInfo(v);
+    auto o_info = getBufferInfo(o);
+    auto do_info = getBufferInfo(dO);
+    auto l_info = getBufferInfo(lse);
+    auto d_info = getBufferInfo(D);
+    auto dq_info = getBufferInfo(dQ);
+    auto dk_info = getBufferInfo(dK);
+    auto dv_info = getBufferInfo(dV);
+    // Execute backward pass
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        stream->synchronize(at::mps::SyncType::COMMIT_AND_WAIT);
+        bool success = g_mfa_backward(
+            kernel,
+            (__bridge void*)q_info.buffer,
+            (__bridge void*)k_info.buffer,
+            (__bridge void*)v_info.buffer,
+            (__bridge void*)o_info.buffer,
+            (__bridge void*)do_info.buffer,
+            (__bridge void*)l_info.buffer,
+            (__bridge void*)d_info.buffer,
+            (__bridge void*)dq_info.buffer,
+            (__bridge void*)dk_info.buffer,
+            (__bridge void*)dv_info.buffer,
+            q_info.byte_offset,
+            k_info.byte_offset,
+            v_info.byte_offset,
+            o_info.byte_offset,
+            do_info.byte_offset,
+            l_info.byte_offset,
+            d_info.byte_offset,
+            dq_info.byte_offset,
+            dk_info.byte_offset,
+            dv_info.byte_offset,
+            static_cast<int32_t>(batch_size),
+            static_cast<int32_t>(num_heads)
+        );
+        if (!success) {
+            throw std::runtime_error("MFA backward pass failed");
+        }
+    }
+    // Convert gradients back to input dtype if needed
+    if (low_precision) {
+        dQ = dQ.to(query.scalar_type());
+        dK = dK.to(query.scalar_type());
+        dV = dV.to(query.scalar_type());
+    }
+    return std::make_tuple(dQ, dK, dV);
+}
+// ============================================================================
+// Python Bindings
+// ============================================================================
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "MPS Flash Attention - Metal accelerated attention for Apple Silicon";
+    m.def("forward", &mps_flash_attention_forward,
+          "Flash Attention forward pass (returns output only)",
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("is_causal") = false);
+    m.def("forward_with_lse", &mps_flash_attention_forward_with_lse,
+          "Flash Attention forward pass (returns output and logsumexp for backward)",
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("is_causal") = false);
+    m.def("backward", &mps_flash_attention_backward,
+          "Flash Attention backward pass",
+          py::arg("grad_output"),
+          py::arg("query"),
+          py::arg("key"),
+          py::arg("value"),
+          py::arg("output"),
+          py::arg("logsumexp"),
+          py::arg("is_causal") = false);
+}