PyPI - mps-conv3d - Versions diffs - 0.1.5__cp314-cp314-macosx_15_0_arm64.whl - Mend

mps-conv3d 0.1.5__cp314-cp314-macosx_15_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

mps_conv3d/_C.cpython-314-darwin.so +0 -0
mps_conv3d/__init__.py +259 -0
mps_conv3d/csrc/conv3d_mps.mm +505 -0
mps_conv3d-0.1.5.dist-info/METADATA +148 -0
mps_conv3d-0.1.5.dist-info/RECORD +7 -0
mps_conv3d-0.1.5.dist-info/WHEEL +5 -0
mps_conv3d-0.1.5.dist-info/top_level.txt +1 -0

mps_conv3d/_C.cpython-314-darwin.so ADDED Viewed

Binary file

mps_conv3d/__init__.py ADDED Viewed

@@ -0,0 +1,259 @@
+"""
+MPS Conv3D - 3D Convolution for Apple Silicon
+Drop-in replacement for torch.nn.functional.conv3d on MPS.
+Used in video models: Synchformer, I3D, SlowFast, C3D, etc.
+"""
+import torch
+from torch import nn, Tensor
+from torch.autograd import Function
+from typing import Optional, Tuple, Union
+import torch.nn.functional as F
+__version__ = "0.1.5"
+def _load_library():
+    """Load the native extension."""
+    try:
+        from mps_conv3d import _C as _lib
+        return _lib
+    except ImportError:
+        import os
+        from torch.utils.cpp_extension import load
+        src_dir = os.path.join(os.path.dirname(__file__), "csrc")
+        _lib = load(
+            name="mps_conv3d",
+            sources=[os.path.join(src_dir, "conv3d_mps.mm")],
+            extra_cflags=["-std=c++17"],
+            extra_ldflags=["-framework", "Metal", "-framework", "Foundation"],
+            verbose=False,
+        )
+        return _lib
+_lib_cache = None
+def _get_lib():
+    global _lib_cache
+    if _lib_cache is None:
+        _lib_cache = _load_library()
+    return _lib_cache
+class _Conv3DFunction(Function):
+    """Autograd function for Conv3D."""
+    @staticmethod
+    def forward(
+        ctx,
+        input: Tensor,
+        weight: Tensor,
+        bias: Optional[Tensor],
+        stride: Tuple[int, int, int],
+        padding: Tuple[int, int, int],
+        dilation: Tuple[int, int, int],
+        groups: int,
+    ) -> Tensor:
+        ctx.save_for_backward(input, weight, bias)
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        lib = _get_lib()
+        output = lib.conv3d_forward(
+            input, weight,
+            stride[0], stride[1], stride[2],
+            padding[0], padding[1], padding[2],
+            dilation[0], dilation[1], dilation[2],
+            groups
+        )
+        if bias is not None:
+            output = output + bias.view(1, -1, 1, 1, 1)
+        return output
+    @staticmethod
+    def backward(ctx, grad_output: Tensor):
+        input, weight, bias = ctx.saved_tensors
+        lib = _get_lib()
+        grad_input = grad_weight = grad_bias = None
+        if ctx.needs_input_grad[0]:
+            grad_input = lib.conv3d_backward_input(
+                grad_output, weight, input.shape,
+                ctx.stride[0], ctx.stride[1], ctx.stride[2],
+                ctx.padding[0], ctx.padding[1], ctx.padding[2],
+                ctx.dilation[0], ctx.dilation[1], ctx.dilation[2],
+                ctx.groups
+            )
+        if ctx.needs_input_grad[1]:
+            grad_weight = lib.conv3d_backward_weight(
+                grad_output, input, weight.shape,
+                ctx.stride[0], ctx.stride[1], ctx.stride[2],
+                ctx.padding[0], ctx.padding[1], ctx.padding[2],
+                ctx.dilation[0], ctx.dilation[1], ctx.dilation[2],
+                ctx.groups
+            )
+        if bias is not None and ctx.needs_input_grad[2]:
+            grad_bias = grad_output.sum(dim=[0, 2, 3, 4])
+        return grad_input, grad_weight, grad_bias, None, None, None, None
+def _normalize_tuple(value, n, name):
+    """Convert int or tuple to n-tuple."""
+    if isinstance(value, int):
+        return (value,) * n
+    if isinstance(value, (list, tuple)):
+        if len(value) == n:
+            return tuple(value)
+        elif len(value) == 1:
+            return (value[0],) * n
+    raise ValueError(f"{name} must be an int or {n}-tuple")
+def conv3d(
+    input: Tensor,
+    weight: Tensor,
+    bias: Optional[Tensor] = None,
+    stride: Union[int, Tuple[int, int, int]] = 1,
+    padding: Union[int, Tuple[int, int, int]] = 0,
+    dilation: Union[int, Tuple[int, int, int]] = 1,
+    groups: int = 1,
+) -> Tensor:
+    """
+    3D convolution on MPS.
+    Drop-in replacement for torch.nn.functional.conv3d.
+    Args:
+        input: Input tensor (N, C_in, D, H, W)
+        weight: Weight tensor (C_out, C_in/groups, kD, kH, kW)
+        bias: Optional bias tensor (C_out,)
+        stride: Stride of convolution
+        padding: Padding added to input
+        dilation: Dilation of kernel
+        groups: Number of groups
+    Returns:
+        Output tensor (N, C_out, D_out, H_out, W_out)
+    """
+    if input.device.type != "mps":
+        # Fallback to PyTorch for non-MPS
+        return F.conv3d(input, weight, bias, stride, padding, dilation, groups)
+    stride = _normalize_tuple(stride, 3, "stride")
+    padding = _normalize_tuple(padding, 3, "padding")
+    dilation = _normalize_tuple(dilation, 3, "dilation")
+    return _Conv3DFunction.apply(
+        input.contiguous(), weight.contiguous(), bias,
+        stride, padding, dilation, groups
+    )
+class Conv3d(nn.Module):
+    """
+    3D Convolution layer for MPS.
+    Drop-in replacement for torch.nn.Conv3d.
+    Args:
+        in_channels: Number of input channels
+        out_channels: Number of output channels
+        kernel_size: Size of the convolving kernel
+        stride: Stride of the convolution
+        padding: Padding added to input
+        dilation: Dilation of kernel elements
+        groups: Number of blocked connections
+        bias: If True, adds a learnable bias
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups: int = 1,
+        bias: bool = True,
+    ):
+        super().__init__()
+        kernel_size = _normalize_tuple(kernel_size, 3, "kernel_size")
+        self.stride = _normalize_tuple(stride, 3, "stride")
+        self.padding = _normalize_tuple(padding, 3, "padding")
+        self.dilation = _normalize_tuple(dilation, 3, "dilation")
+        self.groups = groups
+        self.weight = nn.Parameter(
+            torch.empty(out_channels, in_channels // groups, *kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.kaiming_uniform_(self.weight, a=5**0.5)
+        if self.bias is not None:
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / (fan_in ** 0.5)
+            nn.init.uniform_(self.bias, -bound, bound)
+    def forward(self, input: Tensor) -> Tensor:
+        return conv3d(
+            input, self.weight, self.bias,
+            self.stride, self.padding, self.dilation, self.groups
+        )
+_original_conv3d = None
+def patch_conv3d():
+    """
+    Monkey-patch torch.nn.functional.conv3d to use MPS implementation.
+    Call this at the start of your script to automatically use MPS conv3d
+    for all 3D convolution operations.
+    """
+    global _original_conv3d
+    if _original_conv3d is not None:
+        return  # Already patched
+    _original_conv3d = F.conv3d
+    def patched_conv3d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
+        if input.device.type == 'mps':
+            return conv3d(input, weight, bias, stride, padding, dilation, groups)
+        return _original_conv3d(input, weight, bias, stride, padding, dilation, groups)
+    F.conv3d = patched_conv3d
+    print("MPS Conv3D: Patched F.conv3d")
+def unpatch_conv3d():
+    """Restore original torch.nn.functional.conv3d."""
+    global _original_conv3d
+    if _original_conv3d is not None:
+        F.conv3d = _original_conv3d
+        _original_conv3d = None
+def is_available() -> bool:
+    """Check if MPS is available."""
+    return torch.backends.mps.is_available()

mps_conv3d/csrc/conv3d_mps.mm ADDED Viewed

@@ -0,0 +1,505 @@
+// MPS Conv3D - Metal implementation of 3D convolution
+// Strategy: im2col + PyTorch matmul (leverage Apple's optimized GEMM)
+#include <torch/extension.h>
+#include <ATen/mps/MPSStream.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include <Metal/Metal.h>
+#include <Foundation/Foundation.h>
+static id<MTLDevice> g_device = nil;
+static id<MTLLibrary> g_library = nil;
+static id<MTLComputePipelineState> g_im2col3d_fp32 = nil;
+static id<MTLComputePipelineState> g_im2col3d_fp16 = nil;
+static id<MTLComputePipelineState> g_col2im3d_fp32 = nil;
+static const char* METAL_SHADER = R"(
+#include <metal_stdlib>
+using namespace metal;
+// im2col3d: Extract 3D patches into column matrix for GEMM
+// Input: (N, C_in, D, H, W)
+// Output: (N * D_out * H_out * W_out, C_in * kD * kH * kW)
+// Each thread handles one output spatial position
+kernel void im2col3d_fp32(
+    device const float* input [[buffer(0)]],
+    device float* col [[buffer(1)]],
+    constant int& batch [[buffer(2)]],
+    constant int& in_channels [[buffer(3)]],
+    constant int& in_depth [[buffer(4)]],
+    constant int& in_height [[buffer(5)]],
+    constant int& in_width [[buffer(6)]],
+    constant int& out_depth [[buffer(7)]],
+    constant int& out_height [[buffer(8)]],
+    constant int& out_width [[buffer(9)]],
+    constant int& kernel_d [[buffer(10)]],
+    constant int& kernel_h [[buffer(11)]],
+    constant int& kernel_w [[buffer(12)]],
+    constant int& stride_d [[buffer(13)]],
+    constant int& stride_h [[buffer(14)]],
+    constant int& stride_w [[buffer(15)]],
+    constant int& pad_d [[buffer(16)]],
+    constant int& pad_h [[buffer(17)]],
+    constant int& pad_w [[buffer(18)]],
+    constant int& dilation_d [[buffer(19)]],
+    constant int& dilation_h [[buffer(20)]],
+    constant int& dilation_w [[buffer(21)]],
+    uint gid [[thread_position_in_grid]]
+) {
+    // gid = linear index into (batch, out_d, out_h, out_w)
+    int out_spatial = out_depth * out_height * out_width;
+    int total = batch * out_spatial;
+    if (int(gid) >= total) return;
+    int b = gid / out_spatial;
+    int spatial_idx = gid % out_spatial;
+    int od = spatial_idx / (out_height * out_width);
+    int oh = (spatial_idx / out_width) % out_height;
+    int ow = spatial_idx % out_width;
+    int col_width = in_channels * kernel_d * kernel_h * kernel_w;
+    int col_row = gid;  // Row in column matrix
+    // Input base for this batch
+    int input_batch_offset = b * in_channels * in_depth * in_height * in_width;
+    // Fill one row of column matrix
+    int col_idx = 0;
+    for (int ic = 0; ic < in_channels; ic++) {
+        for (int kd = 0; kd < kernel_d; kd++) {
+            int id = od * stride_d - pad_d + kd * dilation_d;
+            for (int kh = 0; kh < kernel_h; kh++) {
+                int ih = oh * stride_h - pad_h + kh * dilation_h;
+                for (int kw = 0; kw < kernel_w; kw++) {
+                    int iw = ow * stride_w - pad_w + kw * dilation_w;
+                    float val = 0.0f;
+                    if (id >= 0 && id < in_depth && ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                        int input_idx = input_batch_offset +
+                                       ic * in_depth * in_height * in_width +
+                                       id * in_height * in_width +
+                                       ih * in_width + iw;
+                        val = input[input_idx];
+                    }
+                    col[col_row * col_width + col_idx] = val;
+                    col_idx++;
+                }
+            }
+        }
+    }
+}
+kernel void im2col3d_fp16(
+    device const half* input [[buffer(0)]],
+    device half* col [[buffer(1)]],
+    constant int& batch [[buffer(2)]],
+    constant int& in_channels [[buffer(3)]],
+    constant int& in_depth [[buffer(4)]],
+    constant int& in_height [[buffer(5)]],
+    constant int& in_width [[buffer(6)]],
+    constant int& out_depth [[buffer(7)]],
+    constant int& out_height [[buffer(8)]],
+    constant int& out_width [[buffer(9)]],
+    constant int& kernel_d [[buffer(10)]],
+    constant int& kernel_h [[buffer(11)]],
+    constant int& kernel_w [[buffer(12)]],
+    constant int& stride_d [[buffer(13)]],
+    constant int& stride_h [[buffer(14)]],
+    constant int& stride_w [[buffer(15)]],
+    constant int& pad_d [[buffer(16)]],
+    constant int& pad_h [[buffer(17)]],
+    constant int& pad_w [[buffer(18)]],
+    constant int& dilation_d [[buffer(19)]],
+    constant int& dilation_h [[buffer(20)]],
+    constant int& dilation_w [[buffer(21)]],
+    uint gid [[thread_position_in_grid]]
+) {
+    int out_spatial = out_depth * out_height * out_width;
+    int total = batch * out_spatial;
+    if (int(gid) >= total) return;
+    int b = gid / out_spatial;
+    int spatial_idx = gid % out_spatial;
+    int od = spatial_idx / (out_height * out_width);
+    int oh = (spatial_idx / out_width) % out_height;
+    int ow = spatial_idx % out_width;
+    int col_width = in_channels * kernel_d * kernel_h * kernel_w;
+    int col_row = gid;
+    int input_batch_offset = b * in_channels * in_depth * in_height * in_width;
+    int col_idx = 0;
+    for (int ic = 0; ic < in_channels; ic++) {
+        for (int kd = 0; kd < kernel_d; kd++) {
+            int id = od * stride_d - pad_d + kd * dilation_d;
+            for (int kh = 0; kh < kernel_h; kh++) {
+                int ih = oh * stride_h - pad_h + kh * dilation_h;
+                for (int kw = 0; kw < kernel_w; kw++) {
+                    int iw = ow * stride_w - pad_w + kw * dilation_w;
+                    half val = 0.0h;
+                    if (id >= 0 && id < in_depth && ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                        int input_idx = input_batch_offset +
+                                       ic * in_depth * in_height * in_width +
+                                       id * in_height * in_width +
+                                       ih * in_width + iw;
+                        val = input[input_idx];
+                    }
+                    col[col_row * col_width + col_idx] = val;
+                    col_idx++;
+                }
+            }
+        }
+    }
+}
+// col2im3d: Scatter column matrix back to image (for backward)
+// Atomic add since multiple output positions may write to same input
+inline void atomic_add_float(device atomic_uint* addr, float value) {
+    uint expected = atomic_load_explicit(addr, memory_order_relaxed);
+    float current_val = as_type<float>(expected);
+    float new_val = current_val + value;
+    uint new_bits = as_type<uint>(new_val);
+    while (!atomic_compare_exchange_weak_explicit(
+        addr, &expected, new_bits,
+        memory_order_relaxed, memory_order_relaxed)) {
+        current_val = as_type<float>(expected);
+        new_val = current_val + value;
+        new_bits = as_type<uint>(new_val);
+    }
+}
+kernel void col2im3d_fp32(
+    device const float* col [[buffer(0)]],
+    device atomic_uint* output [[buffer(1)]],
+    constant int& batch [[buffer(2)]],
+    constant int& in_channels [[buffer(3)]],
+    constant int& in_depth [[buffer(4)]],
+    constant int& in_height [[buffer(5)]],
+    constant int& in_width [[buffer(6)]],
+    constant int& out_depth [[buffer(7)]],
+    constant int& out_height [[buffer(8)]],
+    constant int& out_width [[buffer(9)]],
+    constant int& kernel_d [[buffer(10)]],
+    constant int& kernel_h [[buffer(11)]],
+    constant int& kernel_w [[buffer(12)]],
+    constant int& stride_d [[buffer(13)]],
+    constant int& stride_h [[buffer(14)]],
+    constant int& stride_w [[buffer(15)]],
+    constant int& pad_d [[buffer(16)]],
+    constant int& pad_h [[buffer(17)]],
+    constant int& pad_w [[buffer(18)]],
+    constant int& dilation_d [[buffer(19)]],
+    constant int& dilation_h [[buffer(20)]],
+    constant int& dilation_w [[buffer(21)]],
+    uint gid [[thread_position_in_grid]]
+) {
+    int out_spatial = out_depth * out_height * out_width;
+    int total = batch * out_spatial;
+    if (int(gid) >= total) return;
+    int b = gid / out_spatial;
+    int spatial_idx = gid % out_spatial;
+    int od = spatial_idx / (out_height * out_width);
+    int oh = (spatial_idx / out_width) % out_height;
+    int ow = spatial_idx % out_width;
+    int col_width = in_channels * kernel_d * kernel_h * kernel_w;
+    int col_row = gid;
+    int output_batch_offset = b * in_channels * in_depth * in_height * in_width;
+    int col_idx = 0;
+    for (int ic = 0; ic < in_channels; ic++) {
+        for (int kd = 0; kd < kernel_d; kd++) {
+            int id = od * stride_d - pad_d + kd * dilation_d;
+            for (int kh = 0; kh < kernel_h; kh++) {
+                int ih = oh * stride_h - pad_h + kh * dilation_h;
+                for (int kw = 0; kw < kernel_w; kw++) {
+                    int iw = ow * stride_w - pad_w + kw * dilation_w;
+                    if (id >= 0 && id < in_depth && ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                        int output_idx = output_batch_offset +
+                                        ic * in_depth * in_height * in_width +
+                                        id * in_height * in_width +
+                                        ih * in_width + iw;
+                        float val = col[col_row * col_width + col_idx];
+                        atomic_add_float(&output[output_idx], val);
+                    }
+                    col_idx++;
+                }
+            }
+        }
+    }
+}
+)";
+static void ensure_initialized() {
+    if (g_device != nil) return;
+    g_device = MTLCreateSystemDefaultDevice();
+    NSError* error = nil;
+    NSString* source = [NSString stringWithUTF8String:METAL_SHADER];
+    g_library = [g_device newLibraryWithSource:source options:nil error:&error];
+    if (error) {
+        NSLog(@"Failed to compile Metal shader: %@", error);
+        throw std::runtime_error("Failed to compile Metal shader");
+    }
+    id<MTLFunction> im2col_fp32 = [g_library newFunctionWithName:@"im2col3d_fp32"];
+    id<MTLFunction> im2col_fp16 = [g_library newFunctionWithName:@"im2col3d_fp16"];
+    id<MTLFunction> col2im_fp32 = [g_library newFunctionWithName:@"col2im3d_fp32"];
+    g_im2col3d_fp32 = [g_device newComputePipelineStateWithFunction:im2col_fp32 error:&error];
+    g_im2col3d_fp16 = [g_device newComputePipelineStateWithFunction:im2col_fp16 error:&error];
+    g_col2im3d_fp32 = [g_device newComputePipelineStateWithFunction:col2im_fp32 error:&error];
+}
+// im2col3d: Metal kernel to extract patches
+torch::Tensor im2col3d_mps(
+    const torch::Tensor& input,
+    int kernel_d, int kernel_h, int kernel_w,
+    int stride_d, int stride_h, int stride_w,
+    int pad_d, int pad_h, int pad_w,
+    int dilation_d, int dilation_h, int dilation_w
+) {
+    ensure_initialized();
+    int batch = input.size(0);
+    int in_channels = input.size(1);
+    int in_depth = input.size(2);
+    int in_height = input.size(3);
+    int in_width = input.size(4);
+    int out_depth = (in_depth + 2 * pad_d - dilation_d * (kernel_d - 1) - 1) / stride_d + 1;
+    int out_height = (in_height + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+    int out_width = (in_width + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+    int col_height = batch * out_depth * out_height * out_width;
+    int col_width = in_channels * kernel_d * kernel_h * kernel_w;
+    auto input_contig = input.contiguous();
+    auto col = torch::empty({col_height, col_width}, input_contig.options());
+    id<MTLBuffer> input_buf = at::native::mps::getMTLBufferStorage(input_contig);
+    id<MTLBuffer> col_buf = at::native::mps::getMTLBufferStorage(col);
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        id<MTLComputePipelineState> pso = (input_contig.scalar_type() == at::kHalf)
+            ? g_im2col3d_fp16 : g_im2col3d_fp32;
+        [encoder setComputePipelineState:pso];
+        [encoder setBuffer:input_buf offset:input_contig.storage_offset() * input_contig.element_size() atIndex:0];
+        [encoder setBuffer:col_buf offset:0 atIndex:1];
+        [encoder setBytes:&batch length:sizeof(int) atIndex:2];
+        [encoder setBytes:&in_channels length:sizeof(int) atIndex:3];
+        [encoder setBytes:&in_depth length:sizeof(int) atIndex:4];
+        [encoder setBytes:&in_height length:sizeof(int) atIndex:5];
+        [encoder setBytes:&in_width length:sizeof(int) atIndex:6];
+        [encoder setBytes:&out_depth length:sizeof(int) atIndex:7];
+        [encoder setBytes:&out_height length:sizeof(int) atIndex:8];
+        [encoder setBytes:&out_width length:sizeof(int) atIndex:9];
+        [encoder setBytes:&kernel_d length:sizeof(int) atIndex:10];
+        [encoder setBytes:&kernel_h length:sizeof(int) atIndex:11];
+        [encoder setBytes:&kernel_w length:sizeof(int) atIndex:12];
+        [encoder setBytes:&stride_d length:sizeof(int) atIndex:13];
+        [encoder setBytes:&stride_h length:sizeof(int) atIndex:14];
+        [encoder setBytes:&stride_w length:sizeof(int) atIndex:15];
+        [encoder setBytes:&pad_d length:sizeof(int) atIndex:16];
+        [encoder setBytes:&pad_h length:sizeof(int) atIndex:17];
+        [encoder setBytes:&pad_w length:sizeof(int) atIndex:18];
+        [encoder setBytes:&dilation_d length:sizeof(int) atIndex:19];
+        [encoder setBytes:&dilation_h length:sizeof(int) atIndex:20];
+        [encoder setBytes:&dilation_w length:sizeof(int) atIndex:21];
+        MTLSize gridSize = MTLSizeMake(col_height, 1, 1);
+        NSUInteger threadGroupSize = std::min((NSUInteger)256, pso.maxTotalThreadsPerThreadgroup);
+        MTLSize tgSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize];
+    }
+    return col;
+}
+// Forward: im2col + matmul
+torch::Tensor conv3d_forward_mps(
+    const torch::Tensor& input,
+    const torch::Tensor& weight,
+    int stride_d, int stride_h, int stride_w,
+    int pad_d, int pad_h, int pad_w,
+    int dilation_d, int dilation_h, int dilation_w,
+    int groups
+) {
+    TORCH_CHECK(input.device().type() == torch::kMPS, "input must be on MPS");
+    TORCH_CHECK(weight.device().type() == torch::kMPS, "weight must be on MPS");
+    TORCH_CHECK(groups == 1, "groups > 1 not yet supported in im2col path");
+    int batch = input.size(0);
+    int out_channels = weight.size(0);
+    int in_channels = weight.size(1);
+    int kernel_d = weight.size(2);
+    int kernel_h = weight.size(3);
+    int kernel_w = weight.size(4);
+    int in_depth = input.size(2);
+    int in_height = input.size(3);
+    int in_width = input.size(4);
+    int out_depth = (in_depth + 2 * pad_d - dilation_d * (kernel_d - 1) - 1) / stride_d + 1;
+    int out_height = (in_height + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+    int out_width = (in_width + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+    // im2col: (B*D_out*H_out*W_out, C_in*kD*kH*kW)
+    auto col = im2col3d_mps(input, kernel_d, kernel_h, kernel_w,
+                            stride_d, stride_h, stride_w,
+                            pad_d, pad_h, pad_w,
+                            dilation_d, dilation_h, dilation_w);
+    // Weight: (C_out, C_in*kD*kH*kW) -> transpose for matmul
+    auto weight_col = weight.view({out_channels, -1});  // (C_out, C_in*k*k*k)
+    // Matmul: (B*D*H*W, C_in*k*k*k) @ (C_in*k*k*k, C_out) = (B*D*H*W, C_out)
+    auto output_col = torch::mm(col, weight_col.t());
+    // Reshape: (B, D_out, H_out, W_out, C_out) -> (B, C_out, D_out, H_out, W_out)
+    auto output = output_col.view({batch, out_depth, out_height, out_width, out_channels});
+    output = output.permute({0, 4, 1, 2, 3}).contiguous();
+    return output;
+}
+// Backward input: matmul + col2im
+torch::Tensor conv3d_backward_input_mps(
+    const torch::Tensor& grad_output,
+    const torch::Tensor& weight,
+    std::vector<int64_t> input_shape,
+    int stride_d, int stride_h, int stride_w,
+    int pad_d, int pad_h, int pad_w,
+    int dilation_d, int dilation_h, int dilation_w,
+    int groups
+) {
+    ensure_initialized();
+    TORCH_CHECK(groups == 1, "groups > 1 not yet supported");
+    int batch = input_shape[0];
+    int in_channels = input_shape[1];
+    int in_depth = input_shape[2];
+    int in_height = input_shape[3];
+    int in_width = input_shape[4];
+    int out_channels = weight.size(0);
+    int kernel_d = weight.size(2);
+    int kernel_h = weight.size(3);
+    int kernel_w = weight.size(4);
+    int out_depth = grad_output.size(2);
+    int out_height = grad_output.size(3);
+    int out_width = grad_output.size(4);
+    // grad_output: (B, C_out, D_out, H_out, W_out) -> (B*D*H*W, C_out)
+    auto grad_out_col = grad_output.permute({0, 2, 3, 4, 1}).contiguous();
+    grad_out_col = grad_out_col.view({-1, out_channels});
+    // Weight: (C_out, C_in*k*k*k)
+    auto weight_col = weight.view({out_channels, -1});
+    // grad_col = grad_out_col @ weight_col: (B*D*H*W, C_in*k*k*k)
+    auto grad_col = torch::mm(grad_out_col.to(at::kFloat), weight_col.to(at::kFloat));
+    // col2im: scatter back to input shape
+    auto grad_input = torch::zeros(input_shape, grad_col.options());
+    int col_height = batch * out_depth * out_height * out_width;
+    id<MTLBuffer> col_buf = at::native::mps::getMTLBufferStorage(grad_col);
+    id<MTLBuffer> output_buf = at::native::mps::getMTLBufferStorage(grad_input);
+    @autoreleasepool {
+        auto stream = at::mps::getCurrentMPSStream();
+        id<MTLComputeCommandEncoder> encoder = stream->commandEncoder();
+        [encoder setComputePipelineState:g_col2im3d_fp32];
+        [encoder setBuffer:col_buf offset:0 atIndex:0];
+        [encoder setBuffer:output_buf offset:0 atIndex:1];
+        [encoder setBytes:&batch length:sizeof(int) atIndex:2];
+        [encoder setBytes:&in_channels length:sizeof(int) atIndex:3];
+        [encoder setBytes:&in_depth length:sizeof(int) atIndex:4];
+        [encoder setBytes:&in_height length:sizeof(int) atIndex:5];
+        [encoder setBytes:&in_width length:sizeof(int) atIndex:6];
+        [encoder setBytes:&out_depth length:sizeof(int) atIndex:7];
+        [encoder setBytes:&out_height length:sizeof(int) atIndex:8];
+        [encoder setBytes:&out_width length:sizeof(int) atIndex:9];
+        [encoder setBytes:&kernel_d length:sizeof(int) atIndex:10];
+        [encoder setBytes:&kernel_h length:sizeof(int) atIndex:11];
+        [encoder setBytes:&kernel_w length:sizeof(int) atIndex:12];
+        [encoder setBytes:&stride_d length:sizeof(int) atIndex:13];
+        [encoder setBytes:&stride_h length:sizeof(int) atIndex:14];
+        [encoder setBytes:&stride_w length:sizeof(int) atIndex:15];
+        [encoder setBytes:&pad_d length:sizeof(int) atIndex:16];
+        [encoder setBytes:&pad_h length:sizeof(int) atIndex:17];
+        [encoder setBytes:&pad_w length:sizeof(int) atIndex:18];
+        [encoder setBytes:&dilation_d length:sizeof(int) atIndex:19];
+        [encoder setBytes:&dilation_h length:sizeof(int) atIndex:20];
+        [encoder setBytes:&dilation_w length:sizeof(int) atIndex:21];
+        MTLSize gridSize = MTLSizeMake(col_height, 1, 1);
+        MTLSize tgSize = MTLSizeMake(256, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:tgSize];
+    }
+    return grad_input.to(grad_output.scalar_type());
+}
+// Backward weight: im2col.T @ grad_output
+torch::Tensor conv3d_backward_weight_mps(
+    const torch::Tensor& grad_output,
+    const torch::Tensor& input,
+    std::vector<int64_t> weight_shape,
+    int stride_d, int stride_h, int stride_w,
+    int pad_d, int pad_h, int pad_w,
+    int dilation_d, int dilation_h, int dilation_w,
+    int groups
+) {
+    TORCH_CHECK(groups == 1, "groups > 1 not yet supported");
+    int out_channels = weight_shape[0];
+    int kernel_d = weight_shape[2];
+    int kernel_h = weight_shape[3];
+    int kernel_w = weight_shape[4];
+    // im2col on input
+    auto col = im2col3d_mps(input.to(at::kFloat), kernel_d, kernel_h, kernel_w,
+                            stride_d, stride_h, stride_w,
+                            pad_d, pad_h, pad_w,
+                            dilation_d, dilation_h, dilation_w);
+    // grad_output: (B, C_out, D_out, H_out, W_out) -> (B*D*H*W, C_out)
+    auto grad_out_col = grad_output.permute({0, 2, 3, 4, 1}).contiguous();
+    grad_out_col = grad_out_col.view({-1, out_channels}).to(at::kFloat);
+    // grad_weight = grad_out_col.T @ col: (C_out, C_in*k*k*k)
+    auto grad_weight_col = torch::mm(grad_out_col.t(), col);
+    // Reshape to weight shape
+    auto grad_weight = grad_weight_col.view(weight_shape);
+    return grad_weight.to(grad_output.scalar_type());
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("conv3d_forward", &conv3d_forward_mps, "Conv3D forward (MPS) - im2col + GEMM");
+    m.def("conv3d_backward_input", &conv3d_backward_input_mps, "Conv3D backward input (MPS)");
+    m.def("conv3d_backward_weight", &conv3d_backward_weight_mps, "Conv3D backward weight (MPS)");
+    m.def("im2col3d", &im2col3d_mps, "im2col3d (MPS)");
+}

mps_conv3d-0.1.5.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,148 @@
+Metadata-Version: 2.4
+Name: mps-conv3d
+Version: 0.1.5
+Summary: 3D Convolution for Apple Silicon (MPS)
+Author: mpsops
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/mpsops/mps-conv3d
+Project-URL: Repository, https://github.com/mpsops/mps-conv3d
+Project-URL: Issues, https://github.com/mpsops/mps-conv3d/issues
+Keywords: pytorch,mps,apple-silicon,conv3d,video,3d-convolution
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: torch>=2.0.0
+Dynamic: requires-python
+# MPS Conv3D
+3D Convolution for Apple Silicon (M1/M2/M3/M4).
+**Drop-in replacement** for `torch.nn.functional.conv3d` on MPS.
+## Why?
+3D convolutions are essential for video models:
+- **Synchformer**: Audio-visual synchronization
+- **I3D**: Video classification
+- **SlowFast**: Action recognition
+- **C3D**: Video feature extraction
+- **MMAudio**: Audio generation from video
+But PyTorch's MPS backend doesn't support 3D convolutions:
+```
+NotImplementedError: aten::slow_conv3d_forward is not implemented for MPS
+```
+This package provides a native Metal implementation.
+## Installation
+```bash
+pip install mps-conv3d
+```
+Or from source:
+```bash
+git clone https://github.com/mpsops/mps-conv3d
+cd mps-conv3d
+pip install -e .
+```
+## Quick Start
+### Patch All Conv3D Operations (Recommended)
+```python
+from mps_conv3d import patch_conv3d
+# Patch at the start of your script
+patch_conv3d()
+# Now all conv3d operations use MPS!
+import torch
+import torch.nn.functional as F
+x = torch.randn(1, 3, 16, 112, 112, device='mps')
+w = torch.randn(64, 3, 3, 7, 7, device='mps')
+out = F.conv3d(x, w, padding=(1, 3, 3))  # Uses MPS!
+```
+### Direct Usage
+```python
+import torch
+from mps_conv3d import conv3d
+x = torch.randn(1, 3, 16, 112, 112, device='mps')
+w = torch.randn(64, 3, 3, 7, 7, device='mps')
+out = conv3d(x, w, stride=1, padding=(1, 3, 3))
+```
+### Conv3d Module
+```python
+from mps_conv3d import Conv3d
+conv = Conv3d(
+    in_channels=3,
+    out_channels=64,
+    kernel_size=(3, 7, 7),
+    stride=(1, 2, 2),
+    padding=(1, 3, 3)
+).to('mps')
+x = torch.randn(1, 3, 16, 112, 112, device='mps')
+out = conv(x)
+```
+## API Reference
+### `conv3d(input, weight, bias, stride, padding, dilation, groups)`
+Same signature as `torch.nn.functional.conv3d`.
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `input` | Tensor | Input tensor (N, C_in, D, H, W) |
+| `weight` | Tensor | Weight tensor (C_out, C_in/groups, kD, kH, kW) |
+| `bias` | Tensor | Optional bias (C_out,) |
+| `stride` | int/tuple | Stride of convolution |
+| `padding` | int/tuple | Padding added to input |
+| `dilation` | int/tuple | Dilation of kernel |
+| `groups` | int | Number of groups |
+### `patch_conv3d()`
+Monkey-patches `torch.nn.functional.conv3d` to use MPS implementation for MPS tensors.
+### `unpatch_conv3d()`
+Restores original `torch.nn.functional.conv3d`.
+## Compatibility
+- **PyTorch**: 2.0+
+- **macOS**: 12.0+ (Monterey)
+- **Hardware**: Apple Silicon (M1/M2/M3/M4)
+## Features
+- Full forward and backward pass (training supported)
+- fp32 and fp16 supported
+- Groups and dilation supported
+- Drop-in compatible with PyTorch API
+## License
+MIT

mps_conv3d-0.1.5.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+mps_conv3d/_C.cpython-314-darwin.so,sha256=ZG2iBtp1Z12ChIEiW46iFB7CFH48XUBVxciWjuyqJ7A,267336
+mps_conv3d/__init__.py,sha256=UxjYD3ud-FU2ogQDKtgrXiPtSeiOkWbxsrZG73k0JWQ,7729
+mps_conv3d/csrc/conv3d_mps.mm,sha256=kSYI07eLvWxwO4LLBWG1-A0UvCdxt-7dMGS0hF8kD30,20998
+mps_conv3d-0.1.5.dist-info/METADATA,sha256=rE525f6iQecRG5ZSqr2rBXqY2cCdMXHb5I1mPBNvXX4,3636
+mps_conv3d-0.1.5.dist-info/WHEEL,sha256=uAzMRtb2noxPlbYLbRgeD25pPgKOo3k59IS71Dg5Qjs,110
+mps_conv3d-0.1.5.dist-info/top_level.txt,sha256=TjxUUmzGDVNDsIKTolOKbln2HXVSrh9M3DU84ug60II,11
+mps_conv3d-0.1.5.dist-info/RECORD,,

mps_conv3d-0.1.5.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: false
+Tag: cp314-cp314-macosx_15_0_arm64

mps_conv3d-0.1.5.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ mps_conv3d