PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/steel/attn/params.h ADDED Viewed

@@ -0,0 +1,44 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+///////////////////////////////////////////////////////////////////////////////
+// Attn param classes
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+struct AttnParams {
+  int B; ///< Batch Size
+  int H; ///< Heads
+  int D; ///< Head Dim
+  int qL; ///< Query Sequence Length
+  int kL; ///< Key Sequence Length
+  int gqa_factor; ///< Group Query factor
+  float scale; ///< Attention scale
+  int NQ; ///< Number of query blocks
+  int NK; ///< Number of key/value blocks
+  int NQ_aligned; ///< Number of full query blocks
+  int NK_aligned; ///< Number of full key/value blocks
+  int qL_rem; ///< Remainder in last query block
+  int kL_rem; ///< Remainder in last key/value block
+  int qL_off; ///< Offset in query sequence start
+  int64_t Q_strides[3]; ///< Query  strides (B, H, L, D = 1)
+  int64_t K_strides[3]; ///< Key    strides (B, H, L, D = 1)
+  int64_t V_strides[3]; ///< Value  strides (B, H, L, D = 1)
+  int64_t O_strides[3]; ///< Output strides (B, H, L, D = 1)
+};
+struct AttnMaskParams {
+  int64_t M_strides[3]; ///< Mask  strides (B, H, qL, kL = 1)
+};
+} // namespace steel
+} // namespace mlx

mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h ADDED Viewed

@@ -0,0 +1,71 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/utils.h"
+///////////////////////////////////////////////////////////////////////////////
+// Transforms and Epilogues
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <typename OutT, typename InT>
+struct TransformNone {
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT) {
+    return static_cast<OutT>(x);
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAdd {
+  TransformAdd(const float, const float) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  static METAL_FUNC OutT apply(InT x, OutT c) {
+    return static_cast<OutT>(x) + c;
+  }
+};
+template <typename OutT, typename InT>
+struct TransformAxpby {
+  const float alpha;
+  const float beta;
+  TransformAxpby(const float alpha_, const float beta_)
+      : alpha(alpha_), beta(beta_) {}
+  static METAL_FUNC OutT apply(InT x) {
+    return static_cast<OutT>(x);
+  }
+  METAL_FUNC OutT apply(InT x, OutT c) const {
+    return static_cast<OutT>(x * alpha + (beta * c));
+  }
+};
+template <typename T>
+struct AccumHelper {
+  typedef float accum_type;
+};
+struct BlockSwizzle {
+  static METAL_FUNC int2
+  swizzle(uint3 tid [[threadgroup_position_in_grid]], const int swizzle_log) {
+    const int tid_x = (tid.x) >> swizzle_log;
+    const int tid_y =
+        ((tid.y) << swizzle_log) + ((tid.x) & ((1 << swizzle_log) - 1));
+    return int2(tid_x, tid_y);
+  }
+};
+} // namespace steel
+} // namespace mlx

mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h ADDED Viewed

@@ -0,0 +1,13 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/defines.h"
+#include "mlx/backend/metal/kernels/steel/utils.h"
+#include "mlx/backend/metal/kernels/steel/conv/loader.h"
+#include "mlx/backend/metal/kernels/steel/conv/params.h"
+#include "mlx/backend/metal/kernels/steel/gemm/mma.h"
+using namespace metal;
+using namespace mlx::steel;

mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h ADDED Viewed

@@ -0,0 +1,176 @@
+// Copyright © 2024 Apple Inc.
+#include <metal_stdlib>
+using namespace metal;
+template <
+    typename T,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    int N_CHANNELS = 0,
+    bool SMALL_FILTER = false>
+[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void
+implicit_gemm_conv_2d(
+    const device T* A [[buffer(0)]],
+    const device T* B [[buffer(1)]],
+    device T* C [[buffer(2)]],
+    const constant MLXConvParams<2>* params [[buffer(3)]],
+    const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  using namespace mlx::steel;
+  (void)lid;
+  constexpr bool transpose_a = false;
+  constexpr bool transpose_b = true;
+  constexpr short tgp_padding_a = 16 / sizeof(T);
+  constexpr short tgp_padding_b = 16 / sizeof(T);
+  constexpr short shape_a_cols = (transpose_a ? BM : BK) + tgp_padding_a;
+  constexpr short shape_b_cols = (transpose_b ? BK : BN) + tgp_padding_b;
+  constexpr short shape_a_rows = (transpose_a ? BK : BM);
+  constexpr short shape_b_rows = (transpose_b ? BN : BK);
+  constexpr short tgp_mem_size_a = shape_a_cols * shape_a_rows;
+  constexpr short tgp_mem_size_b = shape_b_cols * shape_b_rows;
+  constexpr short tgp_size = WM * WN * 32;
+  // Input loader
+  using loader_a_t = typename metal::conditional_t<
+      // Check for small channel specialization
+      N_CHANNELS != 0 && N_CHANNELS <= 4,
+      // Go to small channel specialization
+      Conv2DInputBlockLoaderSmallChannels<
+          T,
+          BM,
+          BN,
+          BK,
+          tgp_size,
+          N_CHANNELS,
+          tgp_padding_a>,
+      // Else go to general loader
+      typename metal::conditional_t<
+          // Check if filter size is small enough
+          SMALL_FILTER,
+          // Go to small filter specialization
+          Conv2DInputBlockLoaderSmallFilter<
+              T,
+              BM,
+              BN,
+              BK,
+              tgp_size,
+              tgp_padding_a>,
+          // Else go to large filter generalization
+          Conv2DInputBlockLoaderLargeFilter<
+              T,
+              BM,
+              BN,
+              BK,
+              tgp_size,
+              tgp_padding_a>>>;
+  // Weight loader
+  using loader_b_t = typename metal::conditional_t<
+      // Check for small channel specialization
+      N_CHANNELS != 0 && N_CHANNELS <= 4,
+      // Go to small channel specialization
+      Conv2DWeightBlockLoaderSmallChannels<
+          T,
+          BM,
+          BN,
+          BK,
+          tgp_size,
+          N_CHANNELS,
+          tgp_padding_b>,
+      // Else go to general loader
+      Conv2DWeightBlockLoader<T, BM, BN, BK, tgp_size, tgp_padding_b>>;
+  using mma_t = BlockMMA<
+      T,
+      T,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_a,
+      transpose_b,
+      shape_a_cols,
+      shape_b_cols>;
+  threadgroup T As[tgp_mem_size_a];
+  threadgroup T Bs[tgp_mem_size_b];
+  const int tid_y = ((tid.y) << gemm_params->swizzle_log) +
+      ((tid.x) & ((1 << gemm_params->swizzle_log) - 1));
+  const int tid_x = (tid.x) >> gemm_params->swizzle_log;
+  if (gemm_params->tiles_n <= tid_x || gemm_params->tiles_m <= tid_y) {
+    return;
+  }
+  const int c_row = tid_y * BM;
+  const int c_col = tid_x * BN;
+  const int K = gemm_params->K;
+  const int N = gemm_params->N;
+  const int C_per_group = params->C / params->groups;
+  // Groups
+  A += tid.z * C_per_group;
+  B += tid.z * N * K;
+  C += tid.z * N;
+  B += c_col * K;
+  C += c_row * (N * params->groups) + c_col;
+  const int2 offsets_a(0, c_row);
+  const int2 offsets_b(0, c_col);
+  // Prepare threadgroup loading operations
+  loader_a_t loader_a(
+      A, As, offsets_a, params, gemm_params, simd_gid, simd_lid);
+  loader_b_t loader_b(
+      B, Bs, offsets_b, params, gemm_params, simd_gid, simd_lid);
+  // Prepare threadgroup mma operation
+  mma_t mma_op(simd_gid, simd_lid);
+  int gemm_k_iterations = gemm_params->gemm_k_iterations;
+  for (int k = 0; k < gemm_k_iterations; k++) {
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Load elements into threadgroup
+    loader_a.load_unsafe();
+    loader_b.load_unsafe();
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Multiply and accumulate threadgroup elements
+    mma_op.mma(As, Bs);
+    // Prepare for next iteration
+    loader_a.next();
+    loader_b.next();
+  }
+  threadgroup_barrier(mem_flags::mem_none);
+  // Store results to device memory
+  short tgp_bm = min(BM, gemm_params->M - c_row);
+  short tgp_bn = min(BN, gemm_params->N - c_col);
+  const int ldc = N * params->groups;
+  mma_op.store_result_safe(C, ldc, short2(tgp_bn, tgp_bm));
+}

mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h ADDED Viewed

@@ -0,0 +1,225 @@
+// Copyright © 2024 Apple Inc.
+#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h"
+constant bool align_C [[function_constant(200)]];
+template <
+    typename T,
+    int BM,
+    int BN,
+    int BK,
+    int WM,
+    int WN,
+    typename AccumType = float,
+    typename Epilogue = TransformNone<T, AccumType>>
+[[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] void
+implicit_gemm_conv_2d_general(
+    const device T* A [[buffer(0)]],
+    const device T* B [[buffer(1)]],
+    device T* C [[buffer(2)]],
+    const constant MLXConvParams<2>* params [[buffer(3)]],
+    const constant ImplicitGemmConv2DParams* gemm_params [[buffer(4)]],
+    const constant Conv2DGeneralJumpParams* jump_params [[buffer(5)]],
+    const constant Conv2DGeneralBaseInfo* base_h [[buffer(6)]],
+    const constant Conv2DGeneralBaseInfo* base_w [[buffer(7)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_gid [[simdgroup_index_in_threadgroup]],
+    uint simd_lid [[thread_index_in_simdgroup]]) {
+  (void)lid;
+  constexpr bool transpose_a = false;
+  constexpr bool transpose_b = true;
+  constexpr short tgp_padding_a = 16 / sizeof(T);
+  constexpr short tgp_padding_b = 16 / sizeof(T);
+  constexpr short shape_a_cols = (transpose_a ? BM : BK) + tgp_padding_a;
+  constexpr short shape_b_cols = (transpose_b ? BK : BN) + tgp_padding_b;
+  constexpr short shape_a_rows = (transpose_a ? BK : BM);
+  constexpr short shape_b_rows = (transpose_b ? BN : BK);
+  constexpr short tgp_mem_size_a = shape_a_cols * shape_a_rows;
+  constexpr short tgp_mem_size_b = shape_b_cols * shape_b_rows;
+  constexpr short tgp_size = WM * WN * 32;
+  // Input loader
+  using loader_a_t =
+      Conv2DInputBlockLoaderGeneral<T, BM, BN, BK, tgp_size, tgp_padding_a>;
+  // Weight loader
+  using loader_b_t =
+      Conv2DWeightBlockLoaderGeneral<T, BM, BN, BK, tgp_size, tgp_padding_b>;
+  using mma_t = BlockMMA<
+      T,
+      T,
+      BM,
+      BN,
+      BK,
+      WM,
+      WN,
+      transpose_a,
+      transpose_b,
+      shape_a_cols,
+      shape_b_cols>;
+  threadgroup T As[tgp_mem_size_a];
+  threadgroup T Bs[tgp_mem_size_b];
+  const int tid_y = ((tid.y) << gemm_params->swizzle_log) +
+      ((tid.x) & ((1 << gemm_params->swizzle_log) - 1));
+  const int tid_x = (tid.x) >> gemm_params->swizzle_log;
+  if (gemm_params->tiles_n <= tid_x || gemm_params->tiles_m <= tid_y) {
+    return;
+  }
+  const int tid_z = tid.z;
+  const int base_oh = tid_z / jump_params->f_out_jump_w;
+  const int base_ow = tid_z % jump_params->f_out_jump_w;
+  const int base_wh = base_h[base_oh].weight_base;
+  const int base_ww = base_w[base_ow].weight_base;
+  const int base_wh_size = base_h[base_oh].weight_size;
+  const int base_ww_size = base_w[base_ow].weight_size;
+  const int c_row = tid_y * BM;
+  const int c_col = tid_x * BN;
+  const int K = gemm_params->K;
+  B += c_col * K;
+  const int4 offsets_a(0, c_row, base_oh, base_ow);
+  const int2 offsets_b(0, c_col);
+  // Prepare threadgroup loading operations
+  loader_a_t loader_a(
+      A,
+      As,
+      offsets_a,
+      params,
+      jump_params,
+      base_wh,
+      base_ww,
+      simd_gid,
+      simd_lid);
+  loader_b_t loader_b(
+      B,
+      Bs,
+      offsets_b,
+      params,
+      jump_params,
+      base_wh,
+      base_ww,
+      simd_gid,
+      simd_lid);
+  // Prepare threadgroup mma operation
+  mma_t mma_op(simd_gid, simd_lid);
+  if (align_C) {
+    int gemm_k_iterations =
+        base_wh_size * base_ww_size * gemm_params->gemm_k_iterations;
+    for (int k = 0; k < gemm_k_iterations; k++) {
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Load elements into threadgroup
+      loader_a.load_unsafe();
+      loader_b.load_unsafe();
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+      // Prepare for next iteration
+      loader_a.next();
+      loader_b.next();
+    }
+  }
+  else {
+    for (int k = 1; k < gemm_params->gemm_k_iterations; k++) {
+      for (int j = 0; j < base_wh_size * base_ww_size; j++) {
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Load elements into threadgroup
+        loader_a.load_unsafe();
+        loader_b.load_unsafe();
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        // Multiply and accumulate threadgroup elements
+        mma_op.mma(As, Bs);
+        // Prepare for next iteration
+        loader_a.next();
+        loader_b.next();
+      }
+    }
+    const short remaining_k = params->C % BK;
+    for (int j = 0; j < base_wh_size * base_ww_size; j++) {
+      // Load elements into threadgroup
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      loader_a.load_safe(remaining_k);
+      loader_b.load_safe(remaining_k);
+      threadgroup_barrier(mem_flags::mem_threadgroup);
+      // Multiply and accumulate threadgroup elements
+      mma_op.mma(As, Bs);
+      // Prepare for next iteration
+      loader_a.next();
+      loader_b.next();
+    }
+  }
+  threadgroup_barrier(mem_flags::mem_none);
+  // Store results to device memory
+  {
+    // Adjust for simdgroup and thread location
+    int offset_m = c_row + mma_op.sm;
+    int offset_n = c_col + mma_op.sn;
+    C += offset_n;
+    if (offset_n >= gemm_params->N)
+      return;
+    short diff = gemm_params->N - offset_n;
+    STEEL_PRAGMA_UNROLL
+    for (int i = 0; i < mma_t::TM; i++) {
+      int cm = offset_m + i * mma_t::TM_stride;
+      int n = cm / jump_params->adj_out_hw;
+      int hw = cm % jump_params->adj_out_hw;
+      int oh =
+          (hw / jump_params->adj_out_w) * jump_params->f_out_jump_h + base_oh;
+      int ow =
+          (hw % jump_params->adj_out_w) * jump_params->f_out_jump_w + base_ow;
+      if (n < params->N && oh < params->oS[0] && ow < params->oS[1]) {
+        int offset_cm = n * params->out_strides[0] +
+            oh * params->out_strides[1] + ow * params->out_strides[2];
+        STEEL_PRAGMA_UNROLL
+        for (int j = 0; j < mma_t::TN; j++) {
+          // Get accumulated result and associated offset in C
+          thread const auto& accum = mma_op.Ctile.frag_at(i, j);
+          int offset = offset_cm + (j * mma_t::TN_stride);
+          constexpr short kelems = decltype(mma_op.Ctile)::kElemsPerFrag;
+          // Apply epilogue and output C
+          STEEL_PRAGMA_UNROLL
+          for (short k = 0; k < kelems; k++) {
+            if ((j * mma_t::TN_stride + k) < diff) {
+              C[offset + k] = Epilogue::apply(accum[k]);
+            }
+          }
+        }
+      }
+    }
+  }
+}

mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h ADDED Viewed

@@ -0,0 +1,6 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h"
+#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h"