PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h ADDED Viewed

@@ -0,0 +1,319 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/utils.h"
+#include "mlx/backend/metal/kernels/steel/conv/params.h"
+///////////////////////////////////////////////////////////////////////////////
+// Loading helper
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <short n_channels_>
+struct ChannelHelper {
+  STEEL_CONST short n_channels = n_channels_;
+  STEEL_CONST short vec_size = n_channels_ <= 4 ? 4 : 8;
+  STEEL_CONST short excess = vec_size - n_channels_;
+};
+template <>
+struct ChannelHelper<1> {
+  STEEL_CONST short n_channels = 1;
+  STEEL_CONST short vec_size = 1;
+  STEEL_CONST short excess = 0;
+};
+template <>
+struct ChannelHelper<2> {
+  STEEL_CONST short n_channels = 2;
+  STEEL_CONST short vec_size = 2;
+  STEEL_CONST short excess = 0;
+};
+template <>
+struct ChannelHelper<3> {
+  STEEL_CONST short n_channels = 3;
+  STEEL_CONST short vec_size = 4;
+  STEEL_CONST short excess = 1;
+};
+template <>
+struct ChannelHelper<4> {
+  STEEL_CONST short n_channels = 4;
+  STEEL_CONST short vec_size = 4;
+  STEEL_CONST short excess = 0;
+};
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short n_channels,
+    short tgp_padding = 0>
+struct Conv2DInputBlockLoaderSmallChannels {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BM;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size = ChannelHelper<n_channels>::vec_size;
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const constant MLXConvParams<2>* params;
+  const constant ImplicitGemmConv2DParams* gemm_params;
+  int weight_hw;
+  const device T* src[n_rows];
+  int read_n[n_rows];
+  int read_ih[n_rows];
+  int read_iw[n_rows];
+  /* Constructor */
+  METAL_FUNC Conv2DInputBlockLoaderSmallChannels(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant ImplicitGemmConv2DParams* gemm_params_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        params(params_),
+        gemm_params(gemm_params_),
+        weight_hw(thread_idx % TCOLS) {
+    int out_n_pixels = params->oS[0] * params->oS[1];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; ++i) {
+      int offset_nhw = offsets.y + bi + i * TROWS;
+      int n = offset_nhw / out_n_pixels;
+      int hw = offset_nhw % out_n_pixels;
+      int oh = hw / params->oS[1];
+      int ow = hw % params->oS[1];
+      int ih = oh * params->str[0] - params->pad[0];
+      int iw = ow * params->str[1] - params->pad[1];
+      // Read from input if in bounds
+      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
+          iw * params->in_strides[2];
+      read_n[i] = n;
+      read_ih[i] = ih;
+      read_iw[i] = iw;
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    if (weight_hw >= params->wS[1] * params->wS[0]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+      return;
+    }
+    int wh = (weight_hw / params->wS[1]);
+    int ww = (weight_hw % params->wS[1]);
+    int flip_h = params->flip ? params->wS[0] - wh - 1 : wh;
+    int flip_w = params->flip ? params->wS[1] - ww - 1 : ww;
+    int weight_h = flip_h * params->kdil[0];
+    int weight_w = flip_w * params->kdil[1];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
+      // Find bounds
+      int n = read_n[i];
+      int ih = read_ih[i] + weight_h;
+      int iw = read_iw[i] + weight_w;
+      // Read from input if in bounds
+      if ((n < params->N) && (ih >= 0 && ih < params->iS[0]) &&
+          (iw >= 0 && iw < params->iS[1])) {
+        const device T* curr_src = src[i] + weight_h * params->in_strides[1] +
+            weight_w * params->in_strides[2];
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < n_channels; ++j) {
+          dst[is * dst_ld + j] = curr_src[j];
+        }
+        STEEL_PRAGMA_UNROLL
+        for (short j = n_channels; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+      // Zero pad otherwise
+      else {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    weight_hw += TCOLS;
+  }
+};
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short n_channels,
+    short tgp_padding = 0>
+struct Conv2DWeightBlockLoaderSmallChannels {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BN;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size = ChannelHelper<n_channels>::vec_size;
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Leading dimension for src
+  const int src_ld;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  const constant MLXConvParams<2>* params;
+  int weight_hw;
+  const int read_n;
+  const bool do_read;
+  /* Constructor */
+  METAL_FUNC Conv2DWeightBlockLoaderSmallChannels(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant ImplicitGemmConv2DParams* gemm_params_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(params_->wt_strides[0]),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld),
+        params(params_),
+        weight_hw(thread_idx % TCOLS),
+        read_n(offsets.y + bi),
+        do_read(read_n + BN <= gemm_params_->N) {}
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    if (bi >= BROWS || bj >= BCOLS)
+      return;
+    if (read_n >= params->O || weight_hw >= params->wS[1] * params->wS[0]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+      return;
+    }
+    const device T* curr_src = src + weight_hw * (params->C / params->groups);
+    if (BN != 8 || do_read) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BROWS; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < n_channels; j++) {
+          dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+        }
+        STEEL_PRAGMA_UNROLL
+        for (short j = n_channels; j < vec_size; j++) {
+          dst[i * dst_ld + j] = T(0);
+        }
+      }
+    } else {
+      for (short i = 0; i < BROWS; i += TROWS) {
+        if (((read_n + i) < params->O)) {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < n_channels; j++) {
+            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+          }
+          STEEL_PRAGMA_UNROLL
+          for (short j = n_channels; j < vec_size; j++) {
+            dst[i * dst_ld + j] = T(0);
+          }
+        } else {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = T(0);
+          }
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    weight_hw += TCOLS;
+  }
+};
+} // namespace steel
+} // namespace mlx

mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h ADDED Viewed

@@ -0,0 +1,381 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/defines.h"
+///////////////////////////////////////////////////////////////////////////////
+// Loading helper
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short tgp_padding = 0>
+struct Conv2DInputBlockLoaderGeneral {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BM;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const constant MLXConvParams<2>* params;
+  const constant Conv2DGeneralJumpParams* jump_params;
+  const short base_wh;
+  const short base_ww;
+  short weight_h;
+  short weight_w;
+  const device T* src[n_rows];
+  int read_n[n_rows];
+  int read_ih[n_rows];
+  int read_iw[n_rows];
+  /* Constructor */
+  METAL_FUNC Conv2DInputBlockLoaderGeneral(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int4 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant Conv2DGeneralJumpParams* jump_params_,
+      const short base_wh_,
+      const short base_ww_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        params(params_),
+        jump_params(jump_params_),
+        base_wh(base_wh_),
+        base_ww(base_ww_),
+        weight_h(base_wh_),
+        weight_w(base_ww_) {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; ++i) {
+      int offset_nhw = offsets.y + bi + i * TROWS;
+      int n = offset_nhw / jump_params->adj_out_hw;
+      int hw = offset_nhw % jump_params->adj_out_hw;
+      int oh =
+          (hw / jump_params->adj_out_w) * jump_params->f_out_jump_h + offsets.z;
+      int ow =
+          (hw % jump_params->adj_out_w) * jump_params->f_out_jump_w + offsets.w;
+      int ih = oh * params->str[0] - params->pad[0];
+      int iw = ow * params->str[1] - params->pad[1];
+      read_n[i] = n;
+      read_ih[i] = ih;
+      read_iw[i] = iw;
+      // Read from input if in bounds
+      src[i] = src_ + n * params->in_strides[0] + bj;
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
+      // Find bounds
+      int n = read_n[i];
+      int h_flip = params->flip ? params->wS[0] - weight_h - 1 : weight_h;
+      int w_flip = params->flip ? params->wS[1] - weight_w - 1 : weight_w;
+      int ih_dil = read_ih[i] + h_flip * params->kdil[0];
+      int iw_dil = read_iw[i] + w_flip * params->kdil[1];
+      int ih = ih_dil / params->idil[0];
+      int iw = iw_dil / params->idil[1];
+      size_t offset = ih * params->in_strides[1] + iw * params->in_strides[2];
+      // Read from input if in bounds
+      if ((n < params->N) && (ih_dil >= 0 && ih < params->iS[0]) &&
+          (iw_dil >= 0 && iw < params->iS[1])) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = (src[i])[offset + j];
+        }
+      }
+      // Zero pad otherwise
+      else {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+    }
+  }
+  METAL_FUNC void load_safe(const short remaining_k) const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
+      // Find bounds
+      int n = read_n[i];
+      int h_flip = params->flip ? params->wS[0] - weight_h - 1 : weight_h;
+      int w_flip = params->flip ? params->wS[1] - weight_w - 1 : weight_w;
+      int ih_dil = read_ih[i] + h_flip * params->kdil[0];
+      int iw_dil = read_iw[i] + w_flip * params->kdil[1];
+      int ih = ih_dil / params->idil[0];
+      int iw = iw_dil / params->idil[1];
+      size_t offset = ih * params->in_strides[1] + iw * params->in_strides[2];
+      // Read from input if in bounds
+      if ((n < params->N) && (ih_dil >= 0 && ih < params->iS[0]) &&
+          (iw_dil >= 0 && iw < params->iS[1])) {
+        if (bj + vec_size <= remaining_k) {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; ++j) {
+            dst[is * dst_ld + j] = (src[i])[offset + j];
+          }
+        } else {
+          for (short j = 0; j < vec_size; ++j) {
+            if (bj + j < remaining_k) {
+              dst[is * dst_ld + j] = (src[i])[offset + j];
+            } else {
+              dst[is * dst_ld + j] = T(0);
+            }
+          }
+        }
+      }
+      // Zero pad otherwise
+      else {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    weight_w += jump_params->f_wgt_jump_w;
+    if (weight_w < params->wS[1]) {
+      return;
+    }
+    weight_w = base_ww;
+    weight_h += jump_params->f_wgt_jump_h;
+    if (weight_h < params->wS[0]) {
+      return;
+    }
+    weight_h = base_wh;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; i++) {
+      src[i] += BK;
+    }
+  }
+};
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short tgp_padding = 0>
+struct Conv2DWeightBlockLoaderGeneral {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BN;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size =
+      (BN == 8) ? 1 : (tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4);
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Leading dimension for src
+  const int src_ld;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  const constant MLXConvParams<2>* params;
+  const constant Conv2DGeneralJumpParams* jump_params;
+  const short base_wh;
+  const short base_ww;
+  short weight_h;
+  short weight_w;
+  const int start_row;
+  /* Constructor */
+  METAL_FUNC Conv2DWeightBlockLoaderGeneral(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant Conv2DGeneralJumpParams* jump_params_,
+      const short base_wh_,
+      const short base_ww_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(params_->wt_strides[0]),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj),
+        params(params_),
+        jump_params(jump_params_),
+        base_wh(base_wh_),
+        base_ww(base_ww_),
+        weight_h(base_wh_),
+        weight_w(base_ww_),
+        start_row(offsets.y + bi) {}
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    const device T* curr_src = src + weight_h * params->wt_strides[1] +
+        weight_w * params->wt_strides[2];
+    if ((start_row + BN <= params->O)) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BN; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+        }
+      }
+    } else {
+      for (short i = 0; i < BN; i += TROWS) {
+        if ((start_row + i) < params->O) {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+          }
+        } else {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = T(0);
+          }
+        }
+      }
+    }
+  }
+  METAL_FUNC void load_safe(const short remaining_k) const {
+    const device T* curr_src = src + weight_h * params->wt_strides[1] +
+        weight_w * params->wt_strides[2];
+    if ((start_row + BN <= params->O)) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BN; i += TROWS) {
+        if (bj + vec_size <= remaining_k) {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+          }
+        } else {
+          for (short j = 0; j < vec_size; j++) {
+            if (bj + j < remaining_k) {
+              dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+            } else {
+              dst[i * dst_ld + j] = T(0);
+            }
+          }
+        }
+      }
+    } else {
+      for (short i = 0; i < BN; i += TROWS) {
+        if ((start_row + i) < params->O) {
+          if (bj + vec_size <= remaining_k) {
+            STEEL_PRAGMA_UNROLL
+            for (short j = 0; j < vec_size; j++) {
+              dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+            }
+          } else {
+            for (short j = 0; j < vec_size; j++) {
+              if (bj + j < remaining_k) {
+                dst[i * dst_ld + j] = curr_src[i * src_ld + j];
+              } else {
+                dst[i * dst_ld + j] = T(0);
+              }
+            }
+          }
+        } else {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = T(0);
+          }
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    weight_w += jump_params->f_wgt_jump_w;
+    if (weight_w < params->wS[1]) {
+      return;
+    }
+    weight_w = base_ww;
+    weight_h += jump_params->f_wgt_jump_h;
+    if (weight_h < params->wS[0]) {
+      return;
+    }
+    weight_h = base_wh;
+    src += BK;
+  }
+};
+} // namespace steel
+} // namespace mlx