PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h ADDED Viewed

@@ -0,0 +1,451 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/steel/utils.h"
+#include "mlx/backend/metal/kernels/steel/conv/params.h"
+///////////////////////////////////////////////////////////////////////////////
+// Loading helper
+///////////////////////////////////////////////////////////////////////////////
+namespace mlx {
+namespace steel {
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short tgp_padding = 0>
+struct Conv2DInputBlockLoaderLargeFilter {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BM;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const constant MLXConvParams<2>* params;
+  const constant ImplicitGemmConv2DParams* gemm_params;
+  short weight_h;
+  short weight_w;
+  const device T* src[n_rows];
+  int read_n[n_rows];
+  int read_ih[n_rows];
+  int read_iw[n_rows];
+  /* Constructor */
+  METAL_FUNC Conv2DInputBlockLoaderLargeFilter(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant ImplicitGemmConv2DParams* gemm_params_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        params(params_),
+        gemm_params(gemm_params_),
+        weight_h(0),
+        weight_w(0) {
+    int out_n_pixels = params->oS[0] * params->oS[1];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; ++i) {
+      int offset_nhw = offsets.y + bi + i * TROWS;
+      int n = offset_nhw / out_n_pixels;
+      int hw = offset_nhw % out_n_pixels;
+      int oh = hw / params->oS[1];
+      int ow = hw % params->oS[1];
+      int ih = oh * params->str[0] - params->pad[0];
+      int iw = ow * params->str[1] - params->pad[1];
+      read_n[i] = n;
+      read_ih[i] = ih;
+      read_iw[i] = iw;
+      // Adjust for flip
+      if (params->flip) {
+        ih += (params->wS[0] - 1) * params->kdil[0];
+        iw += (params->wS[1] - 1) * params->kdil[1];
+      }
+      // Read from input if in bounds
+      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
+          iw * params->in_strides[2] + bj;
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
+      // Find bounds
+      int n = read_n[i];
+      int ih = read_ih[i] + weight_h * params->kdil[0];
+      int iw = read_iw[i] + weight_w * params->kdil[1];
+      // Read from input if in bounds
+      if ((n < params->N) && (ih >= 0 && ih < params->iS[0]) &&
+          (iw >= 0 && iw < params->iS[1])) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = src[i][j];
+        }
+      }
+      // Zero pad otherwise
+      else {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    if (++weight_w < params->wS[1]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; i++) {
+        src[i] += gemm_params->inp_jump_w;
+      }
+      return;
+    }
+    weight_w = 0;
+    if (++weight_h < params->wS[0]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; i++) {
+        src[i] += gemm_params->inp_jump_h;
+      }
+      return;
+    }
+    weight_h = 0;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; i++) {
+      src[i] += gemm_params->inp_jump_c;
+    }
+  }
+};
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short tgp_padding = 0>
+struct Conv2DInputBlockLoaderSmallFilter {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BM;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size = tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4;
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  using mask_t = short;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const constant MLXConvParams<2>* params;
+  const constant ImplicitGemmConv2DParams* gemm_params;
+  short weight_h;
+  short weight_w;
+  const device T* src[n_rows];
+  mask_t mask_h[n_rows];
+  mask_t mask_w[n_rows];
+  /* Constructor */
+  METAL_FUNC Conv2DInputBlockLoaderSmallFilter(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant ImplicitGemmConv2DParams* gemm_params_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        params(params_),
+        gemm_params(gemm_params_),
+        weight_h(0),
+        weight_w(0) {
+    int out_n_pixels = params->oS[0] * params->oS[1];
+    int read_n[n_rows];
+    int read_ih[n_rows];
+    int read_iw[n_rows];
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; ++i) {
+      int offset_nhw = offsets.y + bi + i * TROWS;
+      int n = offset_nhw / out_n_pixels;
+      int hw = offset_nhw % out_n_pixels;
+      int oh = hw / params->oS[1];
+      int ow = hw % params->oS[1];
+      int ih = oh * params->str[0] - params->pad[0];
+      int iw = ow * params->str[1] - params->pad[1];
+      read_n[i] = n;
+      read_ih[i] = ih;
+      read_iw[i] = iw;
+      // Adjust for flip
+      if (params->flip) {
+        ih += (params->wS[0] - 1) * params->kdil[0];
+        iw += (params->wS[1] - 1) * params->kdil[1];
+      }
+      // Read from input if in bounds
+      src[i] = src_ + n * params->in_strides[0] + ih * params->in_strides[1] +
+          iw * params->in_strides[2] + bj;
+    }
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; ++i) {
+      mask_h[i] = 0;
+      mask_w[i] = 0;
+    }
+    for (short kh = 0; kh < params->wS[0]; kh++) {
+      short flip_h = params->flip ? params->wS[0] - kh - 1 : kh;
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; ++i) {
+        int n = read_n[i];
+        int ih = read_ih[i] + flip_h * params->kdil[0];
+        bool in_bounds = n < params->N && ih >= 0 && ih < params->iS[0];
+        mask_h[i] |= (in_bounds << kh);
+      }
+    }
+    for (short kw = 0; kw < params->wS[1]; kw++) {
+      short flip_w = params->flip ? params->wS[1] - kw - 1 : kw;
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; ++i) {
+        int iw = read_iw[i] + flip_w * params->kdil[1];
+        bool in_bounds = iw >= 0 && iw < params->iS[1];
+        mask_w[i] |= (in_bounds << kw);
+      }
+    }
+  }
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    mask_t h_mask = mask_t(1) << weight_h;
+    mask_t w_mask = mask_t(1) << weight_w;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
+      // Read from input if in bounds
+      if ((mask_h[i] & h_mask) && (mask_w[i] & w_mask)) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = src[i][j];
+        }
+      }
+      // Zero pad otherwise
+      else {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; ++j) {
+          dst[is * dst_ld + j] = T(0);
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    if (++weight_w < params->wS[1]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; i++) {
+        src[i] += gemm_params->inp_jump_w;
+      }
+      return;
+    }
+    weight_w = 0;
+    if (++weight_h < params->wS[0]) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < n_rows; i++) {
+        src[i] += gemm_params->inp_jump_h;
+      }
+      return;
+    }
+    weight_h = 0;
+    STEEL_PRAGMA_UNROLL
+    for (short i = 0; i < n_rows; i++) {
+      src[i] += gemm_params->inp_jump_c;
+    }
+  }
+};
+template <
+    typename T,
+    short BM,
+    short BN,
+    short BK,
+    short tgp_size,
+    short tgp_padding = 0>
+struct Conv2DWeightBlockLoader {
+  // Destination dimensions
+  STEEL_CONST short BROWS = BN;
+  STEEL_CONST short BCOLS = BK;
+  // Read dimensions
+  STEEL_CONST short dst_ld = BCOLS + tgp_padding;
+  STEEL_CONST short vec_size =
+      (BN == 8) ? 1 : (tgp_size / (BROWS * BCOLS) >= 8 ? 8 : 4);
+  // Thread read shape
+  STEEL_CONST short TCOLS = BCOLS / vec_size;
+  STEEL_CONST short TROWS = tgp_size / TCOLS;
+  // Rows / strided reads within the block
+  STEEL_CONST short n_rows = BROWS / TROWS;
+  // Leading dimension for src
+  const int src_ld;
+  // Thread location indices
+  const short thread_idx;
+  const short bi;
+  const short bj;
+  // threadgroup and device memory
+  threadgroup T* dst;
+  const device T* src;
+  const constant MLXConvParams<2>* params;
+  int weight_hw;
+  int weight_step;
+  const int read_n;
+  const bool do_read;
+  /* Constructor */
+  METAL_FUNC Conv2DWeightBlockLoader(
+      const device T* src_,
+      threadgroup T* dst_,
+      const int2 offsets,
+      const constant MLXConvParams<2>* params_,
+      const constant ImplicitGemmConv2DParams* gemm_params_,
+      uint simd_group_id [[simdgroup_index_in_threadgroup]],
+      uint simd_lane_id [[thread_index_in_simdgroup]])
+      : src_ld(params_->wt_strides[0]),
+        thread_idx(simd_group_id * 32 + simd_lane_id),
+        bi(thread_idx / TCOLS),
+        bj(vec_size * (thread_idx % TCOLS)),
+        dst(dst_ + bi * dst_ld + bj),
+        src(src_ + bi * src_ld + bj),
+        params(params_),
+        weight_hw(0),
+        weight_step(params->C / params->groups),
+        read_n(offsets.y + bi),
+        do_read(read_n + n_rows * TROWS <= gemm_params_->N) {}
+  /* Load from device memory into threadgroup memory - without bound checking */
+  METAL_FUNC void load_unsafe() const {
+    if (BN != 8 || do_read) {
+      STEEL_PRAGMA_UNROLL
+      for (short i = 0; i < BN; i += TROWS) {
+        STEEL_PRAGMA_UNROLL
+        for (short j = 0; j < vec_size; j++) {
+          dst[i * dst_ld + j] = src[i * src_ld + j];
+        }
+      }
+    } else {
+      for (short i = 0; i < BN; i += TROWS) {
+        if ((read_n + i) < params->O) {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = src[i * src_ld + j];
+          }
+        } else {
+          STEEL_PRAGMA_UNROLL
+          for (short j = 0; j < vec_size; j++) {
+            dst[i * dst_ld + j] = T(0);
+          }
+        }
+      }
+    }
+  }
+  /* Iteration helper */
+  METAL_FUNC void next() {
+    if (++weight_hw < (params->wS[1] * params->wS[0])) {
+      src += weight_step;
+      return;
+    }
+    weight_hw = 0;
+    src += BK - (params->wS[1] * params->wS[0] - 1) * weight_step;
+  }
+};
+} // namespace steel
+} // namespace mlx