PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/scan.h ADDED Viewed

@@ -0,0 +1,514 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include "mlx/backend/metal/kernels/binary_ops.h"
+#define DEFINE_SIMD_SCAN()                                               \
+  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
+  T simd_scan(T val) {                                                   \
+    return simd_scan_impl(val);                                          \
+  }                                                                      \
+                                                                         \
+  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
+  T simd_scan(T val) {                                                   \
+    for (int i = 1; i <= 16; i *= 2) {                                   \
+      val = operator()(val, simd_shuffle_and_fill_up(val, init, i));     \
+    }                                                                    \
+    return val;                                                          \
+  }
+#define DEFINE_SIMD_EXCLUSIVE_SCAN()                                     \
+  template <typename T, metal::enable_if_t<sizeof(T) < 8, bool> = true>  \
+  T simd_exclusive_scan(T val) {                                         \
+    return simd_exclusive_scan_impl(val);                                \
+  }                                                                      \
+                                                                         \
+  template <typename T, metal::enable_if_t<sizeof(T) == 8, bool> = true> \
+  T simd_exclusive_scan(T val) {                                         \
+    val = simd_scan(val);                                                \
+    return simd_shuffle_and_fill_up(val, init, 1);                       \
+  }
+template <typename U>
+struct CumSum {
+  DEFINE_SIMD_SCAN()
+  DEFINE_SIMD_EXCLUSIVE_SCAN()
+  static constexpr constant U init = static_cast<U>(0);
+  template <typename T>
+  U operator()(U a, T b) {
+    return a + b;
+  }
+  U simd_scan_impl(U x) {
+    return simd_prefix_inclusive_sum(x);
+  }
+  U simd_exclusive_scan_impl(U x) {
+    return simd_prefix_exclusive_sum(x);
+  }
+};
+template <typename U>
+struct CumProd {
+  DEFINE_SIMD_SCAN()
+  DEFINE_SIMD_EXCLUSIVE_SCAN()
+  static constexpr constant U init = static_cast<U>(1.0f);
+  template <typename T>
+  U operator()(U a, T b) {
+    return a * b;
+  }
+  U simd_scan_impl(U x) {
+    return simd_prefix_inclusive_product(x);
+  }
+  U simd_exclusive_scan_impl(U x) {
+    return simd_prefix_exclusive_product(x);
+  }
+};
+template <>
+struct CumProd<bool> {
+  static constexpr constant bool init = true;
+  template <typename T>
+  bool operator()(bool a, T b) {
+    return a & static_cast<bool>(b);
+  }
+  bool simd_scan(bool x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      bool other = simd_shuffle_and_fill_up(x, init, i);
+      x &= other;
+    }
+    return x;
+  }
+  bool simd_exclusive_scan(bool x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+template <typename U>
+struct CumMax {
+  static constexpr constant U init = Limits<U>::min;
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a >= b) ? a : b;
+  }
+  U simd_scan(U x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      U other = simd_shuffle_and_fill_up(x, init, i);
+      x = (x >= other) ? x : other;
+    }
+    return x;
+  }
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+template <typename U>
+struct CumMin {
+  static constexpr constant U init = Limits<U>::max;
+  template <typename T>
+  U operator()(U a, T b) {
+    return (a <= b) ? a : b;
+  }
+  U simd_scan(U x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      U other = simd_shuffle_and_fill_up(x, init, i);
+      x = (x <= other) ? x : other;
+    }
+    return x;
+  }
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+template <typename U>
+struct CumLogaddexp {
+  static constexpr constant U init = Limits<U>::min;
+  template <typename T>
+  U operator()(U a, T b) {
+    return LogAddExp{}(a, static_cast<U>(b));
+  }
+  U simd_scan(U x) {
+    for (int i = 1; i <= 16; i *= 2) {
+      U other = simd_shuffle_and_fill_up(x, init, i);
+      x = LogAddExp{}(x, other);
+    }
+    return x;
+  }
+  U simd_exclusive_scan(U x) {
+    x = simd_scan(x);
+    return simd_shuffle_and_fill_up(x, init, 1);
+  }
+};
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_unsafe(U values[N_READS], const device T* input) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      values[N_READS - i - 1] = input[i];
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = input[i];
+    }
+  }
+}
+template <typename T, typename U, int N_READS, bool reverse>
+inline void load_safe(
+    U values[N_READS],
+    const device T* input,
+    int start,
+    int total,
+    U init) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      values[N_READS - i - 1] =
+          (start + N_READS - i - 1 < total) ? input[i] : init;
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = (start + i < total) ? input[i] : init;
+    }
+  }
+}
+template <typename U, int N_READS, bool reverse>
+inline void write_unsafe(U values[N_READS], device U* out) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      out[i] = values[N_READS - i - 1];
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      out[i] = values[i];
+    }
+  }
+}
+template <typename U, int N_READS, bool reverse>
+inline void write_safe(U values[N_READS], device U* out, int start, int total) {
+  if (reverse) {
+    for (int i = 0; i < N_READS; i++) {
+      if (start + N_READS - i - 1 < total) {
+        out[i] = values[N_READS - i - 1];
+      }
+    }
+  } else {
+    for (int i = 0; i < N_READS; i++) {
+      if (start + i < total) {
+        out[i] = values[i];
+      }
+    }
+  }
+}
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    bool inclusive,
+    bool reverse>
+[[kernel]] void contiguous_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    uint3 gid [[threadgroup_position_in_grid]],
+    uint3 gsize [[threadgroups_per_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int simd_size = 32;
+  Op op;
+  // Position the pointers
+  size_t offset = (gid.y + gsize.y * size_t(gid.z)) * axis_size;
+  in += offset;
+  out += offset;
+  // Compute the number of simd_groups
+  uint simd_groups = lsize.x / simd_size;
+  // Allocate memory
+  U prefix = Op::init;
+  U values[N_READS];
+  threadgroup U simdgroup_sums[32];
+  // Loop over the reduced axis in blocks of size ceildiv(axis_size,
+  // N_READS*lsize)
+  //    Read block
+  //    Compute inclusive scan of the block
+  //      Compute inclusive scan per thread
+  //      Compute exclusive scan of thread sums in simdgroup
+  //      Write simdgroup sums in SM
+  //      Compute exclusive scan of simdgroup sums
+  //      Compute the output by scanning prefix, prev_simdgroup, prev_thread,
+  //      value
+  //    Write block
+  for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize.x); r++) {
+    // Compute the block offset
+    uint offset = r * lsize.x * N_READS + lid.x * N_READS;
+    // Read the values
+    if (reverse) {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(
+            values, in + axis_size - offset - N_READS);
+      } else {
+        load_safe<T, U, N_READS, reverse>(
+            values,
+            in + axis_size - offset - N_READS,
+            offset,
+            axis_size,
+            Op::init);
+      }
+    } else {
+      if ((offset + N_READS) < axis_size) {
+        load_unsafe<T, U, N_READS, reverse>(values, in + offset);
+      } else {
+        load_safe<T, U, N_READS, reverse>(
+            values, in + offset, offset, axis_size, Op::init);
+      }
+    }
+    // Compute an inclusive scan per thread
+    for (int i = 1; i < N_READS; i++) {
+      values[i] = op(values[i], values[i - 1]);
+    }
+    // Compute exclusive scan of thread sums
+    U prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
+    // Write simdgroup_sums to SM
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (simd_lane_id == simd_size - 1) {
+      simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Compute exclusive scan of simdgroup_sums
+    if (simd_group_id == 0) {
+      U prev_simdgroup = op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
+      simdgroup_sums[simd_lane_id] = prev_simdgroup;
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Compute the output
+    for (int i = 0; i < N_READS; i++) {
+      values[i] = op(values[i], prefix);
+      values[i] = op(values[i], simdgroup_sums[simd_group_id]);
+      values[i] = op(values[i], prev_thread);
+    }
+    // Write the values
+    if (reverse) {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(
+              values, out + axis_size - offset - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + axis_size - offset - N_READS, offset, axis_size);
+        }
+      } else {
+        if (lid.x == 0 && offset == 0) {
+          out[axis_size - 1] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(
+              values, out + axis_size - offset - 1 - N_READS);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values,
+              out + axis_size - offset - 1 - N_READS,
+              offset + 1,
+              axis_size);
+        }
+      }
+    } else {
+      if (inclusive) {
+        if ((offset + N_READS) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + offset, offset, axis_size);
+        }
+      } else {
+        if (lid.x == 0 && offset == 0) {
+          out[0] = Op::init;
+        }
+        if ((offset + N_READS + 1) < axis_size) {
+          write_unsafe<U, N_READS, reverse>(values, out + offset + 1);
+        } else {
+          write_safe<U, N_READS, reverse>(
+              values, out + offset + 1, offset + 1, axis_size);
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Share the prefix
+    if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
+      simdgroup_sums[0] = values[N_READS - 1];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    prefix = simdgroup_sums[0];
+  }
+}
+template <
+    typename T,
+    typename U,
+    typename Op,
+    int N_READS,
+    bool inclusive,
+    bool reverse>
+[[kernel]] void strided_scan(
+    const device T* in [[buffer(0)]],
+    device U* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    const constant size_t& stride [[buffer(3)]],
+    const constant size_t& stride_blocks [[buffer(4)]],
+    uint3 gid [[threadgroup_position_in_grid]],
+    uint3 gsize [[threadgroups_per_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
+  constexpr int simd_size = 32;
+  constexpr int BM = 32;
+  constexpr int BN = 32;
+  constexpr int BN_pad = 32 + 16 / sizeof(U);
+  constexpr int n_simds = BN / N_READS;
+  constexpr int n_scans = BN / n_simds;
+  Op op;
+  threadgroup U read_buffer[BM * BN_pad];
+  U values[n_scans];
+  U prefix[n_scans];
+  for (int i = 0; i < n_scans; i++) {
+    prefix[i] = Op::init;
+  }
+  // Compute offsets
+  size_t full_gid = gid.y + gsize.y * size_t(gid.z);
+  size_t offset = full_gid / stride_blocks * axis_size * stride;
+  size_t global_index_x = full_gid % stride_blocks * BN;
+  uint read_offset_y = (lid.x * N_READS) / BN;
+  uint read_offset_x = (lid.x * N_READS) % BN;
+  uint scan_offset_y = simd_lane_id;
+  uint scan_offset_x = simd_group_id * n_scans;
+  uint stride_limit = stride - global_index_x;
+  in += offset + global_index_x + read_offset_x;
+  out += offset + global_index_x + read_offset_x;
+  threadgroup U* read_into =
+      read_buffer + read_offset_y * BN_pad + read_offset_x;
+  threadgroup U* read_from =
+      read_buffer + scan_offset_y * BN_pad + scan_offset_x;
+  for (uint j = 0; j < axis_size; j += BM) {
+    // Calculate the indices for the current thread
+    uint index_y = j + read_offset_y;
+    uint check_index_y = index_y;
+    if (reverse) {
+      index_y = axis_size - 1 - index_y;
+    }
+    // Read in SM
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; i++) {
+        read_into[i] = in[index_y * stride + i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          read_into[i] = in[index_y * stride + i];
+        } else {
+          read_into[i] = Op::init;
+        }
+      }
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Read strided into registers
+    for (int i = 0; i < n_scans; i++) {
+      values[i] = read_from[i];
+    }
+    simdgroup_barrier(mem_flags::mem_threadgroup);
+    // Perform the scan
+    for (int i = 0; i < n_scans; i++) {
+      values[i] = op.simd_scan(values[i]);
+      values[i] = op(values[i], prefix[i]);
+      prefix[i] = simd_shuffle(values[i], simd_size - 1);
+    }
+    // Write to SM
+    for (int i = 0; i < n_scans; i++) {
+      read_from[i] = values[i];
+    }
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Write to device memory
+    if (!inclusive) {
+      if (check_index_y == 0) {
+        if ((read_offset_x + N_READS) < stride_limit) {
+          for (int i = 0; i < N_READS; i++) {
+            out[index_y * stride + i] = Op::init;
+          }
+        } else {
+          for (int i = 0; i < N_READS; i++) {
+            if ((read_offset_x + i) < stride_limit) {
+              out[index_y * stride + i] = Op::init;
+            }
+          }
+        }
+      }
+      if (reverse) {
+        index_y -= 1;
+        check_index_y += 1;
+      } else {
+        index_y += 1;
+        check_index_y += 1;
+      }
+    }
+    if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
+      for (int i = 0; i < N_READS; i++) {
+        out[index_y * stride + i] = read_into[i];
+      }
+    } else {
+      for (int i = 0; i < N_READS; i++) {
+        if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
+          out[index_y * stride + i] = read_into[i];
+        }
+      }
+    }
+  }
+}