PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/complex.h ADDED Viewed

@@ -0,0 +1,173 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <metal_stdlib>
+using namespace metal;
+struct complex64_t;
+template <typename T>
+static constexpr constant bool can_convert_to_complex64 =
+    !is_same_v<T, complex64_t> && is_convertible_v<T, float>;
+template <typename T>
+static constexpr constant bool can_convert_from_complex64 =
+    !is_same_v<T, complex64_t> &&
+    (is_convertible_v<float, T> || is_convertible_v<bfloat16_t, T>);
+struct complex64_t {
+  float real;
+  float imag;
+  // Constructors
+  constexpr complex64_t(float real, float imag) : real(real), imag(imag) {};
+  constexpr complex64_t() : real(0), imag(0) {};
+  constexpr complex64_t() threadgroup : real(0), imag(0) {};
+  // Conversions to complex64_t
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) thread : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) threadgroup : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) device : real(x), imag(0) {}
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_to_complex64<T>>::type>
+  constexpr complex64_t(T x) constant : real(x), imag(0) {}
+  // Conversions from complex64_t
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const thread {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const threadgroup {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const device {
+    return static_cast<T>(real);
+  }
+  template <
+      typename T,
+      typename = typename enable_if<can_convert_from_complex64<T>>::type>
+  constexpr operator T() const constant {
+    return static_cast<T>(real);
+  }
+};
+constexpr complex64_t operator-(complex64_t x) {
+  return {-x.real, -x.imag};
+}
+constexpr bool operator>=(complex64_t a, complex64_t b) {
+  return (a.real > b.real) || (a.real == b.real && a.imag >= b.imag);
+}
+constexpr bool operator>(complex64_t a, complex64_t b) {
+  return (a.real > b.real) || (a.real == b.real && a.imag > b.imag);
+}
+constexpr bool operator<=(complex64_t a, complex64_t b) {
+  return operator>=(b, a);
+}
+constexpr bool operator<(complex64_t a, complex64_t b) {
+  return operator>(b, a);
+}
+constexpr bool operator==(complex64_t a, complex64_t b) {
+  return a.real == b.real && a.imag == b.imag;
+}
+constexpr complex64_t operator+(complex64_t a, complex64_t b) {
+  return {a.real + b.real, a.imag + b.imag};
+}
+constexpr thread complex64_t& operator+=(thread complex64_t& a, complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr threadgroup complex64_t& operator+=(
+    threadgroup complex64_t& a,
+    complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr device complex64_t& operator+=(device complex64_t& a, complex64_t b) {
+  a.real += b.real;
+  a.imag += b.imag;
+  return a;
+}
+constexpr complex64_t operator+(float a, complex64_t b) {
+  return {a + b.real, b.imag};
+}
+constexpr complex64_t operator+(complex64_t a, float b) {
+  return {a.real + b, a.imag};
+}
+constexpr complex64_t operator-(complex64_t a, complex64_t b) {
+  return {a.real - b.real, a.imag - b.imag};
+}
+constexpr complex64_t operator-(float a, complex64_t b) {
+  return {a - b.real, -b.imag};
+}
+constexpr complex64_t operator-(complex64_t a, float b) {
+  return {a.real - b, a.imag};
+}
+constexpr complex64_t operator*(complex64_t a, complex64_t b) {
+  return {a.real * b.real - a.imag * b.imag, a.real * b.imag + a.imag * b.real};
+}
+constexpr complex64_t operator/(complex64_t a, complex64_t b) {
+  auto denom = b.real * b.real + b.imag * b.imag;
+  auto x = a.real * b.real + a.imag * b.imag;
+  auto y = a.imag * b.real - a.real * b.imag;
+  return {x / denom, y / denom};
+}
+constexpr complex64_t operator/(float a, complex64_t b) {
+  auto denom = b.real * b.real + b.imag * b.imag;
+  auto x = a * b.real;
+  auto y = -a * b.imag;
+  return {x / denom, y / denom};
+}
+constexpr complex64_t operator%(complex64_t a, complex64_t b) {
+  auto real = a.real - (b.real * static_cast<int64_t>(a.real / b.real));
+  auto imag = a.imag - (b.imag * static_cast<int64_t>(a.imag / b.imag));
+  if (real != 0 && (real < 0 != b.real < 0)) {
+    real += b.real;
+  }
+  if (imag != 0 && (imag < 0 != b.imag < 0)) {
+    imag += b.imag;
+  }
+  return {real, imag};
+}

mlx/include/mlx/backend/metal/kernels/copy.h ADDED Viewed

@@ -0,0 +1,276 @@
+// Copyright © 2024 Apple Inc.
+template <typename T, typename U, int N = WorkPerThread<U>::n>
+[[kernel]] void copy_s(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant uint& size,
+    uint index [[thread_position_in_grid]]) {
+  index *= N;
+  if (N > 1 && index + N > size) {
+    for (int i = 0; index + i < size; ++i) {
+      dst[index + i] = static_cast<U>(src[0]);
+    }
+  } else {
+    for (int i = 0; i < N; ++i) {
+      dst[index + i] = static_cast<U>(src[0]);
+    }
+  }
+}
+template <typename T, typename U, int N = WorkPerThread<U>::n>
+[[kernel]] void copy_v(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant uint& size,
+    uint index [[thread_position_in_grid]]) {
+  index *= N;
+  if (N > 1 && index + N > size) {
+    for (int i = 0; index + i < size; ++i) {
+      dst[index + i] = static_cast<U>(src[index + i]);
+    }
+  } else {
+    for (int i = 0; i < N; ++i) {
+      dst[index + i] = static_cast<U>(src[index + i]);
+    }
+  }
+}
+template <typename T, typename U, int N = WorkPerThread<U>::n>
+[[kernel]] void copy_s2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant int64_t& size,
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
+  if (N > 1 && offset + N > size) {
+    for (int i = 0; offset + i < size; ++i) {
+      dst[offset + i] = static_cast<U>(src[0]);
+    }
+  } else {
+    for (int i = 0; i < N; ++i) {
+      dst[offset + i] = static_cast<U>(src[0]);
+    }
+  }
+}
+template <typename T, typename U, int N = WorkPerThread<U>::n>
+[[kernel]] void copy_v2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant int64_t& size,
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  int64_t offset = N * (index.x + grid_dim.x * int64_t(index.y));
+  if (N > 1 && offset + N > size) {
+    for (int i = 0; offset + i < size; ++i) {
+      dst[offset + i] = static_cast<U>(src[offset + i]);
+    }
+  } else {
+    for (int i = 0; i < N; ++i) {
+      dst[offset + i] = static_cast<U>(src[offset + i]);
+    }
+  }
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_g_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
+  dst[index] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_g_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint2 index [[thread_position_in_grid]],
+    uint2 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
+  IdxT dst_idx = index.x + IdxT(grid_dim.x) * index.y;
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_g_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
+  IdxT dst_idx =
+      index.x + IdxT(grid_dim.x) * (index.y + IdxT(grid_dim.y) * index.z);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, int N = 1, typename IdxT = int64_t>
+[[kernel]] void copy_g(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]],
+    uint3 grid_dim [[threads_per_grid]]) {
+  auto src_idx = elem_to_loc<IdxT>(
+      {N * index.x, index.y, index.z}, src_shape, src_strides, ndim);
+  if (N == 1) {
+    IdxT dst_idx =
+        index.x + grid_dim.x * (index.y + IdxT(grid_dim.y) * index.z);
+    dst[dst_idx] = static_cast<U>(src[src_idx]);
+    return;
+  }
+  auto xshape = src_shape[ndim - 1];
+  IdxT dst_idx = N * index.x + xshape * (index.y + IdxT(grid_dim.y) * index.z);
+  auto src_xstride = src_strides[ndim - 1];
+  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
+    dst[dst_idx + i] = static_cast<U>(src[src_idx]);
+    src_idx += src_xstride;
+  }
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    constant const int64_t& dst_stride [[buffer(4)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
+  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint2 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
+  dst[dst_idx] = static_cast<U>(src[src_idx]);
+}
+template <typename T, typename U, int N = 1, typename IdxT = int64_t>
+[[kernel]] void copy_gg(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int& ndim [[buffer(5)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto idx = elem_to_loc_2_nd<IdxT>(
+      {N * index.x, index.y, index.z},
+      src_shape,
+      src_strides,
+      dst_strides,
+      ndim);
+  if (N == 1) {
+    dst[idx.y] = static_cast<U>(src[idx.x]);
+    return;
+  }
+  IdxT src_xstride = src_strides[ndim - 1];
+  IdxT dst_xstride = dst_strides[ndim - 1];
+  auto xshape = src_shape[ndim - 1];
+  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
+    dst[idx.y] = static_cast<U>(src[idx.x]);
+    idx.x += src_xstride;
+    idx.y += dst_xstride;
+  }
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd1(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t& src_stride [[buffer(3)]],
+    constant const int64_t& dst_stride [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_1<IdxT>(index, src_stride);
+  auto dst_idx = elem_to_loc_1<IdxT>(index, dst_stride);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd2(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint2 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_2<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_2<IdxT>(index, dst_strides);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+template <typename T, typename U, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic_nd3(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint3 index [[thread_position_in_grid]]) {
+  auto src_idx = elem_to_loc_3<IdxT>(index, src_strides);
+  auto dst_idx = elem_to_loc_3<IdxT>(index, dst_strides);
+  dst[dst_idx + dst_offset] = src[src_idx + src_offset];
+}
+template <typename T, typename U, int N = 1, typename IdxT = int64_t>
+[[kernel]] void copy_gg_dynamic(
+    device const T* src [[buffer(0)]],
+    device U* dst [[buffer(1)]],
+    constant const int* src_shape [[buffer(2)]],
+    constant const int64_t* src_strides [[buffer(3)]],
+    constant const int64_t* dst_strides [[buffer(4)]],
+    constant const int& ndim [[buffer(5)]],
+    constant const int64_t& src_offset [[buffer(6)]],
+    constant const int64_t& dst_offset [[buffer(7)]],
+    uint3 index [[thread_position_in_grid]]) {
+  src += src_offset;
+  dst += dst_offset;
+  auto idx = elem_to_loc_2_nd<IdxT>(
+      {N * index.x, index.y, index.z},
+      src_shape,
+      src_strides,
+      dst_strides,
+      ndim);
+  if (N == 1) {
+    dst[idx.y] = src[idx.x];
+    return;
+  }
+  IdxT src_xstride = src_strides[ndim - 1];
+  IdxT dst_xstride = dst_strides[ndim - 1];
+  auto xshape = src_shape[ndim - 1];
+  for (int i = 0; i < N && (int(N * index.x) + i) < xshape; ++i) {
+    dst[idx.y] = src[idx.x];
+    idx.x += src_xstride;
+    idx.y += dst_xstride;
+  }
+}

mlx/include/mlx/backend/metal/kernels/defines.h ADDED Viewed

@@ -0,0 +1,24 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#if defined __METAL__ || defined MLX_METAL_JIT
+#define MTL_CONST constant
+#else
+#define MTL_CONST
+#endif
+static MTL_CONST constexpr int MAX_REDUCE_SPECIALIZED_DIMS = 4;
+static MTL_CONST constexpr int REDUCE_N_READS = 4;
+static MTL_CONST constexpr int REDUCE_N_WRITES = 4;
+static MTL_CONST constexpr int SOFTMAX_N_READS = 4;
+static MTL_CONST constexpr int RMS_N_READS = 4;
+static MTL_CONST constexpr int RMS_LOOPED_LIMIT = 4096;
+// Instantiate a templated kernel.
+// Extra args are used as template parameters:
+// e.g. instantiate_kernel(binary_int, binary, a, b) ->
+// [[host_name(binary_int)]] [kernel] binary<a, b>
+#define instantiate_kernel(name, func, ...) \
+  template [[host_name(                     \
+      name)]] [[kernel]] decltype(func<__VA_ARGS__>) func<__VA_ARGS__>;

mlx/include/mlx/backend/metal/kernels/erf.h ADDED Viewed

@@ -0,0 +1,69 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <metal_math>
+/*
+ * Approximation to the error function.
+ * Based on code from:
+ * https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff#answer-35148199
+ */
+float erf(float a) {
+  float r, s, t, u;
+  t = metal::abs(a);
+  s = a * a;
+  if (t > 0.927734375f) {
+    // maximum error 0.99527 ulp
+    r = metal::fma(
+        -1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+    u = metal::fma(
+        -3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+    r = metal::fma(r, s, u);
+    r = metal::fma(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+    r = metal::fma(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+    r = metal::fma(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+    r = metal::fma(r, t, -t);
+    // TODO, replace with expm1 when implemented
+    r = 1.0f - metal::exp(r);
+    r = metal::copysign(r, a);
+  } else {
+    // maximum error 0.98929 ulp
+    r = -5.96761703e-4f; // -0x1.38e000p-11
+    r = metal::fma(r, s, 4.99119423e-3f); //  0x1.471a58p-8
+    r = metal::fma(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+    r = metal::fma(r, s, 1.12819925e-1f); //  0x1.ce1c44p-4
+    r = metal::fma(r, s, -3.76125336e-1f); // -0x1.812700p-2
+    r = metal::fma(r, s, 1.28379166e-1f); //  0x1.06eba8p-3
+    r = metal::fma(r, a, a);
+  }
+  return r;
+}
+float erfinv(float a) {
+  auto t = metal::fma(a, 0.0f - a, 1.0f);
+  t = metal::log(t);
+  float p;
+  if (metal::abs(t) > 6.125f) { // maximum ulp error = 2.35793
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = metal::fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = metal::fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = metal::fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = metal::fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = metal::fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = metal::fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = metal::fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    p = metal::fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  } else { // maximum ulp error = 2.35002
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = metal::fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = metal::fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = metal::fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = metal::fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = metal::fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = metal::fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = metal::fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = metal::fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    p = metal::fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  }
+  return a * p;
+}

mlx/include/mlx/backend/metal/kernels/expm1f.h ADDED Viewed

@@ -0,0 +1,90 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <metal_math>
+// Original license copied below:
+//  Copyright (c) 2015-2023 Norbert Juffa
+//  All rights reserved.
+//
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted provided that the following conditions
+//  are met:
+//
+//  1. Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//  2. Redistributions in binary form must reproduce the above copyright
+//     notice, this list of conditions and the following disclaimer in the
+//     documentation and/or other materials provided with the distribution.
+//
+//  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+//  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+//  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+//  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+//  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+//  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+//  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+//  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+//  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+//  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+//  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/* Compute exponential base e minus 1. Maximum ulp error = 0.997458
+   i = rint(a/log(2)), f = a-i*log(2). Then expm1(a) = 2**i * (expm1(f)+1) - 1.
+   Compute r = expm1(f). Then expm1(a)= 2 * (0.5 * 2**i * r + 0.5 * 2**i - 0.5).
+   With t = 0.5*2**i, expm1(a) = 2*(r * t + t-0.5). However, for best accuracy,
+   when i == 1, expm1(a)= 2*(r + 0.5), and when i == 0, expm1(a) = r.
+   NOTE: Scale factor b is only applied if i < 0 or i > 1 (should be power of 2)
+*/
+float expm1f_scaled_unchecked(float a, float b) {
+  float f, j, r, s, t, u, v, x, y;
+  int i;
+  // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
+  j = fma(1.442695f, a, 12582912.f); // 0x1.715476p0, 0x1.8p23
+  j = j - 12582912.0f; // 0x1.8p23
+  i = (int)j;
+  f = fma(j, -6.93145752e-1f, a);
+  // approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
+  s = f * f;
+  if (a == 0.0f)
+    s = a; // ensure -0 is passed through
+  // err = 0.997458  ulp1 = 11081805
+  r = 1.97350979e-4f; // 0x1.9de000p-13
+  r = fma(r, f, 1.39309070e-3f); // 0x1.6d30bcp-10
+  r = fma(r, f, 8.33343994e-3f); // 0x1.1111f6p-7
+  r = fma(r, f, 4.16668020e-2f); // 0x1.55559ep-5
+  r = fma(r, f, 1.66666716e-1f); // 0x1.55555cp-3
+  r = fma(r, f, 4.99999970e-1f); // 0x1.fffffep-2
+  u = (j == 1) ? (f + 0.5f) : f;
+  v = fma(r, s, u);
+  s = 0.5f * b;
+  t = ldexp(s, i);
+  y = t - s;
+  x = (t - y) - s; // double-float canonicalization of difference
+  r = fma(v, t, x) + y;
+  r = r + r;
+  if (j == 0)
+    r = v;
+  if (j == 1)
+    r = v + v;
+  return r;
+}
+/* Compute exponential base e minus 1. max ulp err = 0.99746 */
+float expm1f(float a) {
+  float r;
+  r = expm1f_scaled_unchecked(a, 1.0f);
+  /* handle severe overflow and underflow */
+  if (abs(a - 1.0f) > 88.0f) {
+    r = pow(2, a);
+    r = fma(r, r, -1.0f);
+  }
+  return r;
+}