PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/cpu/simd/math.h ADDED Viewed

@@ -0,0 +1,193 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/backend/cpu/simd/type.h"
+namespace mlx::core::simd {
+constexpr float inf = std::numeric_limits<float>::infinity();
+/**
+ * Compute exp(x) in an optimizer friendly way as follows:
+ *
+ * First change the problem to computing 2**y where y = x / ln(2).
+ *
+ * Now we will compute 2**y as 2**y1 * 2**y2 where y1 is the integer part
+ * `ipart` and y2 is fractional part. For the integer part we perform bit
+ * shifting and for the fractional part we use a polynomial approximation.
+ *
+ * The algorithm and constants of the polynomial taken from
+ * https://github.com/akohlmey/fastermath/blob/master/src/exp.c which took them
+ * from Cephes math library.
+ *
+ * Note: The implementation below is a general fast exp. There could be faster
+ *       implementations for numbers strictly < 0.
+ */
+template <typename T, int N>
+Simd<T, N> exp(Simd<T, N> in) {
+  if constexpr (is_complex<T>) {
+    return Simd<T, 1>{std::exp(in.value)};
+  } else {
+    Simd<float, N> x_init = in;
+    auto x = x_init * 1.442695f; // multiply with log_2(e)
+    Simd<float, N> ipart, fpart;
+    ipart = floor(x + 0.5);
+    fpart = x - ipart;
+    x = 1.535336188319500e-4f;
+    x = fma(x, fpart, 1.339887440266574e-3f);
+    x = fma(x, fpart, 9.618437357674640e-3f);
+    x = fma(x, fpart, 5.550332471162809e-2f);
+    x = fma(x, fpart, 2.402264791363012e-1f);
+    x = fma(x, fpart, 6.931472028550421e-1f);
+    x = fma(x, fpart, 1.000000000000000f);
+    // generate 2**ipart in the floating point representation using integer
+    // bitshifting
+    Simd<int, N> epart = (Simd<int, N>(ipart) + 127) << 23;
+    // Deal with NaN and Inf
+    auto result = select(isnan(x_init), x_init, (*(Simd<float, N>*)&epart) * x);
+    result = select(x_init > 88.0f, Simd<float, N>(inf), result);
+    result = select(x_init < -88.0f, Simd<float, N>(0), result);
+    return Simd<T, N>(result);
+  }
+}
+/* Implementation from:
+ * https://github.com/JishinMaster/simd_utils/blob/3c1433a86fb38edcc9b02039f3c9a65b16640976/neon_mathfun.h#L357
+ * which originally came from the Cephes math library.
+ */
+template <bool Sine, typename T, int N>
+Simd<T, N> sincos(Simd<T, N> in) {
+  auto sign_mask_sin = in < 0;
+  in = abs(in);
+  Simd<float, N> x = in;
+  // scale by 4/Pi
+  auto y = x * 1.27323954473516f;
+  // store the integer part of y in mm0
+  Simd<uint32_t, N> emm2 = y;
+  // j=(j+1) & (~1) (see the cephes sources)
+  emm2 = emm2 + 1;
+  emm2 = emm2 & ~1;
+  y = emm2;
+  // Get the polynom selection mask. There is one polynom for 0 <= x <= Pi/4
+  // and another one for Pi/4<x<=Pi/2. Both branches will be computed.
+  auto poly_mask = (emm2 & 2) != 0;
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  x = fma(y, Simd<float, N>(-0.78515625f), x);
+  x = fma(y, Simd<float, N>(-2.4187564849853515625e-4f), x);
+  x = fma(y, Simd<float, N>(-3.77489497744594108e-8f), x);
+  sign_mask_sin = sign_mask_sin ^ ((emm2 & 4) != 0);
+  auto sign_mask_cos = ((emm2 - 2) & 4) != 0;
+  // Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
+  // and the second polynom      (Pi/4 <= x <= 0) in y2
+  auto z = x * x;
+  auto y1 =
+      fma(z, Simd<float, N>(2.443315711809948e-5f), -1.388731625493765e-3f);
+  auto y2 = fma(z, Simd<float, N>(-1.9515295891e-4f), 8.3321608736e-3f);
+  y1 = fma(y1, z, 4.166664568298827e-2f);
+  y2 = fma(y2, z, -1.6666654611e-1f);
+  y1 = y1 * z;
+  y2 = y2 * z;
+  y1 = y1 * z;
+  y2 = fma(x, y2, x);
+  y1 = fma(z, Simd<float, N>(-0.5f), y1);
+  y1 = y1 + 1.0f;
+  if constexpr (Sine) {
+    auto ys = select(poly_mask, y1, y2);
+    return select(sign_mask_sin, -ys, ys);
+  } else {
+    auto yc = select(poly_mask, y2, y1);
+    return select(sign_mask_cos, yc, -yc);
+  }
+}
+template <typename T, int N>
+Simd<T, N> sin(Simd<T, N> x) {
+  if constexpr (is_complex<T>) {
+    return std::sin(x.value);
+  } else {
+    return sincos<true>(x);
+  }
+}
+template <typename T, int N>
+Simd<T, N> cos(Simd<T, N> x) {
+  if constexpr (is_complex<T>) {
+    return std::cos(x.value);
+  } else {
+    return sincos<false>(x);
+  }
+}
+template <typename T, int N>
+Simd<T, N> erf(Simd<T, N> x) {
+  // https://github.com/pytorch/pytorch/blob/abf28982a8cb43342e7669d859de9543fd804cc9/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L175
+  Simd<float, N> v = x;
+  auto t = recip(fma(Simd<float, N>(0.3275911f), abs(v), 1.0f));
+  auto r = fma(Simd<float, N>(1.061405429f), t, -1.453152027f);
+  r = fma(r, t, 1.421413741f);
+  r = fma(r, t, -0.284496736f);
+  r = fma(r, t, 0.254829592f);
+  auto e = -exp(-v * v);
+  auto result = Simd<T, N>(fma(e * t, r, 1.0f));
+  return select(x > 0, result, -result);
+}
+template <typename T, int N>
+Simd<T, N> erfinv(Simd<T, N> a_) {
+  Simd<float, N> a = a_;
+  auto t = fma(a, 0.0f - a, 1.0f);
+  t = log(t);
+  auto lhs = [](auto t) {
+    Simd<float, N> p;
+    p = 3.03697567e-10f; //  0x1.4deb44p-32
+    p = fma(p, t, 2.93243101e-8f); //  0x1.f7c9aep-26
+    p = fma(p, t, 1.22150334e-6f); //  0x1.47e512p-20
+    p = fma(p, t, 2.84108955e-5f); //  0x1.dca7dep-16
+    p = fma(p, t, 3.93552968e-4f); //  0x1.9cab92p-12
+    p = fma(p, t, 3.02698812e-3f); //  0x1.8cc0dep-9
+    p = fma(p, t, 4.83185798e-3f); //  0x1.3ca920p-8
+    p = fma(p, t, -2.64646143e-1f); // -0x1.0eff66p-2
+    return fma(p, t, 8.40016484e-1f); //  0x1.ae16a4p-1
+  };
+  auto rhs = [](auto t) {
+    Simd<float, N> p;
+    p = 5.43877832e-9f; //  0x1.75c000p-28
+    p = fma(p, t, 1.43285448e-7f); //  0x1.33b402p-23
+    p = fma(p, t, 1.22774793e-6f); //  0x1.499232p-20
+    p = fma(p, t, 1.12963626e-7f); //  0x1.e52cd2p-24
+    p = fma(p, t, -5.61530760e-5f); // -0x1.d70bd0p-15
+    p = fma(p, t, -1.47697632e-4f); // -0x1.35be90p-13
+    p = fma(p, t, 2.31468678e-3f); //  0x1.2f6400p-9
+    p = fma(p, t, 1.15392581e-2f); //  0x1.7a1e50p-7
+    p = fma(p, t, -2.32015476e-1f); // -0x1.db2aeep-3
+    return fma(p, t, 8.86226892e-1f); //  0x1.c5bf88p-1
+  };
+  auto thresh = 6.125f;
+  // Compute both branches and select if N > 1
+  if constexpr (N == 1) {
+    if ((abs(t) > thresh).value) { // maximum ulp error = 2.35793
+      return a * lhs(t);
+    } else { // maximum ulp error = 2.35002
+      return a * rhs(t);
+    }
+  } else {
+    return a * select(abs(t) > thresh, lhs(t), rhs(t));
+  }
+}
+} // namespace mlx::core::simd

mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h ADDED Viewed

@@ -0,0 +1,212 @@
+#pragma once
+#include <arm_neon.h>
+#include "mlx/backend/cpu/simd/base_simd.h"
+namespace mlx::core::simd {
+constexpr int N = 8;
+template <>
+struct Simd<float16_t, N> {
+  static constexpr int size = N;
+  using scalar_t = float16_t;
+  Simd<float16_t, N>() {}
+  template <typename U>
+  Simd<float16_t, N>(U v) : value(vdupq_n_f16(v)){};
+  Simd<float16_t, N>(float16x8_t v) : value(v){};
+  Simd<float16_t, N>(Simd<float, N> other) {
+    auto f32x4_a = *(float32x4_t*)(&other);
+    auto f32x4_b = *((float32x4_t*)(&other) + 1);
+    value = vcvt_high_f16_f32(vcvt_f16_f32(f32x4_a), f32x4_b);
+  };
+  Simd<float16_t, N>(Simd<uint16_t, N> other) {
+    value = vcvtq_f16_u16(*(uint16x8_t*)(&other.value));
+  };
+  operator Simd<int16_t, N>() {
+    auto v = vcvtq_s16_f16(value);
+    return load<int16_t, N>((int16_t*)&v);
+  };
+  operator Simd<float, N>() {
+    float32x4x2_t v;
+    v.val[0] = vcvt_f32_f16(*(float16x4_t*)(&value));
+    v.val[1] = vcvt_high_f32_f16(value);
+    return load<float, N>((float*)&v);
+  }
+  float16_t operator[](int idx) const {
+    return reinterpret_cast<const float16_t*>(&value)[idx];
+  }
+  float16_t& operator[](int idx) {
+    return reinterpret_cast<float16_t*>(&value)[idx];
+  }
+  float16x8_t value;
+};
+#define DEFINE_NEON_UNARY_OP(name, op)                   \
+  inline Simd<float16_t, N> name(Simd<float16_t, N> a) { \
+    return Simd<float16_t, N>{op(a.value)};              \
+  }
+DEFINE_NEON_UNARY_OP(abs, vabsq_f16)
+DEFINE_NEON_UNARY_OP(ceil, vrndpq_f16)
+DEFINE_NEON_UNARY_OP(floor, vrndmq_f16)
+DEFINE_NEON_UNARY_OP(sqrt, vsqrtq_f16)
+DEFINE_NEON_UNARY_OP(rsqrt, vrsqrteq_f16)
+DEFINE_NEON_UNARY_OP(recip, vrecpeq_f16)
+DEFINE_NEON_UNARY_OP(rint, vrndnq_f16)
+#define DEFINE_NEON_BINARY_OP(name, op)                                        \
+  inline Simd<float16_t, N> name(Simd<float16_t, N> a, Simd<float16_t, N> b) { \
+    return op(a.value, b.value);                                               \
+  }                                                                            \
+  template <typename T>                                                        \
+  Simd<float16_t, N> name(Simd<float16_t, N> a, T b) {                         \
+    return op(a.value, Simd<float16_t, N>(b).value);                           \
+  }                                                                            \
+  template <typename T>                                                        \
+  Simd<float16_t, N> name(T a, Simd<float16_t, N> b) {                         \
+    return op(Simd<float16_t, N>(a).value, b.value);                           \
+  }
+inline Simd<float16_t, N> operator!(Simd<float16_t, N> v) {
+  auto out = vceqzq_f16(v.value);
+  return Simd<uint16_t, N>(*(uint16_t*)&out);
+}
+inline Simd<float16_t, N> operator-(Simd<float16_t, N> v) {
+  return vnegq_f16(v.value);
+}
+DEFINE_NEON_BINARY_OP(maximum, vmaxq_f16)
+DEFINE_NEON_BINARY_OP(minimum, vminq_f16)
+DEFINE_NEON_BINARY_OP(operator+, vaddq_f16)
+DEFINE_NEON_BINARY_OP(operator-, vsubq_f16)
+DEFINE_NEON_BINARY_OP(operator*, vmulq_f16)
+DEFINE_NEON_BINARY_OP(operator/, vdivq_f16)
+#define DEFINE_NEON_COMPARISON(Op, op)                   \
+  template <typename T>                                  \
+  Simd<bool, N> operator Op(Simd<float16_t, N> a, T b) { \
+    auto out = op(a.value, Simd<float16_t, N>(b).value); \
+    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
+  }                                                      \
+  template <typename T>                                  \
+  Simd<bool, N> operator Op(T a, Simd<float16_t, N> b) { \
+    auto out = op(Simd<float16_t, N>(a).value, b.value); \
+    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
+  }                                                      \
+  inline Simd<bool, N> operator Op(                      \
+      Simd<float16_t, N> a, Simd<float16_t, N> b) {      \
+    auto out = op(a.value, b.value);                     \
+    return Simd<uint16_t, N>(*(uint16_t*)(&out));        \
+  }
+DEFINE_NEON_COMPARISON(==, vceqq_f16)
+DEFINE_NEON_COMPARISON(>=, vcgeq_f16)
+DEFINE_NEON_COMPARISON(<=, vcleq_f16)
+DEFINE_NEON_COMPARISON(>, vcgtq_f16)
+DEFINE_NEON_COMPARISON(<, vcltq_f16)
+template <typename T>
+Simd<bool, N> operator!=(Simd<float16_t, N> a, T b) {
+  return !(a == b);
+}
+template <typename T>
+Simd<bool, N> operator!=(T a, Simd<float16_t, N> b) {
+  return !(a == b);
+}
+inline Simd<bool, N> operator!=(Simd<float16_t, N> a, Simd<float16_t, N> b) {
+  return !(a == b);
+}
+inline Simd<float16_t, N> operator||(
+    Simd<float16_t, N> a,
+    Simd<float16_t, N> b) {
+  return Simd<uint16_t, N>((a != 0) || (b != 0));
+}
+template <typename T>
+Simd<float16_t, N> operator||(Simd<float16_t, N> a, T b) {
+  return Simd<uint16_t, N>((a != 0) || (b != 0));
+}
+template <typename T>
+Simd<float16_t, N> operator||(T a, Simd<float16_t, N> b) {
+  return Simd<uint16_t, N>((a != 0) || (b != 0));
+}
+inline Simd<float16_t, N> operator&&(
+    Simd<float16_t, N> a,
+    Simd<float16_t, N> b) {
+  return Simd<uint16_t, N>((a != 0) && (b != 0));
+}
+template <typename T>
+Simd<float16_t, N> operator&&(Simd<float16_t, N> a, T b) {
+  return Simd<uint16_t, N>((a != 0) && (b != 0));
+}
+template <typename T>
+Simd<float16_t, N> operator&&(T a, Simd<float16_t, N> b) {
+  return Simd<uint16_t, N>((a != 0) && (b != 0));
+}
+template <>
+inline Simd<bool, N> isnan(Simd<float16_t, N> v) {
+  return v != v;
+}
+template <>
+inline Simd<float16_t, N>
+clamp(Simd<float16_t, N> v, Simd<float16_t, N> min, Simd<float16_t, N> max) {
+  return minimum(maximum(v, min), max);
+}
+template <typename T>
+Simd<float16_t, N> fma(Simd<float16_t, N> x, Simd<float16_t, N> y, T z) {
+  return vfmaq_f16(x.value, y.value, Simd<float16_t, N>(z).value);
+}
+template <typename MaskT>
+Simd<float16_t, N>
+select(Simd<MaskT, N> mask, Simd<float16_t, N> x, Simd<float16_t, N> y) {
+  return vbslq_f16(Simd<uint16_t, N>(mask).value, x.value, y.value);
+}
+// Reductions
+inline float16_t max(Simd<float16_t, N> x) {
+  float16x4_t y;
+  y = vpmax_f16(vget_low_f16(x.value), vget_high_f16(x.value));
+  y = vpmax_f16(y, y);
+  y = vpmax_f16(y, y);
+  return vget_lane_f16(y, 0);
+}
+inline float16_t min(Simd<float16_t, N> x) {
+  float16x4_t y;
+  y = vpmin_f16(vget_low_f16(x.value), vget_high_f16(x.value));
+  y = vpmin_f16(y, y);
+  y = vpmin_f16(y, y);
+  return vget_lane_f16(y, 0);
+}
+inline float16_t sum(Simd<float16_t, N> x) {
+  float16x4_t y;
+  y = vpadd_f16(vget_low_f16(x.value), vget_high_f16(x.value));
+  y = vpadd_f16(y, y);
+  y = vpadd_f16(y, y);
+  return vget_lane_f16(y, 0);
+}
+inline float16_t prod(Simd<float16_t, N> x) {
+  auto hx = vmul_f16(vget_low_f16(x.value), vget_high_f16(x.value));
+  auto out = hx[0];
+  hx[0] *= hx[1];
+  hx[0] *= hx[2];
+  hx[0] *= hx[3];
+  return hx[0];
+}
+} // namespace mlx::core::simd

mlx/include/mlx/backend/cpu/simd/simd.h ADDED Viewed

@@ -0,0 +1,4 @@
+#pragma once
+#include "mlx/backend/cpu/simd/math.h"
+#include "mlx/backend/cpu/simd/type.h"

mlx/include/mlx/backend/cpu/simd/type.h ADDED Viewed

@@ -0,0 +1,11 @@
+#pragma once
+#include "mlx/backend/cpu/simd/base_simd.h"
+#ifdef MLX_USE_ACCELERATE
+#if defined(__x86_64__)
+// the accelerate_simd implementation require neon -- use base implementation
+#else
+#include "mlx/backend/cpu/simd/accelerate_simd.h"
+#endif
+#endif

mlx/include/mlx/backend/cpu/slicing.h ADDED Viewed

@@ -0,0 +1,21 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+std::tuple<int64_t, Strides> prepare_slice(
+    const array& in,
+    const Shape& start_indices,
+    const Shape& strides);
+void shared_buffer_slice(
+    const array& in,
+    const Strides& out_strides,
+    size_t data_offset,
+    size_t data_size,
+    array& out);
+} // namespace mlx::core

mlx/include/mlx/backend/cpu/ternary.h ADDED Viewed

@@ -0,0 +1,154 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+#include "mlx/backend/common/ternary.h"
+#include "mlx/backend/common/utils.h"
+#include "mlx/backend/cpu/encoder.h"
+namespace mlx::core {
+template <typename T1, typename T2, typename T3, typename U, typename Op, int D>
+void ternary_op_dims(
+    const T1* a,
+    const T2* b,
+    const T3* c,
+    U* out,
+    Op op,
+    const Shape& shape,
+    const Strides& a_strides,
+    const Strides& b_strides,
+    const Strides& c_strides,
+    const Strides& out_strides,
+    int axis) {
+  auto stride_a = a_strides[axis];
+  auto stride_b = b_strides[axis];
+  auto stride_c = c_strides[axis];
+  auto stride_out = out_strides[axis];
+  auto N = shape[axis];
+  for (int i = 0; i < N; i++) {
+    if constexpr (D > 1) {
+      ternary_op_dims<T1, T2, T3, U, Op, D - 1>(
+          a,
+          b,
+          c,
+          out,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          axis + 1);
+    } else {
+      *out = op(*a, *b, *c);
+    }
+    a += stride_a;
+    b += stride_b;
+    c += stride_c;
+    out += stride_out;
+  }
+}
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op_dispatch_dims(
+    const T1* a_ptr,
+    const T2* b_ptr,
+    const T3* c_ptr,
+    U* out_ptr,
+    Op op,
+    size_t size,
+    Shape& shape,
+    std::vector<Strides>& strides) {
+  const auto& a_strides = strides[0];
+  const auto& b_strides = strides[1];
+  const auto& c_strides = strides[2];
+  const auto& out_strides = strides[3];
+  int ndim = shape.size();
+  switch (ndim) {
+    case 1:
+      ternary_op_dims<T1, T2, T3, U, Op, 1>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+    case 2:
+      ternary_op_dims<T1, T2, T3, U, Op, 2>(
+          a_ptr,
+          b_ptr,
+          c_ptr,
+          out_ptr,
+          op,
+          shape,
+          a_strides,
+          b_strides,
+          c_strides,
+          out_strides,
+          0);
+      return;
+  }
+  ContiguousIterator a_it(shape, a_strides, ndim - 2);
+  ContiguousIterator b_it(shape, b_strides, ndim - 2);
+  ContiguousIterator c_it(shape, c_strides, ndim - 2);
+  auto stride = out_strides[ndim - 3];
+  for (size_t elem = 0; elem < size; elem += stride) {
+    ternary_op_dims<T1, T2, T3, U, Op, 2>(
+        a_ptr + a_it.loc,
+        b_ptr + b_it.loc,
+        c_ptr + c_it.loc,
+        out_ptr + elem,
+        op,
+        shape,
+        a_strides,
+        b_strides,
+        c_strides,
+        out_strides,
+        ndim - 2);
+    a_it.step();
+    b_it.step();
+    c_it.step();
+  }
+}
+template <typename T1, typename T2, typename T3, typename U, typename Op>
+void ternary_op(
+    const array& a,
+    const array& b,
+    const array& c,
+    array& out,
+    Op op,
+    TernaryOpType topt) {
+  const T1* a_ptr = a.data<T1>();
+  const T2* b_ptr = b.data<T2>();
+  const T3* c_ptr = c.data<T3>();
+  U* out_ptr = out.data<U>();
+  if (topt == TernaryOpType::ScalarScalarScalar) {
+    *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
+  } else if (topt == TernaryOpType::VectorVectorVector) {
+    for (size_t i = 0; i < out.size(); ++i) {
+      *out_ptr = op(*a_ptr, *b_ptr, *c_ptr);
+      a_ptr++;
+      b_ptr++;
+      c_ptr++;
+      out_ptr++;
+    }
+  } else {
+    auto [shape, strides] = collapse_contiguous_dims(
+        a.shape(), {a.strides(), b.strides(), c.strides(), out.strides()});
+    ternary_op_dispatch_dims<T1, T2, T3, U>(
+        a_ptr, b_ptr, c_ptr, out_ptr, op, out.size(), shape, strides);
+  }
+}
+} // namespace mlx::core

mlx/include/mlx/backend/cpu/threefry.h ADDED Viewed

@@ -0,0 +1,21 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <cstdint>
+#include <utility>
+namespace mlx::core::random {
+/** Applies the Threefry 2x32 hash function.
+ * This code is based on the Jax counter-based and splittable PRNG
+ * https://github.com/google/jax/blob/main/docs/jep/263-prng.md
+ *
+ * Original Threefry reference:
+ * http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+ */
+std::pair<uint32_t, uint32_t> threefry2x32_hash(
+    const std::pair<uint32_t, uint32_t>& key,
+    std::pair<uint32_t, uint32_t> count);
+} // namespace mlx::core::random