PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/cpu/simd/accelerate_simd.h ADDED Viewed

@@ -0,0 +1,329 @@
+#pragma once
+#include <arm_neon.h>
+#include <simd/math.h>
+#include <simd/vector.h>
+#include <stdint.h>
+#include <cmath>
+#include <complex>
+#include "mlx/backend/cpu/simd/base_simd.h"
+// There seems to be a bug in simd/base_simd.h
+// __XROS_2_0 is not defined, the expression evaluates
+// to true instead of false setting the SIMD library
+// higher than it should be even on macOS < 15
+#if __MAC_OS_X_VERSION_MIN_REQUIRED >= 150000 ||  \
+    __IPHONE_OS_VERSION_MIN_REQUIRED >= 180000 || \
+    __WATCH_OS_VERSION_MIN_REQUIRED >= 110000 ||  \
+    __WATCH_OS_VERSION_MIN_REQUIRED >= 110000 ||  \
+    __TV_OS_VERSION_MIN_REQUIRED >= 180000
+#define MLX_SIMD_LIBRARY_VERSION 6
+#else
+#define MLX_SIMD_LIBRARY_VERSION 5
+#endif
+namespace mlx::core::simd {
+// Apple simd namespace
+namespace asd = ::simd;
+// This indirection is needed to remap certain types to ones that accelerate
+// SIMD can handle
+template <typename T, int N>
+struct ScalarT {
+  using v = T;
+};
+template <int N>
+struct ScalarT<bool, N> {
+  using v = char;
+};
+template <int N>
+struct ScalarT<int8_t, N> {
+  using v = char;
+};
+template <int N>
+struct ScalarT<uint64_t, N> {
+  using v = unsigned long;
+};
+template <int N>
+struct ScalarT<int64_t, N> {
+  using v = long;
+};
+template <typename T, int N>
+struct Simd {
+  static constexpr int size = N;
+  using scalar_t = typename ScalarT<T, N>::v;
+  Simd<T, N>() {}
+  template <typename U>
+  Simd<T, N>(Simd<U, N> other) : value(asd::convert<scalar_t>(other.value)) {}
+  template <typename U>
+  Simd<T, N>(U v) : value(v){};
+  Simd<T, N>(Simd<T, N / 2> x, Simd<T, N / 2> y) {
+    value = asd::make<typename asd::Vector<scalar_t, N>::packed_t>(
+        x.value, y.value);
+  };
+  T operator[](int idx) const {
+    return reinterpret_cast<const T*>(&value)[idx];
+  }
+  T& operator[](int idx) {
+    return reinterpret_cast<T*>(&value)[idx];
+  }
+  typename asd::Vector<scalar_t, N>::packed_t value;
+};
+// Values chosen based on benchmarks on M3 Max
+// TODO: consider choosing these more optimally
+template <>
+inline constexpr int max_size<int8_t> = 16;
+template <>
+inline constexpr int max_size<int16_t> = 16;
+template <>
+inline constexpr int max_size<int> = 8;
+template <>
+inline constexpr int max_size<int64_t> = 4;
+template <>
+inline constexpr int max_size<uint8_t> = 16;
+template <>
+inline constexpr int max_size<uint16_t> = 16;
+template <>
+inline constexpr int max_size<uint32_t> = 8;
+template <>
+inline constexpr int max_size<uint64_t> = 4;
+template <>
+inline constexpr int max_size<float> = 8;
+template <>
+inline constexpr int max_size<double> = 4;
+#define SIMD_DEFAULT_UNARY(name, op) \
+  template <typename T, int N>       \
+  Simd<T, N> name(Simd<T, N> v) {    \
+    return op(v.value);              \
+  }
+SIMD_DEFAULT_UNARY(abs, asd::abs)
+SIMD_DEFAULT_UNARY(floor, asd::floor)
+SIMD_DEFAULT_UNARY(acos, asd::acos)
+SIMD_DEFAULT_UNARY(acosh, asd::acosh)
+SIMD_DEFAULT_UNARY(asin, asd::asin)
+SIMD_DEFAULT_UNARY(asinh, asd::asinh)
+SIMD_DEFAULT_UNARY(atan, asd::atan)
+SIMD_DEFAULT_UNARY(atanh, asd::atanh)
+SIMD_DEFAULT_UNARY(ceil, asd::ceil)
+SIMD_DEFAULT_UNARY(cosh, asd::cosh)
+SIMD_DEFAULT_UNARY(expm1, asd::expm1)
+SIMD_DEFAULT_UNARY(log, asd::log)
+SIMD_DEFAULT_UNARY(log2, asd::log2)
+SIMD_DEFAULT_UNARY(log10, asd::log10)
+SIMD_DEFAULT_UNARY(log1p, asd::log1p)
+SIMD_DEFAULT_UNARY(rint, asd::rint)
+SIMD_DEFAULT_UNARY(sinh, asd::sinh)
+SIMD_DEFAULT_UNARY(sqrt, asd::sqrt)
+SIMD_DEFAULT_UNARY(rsqrt, asd::rsqrt)
+SIMD_DEFAULT_UNARY(recip, asd::recip)
+SIMD_DEFAULT_UNARY(tan, asd::tan)
+SIMD_DEFAULT_UNARY(tanh, asd::tanh)
+template <typename T, int N>
+Simd<T, N> operator-(Simd<T, N> v) {
+  return -v.value;
+}
+template <typename T, int N>
+Simd<T, N> operator~(Simd<T, N> v) {
+  return ~v.value;
+}
+template <typename T, int N>
+Simd<bool, N> isnan(Simd<T, N> v) {
+  return asd::convert<char>(v.value != v.value);
+}
+// No simd_boolN in accelerate, use int8_t instead
+template <typename T, int N>
+Simd<bool, N> operator!(Simd<T, N> v) {
+  return asd::convert<char>(!v.value);
+}
+#define SIMD_DEFAULT_BINARY(OP)                                              \
+  template <typename T, typename U, int N>                                   \
+  Simd<T, N> operator OP(Simd<T, N> x, U y) {                                \
+    return asd::convert<typename Simd<T, N>::scalar_t>(x.value OP y);        \
+  }                                                                          \
+  template <typename T1, typename T2, int N>                                 \
+  Simd<T2, N> operator OP(T1 x, Simd<T2, N> y) {                             \
+    return asd::convert<typename Simd<T2, N>::scalar_t>(x OP y.value);       \
+  }                                                                          \
+  template <typename T1, typename T2, int N>                                 \
+  Simd<T1, N> operator OP(Simd<T1, N> x, Simd<T2, N> y) {                    \
+    return asd::convert<typename Simd<T1, N>::scalar_t>(x.value OP y.value); \
+  }
+SIMD_DEFAULT_BINARY(+)
+SIMD_DEFAULT_BINARY(-)
+SIMD_DEFAULT_BINARY(/)
+SIMD_DEFAULT_BINARY(*)
+SIMD_DEFAULT_BINARY(<<)
+SIMD_DEFAULT_BINARY(>>)
+SIMD_DEFAULT_BINARY(|)
+SIMD_DEFAULT_BINARY(^)
+SIMD_DEFAULT_BINARY(&)
+SIMD_DEFAULT_BINARY(&&)
+SIMD_DEFAULT_BINARY(||)
+#define SIMD_DEFAULT_COMPARISONS(OP)                        \
+  template <int N, typename T, typename U>                  \
+  Simd<bool, N> operator OP(Simd<T, N> a, U b) {            \
+    return asd::convert<char>(a.value OP b);                \
+  }                                                         \
+  template <int N, typename T, typename U>                  \
+  Simd<bool, N> operator OP(T a, Simd<U, N> b) {            \
+    return asd::convert<char>(a OP b.value);                \
+  }                                                         \
+  template <int N, typename T1, typename T2>                \
+  Simd<bool, N> operator OP(Simd<T1, N> a, Simd<T2, N> b) { \
+    return asd::convert<char>(a.value OP b.value);          \
+  }
+SIMD_DEFAULT_COMPARISONS(>)
+SIMD_DEFAULT_COMPARISONS(<)
+SIMD_DEFAULT_COMPARISONS(>=)
+SIMD_DEFAULT_COMPARISONS(<=)
+SIMD_DEFAULT_COMPARISONS(==)
+SIMD_DEFAULT_COMPARISONS(!=)
+template <typename T, int N>
+Simd<T, N> clz(Simd<T, N> x) {
+  auto a = *(uint32x4_t*)(&x);
+  auto b = *((uint32x4_t*)(&x) + 1);
+  a = vclzq_u32(a);
+  b = vclzq_u32(b);
+  return asd::make_uint8(a, b);
+}
+template <typename T, int N>
+Simd<T, N> atan2(Simd<T, N> a, Simd<T, N> b) {
+  return asd::atan2(a.value, b.value);
+}
+template <typename T, int N>
+Simd<T, N> maximum(Simd<T, N> a, Simd<T, N> b) {
+  auto out = Simd<T, N>(asd::max(a.value, b.value));
+  if constexpr (!std::is_integral_v<T>) {
+    out = select(isnan(b), b, select(isnan(a), a, out));
+  }
+  return out;
+}
+template <typename T, int N>
+Simd<T, N> minimum(Simd<T, N> a, Simd<T, N> b) {
+  auto out = Simd<T, N>(asd::min(a.value, b.value));
+  if constexpr (!std::is_integral_v<T>) {
+    out = select(isnan(b), b, select(isnan(a), a, out));
+  }
+  return out;
+}
+template <typename T, int N>
+Simd<T, N> remainder(Simd<T, N> a, Simd<T, N> b) {
+  Simd<T, N> r;
+  if constexpr (!std::is_integral_v<T>) {
+    r = asd::remainder(a.value, b.value);
+  } else {
+    r = a - b * (a / b);
+  }
+  if constexpr (std::is_signed_v<T>) {
+    auto mask = r != 0 && (r < 0 != b < 0);
+    r = select(mask, r + b, r);
+  }
+  return r;
+}
+template <typename MaskT, typename T1, typename T2, int N>
+Simd<T1, N> select(Simd<MaskT, N> mask, Simd<T1, N> x, Simd<T2, N> y) {
+  static_assert(std::is_same_v<MaskT, bool>);
+  if constexpr (sizeof(T1) == 1) {
+    return asd::bitselect(y.value, x.value, asd::convert<char>(mask.value));
+  } else if constexpr (sizeof(T1) == 2) {
+    return asd::bitselect(y.value, x.value, asd::convert<short>(mask.value));
+  } else if constexpr (sizeof(T1) == 4) {
+    return asd::bitselect(y.value, x.value, asd::convert<int>(mask.value));
+  } else {
+    return asd::bitselect(y.value, x.value, asd::convert<long>(mask.value));
+  }
+}
+template <typename T, int N>
+Simd<T, N> pow(Simd<T, N> base, Simd<T, N> exp) {
+  if constexpr (!std::is_integral_v<T>) {
+    return asd::pow(base.value, exp.value);
+  } else {
+    Simd<T, N> res = 1;
+    // Raising an integer to a negative power is undefined
+    if (any(exp < 0)) {
+      return 0;
+    }
+    while (any(exp > 0)) {
+      res = select((exp & 1) != 0, res * base, res);
+      base = select(exp > 0, base * base, base);
+      exp = exp >> 1;
+    }
+    return res;
+  }
+}
+template <typename T, int N>
+Simd<T, N> clamp(Simd<T, N> v, Simd<T, N> min, Simd<T, N> max) {
+  return asd::clamp(v.value, min.value, max.value);
+}
+template <typename T, typename U, int N>
+Simd<T, N> fma(Simd<T, N> x, Simd<T, N> y, U z) {
+  return asd::muladd(x.value, y.value, Simd<T, N>(z).value);
+}
+// Reductions
+template <typename T, int N>
+bool all(Simd<T, N> x) {
+  return asd::all(x.value);
+}
+template <typename T, int N>
+bool any(Simd<T, N> x) {
+  return asd::any(x.value);
+}
+template <typename T, int N>
+T sum(Simd<T, N> x) {
+  return asd::reduce_add(x.value);
+}
+template <typename T, int N>
+T max(Simd<T, N> x) {
+  return asd::reduce_max(x.value);
+}
+template <typename T, int N>
+T min(Simd<T, N> x) {
+  return asd::reduce_min(x.value);
+}
+template <typename T, int N>
+T prod(Simd<T, N> x) {
+  auto ptr = (T*)&x;
+  auto lhs = load<T, N / 2>(ptr);
+  auto rhs = load<T, N / 2>(ptr + N / 2);
+  return prod(lhs * rhs);
+}
+} // namespace mlx::core::simd
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "mlx/backend/cpu/simd/accelerate_fp16_simd.h"
+#endif

mlx/include/mlx/backend/cpu/simd/base_simd.h ADDED Viewed

@@ -0,0 +1,295 @@
+#pragma once
+#include <stdint.h>
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <functional>
+namespace mlx::core::simd {
+template <typename T, int N>
+struct Simd;
+template <typename T>
+static constexpr int max_size = 1;
+template <typename T>
+struct Simd<T, 1> {
+  static constexpr int size = 1;
+  T value;
+  Simd() {}
+  template <typename U>
+  Simd(Simd<U, 1> v) : value(v.value) {}
+  template <typename U>
+  Simd(U v) : value(v) {}
+};
+template <typename T, int N>
+Simd<T, N> load(const T* x) {
+  return *(Simd<T, N>*)x;
+}
+template <typename T, int N>
+void store(T* dst, Simd<T, N> x) {
+  // Maintain invariant that bool is either 0 or 1 as
+  // simd comparison ops set all bits in the result to 1
+  if constexpr (std::is_same_v<T, bool> && N > 1) {
+    x = x & 1;
+  }
+  *(Simd<T, N>*)dst = x;
+}
+template <typename, typename = void>
+constexpr bool is_complex = false;
+template <typename T>
+constexpr bool is_complex<T, std::void_t<decltype(std::declval<T>().real())>> =
+    true;
+template <typename T>
+Simd<T, 1> rint(Simd<T, 1> in) {
+  if constexpr (is_complex<T>) {
+    return Simd<T, 1>{
+        T{std::rint(in.value.real()), std::rint(in.value.imag())}};
+  } else {
+    return Simd<T, 1>{std::rint(in.value)};
+  }
+}
+template <typename T>
+Simd<T, 1> rsqrt(Simd<T, 1> in) {
+  return T(1.0) / sqrt(in);
+}
+template <typename T>
+Simd<T, 1> recip(Simd<T, 1> in) {
+  return T(1.0) / in;
+}
+#define DEFAULT_UNARY(name, op)    \
+  template <typename T>            \
+  Simd<T, 1> name(Simd<T, 1> in) { \
+    return op(in.value);           \
+  }
+DEFAULT_UNARY(operator-, std::negate{})
+DEFAULT_UNARY(operator!, std::logical_not{})
+DEFAULT_UNARY(abs, std::abs)
+DEFAULT_UNARY(acos, std::acos)
+DEFAULT_UNARY(acosh, std::acosh)
+DEFAULT_UNARY(asin, std::asin)
+DEFAULT_UNARY(asinh, std::asinh)
+DEFAULT_UNARY(atan, std::atan)
+DEFAULT_UNARY(atanh, std::atanh)
+DEFAULT_UNARY(ceil, std::ceil)
+DEFAULT_UNARY(conj, std::conj)
+DEFAULT_UNARY(cosh, std::cosh)
+DEFAULT_UNARY(expm1, std::expm1)
+DEFAULT_UNARY(floor, std::floor)
+DEFAULT_UNARY(log, std::log)
+DEFAULT_UNARY(log10, std::log10)
+DEFAULT_UNARY(sinh, std::sinh)
+DEFAULT_UNARY(sqrt, std::sqrt)
+DEFAULT_UNARY(tan, std::tan)
+DEFAULT_UNARY(tanh, std::tanh)
+template <typename T>
+Simd<T, 1> log1p(Simd<T, 1> in) {
+  if constexpr (is_complex<T>) {
+    auto x = in.value.real();
+    auto y = in.value.imag();
+    auto zabs = std::abs(in.value);
+    auto theta = std::atan2(y, x + 1);
+    if (zabs < 0.5) {
+      auto r = x * (2 + x) + y * y;
+      if (r == 0) { // handle underflow
+        return Simd<T, 1>{T{x, theta}};
+      }
+      return Simd<T, 1>{T{((typeof(x))(0.5)) * std::log1p(r), theta}};
+    } else {
+      auto z0 = std::hypot(x + 1, y);
+      return Simd<T, 1>{T{std::log(z0), theta}};
+    }
+  } else {
+    return Simd<T, 1>{std::log1p(in.value)};
+  }
+}
+template <typename T>
+Simd<T, 1> log2(Simd<T, 1> in) {
+  if constexpr (is_complex<T>) {
+    auto out = std::log(in.value);
+    auto scale = decltype(out.real())(M_LN2);
+    return Simd<T, 1>{T{out.real() / scale, out.imag() / scale}};
+  } else {
+    return Simd<T, 1>{std::log2(in.value)};
+  }
+}
+template <typename T>
+Simd<T, 1> operator~(Simd<T, 1> in) {
+  return ~in.value;
+}
+template <typename T>
+auto real(Simd<T, 1> in) -> Simd<decltype(std::real(in.value)), 1> {
+  return std::real(in.value);
+}
+template <typename T>
+auto imag(Simd<T, 1> in) -> Simd<decltype(std::imag(in.value)), 1> {
+  return std::imag(in.value);
+}
+template <typename T>
+Simd<bool, 1> isnan(Simd<T, 1> in) {
+  return std::isnan(in.value);
+}
+#define DEFAULT_BINARY(OP)                                                 \
+  template <typename T1, typename T2>                                      \
+  auto operator OP(Simd<T1, 1> a, Simd<T2, 1> b)                           \
+      ->Simd<decltype(a.value OP b.value), 1> {                            \
+    return a.value OP b.value;                                             \
+  }                                                                        \
+  template <typename T1, typename T2>                                      \
+  auto operator OP(T1 a, Simd<T2, 1> b)->Simd<decltype(a OP b.value), 1> { \
+    return a OP b.value;                                                   \
+  }                                                                        \
+  template <typename T1, typename T2>                                      \
+  auto operator OP(Simd<T1, 1> a, T2 b)->Simd<decltype(a.value OP b), 1> { \
+    return a.value OP b;                                                   \
+  }
+DEFAULT_BINARY(+)
+DEFAULT_BINARY(-)
+DEFAULT_BINARY(*)
+DEFAULT_BINARY(/)
+DEFAULT_BINARY(<<)
+DEFAULT_BINARY(>>)
+DEFAULT_BINARY(|)
+DEFAULT_BINARY(^)
+DEFAULT_BINARY(&)
+DEFAULT_BINARY(&&)
+DEFAULT_BINARY(||)
+template <typename T>
+Simd<T, 1> clz(Simd<T, 1> x_) {
+  return __builtin_clz(x_.value);
+}
+template <typename T>
+Simd<T, 1> remainder(Simd<T, 1> a_, Simd<T, 1> b_) {
+  T a = a_.value;
+  T b = b_.value;
+  T r;
+  if constexpr (std::is_integral_v<T>) {
+    r = a % b;
+  } else {
+    r = std::remainder(a, b);
+  }
+  if constexpr (std::is_signed_v<T>) {
+    if (r != 0 && (r < 0 != b < 0)) {
+      r += b;
+    }
+  }
+  return r;
+}
+template <typename T>
+Simd<T, 1> maximum(Simd<T, 1> a_, Simd<T, 1> b_) {
+  T a = a_.value;
+  T b = b_.value;
+  if constexpr (!std::is_integral_v<T>) {
+    if (std::isnan(a)) {
+      return a;
+    }
+  }
+  return (a > b) ? a : b;
+}
+template <typename T>
+Simd<T, 1> minimum(Simd<T, 1> a_, Simd<T, 1> b_) {
+  T a = a_.value;
+  T b = b_.value;
+  if constexpr (!std::is_integral_v<T>) {
+    if (std::isnan(a)) {
+      return a;
+    }
+  }
+  return (a < b) ? a : b;
+}
+template <typename T>
+Simd<T, 1> pow(Simd<T, 1> a, Simd<T, 1> b) {
+  T base = a.value;
+  T exp = b.value;
+  if constexpr (!std::is_integral_v<T>) {
+    return std::pow(base, exp);
+  } else {
+    T res = 1;
+    while (exp) {
+      if (exp & 1) {
+        res *= base;
+      }
+      exp >>= 1;
+      base *= base;
+    }
+    return res;
+  }
+}
+template <typename T>
+Simd<T, 1> atan2(Simd<T, 1> a, Simd<T, 1> b) {
+  return std::atan2(a.value, b.value);
+}
+#define DEFAULT_COMPARISONS(OP)                             \
+  template <typename T1, typename T2>                       \
+  Simd<bool, 1> operator OP(Simd<T1, 1> a, Simd<T2, 1> b) { \
+    return a.value OP b.value;                              \
+  }                                                         \
+  template <typename T1, typename T2>                       \
+  Simd<bool, 1> operator OP(T1 a, Simd<T2, 1> b) {          \
+    return a OP b.value;                                    \
+  }                                                         \
+  template <typename T1, typename T2>                       \
+  Simd<bool, 1> operator OP(Simd<T1, 1> a, T2 b) {          \
+    return a.value OP b;                                    \
+  }
+DEFAULT_COMPARISONS(>)
+DEFAULT_COMPARISONS(<)
+DEFAULT_COMPARISONS(>=)
+DEFAULT_COMPARISONS(<=)
+DEFAULT_COMPARISONS(==)
+DEFAULT_COMPARISONS(!=)
+template <typename MaskT, typename T>
+Simd<T, 1> select(Simd<MaskT, 1> mask, Simd<T, 1> x, Simd<T, 1> y) {
+  return mask.value ? x.value : y.value;
+}
+template <typename T>
+Simd<T, 1> clamp(Simd<T, 1> v, Simd<T, 1> min, Simd<T, 1> max) {
+  return std::clamp(v.value, min.value, max.value);
+}
+template <typename T, typename U>
+Simd<T, 1> fma(Simd<T, 1> x, Simd<T, 1> y, U z) {
+  return std::fma(x.value, y.value, Simd<T, 1>(z).value);
+}
+// Reductions
+#define DEFAULT_REDUCTION(name, type) \
+  template <typename T>               \
+  type name(Simd<T, 1> x) {           \
+    return x.value;                   \
+  }
+DEFAULT_REDUCTION(max, T)
+DEFAULT_REDUCTION(min, T)
+DEFAULT_REDUCTION(sum, T)
+DEFAULT_REDUCTION(prod, T)
+DEFAULT_REDUCTION(any, bool)
+DEFAULT_REDUCTION(all, bool)
+} // namespace mlx::core::simd