PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/types/fp16.h ADDED Viewed

@@ -0,0 +1,234 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <vector>
+#define __MLX_HALF_NAN__ 0x7D00
+namespace mlx::core {
+namespace {
+union float_bits_fp16 {
+  float f;
+  uint32_t u;
+};
+} // namespace
+struct _MLX_Float16 {
+  uint16_t bits_;
+  // Default constructor
+  _MLX_Float16() = default;
+  // Default copy constructor
+  _MLX_Float16(_MLX_Float16 const&) = default;
+  // Appease std::vector<bool> for being special
+  _MLX_Float16& operator=(std::vector<bool>::reference x) {
+    bits_ = x;
+    return *this;
+  }
+  _MLX_Float16& operator=(const float& x) {
+    return (*this = _MLX_Float16(x));
+  }
+  // From float32
+  _MLX_Float16(const float& x) : bits_(0) {
+    // Conversion following
+    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
+    // Union
+    float_bits_fp16 in;
+    // Take fp32 bits
+    in.f = x;
+    // Find and take sign bit
+    uint32_t x_sign_32 = in.u & uint32_t(0x80000000);
+    uint16_t x_sign_16 = (x_sign_32 >> 16);
+    if (std::isnan(x)) {
+      bits_ = x_sign_16 | uint16_t(__MLX_HALF_NAN__);
+    } else {
+      // Union
+      float_bits_fp16 inf_scale, zero_scale, magic_bits;
+      // Find exponent bits and take the max supported by half
+      uint32_t x_expo_32 = in.u & uint32_t(0x7f800000);
+      uint32_t max_expo_32 = uint32_t(0x38800000);
+      x_expo_32 = x_expo_32 < max_expo_32 ? max_expo_32 : x_expo_32;
+      x_expo_32 += uint32_t(15) << 23;
+      // Handle scaling to inf as needed
+      inf_scale.u = uint32_t(0x77800000);
+      zero_scale.u = uint32_t(0x08800000);
+      // Combine with magic and let addition do rounding
+      magic_bits.u = x_expo_32;
+      magic_bits.f += (std::abs(x) * inf_scale.f) * zero_scale.f;
+      // Take the lower 5 bits of the exponent
+      uint32_t x_expo_16 = ((magic_bits.u >> 13) & uint32_t(0x7c00));
+      // Collect the lower 12 bits which have the mantissa
+      uint32_t x_mant_16 = magic_bits.u & uint32_t(0x0fff);
+      // Combine sign, exp and mantissa
+      bits_ = (x_sign_16 | uint16_t(x_expo_16 + x_mant_16));
+    }
+  }
+  // To float32
+  operator float() const {
+    // Conversion following
+    // https://github.com/Maratyszcza/FP16/blob/master/include/fp16/fp16.h
+    // Union
+    float_bits_fp16 out;
+    uint32_t x_sign_32 = (bits_ << 16) & uint32_t(0x80000000);
+    uint32_t base = (bits_ << 16);
+    uint32_t two_base = base + base;
+    uint32_t denorm_max = 1u << 27;
+    if (two_base < denorm_max) {
+      out.u = uint32_t(126) << 23; // magic mask
+      out.u |= (two_base >> 17); // Bits from fp16
+      out.f -= 0.5f; // magic bias
+    } else {
+      out.u = uint32_t(0xE0) << 23; // exponent offset
+      out.u += (two_base >> 4); // Bits from fp16
+      float out_unscaled = out.f; // Store value
+      out.u = uint32_t(0x7800000); // exponent scale
+      out.f *= out_unscaled;
+    }
+    // Add sign
+    out.u |= x_sign_32;
+    return out.f;
+  }
+};
+#define half_binop_base(__op__, __operator__, otype, atype, btype, ctype) \
+  inline otype __operator__(atype lhs, btype rhs) {                       \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);        \
+  }
+#define half_binop_helper(__op__, __operator__, otype, itype, ctype) \
+  inline otype __operator__(_MLX_Float16 lhs, itype rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
+  }                                                                  \
+  inline otype __operator__(itype lhs, _MLX_Float16 rhs) {           \
+    return static_cast<ctype>(lhs) __op__ static_cast<ctype>(rhs);   \
+  }
+// Operators
+#define half_binop(__op__, __operator__)                                      \
+  half_binop_base(                                                            \
+      __op__, __operator__, _MLX_Float16, _MLX_Float16, _MLX_Float16, float); \
+  half_binop_helper(__op__, __operator__, float, float, float);               \
+  half_binop_helper(__op__, __operator__, double, double, double);            \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, bool, float);         \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, int32_t, float);      \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, uint32_t, float);     \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, int64_t, float);      \
+  half_binop_helper(__op__, __operator__, _MLX_Float16, uint64_t, float);
+half_binop(+, operator+);
+half_binop(-, operator-);
+half_binop(*, operator*);
+half_binop(/, operator/);
+#undef half_binop
+// Comparison ops
+#define half_compop(__op__, __operator__)                             \
+  half_binop_base(                                                    \
+      __op__, __operator__, bool, _MLX_Float16, _MLX_Float16, float); \
+  half_binop_helper(__op__, __operator__, bool, float, float);        \
+  half_binop_helper(__op__, __operator__, bool, double, double);      \
+  half_binop_helper(__op__, __operator__, bool, int32_t, float);      \
+  half_binop_helper(__op__, __operator__, bool, uint32_t, float);     \
+  half_binop_helper(__op__, __operator__, bool, int64_t, float);      \
+  half_binop_helper(__op__, __operator__, bool, uint64_t, float);
+half_compop(>, operator>);
+half_compop(<, operator<);
+half_compop(>=, operator>=);
+half_compop(<=, operator<=);
+half_compop(==, operator==);
+half_compop(!=, operator!=);
+#undef half_compop
+// Negative
+inline _MLX_Float16 operator-(_MLX_Float16 lhs) {
+  return -static_cast<float>(lhs);
+}
+// Inplace ops
+#define half_inplace_op(__op__, __operator__)                              \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, const float& rhs) { \
+    lhs = lhs __op__ rhs;                                                  \
+    return lhs;                                                            \
+  }                                                                        \
+  inline float& __operator__(float& lhs, _MLX_Float16 rhs) {               \
+    lhs = lhs __op__ rhs;                                                  \
+    return lhs;                                                            \
+  }
+half_inplace_op(+, operator+=);
+half_inplace_op(-, operator-=);
+half_inplace_op(*, operator*=);
+half_inplace_op(/, operator/=);
+#undef half_inplace_op
+// Bitwise ops
+#define half_bitop(__op__, __operator__)                                 \
+  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, _MLX_Float16 rhs) { \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs.bits_ __op__ rhs.bits_;                              \
+    return out;                                                          \
+  }                                                                      \
+  inline _MLX_Float16 __operator__(_MLX_Float16 lhs, uint16_t rhs) {     \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs.bits_ __op__ rhs;                                    \
+    return out;                                                          \
+  }                                                                      \
+  inline _MLX_Float16 __operator__(uint16_t lhs, _MLX_Float16 rhs) {     \
+    _MLX_Float16 out;                                                    \
+    out.bits_ = lhs __op__ rhs.bits_;                                    \
+    return out;                                                          \
+  }
+half_bitop(|, operator|);
+half_bitop(&, operator&);
+half_bitop(^, operator^);
+#undef half_bitop
+#define half_inplace_bitop(__op__, __operator__)                           \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, _MLX_Float16 rhs) { \
+    lhs.bits_ = lhs.bits_ __op__ rhs.bits_;                                \
+    return lhs;                                                            \
+  }                                                                        \
+  inline _MLX_Float16& __operator__(_MLX_Float16& lhs, uint16_t rhs) {     \
+    lhs.bits_ = lhs.bits_ __op__ rhs;                                      \
+    return lhs;                                                            \
+  }
+half_inplace_bitop(|, operator|=);
+half_inplace_bitop(&, operator&=);
+half_inplace_bitop(^, operator^=);
+#undef half_inplace_bitop
+} // namespace mlx::core

mlx/include/mlx/types/half_types.h ADDED Viewed

@@ -0,0 +1,58 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#include <arm_fp16.h>
+namespace mlx::core {
+using ::float16_t;
+} // namespace mlx::core
+#else
+#define ADD_HALF_BINOPS
+#include "mlx/types/fp16.h"
+namespace mlx::core {
+typedef struct _MLX_Float16 float16_t;
+} // namespace mlx::core
+#endif // __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+#ifdef __ARM_FEATURE_BF16
+#include <arm_bf16.h>
+namespace mlx::core {
+using ::bfloat16_t;
+} // namespace mlx::core
+#else
+#define ADD_HALF_BINOPS
+#include "mlx/types/bf16.h"
+namespace mlx::core {
+typedef struct _MLX_BFloat16 bfloat16_t;
+} // namespace mlx::core
+#endif // __ARM_FEATURE_BF16
+#ifdef ADD_HALF_BINOPS
+namespace mlx::core {
+// clang-format off
+#define fp16_bf16_binop_helper(__op__, __operator__)               \
+  inline float __operator__(float16_t lhs, bfloat16_t rhs) {       \
+    return static_cast<float>(lhs) __op__ static_cast<float>(rhs); \
+  }                                                                \
+  inline float __operator__(bfloat16_t lhs, float16_t rhs) {       \
+    return static_cast<float>(lhs) __op__ static_cast<float>(rhs); \
+  }
+fp16_bf16_binop_helper(+, operator+)
+fp16_bf16_binop_helper(-, operator-)
+fp16_bf16_binop_helper(*, operator*)
+fp16_bf16_binop_helper(/, operator/)
+// clang-format on
+} // namespace mlx::core
+#endif

mlx/include/mlx/types/limits.h ADDED Viewed

@@ -0,0 +1,70 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <limits>
+#include "mlx/types/half_types.h"
+namespace mlx::core {
+template <typename T>
+struct numeric_limits;
+template <>
+struct numeric_limits<float> : public std::numeric_limits<float> {};
+template <>
+struct numeric_limits<double> : public std::numeric_limits<double> {};
+template <>
+struct numeric_limits<float16_t> {
+ private:
+  union half_or_bits {
+    uint16_t bits;
+    float16_t value;
+  };
+  constexpr static float16_t bits_to_half(uint16_t v) {
+    return half_or_bits{v}.value;
+  }
+ public:
+  constexpr static float16_t lowest() {
+    return bits_to_half(0xFBFF);
+  }
+  static constexpr float16_t max() {
+    return bits_to_half(0x7BFF);
+  }
+  static constexpr float16_t epsilon() {
+    return bits_to_half(0x1400);
+  }
+  static constexpr float16_t infinity() {
+    return bits_to_half(0x7C00);
+  }
+};
+template <>
+struct numeric_limits<bfloat16_t> {
+ private:
+  union bfloat_or_bits {
+    uint16_t bits;
+    bfloat16_t value;
+  };
+  constexpr static bfloat16_t bits_to_bfloat(uint16_t v) {
+    return bfloat_or_bits{v}.value;
+  }
+ public:
+  constexpr static bfloat16_t lowest() {
+    return bits_to_bfloat(0xFF7F);
+  }
+  static constexpr bfloat16_t max() {
+    return bits_to_bfloat(0x7F7F);
+  }
+  static constexpr bfloat16_t epsilon() {
+    return bits_to_bfloat(0x3C00);
+  }
+  static constexpr bfloat16_t infinity() {
+    return bits_to_bfloat(0x7F80);
+  }
+};
+} // namespace mlx::core

mlx/include/mlx/utils.h ADDED Viewed

@@ -0,0 +1,175 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <exception>
+#include <variant>
+#include "mlx/array.h"
+#include "mlx/device.h"
+#include "mlx/dtype.h"
+#include "mlx/stream.h"
+namespace mlx::core {
+using StreamOrDevice = std::variant<std::monostate, Stream, Device>;
+Stream to_stream(StreamOrDevice s);
+Stream to_stream(StreamOrDevice s, Device default_);
+struct StreamContext {
+ public:
+  StreamContext(StreamOrDevice s) : _stream(default_stream(default_device())) {
+    if (std::holds_alternative<std::monostate>(s)) {
+      throw std::runtime_error(
+          "[StreamContext] Invalid argument, please specify a stream or device.");
+    }
+    auto _s = to_stream(s);
+    set_default_device(_s.device);
+    set_default_stream(_s);
+  }
+  ~StreamContext() {
+    set_default_device(_stream.device);
+    set_default_stream(_stream);
+  }
+ private:
+  Stream _stream;
+};
+struct PrintFormatter {
+  inline void print(std::ostream& os, bool val);
+  inline void print(std::ostream& os, int16_t val);
+  inline void print(std::ostream& os, uint16_t val);
+  inline void print(std::ostream& os, int32_t val);
+  inline void print(std::ostream& os, uint32_t val);
+  inline void print(std::ostream& os, int64_t val);
+  inline void print(std::ostream& os, uint64_t val);
+  inline void print(std::ostream& os, float16_t val);
+  inline void print(std::ostream& os, bfloat16_t val);
+  inline void print(std::ostream& os, float val);
+  inline void print(std::ostream& os, double val);
+  inline void print(std::ostream& os, complex64_t val);
+  bool capitalize_bool{false};
+};
+PrintFormatter& get_global_formatter();
+/** Print the exception and then abort. */
+void abort_with_exception(const std::exception& error);
+/** Holds information about floating-point types. */
+struct finfo {
+  explicit finfo(Dtype dtype);
+  Dtype dtype;
+  double min;
+  double max;
+  double eps;
+};
+/** Holds information about integral types. */
+struct iinfo {
+  explicit iinfo(Dtype dtype);
+  Dtype dtype;
+  int64_t min;
+  uint64_t max;
+};
+/** The type from promoting the arrays' types with one another. */
+inline Dtype result_type(const array& a, const array& b) {
+  return promote_types(a.dtype(), b.dtype());
+}
+inline Dtype result_type(const array& a, const array& b, const array& c) {
+  return promote_types(result_type(a, b), c.dtype());
+}
+Dtype result_type(const std::vector<array>& arrays);
+Shape broadcast_shapes(const Shape& s1, const Shape& s2);
+/**
+ * Returns the axis normalized to be in the range [0, ndim).
+ */
+int normalize_axis_index(
+    int axis,
+    int ndim,
+    const std::string& msg_prefix = "");
+std::ostream& operator<<(std::ostream& os, const Device& d);
+std::ostream& operator<<(std::ostream& os, const Stream& s);
+std::ostream& operator<<(std::ostream& os, const Dtype& d);
+std::ostream& operator<<(std::ostream& os, const Dtype::Kind& k);
+std::ostream& operator<<(std::ostream& os, array a);
+inline std::ostream& operator<<(std::ostream& os, const complex64_t& v) {
+  return os << v.real() << (v.imag() >= 0 ? "+" : "") << v.imag() << "j";
+}
+inline std::ostream& operator<<(std::ostream& os, const float16_t& v) {
+  return os << static_cast<float>(v);
+}
+inline std::ostream& operator<<(std::ostream& os, const bfloat16_t& v) {
+  return os << static_cast<float>(v);
+}
+template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
+inline std::ostream& operator<<(std::ostream& os, const Vec& v) {
+  os << "(";
+  for (auto it = v.begin(); it != v.end(); ++it) {
+    os << *it;
+    if (it != std::prev(v.end())) {
+      os << ",";
+    }
+  }
+  os << ")";
+  return os;
+}
+inline bool is_power_of_2(int n) {
+  return ((n & (n - 1)) == 0) && n != 0;
+}
+inline int next_power_of_2(int n) {
+  if (is_power_of_2(n)) {
+    return n;
+  }
+  return pow(2, std::ceil(std::log2(n)));
+}
+namespace env {
+int get_var(const char* name, int default_value);
+inline int bfs_max_width() {
+  static int bfs_max_width_ = get_var("MLX_BFS_MAX_WIDTH", 20);
+  return bfs_max_width_;
+}
+inline int max_ops_per_buffer(int default_value) {
+  static int max_ops_per_buffer_ =
+      get_var("MLX_MAX_OPS_PER_BUFFER", default_value);
+  return max_ops_per_buffer_;
+}
+inline int max_mb_per_buffer(int default_value) {
+  static int max_mb_per_buffer_ =
+      get_var("MLX_MAX_MB_PER_BUFFER", default_value);
+  return max_mb_per_buffer_;
+}
+inline bool metal_fast_synch() {
+  static bool metal_fast_synch = get_var("MLX_METAL_FAST_SYNCH", 0);
+  return metal_fast_synch;
+}
+inline bool enable_tf32() {
+  static bool enable_tf32_ = get_var("MLX_ENABLE_TF32", 1);
+  return enable_tf32_;
+}
+inline int nccl_timeout(int default_value) {
+  static int nccl_timeout = get_var("MLX_NCCL_TIMEOUT", default_value);
+  return nccl_timeout;
+}
+} // namespace env
+} // namespace mlx::core

mlx/include/mlx/version.h ADDED Viewed

@@ -0,0 +1,20 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#define MLX_VERSION_MAJOR 0
+#define MLX_VERSION_MINOR 30
+#define MLX_VERSION_PATCH 1
+#define MLX_VERSION_NUMERIC \
+  (100000 * MLX_VERSION_MAJOR + 1000 * MLX_VERSION_MINOR + MLX_VERSION_PATCH)
+namespace mlx::core {
+/* A string representation of the MLX version in the format
+ * "major.minor.patch".
+ *
+ * For dev builds, the version will include the suffix ".devYYYYMMDD+hash"
+ */
+const char* version();
+} // namespace mlx::core

mlx/lib/libmlx.so ADDED Viewed

Binary file

mlx/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+

mlx/share/cmake/MLX/FindNCCL.cmake ADDED Viewed

@@ -0,0 +1,54 @@
+# FindNCCL.cmake This module finds the NVIDIA NCCL library and its include
+# directories.
+set(NCCL_ROOT_DIR
+    $ENV{NCCL_ROOT_DIR}
+    CACHE PATH "Folder contains NVIDIA NCCL")
+find_path(
+  NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR} ${NCCL_ROOT_DIR} ${NCCL_ROOT_DIR}/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/include)
+if($ENV{USE_STATIC_NCCL})
+  message(
+    STATUS "USE_STATIC_NCCL detected. Linking against static NCCL library")
+  set(NCCL_LIBNAME "libnccl_static.a")
+else()
+  set(NCCL_LIBNAME "nccl")
+endif()
+find_library(
+  NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR}
+        ${NCCL_ROOT_DIR}
+        ${NCCL_ROOT_DIR}/lib
+        ${NCCL_ROOT_DIR}/lib/x86_64-linux-gnu
+        ${NCCL_ROOT_DIR}/lib64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS
+                                  NCCL_LIBRARIES)
+if(NCCL_FOUND)
+  set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message(
+    STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}")
+  file(
+    STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED
+    REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$"
+    LIMIT_COUNT 1)
+  if(NCCL_MAJOR_VERSION_DEFINED)
+    string(REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" ""
+                         NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED})
+    message(STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}")
+  endif()
+  message(
+    STATUS
+      "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()

mlx/share/cmake/MLX/Findnvpl.cmake ADDED Viewed

@@ -0,0 +1,3 @@
+# This file does nothing but to suppress the cmake warning: "By not providing
+# Findnvpl.cmake in CMAKE_MODULE_PATH...", which is caused by the
+# find_package(nvpl) from cmake's builtin FindLAPACK.cmake module.

mlx/share/cmake/MLX/MLXConfig.cmake ADDED Viewed

@@ -0,0 +1,66 @@
+# Find MLX
+#
+# Defines the following variables:
+#
+#   MLX_FOUND            : True if MLX is found
+#   MLX_INCLUDE_DIRS     : Include directory
+#   MLX_LIBRARIES        : Libraries to link against
+#   MLX_CXX_FLAGS        : Additional compiler flags
+#   MLX_BUILD_ACCELERATE : True if MLX was built with accelerate
+#   MLX_BUILD_METAL      : True if MLX was built with metal
+####### Expanded from @PACKAGE_INIT@ by configure_package_config_file() #######
+####### Any changes to this file will be overwritten by the next CMake run ####
+####### The input file was mlx.pc.in                            ########
+get_filename_component(PACKAGE_PREFIX_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
+macro(set_and_check _var _file)
+  set(${_var} "${_file}")
+  if(NOT EXISTS "${_file}")
+    message(FATAL_ERROR "File or directory ${_file} referenced by variable ${_var} does not exist !")
+  endif()
+endmacro()
+####################################################################################
+include(${PACKAGE_PREFIX_DIR}/share/cmake/MLX/MLXTargets.cmake)
+include(${PACKAGE_PREFIX_DIR}/share/cmake/MLX/extension.cmake)
+set_and_check(MLX_LIBRARY_DIRS ${PACKAGE_PREFIX_DIR}/lib)
+set_and_check(MLX_INCLUDE_DIRS ${PACKAGE_PREFIX_DIR}/include)
+set(MLX_LIBRARIES mlx)
+find_library(MLX_LIBRARY mlx PATHS ${MLX_LIBRARY_DIRS})
+if (OFF)
+    set(MLX_BUILD_ACCELERATE OFF)
+    set(MLX_CXX_FLAGS ${MLX_CXX_FLAGS} -DACCELERATE_NEW_LAPACK)
+endif()
+if (OFF)
+    set(MLX_BUILD_METAL OFF)
+    set(MLX_CXX_FLAGS ${MLX_CXX_FLAGS} -D_METAL_)
+    set(MLX_INCLUDE_DIRS
+        "${MLX_INCLUDE_DIRS};"
+        ${PACKAGE_PREFIX_DIR}/include/metal_cpp
+    )
+    if( GREATER_EQUAL 310)
+      set(MLX_INCLUDE_DIRS
+        "${MLX_INCLUDE_DIRS};"
+        ${PACKAGE_PREFIX_DIR}/include/mlx/backend/metal/kernels/metal_3_1)
+    else()
+      set(MLX_INCLUDE_DIRS
+        "${MLX_INCLUDE_DIRS};"
+        ${PACKAGE_PREFIX_DIR}/include/mlx/backend/metal/kernels/metal_3_0)
+    endif()
+endif()
+set_target_properties(mlx PROPERTIES
+    CXX_STANDARD 17
+    INTERFACE_COMPILE_OPTIONS "${MLX_CXX_FLAGS}"
+)
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MLX DEFAULT_MSG MLX_LIBRARY MLX_INCLUDE_DIRS)