PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/scan.h ADDED Viewed

@@ -0,0 +1,17 @@
+#pragma once
+#include "mlx/array.h"
+#include "mlx/primitives.h"
+namespace mlx::core {
+void scan_gpu_inplace(
+    array in,
+    array& out,
+    Scan::ReduceType reduce_type,
+    int axis,
+    bool reverse,
+    bool inclusive,
+    const Stream& s);
+} // namespace mlx::core

mlx/include/mlx/backend/metal/ternary.h ADDED Viewed

@@ -0,0 +1,21 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+void ternary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+void ternary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+} // namespace mlx::core

mlx/include/mlx/backend/metal/unary.h ADDED Viewed

@@ -0,0 +1,21 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+void unary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+void unary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+} // namespace mlx::core

mlx/include/mlx/backend/metal/utils.h ADDED Viewed

@@ -0,0 +1,84 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <type_traits>
+#include "mlx/array.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/primitives.h"
+namespace mlx::core {
+std::string type_to_name(const Dtype& t);
+std::string type_to_name(const array& a);
+// Compute the grid and block dimensions, check backend/common/utils.h for docs.
+MTL::Size get_block_dims(int dim0, int dim1, int dim2, int pow2 = 10);
+MTL::Size get_2d_grid_dims(const Shape& shape, const Strides& strides);
+MTL::Size
+get_2d_grid_dims(const Shape& shape, const Strides& strides, size_t divisor);
+inline NS::String* make_string(std::ostringstream& os) {
+  std::string string = os.str();
+  return NS::String::string(string.c_str(), NS::UTF8StringEncoding);
+}
+inline void debug_set_stream_queue_label(MTL::CommandQueue* queue, int index) {
+#ifdef MLX_METAL_DEBUG
+  std::ostringstream label;
+  label << "Stream " << index;
+  queue->setLabel(make_string(label));
+#endif
+}
+inline void debug_set_primitive_buffer_label(
+    MTL::CommandBuffer* command_buffer,
+    Primitive& primitive) {
+#ifdef MLX_METAL_DEBUG
+  std::ostringstream label;
+  if (auto cbuf_label = command_buffer->label(); cbuf_label) {
+    label << cbuf_label->utf8String();
+  }
+  label << primitive.name();
+  command_buffer->setLabel(make_string(label));
+#endif
+}
+template <typename T>
+constexpr bool is_numeric_except_char = std::is_arithmetic_v<T> &&
+    !std::is_same_v<T, char> && !std::is_same_v<T, signed char> &&
+    !std::is_same_v<T, unsigned char> && !std::is_same_v<T, wchar_t>;
+template <typename T>
+void concatenate(std::string& acc, T first) {
+  if constexpr (is_numeric_except_char<T>) {
+    acc += std::to_string(first);
+  } else {
+    acc += first;
+  }
+}
+template <typename T, typename... Args>
+void concatenate(std::string& acc, T first, Args... args) {
+  if constexpr (is_numeric_except_char<T>) {
+    acc += std::to_string(first);
+  } else {
+    acc += first;
+  }
+  concatenate(acc, args...);
+}
+inline int get_work_per_thread(Dtype dtype) {
+  return std::max(1, 8 / dtype.size());
+}
+inline int get_work_per_thread(Dtype dtype, size_t size) {
+  constexpr size_t wpt_threshold = 1 << 16;
+  return size < wpt_threshold ? 1 : std::max(1, 8 / dtype.size());
+}
+inline size_t ceildiv(size_t n, size_t m) {
+  return (n + m - 1) / m;
+}
+} // namespace mlx::core

mlx/include/mlx/backend/no_gpu/apple_memory.h ADDED Viewed

@@ -0,0 +1,16 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#include <sys/sysctl.h>
+namespace {
+size_t get_memory_size() {
+  size_t memsize = 0;
+  size_t length = sizeof(memsize);
+  sysctlbyname("hw.memsize", &memsize, &length, NULL, 0);
+  return memsize;
+}
+} // namespace

mlx/include/mlx/backend/no_gpu/linux_memory.h ADDED Viewed

@@ -0,0 +1,22 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#include <sys/sysinfo.h>
+namespace {
+size_t get_memory_size() {
+  struct sysinfo info;
+  if (sysinfo(&info) != 0) {
+    return 0;
+  }
+  size_t total_ram = info.totalram;
+  total_ram *= info.mem_unit;
+  return total_ram;
+}
+} // namespace

mlx/include/mlx/compile.h ADDED Viewed

@@ -0,0 +1,44 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+enum class CompileMode { disabled, no_simplify, no_fuse, enabled };
+/** Compile takes a function and returns a compiled function. */
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    std::function<std::vector<array>(const std::vector<array>&)> fun,
+    bool shapeless = false);
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    std::vector<array> (*fun)(const std::vector<array>&),
+    bool shapeless = false);
+// Convert capture-less lambdas to function pointers.
+template <
+    typename F,
+    typename = std::enable_if_t<
+        std::is_convertible_v<F, decltype(+std::declval<F>())>>>
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    F&& f,
+    bool shapeless = false) {
+  return compile(+f, shapeless);
+}
+/** Globally disable compilation.
+ * Setting the environment variable ``MLX_DISABLE_COMPILE`` can also
+ * be used to disable compilation.
+ */
+void disable_compile();
+/** Globally enable compilation.
+ * This will override the environment variable ``MLX_DISABLE_COMPILE``.
+ */
+void enable_compile();
+/** Set the compiler mode to the given value. */
+void set_compile_mode(CompileMode mode);
+} // namespace mlx::core

mlx/include/mlx/compile_impl.h ADDED Viewed

@@ -0,0 +1,69 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <unordered_map>
+#include "mlx/array.h"
+namespace mlx::core::detail {
+using ArraysAndExtra = std::pair<std::vector<array>, std::shared_ptr<void>>;
+using ArrayFnWithExtra =
+    std::function<ArraysAndExtra(const std::vector<array>&)>;
+// This is not part of the general C++ API as calling with a bad id is a bad
+// idea.
+std::function<std::vector<array>(const std::vector<array>&)> compile(
+    std::function<std::vector<array>(const std::vector<array>&)> fun,
+    std::uintptr_t fun_id,
+    bool shapeless = false,
+    std::vector<uint64_t> constants = {});
+ArrayFnWithExtra compile(
+    ArrayFnWithExtra fun,
+    std::uintptr_t fun_id,
+    bool shapeless,
+    std::vector<uint64_t> constants);
+// Erase cached compile functions
+void compile_erase(std::uintptr_t fun_id);
+// Clear the compiler cache causing a recompilation of all compiled functions
+// when called again.
+void compile_clear_cache();
+bool compile_available_for_device(const Device& device);
+std::tuple<std::vector<array>, std::vector<array>, std::shared_ptr<void>>
+compile_trace(
+    const ArrayFnWithExtra& fun,
+    const std::vector<array>& inputs,
+    bool shapeless);
+using ParentsMap =
+    std::unordered_map<std::uintptr_t, std::vector<std::pair<array, int>>>;
+// Traverses the graph to build a tape and a map of array ids to their parents
+std::pair<std::vector<array>, ParentsMap> compile_dfs(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const std::vector<array>& original_inputs);
+// Simplify the tape.
+void compile_simplify(
+    std::vector<array>& tape,
+    ParentsMap& parents_map,
+    std::vector<array>& outputs,
+    int passes);
+std::vector<array> compile_replace(
+    const std::vector<array>& tape,
+    const std::vector<array>& trace_inputs,
+    const std::vector<array>& trace_outputs,
+    const std::vector<array>& inputs,
+    bool shapeless);
+void compile_validate_shapeless(const std::vector<array>& tape);
+} // namespace mlx::core::detail

mlx/include/mlx/device.h ADDED Viewed

@@ -0,0 +1,31 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+namespace mlx::core {
+struct Device {
+  enum class DeviceType {
+    cpu,
+    gpu,
+  };
+  static constexpr DeviceType cpu = DeviceType::cpu;
+  static constexpr DeviceType gpu = DeviceType::gpu;
+  Device(DeviceType type, int index = 0) : type(type), index(index) {}
+  DeviceType type;
+  int index;
+};
+const Device& default_device();
+void set_default_device(const Device& d);
+bool operator==(const Device& lhs, const Device& rhs);
+bool operator!=(const Device& lhs, const Device& rhs);
+bool is_available(const Device& d);
+} // namespace mlx::core

mlx/include/mlx/distributed/distributed.h ADDED Viewed

@@ -0,0 +1,60 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <memory>
+#include "mlx/array.h"
+#include "mlx/utils.h"
+namespace mlx::core::distributed {
+// Forward declaration of the base group implementation.
+namespace detail {
+class GroupImpl;
+};
+/* Check if a communication backend is available */
+bool is_available();
+bool is_available(const std::string& bk);
+/**
+ * A distributed::Group represents a group of independent mlx processes that
+ * can communicate. We must also be able to create sub-groups from a group in
+ * order to define more granular communication.
+ */
+struct Group {
+  Group(std::shared_ptr<detail::GroupImpl> group) : group_(std::move(group)) {}
+  int rank() const;
+  int size() const;
+  /**
+   * Split the group according to the provided color. Namely processes that use
+   * the same color will go to the same group.
+   *
+   * The key defines the rank of the processes in the new group. The smaller
+   * the key the smaller the rank. If the provided key is negative, then the
+   * rank in the current group is used.
+   */
+  Group split(int color, int key = -1) const;
+  const std::shared_ptr<detail::GroupImpl>& raw_group() const {
+    return group_;
+  }
+ private:
+  std::shared_ptr<detail::GroupImpl> group_{nullptr};
+};
+/**
+ * Initialize the distributed backend and return the group containing all
+ * discoverable processes.
+ *
+ * If strict is true then throw an error if we couldn't initialize the
+ * distributed subsystem. Otherwise simply return a singleton group which will
+ * render communication operations as no-op.
+ */
+Group init(bool strict = false, const std::string& bk = "any");
+} // namespace mlx::core::distributed

mlx/include/mlx/distributed/distributed_impl.h ADDED Viewed

@@ -0,0 +1,59 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/distributed/distributed.h"
+namespace mlx::core::distributed::detail {
+/**
+ * Abstract base class of a distributed group implementation.
+ */
+class GroupImpl {
+ public:
+  virtual ~GroupImpl() {}
+  // Choose the stream this communication group can operate on
+  virtual Stream communication_stream(StreamOrDevice s = {}) = 0;
+  // Group operations
+  virtual int rank() = 0;
+  virtual int size() = 0;
+  virtual std::shared_ptr<GroupImpl> split(int color, int key = -1) = 0;
+  // Actual communication operations
+  virtual void all_sum(const array& input, array& output, Stream stream) = 0;
+  virtual void all_gather(const array& input, array& output, Stream stream) = 0;
+  virtual void send(const array& input, int dst, Stream stream) = 0;
+  virtual void recv(array& out, int src, Stream stream) = 0;
+  virtual void all_max(const array& input, array& output, Stream stream) = 0;
+  virtual void all_min(const array& input, array& output, Stream stream) = 0;
+  virtual void
+  sum_scatter(const array& input, array& output, Stream stream) = 0;
+};
+/* Define the MLX stream that the communication should happen in. */
+Stream communication_stream(Group group, StreamOrDevice s = {});
+/* Perform an all reduce sum operation */
+void all_sum(Group group, const array& input, array& output, Stream stream);
+/* Perform an all gather operation */
+void all_gather(Group group, const array& input, array& output, Stream stream);
+/** Send an array to the dst rank */
+void send(Group group, const array& input, int dst, Stream stream);
+/** Recv an array from the src rank */
+void recv(Group group, array& out, int src, Stream stream);
+/** Max reduction */
+void all_max(Group group, const array& input, array& output, Stream stream);
+/** Min reduction */
+void all_min(Group group, const array& input, array& output, Stream stream);
+/** Reduce scatter with average operation */
+void sum_scatter(Group group, const array& input, array& output, Stream stream);
+} // namespace mlx::core::distributed::detail

mlx/include/mlx/distributed/jaccl/jaccl.h ADDED Viewed

@@ -0,0 +1,12 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/distributed/distributed.h"
+namespace mlx::core::distributed::jaccl {
+using GroupImpl = mlx::core::distributed::detail::GroupImpl;
+bool is_available();
+std::shared_ptr<GroupImpl> init(bool strict = false);
+} // namespace mlx::core::distributed::jaccl

mlx/include/mlx/distributed/mpi/mpi.h ADDED Viewed

@@ -0,0 +1,12 @@
+// Copyright © 2024 Apple Inc.
+#include "mlx/distributed/distributed.h"
+namespace mlx::core::distributed::mpi {
+using GroupImpl = mlx::core::distributed::detail::GroupImpl;
+bool is_available();
+std::shared_ptr<GroupImpl> init(bool strict = false);
+} // namespace mlx::core::distributed::mpi

mlx/include/mlx/distributed/mpi/mpi_declarations.h ADDED Viewed

@@ -0,0 +1,28 @@
+// Copyright © 2024 Apple Inc.
+// Constants
+#define MPI_SUCCESS 0
+#define MPI_ANY_SOURCE -1
+#define MPI_ANY_TAG -1
+#define MPI_IN_PLACE ((void*)1)
+#define MPI_MAX_LIBRARY_VERSION_STRING 256
+// Define all the types that we use so that we don't include <mpi.h> which
+// causes linker errors on some platforms.
+//
+// NOTE: We define everything for openmpi.
+typedef void* MPI_Comm;
+typedef void* MPI_Datatype;
+typedef void* MPI_Op;
+typedef void(MPI_User_function)(void*, void*, int*, MPI_Datatype*);
+typedef struct ompi_status_public_t {
+  int MPI_SOURCE;
+  int MPI_TAG;
+  int MPI_ERROR;
+  int _cancelled;
+  size_t _ucount;
+} MPI_Status;

mlx/include/mlx/distributed/nccl/nccl.h ADDED Viewed

@@ -0,0 +1,12 @@
+// Copyright © 2024 Apple Inc.
+#include "mlx/distributed/distributed.h"
+namespace mlx::core::distributed::nccl {
+using GroupImpl = mlx::core::distributed::detail::GroupImpl;
+bool is_available();
+std::shared_ptr<GroupImpl> init(bool strict = false);
+} // namespace mlx::core::distributed::nccl

mlx/include/mlx/distributed/ops.h ADDED Viewed

@@ -0,0 +1,56 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include <optional>
+#include "mlx/distributed/distributed.h"
+#include "mlx/utils.h"
+namespace mlx::core::distributed {
+array all_sum(
+    const array& x,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array all_gather(
+    const array& x,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice S = {});
+array send(
+    const array& x,
+    int dst,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array recv(
+    Shape shape,
+    Dtype dtype,
+    int src,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array recv_like(
+    const array& x,
+    int src,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array all_max(
+    const array& x,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array all_min(
+    const array& x,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+array sum_scatter(
+    const array& x,
+    std::optional<Group> group = std::nullopt,
+    StreamOrDevice s = {});
+} // namespace mlx::core::distributed