PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/cuda/utils.h ADDED Viewed

@@ -0,0 +1,46 @@
+// Copyright © 2025 Apple Inc.
+// This file include utilities that are used by C++ code (i.e. .cpp files).
+#pragma once
+#include "mlx/array.h"
+#include "mlx/backend/cuda/allocator.h"
+#include "mlx/backend/cuda/cuda_utils.h"
+namespace mlx::core {
+template <typename T>
+inline uint max_occupancy_block_dim(T kernel) {
+  int _, block_dim;
+  if constexpr (std::is_same_v<T, CUfunction>) {
+    CHECK_CUDA_ERROR(
+        cuOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel, 0, 0, 0));
+  } else {
+    CHECK_CUDA_ERROR(
+        cudaOccupancyMaxPotentialBlockSize(&_, &block_dim, kernel));
+  }
+  return block_dim;
+}
+template <typename T>
+inline T* gpu_ptr(array& arr) {
+  return reinterpret_cast<T*>(
+      static_cast<char*>(
+          static_cast<cu::CudaBuffer*>(arr.buffer().ptr())->data) +
+      arr.offset());
+}
+// For const array, keep constness in pointer unless it is untyped.
+template <typename T>
+inline std::conditional_t<std::is_same_v<T, void>, void*, const T*> gpu_ptr(
+    const array& arr) {
+  return gpu_ptr<T>(const_cast<array&>(arr));
+}
+struct Dtype;
+// Convert Dtype to CUDA C++ types.
+const char* dtype_to_cuda_type(const Dtype& dtype);
+} // namespace mlx::core

mlx/include/mlx/backend/cuda/worker.h ADDED Viewed

@@ -0,0 +1,55 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#include "mlx/backend/cuda/event.h"
+#include <condition_variable>
+#include <functional>
+#include <map>
+#include <mutex>
+#include <thread>
+namespace mlx::core::cu {
+// Run tasks in worker thread, synchronized with cuda stream.
+class Worker {
+ public:
+  explicit Worker(Device& d);
+  ~Worker();
+  Worker(const Worker&) = delete;
+  Worker& operator=(const Worker&) = delete;
+  // Add a pending |task| that will run when consumed or commited.
+  void add_task(std::function<void()> task);
+  // Inform worker thread to run current batches after kernels in |stream|
+  // finish running.
+  void commit(cudaStream_t stream);
+ private:
+  static void signal(void*);
+  void thread_fn();
+  std::mutex mtx_;
+  std::condition_variable cond_;
+  uint64_t committed_batch_{0};
+  uint64_t signaled_batch_{0};
+  // Cuda stream and event for signaling kernel completion.
+  CudaStream signal_stream_;
+  CudaEvent signal_event_;
+  bool stop_{false};
+  // Tasks are put in |pending_tasks_| first, and then moved to
+  // |worker_tasks_| when end_batch() is called.
+  using Tasks = std::vector<std::function<void()>>;
+  Tasks pending_tasks_;
+  std::map<uint64_t, Tasks> worker_tasks_;
+  std::thread worker_;
+};
+} // namespace mlx::core::cu

mlx/include/mlx/backend/gpu/available.h ADDED Viewed

@@ -0,0 +1,9 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+namespace mlx::core::gpu {
+bool is_available();
+} // namespace mlx::core::gpu

mlx/include/mlx/backend/gpu/copy.h ADDED Viewed

@@ -0,0 +1,57 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include "mlx/backend/common/copy.h"
+#include "mlx/stream.h"
+#include <optional>
+namespace mlx::core {
+// Generic copy inplace
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    const Shape& data_shape,
+    const Strides& i_strides,
+    const Strides& o_strides,
+    int64_t i_offset,
+    int64_t o_offset,
+    CopyType ctype,
+    const Stream& s,
+    std::optional<array> dynamic_i_offset = std::nullopt,
+    std::optional<array> dynamic_o_offset = std::nullopt);
+void copy_gpu(const array& src, array& out, CopyType ctype, const Stream& s);
+void copy_gpu(const array& src, array& out, CopyType ctype);
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    CopyType ctype,
+    const Stream& s);
+void copy_gpu_inplace(
+    const array& in,
+    array& out,
+    const Strides& i_strides,
+    int64_t i_offset,
+    CopyType ctype,
+    const Stream& s);
+// Fill the output with the scalar val
+void fill_gpu(const array& val, array& out, const Stream& s);
+// Return a contiguous array with same shape that copies the data of |arr|.
+array contiguous_copy_gpu(const array& arr, const Stream& s);
+// Copy data from |in| and transpose to |out|'s shape.
+void reshape_gpu(const array& in, array& out, Stream s);
+// Like the normal ops but safe to call in eval_gpu.
+array flatten_in_eval(const array& x, int start_axis, int end_axis, Stream s);
+array reshape_in_eval(const array& x, Shape shape, Stream s);
+array swapaxes_in_eval(const array& x, int axis1, int axis2);
+} // namespace mlx::core

mlx/include/mlx/backend/gpu/eval.h ADDED Viewed

@@ -0,0 +1,18 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <future>
+#include <memory>
+#include "mlx/array.h"
+#include "mlx/stream.h"
+namespace mlx::core::gpu {
+void new_stream(Stream stream);
+void eval(array& arr);
+void finalize(Stream s);
+void synchronize(Stream s);
+} // namespace mlx::core::gpu

mlx/include/mlx/backend/gpu/slicing.h ADDED Viewed

@@ -0,0 +1,36 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+void slice_gpu(
+    const array& in,
+    array& out,
+    const Shape& start_indices,
+    const Shape& strides,
+    const Stream& s);
+void concatenate_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    int axis,
+    const Stream& s);
+void pad_gpu(
+    const array& in,
+    const array& val,
+    array& out,
+    const std::vector<int>& axes,
+    const Shape& low_pad_size,
+    const Stream& s);
+array compute_dynamic_offset(
+    const array& indices,
+    const Strides& strides,
+    const std::vector<int>& axes,
+    const Stream& s);
+} // namespace mlx::core

mlx/include/mlx/backend/metal/allocator.h ADDED Viewed

@@ -0,0 +1,79 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <map>
+#include <mutex>
+#include <vector>
+#include "mlx/allocator.h"
+#include "mlx/backend/common/buffer_cache.h"
+#include "mlx/backend/metal/device.h"
+#include "mlx/backend/metal/resident.h"
+namespace mlx::core::metal {
+using allocator::Buffer;
+class MetalAllocator : public allocator::Allocator {
+  /** Allocator for Metal GPUs. */
+ public:
+  virtual Buffer malloc(size_t size) override;
+  virtual void free(Buffer buffer) override;
+  virtual size_t size(Buffer buffer) const override;
+  virtual Buffer make_buffer(void* ptr, size_t size) override;
+  virtual void release(Buffer buffer) override;
+  size_t get_active_memory() {
+    return active_memory_;
+  };
+  size_t get_peak_memory() {
+    return peak_memory_;
+  };
+  void reset_peak_memory() {
+    std::unique_lock lk(mutex_);
+    peak_memory_ = 0;
+  };
+  size_t get_cache_memory() {
+    return buffer_cache_.cache_size();
+  };
+  size_t set_cache_limit(size_t limit);
+  size_t set_memory_limit(size_t limit);
+  size_t get_memory_limit();
+  size_t set_wired_limit(size_t limit);
+  void clear_cache();
+ private:
+  MTL::Device* device_;
+  // The size of allocations which go on the heap until it is full. This size
+  // is chosen because it is the actual minimum size of a buffer allocated from
+  // the heap, a heap can have at most heap.size() / 256 buffers.
+  static constexpr int small_size_ = 256;
+  static constexpr int heap_size_ = 1 << 20;
+  MTL::Heap* heap_;
+  MetalAllocator();
+  ~MetalAllocator();
+  friend MetalAllocator& allocator();
+  // Caching allocator
+  BufferCache<MTL::Buffer> buffer_cache_;
+  ResidencySet residency_set_;
+  // Allocation stats
+  size_t block_limit_;
+  size_t gc_limit_;
+  size_t active_memory_{0};
+  size_t peak_memory_{0};
+  size_t max_pool_size_;
+  size_t wired_limit_{0};
+  size_t num_resources_{0};
+  size_t resource_limit_{0};
+  std::mutex mutex_;
+};
+MetalAllocator& allocator();
+} // namespace mlx::core::metal

mlx/include/mlx/backend/metal/binary.h ADDED Viewed

@@ -0,0 +1,33 @@
+// Copyright © 2024 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+namespace mlx::core {
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const char* op,
+    const Stream& s);
+void binary_op_gpu(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    std::vector<array>& outputs,
+    const char* op,
+    const Stream& s);
+void binary_op_gpu_inplace(
+    const std::vector<array>& inputs,
+    array& out,
+    const char* op,
+    const Stream& s);
+} // namespace mlx::core

mlx/include/mlx/backend/metal/device.h ADDED Viewed

@@ -0,0 +1,283 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+#include <Metal/Metal.hpp>
+#include <functional>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "mlx/array.h"
+#include "mlx/device.h"
+namespace mlx::core::metal {
+using MTLFCList =
+    std::vector<std::tuple<const void*, MTL::DataType, NS::UInteger>>;
+struct DeviceStream;
+struct CommandEncoder {
+  explicit CommandEncoder(DeviceStream& stream);
+  CommandEncoder(const CommandEncoder&) = delete;
+  CommandEncoder& operator=(const CommandEncoder&) = delete;
+  struct ConcurrentContext {
+    ConcurrentContext(CommandEncoder& enc) : enc(enc) {
+      enc.concurrent_ = true;
+    }
+    ~ConcurrentContext() {
+      enc.concurrent_ = false;
+      enc.prev_outputs_.insert(
+          enc.concurrent_outputs_.begin(), enc.concurrent_outputs_.end());
+      enc.concurrent_outputs_.clear();
+    }
+   private:
+    CommandEncoder& enc;
+  };
+  void set_input_array(const array& a, int idx, int64_t offset = 0);
+  void set_output_array(array& a, int idx, int64_t offset = 0);
+  void register_output_array(const array& a);
+  void dispatch_threadgroups(MTL::Size grid_dims, MTL::Size group_dims);
+  void dispatch_threads(MTL::Size grid_dims, MTL::Size group_dims);
+  void maybeInsertBarrier();
+  void set_buffer(const MTL::Buffer* buf, int idx, int64_t offset = 0);
+  void set_compute_pipeline_state(MTL::ComputePipelineState* kernel) {
+    enc_->setComputePipelineState(kernel);
+  }
+  void wait_for_fence(MTL::Fence* fence) {
+    enc_->waitForFence(fence);
+  }
+  void update_fence(MTL::Fence* fence) {
+    enc_->updateFence(fence);
+  }
+  template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
+  void set_vector_bytes(const Vec& vec, size_t nelems, int idx) {
+    enc_->setBytes(vec.data(), nelems * sizeof(typename Vec::value_type), idx);
+  }
+  template <typename Vec, typename = std::enable_if_t<is_vector_v<Vec>>>
+  void set_vector_bytes(const Vec& vec, int idx) {
+    return set_vector_bytes(vec, vec.size(), idx);
+  }
+  template <typename T>
+  void set_bytes(const T* v, int n, int idx) {
+    return enc_->setBytes(v, n * sizeof(T), idx);
+  }
+  template <typename T>
+  void set_bytes(const T& v, int idx) {
+    return enc_->setBytes(&v, sizeof(T), idx);
+  }
+  void set_threadgroup_memory_length(size_t length, int idx) {
+    enc_->setThreadgroupMemoryLength(length, idx);
+  }
+  ConcurrentContext start_concurrent() {
+    return ConcurrentContext(*this);
+  }
+  ~CommandEncoder();
+  // Inputs to all kernels in the encoder including temporaries
+  std::unordered_set<const void*>& inputs() {
+    return all_inputs_;
+  };
+  // Outputs of all kernels in the encoder including temporaries
+  std::unordered_set<const void*>& outputs() {
+    return all_outputs_;
+  };
+  void barrier();
+ private:
+  DeviceStream& stream_;
+  MTL::ComputeCommandEncoder* enc_;
+  bool needs_barrier_{false};
+  bool concurrent_{false};
+  std::unordered_set<MTL::Resource*> prev_outputs_;
+  std::unordered_set<MTL::Resource*> next_outputs_;
+  std::unordered_set<MTL::Resource*> concurrent_outputs_;
+  std::unordered_set<const void*> all_inputs_;
+  std::unordered_set<const void*> all_outputs_;
+};
+struct Fence {
+  Fence(MTL::Fence* fence) : fence(fence) {}
+  ~Fence() {
+    fence->release();
+  }
+  MTL::Fence* fence;
+};
+struct DeviceStream {
+  DeviceStream(MTL::CommandQueue* queue) : queue(queue) {};
+  ~DeviceStream() {
+    queue->release();
+    if (buffer != nullptr) {
+      buffer->release();
+    }
+  };
+  MTL::CommandQueue* queue;
+  // A map of prior command encoder outputs to their corresponding fence
+  std::unordered_map<const void*, std::shared_ptr<Fence>> outputs;
+  // Used to allow thread-safe access to the outputs map
+  std::mutex fence_mtx;
+  // Data updated between command buffers
+  MTL::CommandBuffer* buffer{nullptr};
+  int buffer_ops{0};
+  size_t buffer_sizes{0};
+  // The command encoder, fence, and temporaries are updated between command
+  // encoders
+  std::unique_ptr<CommandEncoder> encoder{nullptr};
+  std::shared_ptr<Fence> fence;
+  std::vector<array> temporaries;
+};
+class Device {
+ public:
+  Device();
+  Device(const Device&) = delete;
+  Device& operator=(const Device&) = delete;
+  ~Device();
+  MTL::Device* mtl_device() {
+    return device_;
+  };
+  const std::string& get_architecture() {
+    return arch_;
+  }
+  int get_architecture_gen() const {
+    return arch_gen_;
+  }
+  void new_queue(int index);
+  MTL::CommandQueue* get_queue(Stream stream);
+  MTL::CommandBuffer* get_command_buffer(int index);
+  bool command_buffer_needs_commit(int index);
+  void commit_command_buffer(int index);
+  CommandEncoder& get_command_encoder(int index);
+  void end_encoding(int index);
+  MTL::Library* get_library(
+      const std::string& name,
+      const std::string& path = "");
+  MTL::Library* get_library(
+      const std::string& name,
+      const std::function<std::string(void)>& builder);
+  void clear_library(const std::string& name);
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+  MTL::ComputePipelineState* get_kernel(
+      const std::string& base_name,
+      const std::string& hash_name = "",
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+  MTL::ArgumentEncoder* argument_encoder(
+      const std::vector<MTL::ArgumentDescriptor*>& arg_descs) const;
+  // Record temporary arrays for the given stream index
+  void add_temporary(array arr, int index);
+  void add_temporaries(std::vector<array> arrays, int index);
+  void set_residency_set(const MTL::ResidencySet* residency_set);
+ private:
+  DeviceStream& get_stream_(int index) {
+    return stream_map_.find(index)->second;
+  }
+  MTL::Library* get_library_cache_(const std::string& name);
+  MTL::Library* get_library_(const std::string& name);
+  MTL::Library* build_library_(const std::string& source_string);
+  MTL::Function* get_function_(const std::string& name, MTL::Library* mtl_lib);
+  MTL::Function* get_function_(
+      const std::string& name,
+      const std::string& specialized_name,
+      const MTLFCList& func_consts,
+      MTL::Library* mtl_lib);
+  MTL::LinkedFunctions* get_linked_functions_(
+      const std::vector<MTL::Function*>& funcs);
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function);
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& name,
+      const MTL::Function* mtl_function,
+      const MTL::LinkedFunctions* linked_functions);
+  MTL::ComputePipelineState* get_kernel_(
+      const std::string& base_name,
+      MTL::Library* mtl_lib,
+      const std::string& hash_name,
+      const MTLFCList& func_consts = {},
+      const std::vector<MTL::Function*>& linked_functions = {});
+  MTL::Device* device_;
+  std::unordered_map<int32_t, DeviceStream> stream_map_;
+  std::shared_mutex kernel_mtx_;
+  std::shared_mutex library_mtx_;
+  std::unordered_map<std::string, MTL::Library*> library_map_;
+  MTL::Library* default_library_;
+  std::unordered_map<
+      MTL::Library*,
+      std::unordered_map<std::string, MTL::ComputePipelineState*>>
+      library_kernels_;
+  const MTL::ResidencySet* residency_set_{nullptr};
+  std::string arch_;
+  int arch_gen_;
+  int max_ops_per_buffer_;
+  int max_mb_per_buffer_;
+};
+Device& device(mlx::core::Device);
+std::unique_ptr<void, std::function<void(void*)>> new_scoped_memory_pool();
+inline bool is_nax_available() {
+  auto _check_nax = []() {
+    bool can_use_nax = false;
+    if (__builtin_available(
+            macOS 26.2, iOS 26.2, tvOS 26.2, visionOS 26.2, *)) {
+      can_use_nax = true;
+    }
+    can_use_nax &=
+        metal::device(mlx::core::Device::gpu).get_architecture_gen() >= 17;
+    return can_use_nax;
+  };
+  static bool is_nax_available_ = _check_nax();
+  return is_nax_available_;
+}
+} // namespace mlx::core::metal

mlx/include/mlx/backend/metal/jit/includes.h ADDED Viewed

@@ -0,0 +1,57 @@
+// Copyright © 2023-2024 Apple Inc.
+#pragma once
+namespace mlx::core::metal {
+const char* utils();
+const char* binary_ops();
+const char* unary_ops();
+const char* ternary_ops();
+const char* reduce_utils();
+const char* gather();
+const char* scatter();
+const char* masked_scatter();
+const char* arange();
+const char* unary();
+const char* binary();
+const char* binary_two();
+const char* copy();
+const char* fft();
+const char* gather_axis();
+const char* gather_front();
+const char* hadamard();
+const char* logsumexp();
+const char* quantized_utils();
+const char* quantized();
+const char* fp_quantized();
+const char* ternary();
+const char* scan();
+const char* scatter_axis();
+const char* softmax();
+const char* sort();
+const char* reduce();
+const char* gemm();
+const char* steel_gemm_fused();
+const char* steel_gemm_masked();
+const char* steel_gemm_splitk();
+const char* steel_gemm_gather();
+const char* steel_gemm_segmented();
+const char* conv();
+const char* steel_conv();
+const char* steel_conv_general();
+const char* gemv_masked();
+const char* steel_attention();
+const char* gemm_nax();
+const char* steel_gemm_fused_nax();
+const char* steel_gemm_gather_nax();
+const char* quantized_nax();
+const char* fp_quantized_nax();
+const char* steel_attention_nax();
+} // namespace mlx::core::metal