PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/random.h ADDED Viewed

@@ -0,0 +1,282 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <chrono>
+#include <optional>
+#include "mlx/array.h"
+#include "mlx/stream.h"
+#include "mlx/utils.h"
+namespace mlx::core::random {
+class KeySequence {
+ public:
+  explicit KeySequence(uint64_t seed);
+  void seed(uint64_t seed);
+  array next();
+  // static default
+  static KeySequence& default_() {
+    static KeySequence ks(get_current_time_seed());
+    return ks;
+  }
+ private:
+  array key_;
+  static uint64_t get_current_time_seed() {
+    auto now = std::chrono::system_clock::now();
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               now.time_since_epoch())
+        .count();
+  }
+};
+/** Get a PRNG key from a seed. */
+array key(uint64_t seed);
+/** Seed the default PRNG key. */
+void seed(uint64_t seed);
+/** Generate an array with type uint32 filled with random bits. */
+array bits(
+    const Shape& shape,
+    int width,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+inline array bits(
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return bits(shape, 4, key, s);
+}
+/** Split the rng key into a pair of keys. */
+std::pair<array, array> split(const array& key, StreamOrDevice s = {});
+/** Split the rng key into `num` keys. */
+array split(const array& key, int num, StreamOrDevice s = {});
+/** Generate uniform random numbers between low and high. */
+array uniform(
+    const array& low,
+    const array& high,
+    const Shape& shape,
+    Dtype dtype = float32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+template <typename T, typename U>
+array uniform(
+    T low,
+    U high,
+    const Shape& shape,
+    Dtype dtype = float32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return uniform(array(low), array(high), shape, dtype, key, to_stream(s));
+}
+/** Generate uniform random numbers between 0 and 1. */
+array uniform(
+    const Shape& shape,
+    Dtype dtype,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+inline array uniform(
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return uniform(shape, float32, key);
+}
+/** Generate samples from the standard normal distribution. */
+array normal(
+    const Shape& shape,
+    Dtype dtype,
+    const std::optional<array>& loc,
+    const std::optional<array>& scale,
+    const std::optional<array>& key,
+    StreamOrDevice s = {});
+inline array normal(
+    const Shape& shape,
+    Dtype dtype,
+    const float loc,
+    const float scale,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  auto loc_ = loc == 0 ? std::nullopt : std::make_optional(array(loc, dtype));
+  auto scale_ =
+      scale == 1 ? std::nullopt : std::make_optional(array(scale, dtype));
+  return normal(shape, dtype, loc_, scale_, key, s);
+}
+inline array normal(
+    const Shape& shape,
+    const float loc,
+    const float scale,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return normal(shape, float32, loc, scale, key, s);
+}
+inline array normal(
+    const Shape& shape,
+    const Dtype dtype,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return normal(shape, dtype, std::nullopt, std::nullopt, key, s);
+}
+inline array normal(
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return normal(shape, float32, std::nullopt, std::nullopt, key, s);
+}
+/** Generate samples from a multivariate normal distribution. **/
+array multivariate_normal(
+    const array& mean,
+    const array& cov,
+    const Shape& shape,
+    Dtype dtype,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+/** Generate integer samples uniformly at random */
+array randint(
+    const array& low,
+    const array& high,
+    const Shape& shape,
+    Dtype dtype = int32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+template <typename T, typename U>
+array randint(
+    T low,
+    U high,
+    const Shape& shape,
+    Dtype dtype = int32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return randint(array(low), array(high), shape, dtype, key, to_stream(s));
+}
+/** Generate binary variables with probability to be true equal to p */
+array bernoulli(
+    const array& p,
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array bernoulli(
+    const array& p,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+template <typename T>
+array bernoulli(
+    T p,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return bernoulli(array(p), key, s);
+}
+template <typename T>
+array bernoulli(
+    T p,
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return bernoulli(array(p), shape, key, s);
+}
+array bernoulli(
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array truncated_normal(
+    const array& lower,
+    const array& upper,
+    const Shape& shape,
+    Dtype dtype = float32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array truncated_normal(
+    const array& lower,
+    const array& upper,
+    Dtype dtype = float32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array gumbel(
+    const Shape& shape,
+    Dtype dtype = float32,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array categorical(
+    const array& logits,
+    int axis,
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array categorical(
+    const array& logits_,
+    int axis,
+    int num_samples,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+array categorical(
+    const array& logits,
+    int axis = -1,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+/** Generate samples from the laplace distribution. */
+array laplace(
+    const Shape& shape,
+    Dtype dtype,
+    const float loc,
+    const float scale,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+inline array laplace(
+    const Shape& shape,
+    const float loc,
+    const float scale,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return laplace(shape, float32, loc, scale, key, s);
+}
+inline array laplace(
+    const Shape& shape,
+    const Dtype dtype,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return laplace(shape, dtype, 0.0, 1.0, key, s);
+}
+inline array laplace(
+    const Shape& shape,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {}) {
+  return laplace(shape, float32, 0.0, 1.0, key, s);
+}
+/* Randomly permute the elements of x along the given axis. */
+array permutation(
+    const array& x,
+    int axis = 0,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+/* A random permutation of `arange(x)` */
+array permutation(
+    int x,
+    const std::optional<array>& key = std::nullopt,
+    StreamOrDevice s = {});
+} // namespace mlx::core::random

mlx/include/mlx/scheduler.h ADDED Viewed

@@ -0,0 +1,188 @@
+// Copyright © 2023 Apple Inc.
+#pragma once
+#include <atomic>
+#include <future>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+#include "mlx/backend/gpu/eval.h"
+#include "mlx/device.h"
+#include "mlx/stream.h"
+namespace mlx::core::scheduler {
+struct StreamThread {
+  std::mutex mtx;
+  std::queue<std::function<void()>> q;
+  std::condition_variable cond;
+  bool stop;
+  std::thread thread;
+  StreamThread() : stop(false), thread(&StreamThread::thread_fn, this) {}
+  ~StreamThread() {
+    {
+      std::lock_guard<std::mutex> lk(mtx);
+      stop = true;
+    }
+    cond.notify_one();
+    thread.join();
+  }
+  void thread_fn() {
+    while (true) {
+      std::function<void()> task;
+      {
+        std::unique_lock<std::mutex> lk(mtx);
+        cond.wait(lk, [this] { return !this->q.empty() || this->stop; });
+        if (q.empty() && stop) {
+          return;
+        }
+        task = std::move(q.front());
+        q.pop();
+      }
+      task();
+    }
+  }
+  template <typename F>
+  void enqueue(F&& f) {
+    {
+      std::lock_guard<std::mutex> lk(mtx);
+      if (stop) {
+        throw std::runtime_error(
+            "Cannot enqueue work after stream is stopped.");
+      }
+      q.emplace(std::forward<F>(f));
+    }
+    cond.notify_one();
+  }
+};
+class Scheduler {
+ public:
+  Scheduler() : n_active_tasks_(0) {
+    if (is_available(Device::gpu)) {
+      default_streams_.insert({Device::gpu, new_stream(Device::gpu)});
+    }
+    default_streams_.insert({Device::cpu, new_stream(Device::cpu)});
+  }
+  // Not copyable or moveable
+  Scheduler(const Scheduler&) = delete;
+  Scheduler(Scheduler&&) = delete;
+  Scheduler& operator=(const Scheduler&) = delete;
+  Scheduler& operator=(Scheduler&&) = delete;
+  Stream new_stream(const Device& d) {
+    streams_.emplace_back(streams_.size(), d);
+    if (d == Device::gpu) {
+      threads_.push_back(nullptr);
+      gpu::new_stream(streams_.back());
+    } else {
+      threads_.push_back(new StreamThread{});
+    }
+    return streams_.back();
+  }
+  template <typename F>
+  void enqueue(const Stream& stream, F&& f);
+  Stream get_default_stream(const Device& d) const {
+    return default_streams_.at(d.type);
+  }
+  Stream get_stream(int index) const {
+    return streams_.at(index);
+  }
+  void set_default_stream(const Stream& s) {
+    default_streams_.at(s.device.type) = s;
+  }
+  void notify_new_task(const Stream& stream) {
+    {
+      std::lock_guard<std::mutex> lk(mtx);
+      n_active_tasks_++;
+    }
+    completion_cv.notify_all();
+  }
+  void notify_task_completion(const Stream& stream) {
+    {
+      std::lock_guard<std::mutex> lk(mtx);
+      n_active_tasks_--;
+    }
+    completion_cv.notify_all();
+  }
+  int n_active_tasks() const {
+    return n_active_tasks_;
+  }
+  void wait_for_one() {
+    std::unique_lock<std::mutex> lk(mtx);
+    int n_tasks_old = n_active_tasks();
+    if (n_tasks_old > 1) {
+      completion_cv.wait(lk, [this, n_tasks_old] {
+        return this->n_active_tasks() < n_tasks_old;
+      });
+    }
+  }
+  ~Scheduler() {
+    for (auto s : streams_) {
+      try {
+        synchronize(s);
+      } catch (const std::runtime_error&) {
+        // ignore errors if synch fails
+      }
+    }
+    for (auto t : threads_) {
+      if (t != nullptr) {
+        delete t;
+      }
+    }
+  }
+ private:
+  int n_active_tasks_;
+  std::vector<StreamThread*> threads_;
+  std::vector<Stream> streams_;
+  std::unordered_map<Device::DeviceType, Stream> default_streams_;
+  std::condition_variable completion_cv;
+  std::mutex mtx;
+};
+template <typename F>
+void Scheduler::enqueue(const Stream& stream, F&& f) {
+  threads_[stream.index]->enqueue(std::forward<F>(f));
+}
+Scheduler& scheduler();
+template <typename F>
+void enqueue(const Stream& stream, F&& f) {
+  scheduler().enqueue(stream, std::forward<F>(f));
+}
+inline int n_active_tasks() {
+  return scheduler().n_active_tasks();
+}
+inline void notify_new_task(const Stream& stream) {
+  scheduler().notify_new_task(stream);
+}
+inline void notify_task_completion(const Stream& stream) {
+  scheduler().notify_task_completion(stream);
+}
+inline void wait_for_one() {
+  scheduler().wait_for_one();
+}
+} // namespace mlx::core::scheduler