PyPI - mlx-cpu - Versions diffs - 0.30.1__py3-none-manylinux_2_35_x86_64.whl - Mend

mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

mlx/__main__.py +27 -0
mlx/_reprlib_fix.py +16 -0
mlx/extension.py +88 -0
mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
mlx/include/mlx/allocator.h +73 -0
mlx/include/mlx/array.h +645 -0
mlx/include/mlx/backend/common/binary.h +97 -0
mlx/include/mlx/backend/common/broadcasting.h +11 -0
mlx/include/mlx/backend/common/buffer_cache.h +157 -0
mlx/include/mlx/backend/common/compiled.h +77 -0
mlx/include/mlx/backend/common/copy.h +50 -0
mlx/include/mlx/backend/common/hadamard.h +109 -0
mlx/include/mlx/backend/common/matmul.h +67 -0
mlx/include/mlx/backend/common/reduce.h +59 -0
mlx/include/mlx/backend/common/slicing.h +20 -0
mlx/include/mlx/backend/common/ternary.h +85 -0
mlx/include/mlx/backend/common/unary.h +29 -0
mlx/include/mlx/backend/common/utils.h +205 -0
mlx/include/mlx/backend/cpu/arange.h +28 -0
mlx/include/mlx/backend/cpu/available.h +9 -0
mlx/include/mlx/backend/cpu/binary.h +517 -0
mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
mlx/include/mlx/backend/cpu/binary_two.h +166 -0
mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
mlx/include/mlx/backend/cpu/copy.h +36 -0
mlx/include/mlx/backend/cpu/encoder.h +67 -0
mlx/include/mlx/backend/cpu/eval.h +12 -0
mlx/include/mlx/backend/cpu/gemm.h +26 -0
mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
mlx/include/mlx/backend/cpu/lapack.h +80 -0
mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
mlx/include/mlx/backend/cpu/simd/math.h +193 -0
mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
mlx/include/mlx/backend/cpu/simd/type.h +11 -0
mlx/include/mlx/backend/cpu/slicing.h +21 -0
mlx/include/mlx/backend/cpu/ternary.h +154 -0
mlx/include/mlx/backend/cpu/threefry.h +21 -0
mlx/include/mlx/backend/cpu/unary.h +281 -0
mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
mlx/include/mlx/backend/cuda/allocator.h +89 -0
mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
mlx/include/mlx/backend/cuda/cuda.h +10 -0
mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
mlx/include/mlx/backend/cuda/device/config.h +12 -0
mlx/include/mlx/backend/cuda/device.h +189 -0
mlx/include/mlx/backend/cuda/event.h +78 -0
mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
mlx/include/mlx/backend/cuda/jit_module.h +119 -0
mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
mlx/include/mlx/backend/cuda/utils.h +46 -0
mlx/include/mlx/backend/cuda/worker.h +55 -0
mlx/include/mlx/backend/gpu/available.h +9 -0
mlx/include/mlx/backend/gpu/copy.h +57 -0
mlx/include/mlx/backend/gpu/eval.h +18 -0
mlx/include/mlx/backend/gpu/slicing.h +36 -0
mlx/include/mlx/backend/metal/allocator.h +79 -0
mlx/include/mlx/backend/metal/binary.h +33 -0
mlx/include/mlx/backend/metal/device.h +283 -0
mlx/include/mlx/backend/metal/jit/includes.h +57 -0
mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
mlx/include/mlx/backend/metal/matmul.h +144 -0
mlx/include/mlx/backend/metal/metal.h +22 -0
mlx/include/mlx/backend/metal/reduce.h +41 -0
mlx/include/mlx/backend/metal/resident.h +32 -0
mlx/include/mlx/backend/metal/scan.h +17 -0
mlx/include/mlx/backend/metal/ternary.h +21 -0
mlx/include/mlx/backend/metal/unary.h +21 -0
mlx/include/mlx/backend/metal/utils.h +84 -0
mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
mlx/include/mlx/compile.h +44 -0
mlx/include/mlx/compile_impl.h +69 -0
mlx/include/mlx/device.h +31 -0
mlx/include/mlx/distributed/distributed.h +60 -0
mlx/include/mlx/distributed/distributed_impl.h +59 -0
mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
mlx/include/mlx/distributed/mpi/mpi.h +12 -0
mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
mlx/include/mlx/distributed/nccl/nccl.h +12 -0
mlx/include/mlx/distributed/ops.h +56 -0
mlx/include/mlx/distributed/primitives.h +156 -0
mlx/include/mlx/distributed/reduction_ops.h +38 -0
mlx/include/mlx/distributed/ring/ring.h +12 -0
mlx/include/mlx/distributed/utils.h +67 -0
mlx/include/mlx/dtype.h +115 -0
mlx/include/mlx/dtype_utils.h +119 -0
mlx/include/mlx/einsum.h +22 -0
mlx/include/mlx/event.h +58 -0
mlx/include/mlx/export.h +136 -0
mlx/include/mlx/export_impl.h +98 -0
mlx/include/mlx/fast.h +102 -0
mlx/include/mlx/fast_primitives.h +427 -0
mlx/include/mlx/fence.h +39 -0
mlx/include/mlx/fft.h +167 -0
mlx/include/mlx/graph_utils.h +66 -0
mlx/include/mlx/io/gguf.h +20 -0
mlx/include/mlx/io/load.h +175 -0
mlx/include/mlx/io.h +61 -0
mlx/include/mlx/linalg.h +111 -0
mlx/include/mlx/memory.h +78 -0
mlx/include/mlx/mlx.h +25 -0
mlx/include/mlx/ops.h +1627 -0
mlx/include/mlx/primitives.h +2524 -0
mlx/include/mlx/random.h +282 -0
mlx/include/mlx/scheduler.h +188 -0
mlx/include/mlx/small_vector.h +540 -0
mlx/include/mlx/stream.h +41 -0
mlx/include/mlx/threadpool.h +133 -0
mlx/include/mlx/transforms.h +229 -0
mlx/include/mlx/transforms_impl.h +86 -0
mlx/include/mlx/types/bf16.h +187 -0
mlx/include/mlx/types/complex.h +113 -0
mlx/include/mlx/types/fp16.h +234 -0
mlx/include/mlx/types/half_types.h +58 -0
mlx/include/mlx/types/limits.h +70 -0
mlx/include/mlx/utils.h +175 -0
mlx/include/mlx/version.h +20 -0
mlx/lib/libmlx.so +0 -0
mlx/py.typed +1 -0
mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
mlx/share/cmake/MLX/extension.cmake +50 -0
mlx/utils.py +325 -0
mlx_cpu-0.30.1.dist-info/METADATA +142 -0
mlx_cpu-0.30.1.dist-info/RECORD +231 -0
mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0

mlx/include/mlx/backend/metal/kernels/fft.h ADDED Viewed

@@ -0,0 +1,486 @@
+// Copyright © 2024 Apple Inc.
+// Metal FFT using Stockham's algorithm
+//
+// References:
+// - VkFFT (https://github.com/DTolm/VkFFT)
+// - Eric Bainville's excellent page (http://www.bealto.com/gpu-fft.html)
+#include <metal_common>
+#include "mlx/backend/metal/kernels/fft/radix.h"
+#include "mlx/backend/metal/kernels/fft/readwrite.h"
+#include "mlx/backend/metal/kernels/steel/defines.h"
+using namespace metal;
+#define MAX_RADIX 13
+// Reached when elems_per_thread_ = 6, max_radix = 13
+// and some threads have to do 3 radix 6s requiring 18 float2s.
+#define MAX_OUTPUT_SIZE 18
+// Specialize for a particular value of N at runtime
+STEEL_CONST bool inv_ [[function_constant(0)]];
+STEEL_CONST bool is_power_of_2_ [[function_constant(1)]];
+STEEL_CONST int elems_per_thread_ [[function_constant(2)]];
+// rader_m = n / rader_n
+STEEL_CONST int rader_m_ [[function_constant(3)]];
+// Stockham steps
+STEEL_CONST int radix_13_steps_ [[function_constant(4)]];
+STEEL_CONST int radix_11_steps_ [[function_constant(5)]];
+STEEL_CONST int radix_8_steps_ [[function_constant(6)]];
+STEEL_CONST int radix_7_steps_ [[function_constant(7)]];
+STEEL_CONST int radix_6_steps_ [[function_constant(8)]];
+STEEL_CONST int radix_5_steps_ [[function_constant(9)]];
+STEEL_CONST int radix_4_steps_ [[function_constant(10)]];
+STEEL_CONST int radix_3_steps_ [[function_constant(11)]];
+STEEL_CONST int radix_2_steps_ [[function_constant(12)]];
+// Rader steps
+STEEL_CONST int rader_13_steps_ [[function_constant(13)]];
+STEEL_CONST int rader_11_steps_ [[function_constant(14)]];
+STEEL_CONST int rader_8_steps_ [[function_constant(15)]];
+STEEL_CONST int rader_7_steps_ [[function_constant(16)]];
+STEEL_CONST int rader_6_steps_ [[function_constant(17)]];
+STEEL_CONST int rader_5_steps_ [[function_constant(18)]];
+STEEL_CONST int rader_4_steps_ [[function_constant(19)]];
+STEEL_CONST int rader_3_steps_ [[function_constant(20)]];
+STEEL_CONST int rader_2_steps_ [[function_constant(21)]];
+// See "radix.h" for radix codelets
+typedef void (*RadixFunc)(thread float2*, thread float2*);
+// Perform a single radix n butterfly with appropriate twiddles
+template <int radix, RadixFunc radix_func>
+METAL_FUNC void radix_butterfly(
+    int i,
+    int p,
+    thread float2* x,
+    thread short* indices,
+    thread float2* y) {
+  // i: the index in the overall DFT that we're processing.
+  // p: the size of the DFTs we're merging at this step.
+  // m: how many threads are working on this DFT.
+  int k, j;
+  // Use faster bitwise operations when working with powers of two
+  constexpr bool radix_p_2 = (radix & (radix - 1)) == 0;
+  if (radix_p_2 && is_power_of_2_) {
+    constexpr short power = __builtin_ctz(radix);
+    k = i & (p - 1);
+    j = ((i - k) << power) + k;
+  } else {
+    k = i % p;
+    j = (i / p) * radix * p + k;
+  }
+  // Apply twiddles
+  if (p > 1) {
+    float2 twiddle_1 = get_twiddle(k, radix * p);
+    float2 twiddle = twiddle_1;
+    x[1] = complex_mul(x[1], twiddle);
+    STEEL_PRAGMA_UNROLL
+    for (int t = 2; t < radix; t++) {
+      twiddle = complex_mul(twiddle, twiddle_1);
+      x[t] = complex_mul(x[t], twiddle);
+    }
+  }
+  radix_func(x, y);
+  STEEL_PRAGMA_UNROLL
+  for (int t = 0; t < radix; t++) {
+    indices[t] = j + t * p;
+  }
+}
+// Perform all the radix steps required for a
+// particular radix size n.
+template <int radix, RadixFunc radix_func>
+METAL_FUNC void radix_n_steps(
+    int i,
+    thread int* p,
+    int m,
+    int n,
+    int num_steps,
+    thread float2* inputs,
+    thread short* indices,
+    thread float2* values,
+    threadgroup float2* buf) {
+  int m_r = n / radix;
+  // When combining different sized radices, we have to do
+  // multiple butterflies in a single thread.
+  // E.g. n = 28 = 4 * 7
+  // 4 threads, 7 elems_per_thread
+  // All threads do 1 radix7 butterfly.
+  // 3 threads do 2 radix4 butterflies.
+  // 1 thread does 1 radix4 butterfly.
+  int max_radices_per_thread = (elems_per_thread_ + radix - 1) / radix;
+  int index = 0;
+  int r_index = 0;
+  for (int s = 0; s < num_steps; s++) {
+    for (int t = 0; t < max_radices_per_thread; t++) {
+      index = i + t * m;
+      if (index < m_r) {
+        for (int r = 0; r < radix; r++) {
+          inputs[r] = buf[index + r * m_r];
+        }
+        radix_butterfly<radix, radix_func>(
+            index, *p, inputs, indices + t * radix, values + t * radix);
+      }
+    }
+    // Wait until all threads have read their inputs into thread local mem
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    for (int t = 0; t < max_radices_per_thread; t++) {
+      index = i + t * m;
+      if (index < m_r) {
+        for (int r = 0; r < radix; r++) {
+          r_index = t * radix + r;
+          buf[indices[r_index]] = values[r_index];
+        }
+      }
+    }
+    // Wait until all threads have written back to threadgroup mem
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    *p *= radix;
+  }
+}
+#define RADIX_STEP(radix, radix_func, num_steps) \
+  radix_n_steps<radix, radix_func>(              \
+      fft_idx, p, m, n, num_steps, inputs, indices, values, buf);
+template <bool rader = false>
+METAL_FUNC void
+perform_fft(int fft_idx, thread int* p, int m, int n, threadgroup float2* buf) {
+  float2 inputs[MAX_RADIX];
+  short indices[MAX_OUTPUT_SIZE];
+  float2 values[MAX_OUTPUT_SIZE];
+  RADIX_STEP(2, radix2, rader ? rader_2_steps_ : radix_2_steps_);
+  RADIX_STEP(3, radix3, rader ? rader_3_steps_ : radix_3_steps_);
+  RADIX_STEP(4, radix4, rader ? rader_4_steps_ : radix_4_steps_);
+  RADIX_STEP(5, radix5, rader ? rader_5_steps_ : radix_5_steps_);
+  RADIX_STEP(6, radix6, rader ? rader_6_steps_ : radix_6_steps_);
+  RADIX_STEP(7, radix7, rader ? rader_7_steps_ : radix_7_steps_);
+  RADIX_STEP(8, radix8, rader ? rader_8_steps_ : radix_8_steps_);
+  RADIX_STEP(11, radix11, rader ? rader_11_steps_ : radix_11_steps_);
+  RADIX_STEP(13, radix13, rader ? rader_13_steps_ : radix_13_steps_);
+}
+// Each FFT is computed entirely in shared GPU memory.
+//
+// N is decomposed into radix-n DFTs:
+// e.g. 128 = 2 * 4 * 4 * 4
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    constant const int& n,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  threadgroup float2 shared_in[tg_mem_size];
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load();
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  int p = 1;
+  int fft_idx = elem.z; // Thread index in DFT
+  int m = grid.z; // Threads per DFT
+  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
+  threadgroup float2* buf = &shared_in[tg_idx];
+  perform_fft(fft_idx, &p, m, n, buf);
+  read_writer.write();
+}
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void rader_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    const device float2* raders_b_q [[buffer(2)]],
+    const device short* raders_g_q [[buffer(3)]],
+    const device short* raders_g_minus_q [[buffer(4)]],
+    constant const int& n,
+    constant const int& batch_size,
+    constant const int& rader_n,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Use Rader's algorithm to compute fast FFTs
+  // when a prime factor `p` of `n` is greater than 13 but
+  // has `p - 1` Stockham decomposable into to prime factors <= 13.
+  //
+  // E.g. n = 102
+  //        = 2 * 3 * 17
+  // .      = 2 * 3 * RADER(16)
+  // .      = 2 * 3 * RADER(4 * 4)
+  //
+  // In numpy:
+  //   x_perm = x[g_q]
+  //   y = np.fft.fft(x_perm) * b_q
+  //   z = np.fft.ifft(y) + x[0]
+  //   out = z[g_minus_q]
+  //   out[0]  = x[1:].sum()
+  //
+  // Where the g_q and g_minus_q are permutations formed
+  // by the group under multiplicative modulo N using the
+  // primitive root of N and b_q is a constant.
+  // See https://en.wikipedia.org/wiki/Rader%27s_FFT_algorithm
+  //
+  // Rader's uses fewer operations than Bluestein's and so
+  // is more accurate. It's also faster in most cases.
+  threadgroup float2 shared_in[tg_mem_size];
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load();
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // The number of the threads we're using for each DFT
+  int m = grid.z;
+  int fft_idx = elem.z;
+  int tg_idx = elem.y * n;
+  threadgroup float2* buf = &shared_in[tg_idx];
+  // rader_m = n / rader_n;
+  int rader_m = rader_m_;
+  // We have to load two x_0s for each thread since sometimes
+  // elems_per_thread_ crosses a boundary.
+  // E.g. with n = 34, rader_n = 17, elems_per_thread_ = 4
+  // 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 7 7 7 7 8 8
+  // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+  short x_0_index =
+      metal::min(fft_idx * elems_per_thread_ / (rader_n - 1), rader_m - 1);
+  float2 x_0[2] = {buf[x_0_index], buf[x_0_index + 1]};
+  // Do the Rader permutation in shared memory
+  float2 temp[MAX_RADIX];
+  int max_index = n - rader_m - 1;
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short g_q = raders_g_q[index / rader_m];
+    temp[e] = buf[rader_m + (g_q - 1) * rader_m + index % rader_m];
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    buf[index + rader_m] = temp[e];
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Rader FFT on x[rader_m:]
+  int p = 1;
+  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);
+  // x_1 + ... + x_n is computed for us in the first FFT step so
+  // we save it in the first rader_m indices of the array for later.
+  int x_sum_index = metal::min(fft_idx, rader_m - 1);
+  buf[x_sum_index] = buf[rader_m + x_sum_index * (rader_n - 1)];
+  float2 inv = {1.0f, -1.0f};
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short interleaved_index =
+        index / rader_m + (index % rader_m) * (rader_n - 1);
+    temp[e] = complex_mul(
+        buf[rader_m + interleaved_index],
+        raders_b_q[interleaved_index % (rader_n - 1)]);
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    buf[rader_m + index] = temp[e] * inv;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // Rader IFFT on x[rader_m:]
+  p = 1;
+  perform_fft</*rader=*/true>(fft_idx, &p, m, n - rader_m, buf + rader_m);
+  float2 rader_inv_factor = {1.0f / (rader_n - 1), -1.0f / (rader_n - 1)};
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, n - rader_m - 1);
+    short diff_index = index / (rader_n - 1) - x_0_index;
+    temp[e] = buf[rader_m + index] * rader_inv_factor + x_0[diff_index];
+  }
+  // Use the sum of elements that was computed in the first FFT
+  float2 x_sum = buf[x_0_index] + x_0[0];
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  for (int e = 0; e < elems_per_thread_; e++) {
+    short index = metal::min(fft_idx * elems_per_thread_ + e, max_index);
+    short g_q_index = index % (rader_n - 1);
+    short g_q = raders_g_minus_q[g_q_index];
+    short out_index = index - g_q_index + g_q + (index / (rader_n - 1));
+    buf[out_index] = temp[e];
+  }
+  buf[x_0_index * rader_n] = x_sum;
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  p = rader_n;
+  perform_fft(fft_idx, &p, m, n, buf);
+  read_writer.write();
+}
+template <int tg_mem_size, typename in_T, typename out_T>
+[[kernel]] void bluestein_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    const device float2* w_q [[buffer(2)]],
+    const device float2* w_k [[buffer(3)]],
+    constant const int& length,
+    constant const int& n,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Computes arbitrary length FFTs with Bluestein's algorithm
+  //
+  // In numpy:
+  //   bluestein_n = next_power_of_2(2*n - 1)
+  //   out = w_k * np.fft.ifft(np.fft.fft(w_k * in, bluestein_n) * w_q)
+  //
+  // Where w_k and w_q are precomputed on CPU in high precision as:
+  //   w_k = np.exp(-1j * np.pi / n * (np.arange(-n + 1, n) ** 2))
+  //   w_q = np.fft.fft(1/w_k[-n:])
+  threadgroup float2 shared_in[tg_mem_size];
+  thread ReadWriter<in_T, out_T> read_writer = ReadWriter<in_T, out_T>(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load_padded(length, w_k);
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  int p = 1;
+  int fft_idx = elem.z; // Thread index in DFT
+  int m = grid.z; // Threads per DFT
+  int tg_idx = elem.y * n; // Index of this DFT in threadgroup
+  threadgroup float2* buf = &shared_in[tg_idx];
+  // fft
+  perform_fft(fft_idx, &p, m, n, buf);
+  float2 inv = float2(1.0f, -1.0f);
+  for (int t = 0; t < elems_per_thread_; t++) {
+    int index = fft_idx + t * m;
+    buf[index] = complex_mul(buf[index], w_q[index]) * inv;
+  }
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  // ifft
+  p = 1;
+  perform_fft(fft_idx, &p, m, n, buf);
+  read_writer.write_padded(length, w_k);
+}
+template <
+    int tg_mem_size,
+    typename in_T,
+    typename out_T,
+    int step,
+    bool real = false>
+[[kernel]] void four_step_fft(
+    const device in_T* in [[buffer(0)]],
+    device out_T* out [[buffer(1)]],
+    constant const int& n1,
+    constant const int& n2,
+    constant const int& batch_size,
+    uint3 elem [[thread_position_in_grid]],
+    uint3 grid [[threads_per_grid]]) {
+  // Fast four step FFT implementation for powers of 2.
+  int overall_n = n1 * n2;
+  int n = step == 0 ? n1 : n2;
+  int stride = step == 0 ? n2 : n1;
+  // The number of the threads we're using for each DFT
+  int m = grid.z;
+  int fft_idx = elem.z;
+  threadgroup float2 shared_in[tg_mem_size];
+  threadgroup float2* buf = &shared_in[elem.y * n];
+  using read_writer_t = ReadWriter<in_T, out_T, step, real>;
+  read_writer_t read_writer = read_writer_t(
+      in,
+      &shared_in[0],
+      out,
+      n,
+      batch_size,
+      elems_per_thread_,
+      elem,
+      grid,
+      inv_);
+  if (read_writer.out_of_bounds()) {
+    return;
+  };
+  read_writer.load_strided(stride, overall_n);
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  int p = 1;
+  perform_fft(fft_idx, &p, m, n, buf);
+  read_writer.write_strided(stride, overall_n);
+}

mlx/include/mlx/backend/metal/kernels/fp4.h ADDED Viewed

@@ -0,0 +1,59 @@
+#pragma once
+constexpr constant static float FP4_LUT[16] = {
+    +0.0f,
+    +0.5f,
+    +1.0f,
+    +1.5f,
+    +2.0f,
+    +3.0f,
+    +4.0f,
+    +6.0f,
+    -0.0f,
+    -0.5f,
+    -1.0f,
+    -1.5f,
+    -2.0f,
+    -3.0f,
+    -4.0f,
+    -6.0f};
+struct fp4_e2m1 {
+  fp4_e2m1(float x) {
+    if (metal::isnan(x)) {
+      bits = 0x7;
+      return;
+    }
+    const uint8_t sign_bit = (metal::signbit(x)) ? 0x8 : 0x0;
+    x = metal::abs(x);
+    if (x > 5.0f) {
+      bits = 0x7;
+    } else if (x >= 3.5f) {
+      bits = 0x6;
+    } else if (x > 2.5f) {
+      bits = 0x5;
+    } else if (x >= 1.75f) {
+      bits = 0x4;
+    } else if (x > 1.25f) {
+      bits = 0x3;
+    } else if (x >= 0.75f) {
+      bits = 0x2;
+    } else if (x > 0.25f) {
+      bits = 0x1;
+    } else {
+      bits = 0x0;
+    }
+    bits |= sign_bit;
+  }
+  operator float() {
+    half converted = as_type<half>(ushort((bits & 7) << 9));
+    converted *= 16384.0;
+    converted = bits & 8 ? -converted : converted;
+    return converted;
+  }
+  uint8_t bits;
+};

mlx/include/mlx/backend/metal/kernels/fp8.h ADDED Viewed

@@ -0,0 +1,82 @@
+#pragma once
+struct fp8_e4m3 {
+  template <typename T>
+  fp8_e4m3(T f) {
+    // From PyTorch
+    // https://github.com/pytorch/pytorch/blob/e3643e1e0e923f0fc063dfab6f45c956d568919d/c10/util/Float8_e4m3fn.h#L148
+    uint32_t fp8_max = 543 << 21;
+    uint32_t denorm_mask = 141 << 23;
+    uint32_t f_bits = as_type<uint32_t>(static_cast<float>(f));
+    uint32_t sign = f_bits & 0x80000000;
+    f_bits ^= sign;
+    if (f_bits >= fp8_max) {
+      // Default behavior saturates to min/max
+      bits = 0x7E;
+    } else {
+      if (f_bits < (121 << 23)) {
+        f_bits = as_type<uint32_t>(
+            as_type<float>(f_bits) + as_type<float>(denorm_mask));
+        bits = static_cast<uint8_t>(f_bits - denorm_mask);
+      } else {
+        // resulting mantissa is odd
+        uint8_t mant_odd = (f_bits >> 20) & 1;
+        f_bits += ((uint32_t)(7 - 127) << 23) + 0x7FFFF;
+        f_bits += mant_odd;
+        bits = static_cast<uint8_t>(f_bits >> 20);
+      }
+    }
+    bits |= static_cast<uint8_t>(sign >> 24);
+  }
+  operator float() {
+    // From PyTorch:
+    // https://github.com/pytorch/pytorch/blob/e3643e1e0e923f0fc063dfab6f45c956d568919d/c10/util/Float8_e4m3fn.h#L46
+    uint32_t w = static_cast<uint32_t>(bits) << 24;
+    uint32_t sign = w & 0x80000000;
+    uint32_t nonsign = w & 0x7FFFFFFF;
+    uint32_t renorm_shift = metal::clz(nonsign);
+    renorm_shift = renorm_shift > 4 ? renorm_shift - 4 : 0;
+    int32_t inf_nan_mask =
+        (static_cast<int32_t>(nonsign + 0x01000000) >> 8) & 0x7F800000;
+    int32_t zero_mask = static_cast<int32_t>(nonsign - 1) >> 31;
+    uint32_t result = sign |
+        ((((nonsign << renorm_shift >> 4) + ((0x78 - renorm_shift) << 23)) |
+          inf_nan_mask) &
+         ~zero_mask);
+    return as_type<float>(result);
+  }
+  uint8_t bits;
+};
+struct fp8_e8m0 {
+  fp8_e8m0(float x) {
+    if (!metal::isfinite(x)) {
+      bits = 0xFF;
+      return;
+    }
+    if (x < 0.0f) {
+      bits = 0x00;
+      return;
+    }
+    float le = metal::log2(x);
+    int n = int(metal::round(le));
+    n = n < -127 ? -127 : n;
+    n = n > 127 ? 127 : n;
+    bits = static_cast<uint8_t>(n + 127);
+  }
+  operator bfloat16_t() {
+    uint16_t out = (bits == 0 ? 0x40 : (static_cast<uint16_t>(bits) << 7));
+    return as_type<bfloat16_t>(out);
+  }
+  operator float() {
+    return static_cast<float>(this->operator bfloat16_t());
+  }
+  uint8_t bits;
+};