RubyGems - mlx - Versions diffs - 0.30.7.3 → 0.30.7.6 - Mend

mlx 0.30.7.3 → 0.30.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (590) hide show

data/submodules/mlx/mlx/backend/cuda/quantized/qqmm.cpp ADDED Viewed

@@ -0,0 +1,193 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/quantized/qmv.h"
+#include "mlx/backend/cuda/quantized/qqmm_impl.h"
+#include "mlx/backend/cuda/quantized/qqmm_utils.h"
+#include "mlx/backend/cuda/quantized/quantized.h"
+#include "mlx/backend/cuda/quantized/quantized_utils.h"
+#include "mlx/primitives.h"
+#include <nvtx3/nvtx3.hpp>
+namespace mlx::core {
+namespace {
+std::tuple<array, array> quantize_input(
+    const array& input,
+    cu::CommandEncoder& encoder,
+    const Stream& s,
+    QuantizationMode mode,
+    int bits,
+    int group_size,
+    std::optional<array> global_scale = std::nullopt) {
+  const array x = ensure_contiguous(input, encoder, s);
+  // Compute output shapes
+  auto xq_shape = x.shape();
+  xq_shape.back() = x.shape(-1) * bits / 32;
+  const int64_t scales_inner = x.shape(-1) / group_size;
+  auto [pad_outer, pad_inner] =
+      get_padded_scale_dims(x.shape(-2), scales_inner);
+  auto sshape = x.shape();
+  sshape[x.ndim() - 2] = pad_outer;
+  sshape[x.ndim() - 1] = pad_inner;
+  sshape.back() = scales_inner;
+  // Allocate outputs
+  const int64_t xq_bytes = x.size() * bits / 8;
+  const int64_t batch = x.size() / (x.shape(-2) * x.shape(-1));
+  const int64_t scales_bytes = batch * (pad_outer * pad_inner);
+  array x_q(cu::malloc_async(xq_bytes, encoder), std::move(xq_shape), uint32);
+  array scales_x(
+      cu::malloc_async(scales_bytes, encoder), std::move(sshape), uint8);
+  encoder.add_temporary(x_q);
+  encoder.add_temporary(scales_x);
+  // global_scale is not nullopt only for NVFP4
+  fp_quantize(x, x_q, scales_x, group_size, bits, global_scale, encoder, s);
+  return {std::move(x_q), std::move(scales_x)};
+}
+GemmScalars create_nvfp4_scalars(
+    const array& global_scale_x,
+    const array& global_scale_w,
+    cu::CommandEncoder& encoder) {
+  // NVFP4 requires alpha/beta as device pointers
+  // alpha = amax_x * amax_w / (448 * 6)^2
+  // beta = 0
+  array alpha(cu::malloc_async(sizeof(float), encoder), {}, float32);
+  array beta(cu::malloc_async(sizeof(float), encoder), {}, float32);
+  compute_qqmm_pointers(alpha, beta, global_scale_x, global_scale_w, encoder);
+  encoder.add_temporary(alpha);
+  encoder.add_temporary(beta);
+  return {alpha, beta};
+}
+} // namespace
+void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("QQMatmul::eval_gpu");
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+  auto& device = encoder.device();
+  bool w_quantized = (inputs[1].dtype() == uint32);
+  int base_size = w_quantized ? 3 : 2;
+  assert(
+      inputs.size() == base_size ||
+      (mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2));
+  if (w_quantized && inputs[0].shape(-2) == 1) {
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    // For nvfp4, get global scale for x from inputs if present
+    bool has_global_scale =
+        mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
+    std::optional<array> global_scale = std::nullopt;
+    if (has_global_scale) {
+      global_scale = inputs[inputs.size() - 2];
+    }
+    bool donate_x = inputs[0].is_donatable();
+    array x = ensure_row_contiguous(inputs[0], encoder, s);
+    // If x is a copy it should be donatable
+    donate_x |= x.is_donatable();
+    auto xhat = donate_x
+        ? x
+        : array(cu::malloc_async(x.nbytes(), encoder), x.shape(), x.dtype());
+    if (!donate_x) {
+      encoder.add_temporary(xhat);
+    }
+    fp_quantize_dequantize(
+        x, xhat, group_size_, bits_, global_scale, encoder, s);
+    // Make sure the last two dims of w and s are contiguous
+    array w = ensure_row_contiguous_matrix(inputs[1], encoder, s);
+    array scales = ensure_row_contiguous_matrix(inputs[2], encoder, s);
+    bool non_batched = w.ndim() == 2;
+    int K = x.shape(-1);
+    int M = non_batched ? x.size() / K : x.shape(-2);
+    int N = out.shape(-1);
+    fp_qmv(w, scales, xhat, out, bits_, group_size_, M, N, K, encoder);
+    return;
+  }
+  auto cc = device.compute_capability_major() * 100 +
+      device.compute_capability_minor() * 10;
+  if (cc < 1000) {
+    throw std::runtime_error(
+        "[QQMatmul::eval_gpu] QQMM is only supported on GPUs with compute capability 10.0 or higher.");
+  }
+  // - 2 inputs: x, w (non-quantized w)
+  // - 3 inputs: x, w, scales_w (quantized w)
+  // For nvfp4, global scales are optional but must be both present or both
+  // absent If present, they add 2 more inputs (global_scale_x, global_scale_w)
+  bool has_global_scales =
+      mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
+  // For nvfp4, get global scales from inputs if present
+  std::optional<array> global_scale_x = std::nullopt;
+  std::optional<array> global_scale_w = std::nullopt;
+  if (has_global_scales) {
+    global_scale_x = inputs[inputs.size() - 2];
+    global_scale_w = inputs[inputs.size() - 1];
+  }
+  // Quantize inputs (or use pre-quantized)
+  auto [x_q, scale_x_pre] = quantize_input(
+      inputs[0], encoder, s, mode_, bits_, group_size_, global_scale_x);
+  auto [w_q, scale_w_pre] = !w_quantized
+      ? quantize_input(
+            inputs[1], encoder, s, mode_, bits_, group_size_, global_scale_w)
+      : std::make_tuple(
+            ensure_contiguous(inputs[1], encoder, s),
+            ensure_contiguous(inputs[2], encoder, s));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  int M = x_q.shape(-2);
+  int N = w_q.shape(-2); // transposed
+  int K = x_q.shape(-1) * (32 / bits_);
+  bool x_transposed = false;
+  bool w_transposed = true; // always transposed
+  int64_t lda = K;
+  int64_t ldb = K;
+  // Repack scales to tiled layout for tensor cores
+  array scale_x = pad_and_swizzle_scales(scale_x_pre, encoder, s);
+  array scale_w = pad_and_swizzle_scales(scale_w_pre, encoder, s);
+  GemmScalars scalars;
+  if (has_global_scales) {
+    scalars = create_nvfp4_scalars(*global_scale_x, *global_scale_w, encoder);
+  }
+  qqmm_impl(
+      encoder,
+      M,
+      N,
+      K,
+      x_transposed,
+      lda,
+      w_transposed,
+      ldb,
+      out,
+      x_q,
+      w_q,
+      scale_x,
+      scale_w,
+      mode_,
+      scalars);
+}
+} // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_impl.cpp RENAMED Viewed

@@ -19,15 +19,10 @@ void qqmm_impl(
     const array& b,
     const array& a_scale,
     const array& b_scale,
-    Dtype out_dtype,
     QuantizationMode mode,
-    float alpha) {
-  // Invoke CublasQQMM
+    const GemmScalars& scalars) {
   std::string qmode = quantization_mode_to_string(mode);
-  // Currently only supports non-batched QQMM operations
-  // that covers all use cases for training, we will just collapse (batch,
-  // seq_len) into (tokens)
   CublasQQMM qqmm(
       encoder.device(),
       a_transposed,
@@ -41,10 +36,22 @@ void qqmm_impl(
       1, // batch_count
       0, // a_batch_stride
       0, // b_batch_stride
-      out_dtype,
+      out.dtype(),
       qmode);
-  qqmm.run(encoder, out, a, b, a_scale, b_scale, alpha);
+  if (scalars.has_values()) {
+    qqmm.run(
+        encoder,
+        out,
+        a,
+        b,
+        a_scale,
+        b_scale,
+        *scalars.alpha_device,
+        *scalars.beta_device);
+  } else {
+    qqmm.run(encoder, out, a, b, a_scale, b_scale);
+  }
 }
 } // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_impl.h RENAMED Viewed

@@ -1,10 +1,22 @@
-// Copyright © 2026 Apple Inc.
+// Copyright © 2025 Apple Inc.
 #pragma once
 #include "mlx/backend/cuda/device.h"
 #include "mlx/primitives.h"
+#include <optional>
 namespace mlx::core {
+struct GemmScalars {
+  std::optional<array> alpha_device;
+  std::optional<array> beta_device;
+  bool has_values() const {
+    return alpha_device.has_value();
+  }
+};
 void qqmm_impl(
     cu::CommandEncoder& encoder,
     int M,
@@ -19,8 +31,7 @@ void qqmm_impl(
     const array& b,
     const array& a_scale,
     const array& b_scale,
-    Dtype out_dtype,
     QuantizationMode mode,
-    float alpha = 1.0f);
+    const GemmScalars& scalars = {});
 } // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_utils.cu RENAMED Viewed

@@ -70,6 +70,21 @@ inline std::tuple<dim3, dim3> get_swizzle_launch_args(
 namespace cu {
+constexpr float F8E4M3_MAX = 448.0f;
+constexpr float F4E2M1_MAX = 6.0f;
+__global__ void compute_qqmm_pointers(
+    float* alpha_out,
+    float* beta_out,
+    const float* tensor_amax_x,
+    const float* tensor_amax_w) {
+  // Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
+  constexpr float inv_scale_sq =
+      1.0f / (F8E4M3_MAX * F4E2M1_MAX * F8E4M3_MAX * F4E2M1_MAX);
+  *alpha_out = (*tensor_amax_x) * (*tensor_amax_w) * inv_scale_sq;
+  *beta_out = 0.0f;
+}
 __global__ void swizzle_scales(
     const uint8_t* scales_linear,
     uint8_t* scales_swizzled,
@@ -224,4 +239,25 @@ void swizzle_scales(
       output_cols);
 }
+void compute_qqmm_pointers(
+    array& alpha_out,
+    array& beta_out,
+    const array& tensor_amax_x,
+    const array& tensor_amax_w,
+    cu::CommandEncoder& enc) {
+  enc.set_input_array(tensor_amax_x);
+  enc.set_input_array(tensor_amax_w);
+  enc.set_output_array(alpha_out);
+  enc.set_output_array(beta_out);
+  enc.add_kernel_node(
+      cu::compute_qqmm_pointers,
+      dim3(1),
+      dim3(1),
+      0,
+      gpu_ptr<void>(alpha_out),
+      gpu_ptr<void>(beta_out),
+      gpu_ptr<void>(tensor_amax_x),
+      gpu_ptr<void>(tensor_amax_w));
+}
 } // namespace mlx::core

data/submodules/mlx/mlx/backend/cuda/quantized/qqmm_utils.h ADDED Viewed

@@ -0,0 +1,62 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+#include "mlx/backend/cuda/device.h"
+namespace mlx::core {
+// Compute padded dimensions for tiled layout
+// Tiles are 128 rows × 4 columns, must allocate full tiles
+inline std::pair<int, int> get_padded_scale_dims(int num_rows, int num_cols) {
+  constexpr int rows_per_tile = 128;
+  constexpr int cols_per_tile = 4;
+  int padded_rows =
+      ((num_rows + rows_per_tile - 1) / rows_per_tile) * rows_per_tile;
+  int padded_cols =
+      ((num_cols + cols_per_tile - 1) / cols_per_tile) * cols_per_tile;
+  return {padded_rows, padded_cols};
+}
+void swizzle_scales(
+    const array& scales,
+    array& scales_tiled,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+inline array pad_and_swizzle_scales(
+    const array& scale,
+    cu::CommandEncoder& encoder,
+    const Stream& s) {
+  // Compute padded dimensions for full tiles (128 rows × 4 cols)
+  auto [pad_outer, pad_inner] =
+      get_padded_scale_dims(scale.shape(-2), scale.shape(-1));
+  // cuBLAS requirements for scale factor layout:
+  // 1. Dimensions must be padded to full tiles (128 rows × 4 cols)
+  // 2. Out-of-bounds values must be filled with zeros
+  // 3. Starting addresses must be 16-byte aligned
+  // https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+  // Note: cu::malloc_async already provides 256-byte alignment
+  array scale_tiled(
+      cu::malloc_async(pad_outer * pad_inner, encoder),
+      Shape{pad_outer, pad_inner},
+      scale.dtype());
+  swizzle_scales(scale, scale_tiled, encoder, s);
+  encoder.add_temporary(scale_tiled);
+  return scale_tiled;
+}
+// Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
+// Allocate beta zero on device as well
+void compute_qqmm_pointers(
+    array& alpha_out,
+    array& beta_out,
+    const array& tensor_amax_x,
+    const array& tensor_amax_w,
+    cu::CommandEncoder& enc);
+} // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized.cpp RENAMED Viewed

@@ -51,7 +51,6 @@ void fast::Quantize::eval_gpu(
   auto& s = stream();
   auto& d = cu::device(s.device);
   auto& enc = d.get_command_encoder(s);
   if (dequantize_) {
     auto wq = ensure_row_contiguous(inputs[0], enc, s);
     auto scales = ensure_row_contiguous(inputs[1], enc, s);
@@ -63,7 +62,12 @@ void fast::Quantize::eval_gpu(
       auto biases = ensure_row_contiguous(inputs[2], enc, s);
       affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
     } else {
-      fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
+      // 0 -- xq, 1 -- scales, 2 -- could be global scale for nvfp4
+      bool use_global_scale =
+          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 2;
+      std::optional<array> global_scale =
+          use_global_scale ? std::make_optional(inputs[2]) : std::nullopt;
+      fp_dequantize(wq, scales, w, group_size_, bits_, global_scale, enc, s);
     }
   } else {
     auto w = ensure_contiguous(inputs[0], enc, s);
@@ -72,12 +76,17 @@ void fast::Quantize::eval_gpu(
     wq.set_data(cu::malloc_async(wq.nbytes(), enc));
     scales.set_data(cu::malloc_async(scales.nbytes(), enc));
     if (mode_ == QuantizationMode::Affine) {
       auto& biases = outputs[2];
       biases.set_data(cu::malloc_async(biases.nbytes(), enc));
       affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
     } else {
-      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
+      bool use_global_scale =
+          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 1;
+      std::optional<array> global_scale =
+          use_global_scale ? std::make_optional(inputs[1]) : std::nullopt;
+      fp_quantize(w, wq, scales, group_size_, bits_, global_scale, enc, s);
     }
   }
 }

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized.h RENAMED Viewed

@@ -1,5 +1,6 @@
 // Copyright © 2025 Apple Inc.
+#include <optional>
 #include "mlx/backend/cuda/device.h"
 namespace mlx::core {
@@ -30,6 +31,7 @@ void fp_quantize(
     array& scales,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);
@@ -39,6 +41,7 @@ void fp_dequantize(
     array& w,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);
@@ -47,6 +50,7 @@ void fp_quantize_dequantize(
     array& what,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized_utils.cuh RENAMED Viewed

@@ -29,7 +29,7 @@ inline constexpr __device__ short get_bytes_per_pack() {
 }
 template <typename T>
-__device__ __forceinline__ void abs_max_x2(T& out, const T& x1, const T& x2) {
+__device__ __forceinline__ void absmax_x2(T& out, const T& x1, const T& x2) {
   if constexpr (
       (std::is_same<T, __nv_bfloat162>::value) ||
       (std::is_same<T, __half2>::value)) {

data/{mlx → submodules/mlx}/mlx/backend/metal/device.cpp RENAMED Viewed

@@ -247,6 +247,10 @@ void CommandEncoder::set_buffer(
     const MTL::Buffer* buf,
     int idx,
     int64_t offset /* = 0 */) {
+  // Record as both input and output to ensure synchronization between command
+  // buffers
+  all_inputs_.insert((void*)buf);
+  all_outputs_.insert((void*)buf);
   enc_->setBuffer(buf, offset, idx);
 }

data/{mlx → submodules/mlx}/mlx/backend/metal/kernels/conv.metal RENAMED Viewed

@@ -30,7 +30,7 @@ template <typename T, int N>
     out_pixels *= params->oS[i];
   // Set out
-  out += gid.z * filter_size + gid.y * (params->C);
+  out += (size_t)gid.z * filter_size + (size_t)gid.y * (params->C);
   // Coordinates in input
   int is[N] = {0};
@@ -93,7 +93,8 @@ template <typename T, int N>
     out_pixels *= params->oS[i];
   // Set out
-  out += gid.z * filter_size + gid.x * (filter_size / params->C);
+  out +=
+      (size_t)gid.z * filter_size + (size_t)gid.x * (filter_size / params->C);
   // Coordinates in input
   int is[N] = {0};

data/{mlx → submodules/mlx}/mlx/export.cpp RENAMED Viewed

@@ -279,6 +279,8 @@ void extract_state(const T state, std::vector<StateT>& unpacked_state) {
     unpacked_state.push_back(state);
   } else if constexpr (std::is_enum_v<T>) {
     unpacked_state.push_back(static_cast<int>(state));
+  } else if constexpr (std::is_same_v<T, Dtype>) {
+    unpacked_state.push_back(state);
   } else if constexpr (is_iterable<T>) {
     unpacked_state.push_back(state);
   } else if constexpr (is_pair<T> || is_tuple<T>) {
@@ -446,6 +448,7 @@ struct PrimitiveFactory {
       SERIALIZE_PRIMITIVE(ScaledDotProductAttention),
       SERIALIZE_PRIMITIVE(CustomKernel)};
   std::unordered_map<std::string, std::string> name_remap;
+  std::unordered_map<int, Stream> stream_map;
   PrimitiveFactory() {
     for (auto& [n, f] : factory) {
@@ -471,13 +474,25 @@ struct PrimitiveFactory {
     }
   };
-  std::shared_ptr<Primitive> load(Reader& is) {
-    auto stream = deserialize<Stream>(is);
-    if (get_stream(stream.index) != stream) {
-      std::ostringstream msg;
-      msg << "[import_function] Invalid stream encountered " << stream << ".";
-      throw std::invalid_argument(msg.str());
+  Stream resolve_stream(const Stream& stream) {
+    if (auto it = stream_map.find(stream.index); it != stream_map.end()) {
+      return it->second;
+    }
+    // Try to find an existing stream on the same device
+    for (auto& s : get_streams()) {
+      if (s.device == stream.device) {
+        stream_map.emplace(stream.index, s);
+        return s;
+      }
     }
+    // No stream on that device, make a new one
+    Stream s = new_stream(stream.device);
+    stream_map.emplace(stream.index, s);
+    return s;
+  }
+  std::shared_ptr<Primitive> load(Reader& is) {
+    auto stream = resolve_stream(deserialize<Stream>(is));
     auto name = deserialize<std::string>(is);
     if (auto it = factory.find(name); it != factory.end()) {
       return it->second.deserialize(is, stream);