RubyGems - mlx - Versions diffs - 0.30.7.2 → 0.30.7.6 - Mend

mlx 0.30.7.2 → 0.30.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (605) hide show

data/submodules/mlx/mlx/backend/cuda/quantized/qqmm_utils.h ADDED Viewed

@@ -0,0 +1,62 @@
+// Copyright © 2025 Apple Inc.
+#pragma once
+#include "mlx/array.h"
+#include "mlx/backend/cuda/device.h"
+namespace mlx::core {
+// Compute padded dimensions for tiled layout
+// Tiles are 128 rows × 4 columns, must allocate full tiles
+inline std::pair<int, int> get_padded_scale_dims(int num_rows, int num_cols) {
+  constexpr int rows_per_tile = 128;
+  constexpr int cols_per_tile = 4;
+  int padded_rows =
+      ((num_rows + rows_per_tile - 1) / rows_per_tile) * rows_per_tile;
+  int padded_cols =
+      ((num_cols + cols_per_tile - 1) / cols_per_tile) * cols_per_tile;
+  return {padded_rows, padded_cols};
+}
+void swizzle_scales(
+    const array& scales,
+    array& scales_tiled,
+    cu::CommandEncoder& enc,
+    const Stream& s);
+inline array pad_and_swizzle_scales(
+    const array& scale,
+    cu::CommandEncoder& encoder,
+    const Stream& s) {
+  // Compute padded dimensions for full tiles (128 rows × 4 cols)
+  auto [pad_outer, pad_inner] =
+      get_padded_scale_dims(scale.shape(-2), scale.shape(-1));
+  // cuBLAS requirements for scale factor layout:
+  // 1. Dimensions must be padded to full tiles (128 rows × 4 cols)
+  // 2. Out-of-bounds values must be filled with zeros
+  // 3. Starting addresses must be 16-byte aligned
+  // https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+  // Note: cu::malloc_async already provides 256-byte alignment
+  array scale_tiled(
+      cu::malloc_async(pad_outer * pad_inner, encoder),
+      Shape{pad_outer, pad_inner},
+      scale.dtype());
+  swizzle_scales(scale, scale_tiled, encoder, s);
+  encoder.add_temporary(scale_tiled);
+  return scale_tiled;
+}
+// Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
+// Allocate beta zero on device as well
+void compute_qqmm_pointers(
+    array& alpha_out,
+    array& beta_out,
+    const array& tensor_amax_x,
+    const array& tensor_amax_w,
+    cu::CommandEncoder& enc);
+} // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized.cpp RENAMED Viewed

@@ -51,7 +51,6 @@ void fast::Quantize::eval_gpu(
   auto& s = stream();
   auto& d = cu::device(s.device);
   auto& enc = d.get_command_encoder(s);
   if (dequantize_) {
     auto wq = ensure_row_contiguous(inputs[0], enc, s);
     auto scales = ensure_row_contiguous(inputs[1], enc, s);
@@ -63,7 +62,12 @@ void fast::Quantize::eval_gpu(
       auto biases = ensure_row_contiguous(inputs[2], enc, s);
       affine_dequantize(wq, scales, biases, w, group_size_, bits_, enc, s);
     } else {
-      fp_dequantize(wq, scales, w, group_size_, bits_, enc, s);
+      // 0 -- xq, 1 -- scales, 2 -- could be global scale for nvfp4
+      bool use_global_scale =
+          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 2;
+      std::optional<array> global_scale =
+          use_global_scale ? std::make_optional(inputs[2]) : std::nullopt;
+      fp_dequantize(wq, scales, w, group_size_, bits_, global_scale, enc, s);
     }
   } else {
     auto w = ensure_contiguous(inputs[0], enc, s);
@@ -72,12 +76,17 @@ void fast::Quantize::eval_gpu(
     wq.set_data(cu::malloc_async(wq.nbytes(), enc));
     scales.set_data(cu::malloc_async(scales.nbytes(), enc));
     if (mode_ == QuantizationMode::Affine) {
       auto& biases = outputs[2];
       biases.set_data(cu::malloc_async(biases.nbytes(), enc));
       affine_quantize(w, wq, scales, biases, group_size_, bits_, enc, s);
     } else {
-      fp_quantize(w, wq, scales, group_size_, bits_, enc, s);
+      bool use_global_scale =
+          mode_ == QuantizationMode::Nvfp4 && inputs.size() > 1;
+      std::optional<array> global_scale =
+          use_global_scale ? std::make_optional(inputs[1]) : std::nullopt;
+      fp_quantize(w, wq, scales, group_size_, bits_, global_scale, enc, s);
     }
   }
 }

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized.h RENAMED Viewed

@@ -1,5 +1,6 @@
 // Copyright © 2025 Apple Inc.
+#include <optional>
 #include "mlx/backend/cuda/device.h"
 namespace mlx::core {
@@ -30,6 +31,7 @@ void fp_quantize(
     array& scales,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);
@@ -39,6 +41,7 @@ void fp_dequantize(
     array& w,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);
@@ -47,6 +50,7 @@ void fp_quantize_dequantize(
     array& what,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale,
     cu::CommandEncoder& enc,
     const Stream& s);

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/quantized_utils.cuh RENAMED Viewed

@@ -29,7 +29,7 @@ inline constexpr __device__ short get_bytes_per_pack() {
 }
 template <typename T>
-__device__ __forceinline__ void abs_max_x2(T& out, const T& x1, const T& x2) {
+__device__ __forceinline__ void absmax_x2(T& out, const T& x1, const T& x2) {
   if constexpr (
       (std::is_same<T, __nv_bfloat162>::value) ||
       (std::is_same<T, __half2>::value)) {

data/{mlx → submodules/mlx}/mlx/backend/metal/device.cpp RENAMED Viewed

@@ -247,6 +247,10 @@ void CommandEncoder::set_buffer(
     const MTL::Buffer* buf,
     int idx,
     int64_t offset /* = 0 */) {
+  // Record as both input and output to ensure synchronization between command
+  // buffers
+  all_inputs_.insert((void*)buf);
+  all_outputs_.insert((void*)buf);
   enc_->setBuffer(buf, offset, idx);
 }

data/{mlx → submodules/mlx}/mlx/backend/metal/kernels/conv.metal RENAMED Viewed

@@ -30,7 +30,7 @@ template <typename T, int N>
     out_pixels *= params->oS[i];
   // Set out
-  out += gid.z * filter_size + gid.y * (params->C);
+  out += (size_t)gid.z * filter_size + (size_t)gid.y * (params->C);
   // Coordinates in input
   int is[N] = {0};
@@ -93,7 +93,8 @@ template <typename T, int N>
     out_pixels *= params->oS[i];
   // Set out
-  out += gid.z * filter_size + gid.x * (filter_size / params->C);
+  out +=
+      (size_t)gid.z * filter_size + (size_t)gid.x * (filter_size / params->C);
   // Coordinates in input
   int is[N] = {0};

data/{mlx → submodules/mlx}/mlx/export.cpp RENAMED Viewed

@@ -279,6 +279,8 @@ void extract_state(const T state, std::vector<StateT>& unpacked_state) {
     unpacked_state.push_back(state);
   } else if constexpr (std::is_enum_v<T>) {
     unpacked_state.push_back(static_cast<int>(state));
+  } else if constexpr (std::is_same_v<T, Dtype>) {
+    unpacked_state.push_back(state);
   } else if constexpr (is_iterable<T>) {
     unpacked_state.push_back(state);
   } else if constexpr (is_pair<T> || is_tuple<T>) {
@@ -446,6 +448,7 @@ struct PrimitiveFactory {
       SERIALIZE_PRIMITIVE(ScaledDotProductAttention),
       SERIALIZE_PRIMITIVE(CustomKernel)};
   std::unordered_map<std::string, std::string> name_remap;
+  std::unordered_map<int, Stream> stream_map;
   PrimitiveFactory() {
     for (auto& [n, f] : factory) {
@@ -471,13 +474,25 @@ struct PrimitiveFactory {
     }
   };
-  std::shared_ptr<Primitive> load(Reader& is) {
-    auto stream = deserialize<Stream>(is);
-    if (get_stream(stream.index) != stream) {
-      std::ostringstream msg;
-      msg << "[import_function] Invalid stream encountered " << stream << ".";
-      throw std::invalid_argument(msg.str());
+  Stream resolve_stream(const Stream& stream) {
+    if (auto it = stream_map.find(stream.index); it != stream_map.end()) {
+      return it->second;
+    }
+    // Try to find an existing stream on the same device
+    for (auto& s : get_streams()) {
+      if (s.device == stream.device) {
+        stream_map.emplace(stream.index, s);
+        return s;
+      }
     }
+    // No stream on that device, make a new one
+    Stream s = new_stream(stream.device);
+    stream_map.emplace(stream.index, s);
+    return s;
+  }
+  std::shared_ptr<Primitive> load(Reader& is) {
+    auto stream = resolve_stream(deserialize<Stream>(is));
     auto name = deserialize<std::string>(is);
     if (auto it = factory.find(name); it != factory.end()) {
       return it->second.deserialize(is, stream);

data/{mlx → submodules/mlx}/mlx/ops.cpp RENAMED Viewed

@@ -10,6 +10,7 @@
 #include <sstream>
 #include "mlx/backend/cuda/cuda.h"
+#include "mlx/backend/metal/metal.h"
 #include "mlx/fast_primitives.h"
 #include "mlx/ops.h"
 #include "mlx/primitives.h"
@@ -2311,6 +2312,40 @@ array argmax(
   return out;
 }
+array hanning(int M, StreamOrDevice s /* = {} */) {
+  if (M < 1) {
+    return array({});
+  }
+  if (M == 1) {
+    return ones({1}, float32, s);
+  }
+  auto n = arange(0, M, float32, s);
+  array factor(M_PI / (M - 1), float32);
+  return square(sin(multiply(factor, n, s), s), s);
+}
+array hamming(int M, StreamOrDevice s /* = {} */) {
+  if (M < 1) {
+    return array({});
+  }
+  if (M == 1) {
+    return ones({1}, float32, s);
+  }
+  auto n = arange(0, M, float32, s);
+  float factor_val = (2.0 * M_PI) / (M - 1);
+  auto factor = array(factor_val, float32);
+  auto arg = multiply(factor, n, s);
+  auto cos_vals = cos(arg, s);
+  auto left_coef = array(0.54f, float32);
+  auto right_coef = array(0.46f, float32);
+  return subtract(left_coef, multiply(right_coef, cos_vals, s), s);
+}
 /** Returns a sorted copy of the flattened array. */
 array sort(const array& a, StreamOrDevice s /* = {} */) {
   int size = a.size();
@@ -4209,6 +4244,34 @@ std::pair<Dtype, QuantizationMode> validate_mode_with_type(
   }
 }
+void validate_global_scale(
+    std::string_view tag,
+    QuantizationMode qmode,
+    const std::optional<array>& global_scale) {
+  if (global_scale.has_value()) {
+    if (qmode != QuantizationMode::Nvfp4) {
+      std::ostringstream msg;
+      msg << "[" << tag << "] Global scale is only supported for 'nvfp4' "
+          << "quantization mode.";
+      throw std::invalid_argument(msg.str());
+    } else {
+      if (global_scale->size() != 1) {
+        std::ostringstream msg;
+        msg << "[" << tag << "] Global scale must be a scalar but got shape "
+            << global_scale->shape() << ".";
+        throw std::invalid_argument(msg.str());
+      }
+      // TODO: not sure if type should be restricted to float32
+      if (global_scale->dtype() != float32) {
+        std::ostringstream msg;
+        msg << "[" << tag << "] Global scale must have dtype float32 but got "
+            << global_scale->dtype() << ".";
+        throw std::invalid_argument(msg.str());
+      }
+    }
+  }
+}
 array quantized_matmul(
     array x,
     array w,
@@ -4251,7 +4314,6 @@ array quantized_matmul(
   if (x.ndim() > 2 && w.ndim() > 2) {
     inputs = broadcast_arrays(inputs, {-2, -1}, s);
   }
   auto out_shape = inputs[0].shape();
   out_shape.back() = w_outer_dims;
   return array(
@@ -4267,7 +4329,10 @@ void validate_qqmm_inputs(
     array w,
     std::optional<array> scales_w,
     int group_size,
-    int bits) {
+    int bits,
+    std::optional<array> global_scale_x,
+    std::optional<array> global_scale_w,
+    QuantizationMode qmode) {
   // check 2D (for now)
   if (x.ndim() > 2 || w.ndim() > 2) {
     std::ostringstream msg;
@@ -4304,6 +4369,19 @@ void validate_qqmm_inputs(
         << "first argument dtype == " << x.dtype() << ".";
     throw std::invalid_argument(msg.str());
   }
+  // validate global scales
+  validate_global_scale("qqmm", qmode, global_scale_x);
+  validate_global_scale("qqmm", qmode, global_scale_w);
+  // For nvfp4 mode, both global scales must be provided together or neither
+  if (qmode == QuantizationMode::Nvfp4) {
+    bool has_x = global_scale_x.has_value();
+    bool has_w = global_scale_w.has_value();
+    if (has_x != has_w) {
+      throw std::invalid_argument(
+          "[qqmm] For nvfp4 mode, either both global_scale_x and "
+          "global_scale_w must be provided, or neither.");
+    }
+  }
 }
 std::pair<int, int> extract_qqmm_dims(
@@ -4343,6 +4421,8 @@ array qqmm(
     std::optional<int> group_size_ /* = std::nullopt */,
     std::optional<int> bits_ /* = std::nullopt */,
     const std::string& mode /* = "nvfp4" */,
+    const std::optional<array> global_scale_x /* = std::nullopt */,
+    const std::optional<array> global_scale_w /* = std::nullopt */,
     StreamOrDevice s /* = {} */) {
   auto stream = to_stream(s);
   auto qmode = string_to_quantization_mode(mode, "qqmm");
@@ -4369,7 +4449,8 @@ array qqmm(
   }
   // validate inputs
-  validate_qqmm_inputs(x, w, scales_w, group_size, bits);
+  validate_qqmm_inputs(
+      x, w, scales_w, group_size, bits, global_scale_x, global_scale_w, qmode);
   // validate and extract shapes
   auto [w_inner_dims, w_outer_dims] =
       extract_qqmm_dims(x, w, scales_w, group_size, bits);
@@ -4380,6 +4461,11 @@ array qqmm(
   if (scales_w.has_value()) {
     inputs.push_back(*scales_w);
   }
+  if (global_scale_x.has_value() && global_scale_w.has_value()) {
+    inputs.push_back(*global_scale_x);
+    inputs.push_back(*global_scale_w);
+  }
   auto out_shape = inputs[0].shape();
   out_shape.back() = w_outer_dims;
   auto out = array(
@@ -4515,6 +4601,7 @@ std::vector<array> fp_quantize(
     int group_size,
     int bits,
     QuantizationMode mode,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     Stream s) {
   int expected_gs = mode == QuantizationMode::Nvfp4 ? 16 : 32;
   int expected_bits = mode == QuantizationMode::Mxfp8 ? 8 : 4;
@@ -4532,6 +4619,12 @@ std::vector<array> fp_quantize(
         << bits << ".";
     throw std::invalid_argument(msg.str());
   }
+  auto inputs = std::vector<array>{w};
+  if (global_scale.has_value()) {
+    inputs.push_back(global_scale.value());
+  }
   auto fallback = [bits = bits, group_size = group_size, s](
                       const std::vector<array>& inputs) -> std::vector<array> {
     auto& w = inputs[0];
@@ -4543,8 +4636,13 @@ std::vector<array> fp_quantize(
         divide(max(abs(wq, s), -1, true, s), array(maxval, w.dtype()), s);
     if (group_size == 16) {
       // convert to e4m3
+      auto scale_encode = inputs.size() > 1
+          ? divide(array(448.0f * 6.0f, float32), inputs[1], s)
+          : array(1.0f, float32);
+      scales = multiply(scales, scale_encode, s);
       scales = to_fp8(scales, s);
-      wq = divide(wq, from_fp8(scales, w.dtype(), s), s);
+      wq = multiply(
+          divide(wq, from_fp8(scales, w.dtype(), s), s), scale_encode, s);
     } else {
       // convert to e8m0
       auto z = array(0, scales.dtype());
@@ -4600,9 +4698,9 @@ std::vector<array> fp_quantize(
         {uint32, uint8},
         std::make_shared<fast::Quantize>(
             s, fallback, group_size, bits, mode, false),
-        {w});
+        inputs);
   }
-  return fallback({w});
+  return fallback(inputs);
 }
 std::vector<array> quantize(
@@ -4610,6 +4708,7 @@ std::vector<array> quantize(
     std::optional<int> group_size_ /* = std::nullopt */,
     std::optional<int> bits_ /* = std::nullopt */,
     const std::string& mode /* = "affine" */,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     StreamOrDevice s /* = {} */) {
   auto qmode = string_to_quantization_mode(mode, "quantize");
   auto [group_size, bits] =
@@ -4636,11 +4735,17 @@ std::vector<array> quantize(
         << " matrix has shape " << w.shape();
     throw std::invalid_argument(msg.str());
   }
+  if (to_stream(s).device == Device::gpu && metal::is_available() &&
+      global_scale.has_value()) {
+    std::ostringstream msg;
+    msg << "[quantize] Global scale is not supported on the Metal backend.";
+    throw std::invalid_argument(msg.str());
+  }
+  validate_global_scale("quantize", qmode, global_scale);
   if (qmode == QuantizationMode::Affine) {
     return affine_quantize(w, group_size, bits, s);
   } else {
-    return fp_quantize(w, group_size, bits, qmode, to_stream(s));
+    return fp_quantize(w, group_size, bits, qmode, global_scale, to_stream(s));
   }
 }
@@ -4745,6 +4850,7 @@ array fp_dequantize(
     int bits,
     Dtype out_type,
     QuantizationMode mode,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     Stream s) {
   int expected_gs = mode == QuantizationMode::Nvfp4 ? 16 : 32;
   int expected_bits = mode == QuantizationMode::Mxfp8 ? 8 : 4;
@@ -4789,6 +4895,11 @@ array fp_dequantize(
     throw std::invalid_argument(msg.str());
   }
+  auto inputs = std::vector<array>{w, scales};
+  if (global_scale.has_value()) {
+    inputs.push_back(global_scale.value());
+  }
   auto fallback =
       [wshape = std::move(wshape),
        sshape = std::move(sshape),
@@ -4831,13 +4942,17 @@ array fp_dequantize(
     out = reshape(out, {-1, group_size}, s);
     scales = reshape(scales, {-1, 1}, s);
     if (group_size == 16) {
-      scales = from_fp8(scales, out_type, s);
+      array inv_scale_enc = inputs.size() > 2
+          ? divide(inputs[2], array(448.0f * 6.0f, out_type), s)
+          : array(1.0f, out_type);
+      scales = multiply(from_fp8(scales, out_type, s), inv_scale_enc, s);
     } else {
       scales = subtract(astype(scales, out_type, s), array(127, out_type), s);
       scales = power(array(2.0f, out_type), scales, s);
     }
     return {reshape(multiply(out, scales, s), wshape, s)};
   };
   if (s.device == Device::gpu) {
     auto out_shape = w.shape();
     out_shape.back() = out_size;
@@ -4846,9 +4961,9 @@ array fp_dequantize(
         out_type,
         std::make_shared<fast::Quantize>(
             s, fallback, group_size, bits, mode, true),
-        {w, scales});
+        inputs);
   }
-  return fallback({w, scales})[0];
+  return fallback(inputs)[0];
 }
 array dequantize(
@@ -4858,6 +4973,7 @@ array dequantize(
     std::optional<int> group_size_ /* = std::nullopt */,
     std::optional<int> bits_ /* = std::nullopt */,
     const std::string& mode /* = "affine" */,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     std::optional<Dtype> dtype /* = std::nullopt */,
     StreamOrDevice s /* = {} */) {
   auto [out_type, qmode] =
@@ -4884,6 +5000,14 @@ array dequantize(
         << "but it has only " << w.ndim() << ".";
     throw std::invalid_argument(msg.str());
   }
+  if (global_scale.has_value()) {
+    if (to_stream(s).device == Device::gpu && metal::is_available()) {
+      std::ostringstream msg;
+      msg << "[dequantize] Global scale is not supported on the Metal backend.";
+      throw std::invalid_argument(msg.str());
+    }
+  }
+  validate_global_scale("dequantize", qmode, global_scale);
   if (qmode == QuantizationMode::Affine) {
     return astype(
@@ -4892,7 +5016,14 @@ array dequantize(
         s);
   } else {
     return fp_dequantize(
-        w, scales, group_size, bits, out_type, qmode, to_stream(s));
+        w,
+        scales,
+        group_size,
+        bits,
+        out_type,
+        qmode,
+        global_scale,
+        to_stream(s));
   }
 }
@@ -6091,4 +6222,4 @@ array contiguous(
       {a});
 }
-} // namespace mlx::core
+} // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/ops.h RENAMED Viewed

@@ -666,6 +666,12 @@ min(const array& a,
 MLX_API array
 min(const array& a, int axis, bool keepdims = false, StreamOrDevice s = {});
+/** Returns the Hanning window of size M. */
+MLX_API array hanning(int M, StreamOrDevice s = {});
+/** Returns the Hamming window of size M. */
+MLX_API array hamming(int M, StreamOrDevice s = {});
 /** Returns the index of the minimum value in the array. */
 MLX_API array argmin(const array& a, bool keepdims, StreamOrDevice s = {});
 inline array argmin(const array& a, StreamOrDevice s = {}) {
@@ -1391,6 +1397,7 @@ MLX_API std::vector<array> quantize(
     std::optional<int> group_size = std::nullopt,
     std::optional<int> bits = std::nullopt,
     const std::string& mode = "affine",
+    const std::optional<array>& global_scale = std::nullopt,
     StreamOrDevice s = {});
 /** Dequantize a matrix produced by quantize() */
@@ -1401,17 +1408,20 @@ MLX_API array dequantize(
     std::optional<int> group_size = std::nullopt,
     std::optional<int> bits = std::nullopt,
     const std::string& mode = "affine",
+    const std::optional<array>& global_scale = std::nullopt,
     std::optional<Dtype> dtype = std::nullopt,
     StreamOrDevice s = {});
 MLX_API array qqmm(
     array x, // input activations
     array w, // maybe quantized weights
-    std::optional<array> w_scales = std::nullopt, // optional scales if w is
-                                                  // quantized
+    const std::optional<array> w_scales = std::nullopt, // optional scales if w
+                                                        // is quantized
     std::optional<int> group_size = std::nullopt,
     std::optional<int> bits = std::nullopt,
     const std::string& mode = "nvfp4",
+    const std::optional<array> global_scale_x = std::nullopt,
+    const std::optional<array> global_scale_w = std::nullopt,
     StreamOrDevice s = {});
 /** Convert an E4M3 float8 to the given floating point dtype. */

data/{mlx → submodules/mlx}/mlx/primitives.cpp RENAMED Viewed

@@ -3424,6 +3424,7 @@ std::vector<array> QuantizedMatmul::vjp(
             group_size_,
             bits_,
             quantization_mode_to_string(mode_),
+            {}, // placeholder for amax
             std::nullopt,
             stream());
         wq = unflatten(wq, -1, {-1, group_size_}, stream());
@@ -3484,14 +3485,14 @@ std::vector<Shape> QQMatmul::output_shapes(const std::vector<array>& inputs) {
 }
 std::vector<array> QQMatmul::vjp(
-    const std::vector<array>& primals, // non quantized x, non quantized w
+    const std::vector<array>& primals, // non quantized x, non quantized w, if
+                                       // nvfp4 global_scale_x, global_scale_w
     const std::vector<array>& cotangents, // non quantized upstream grads
     const std::vector<int>& argnums,
     const std::vector<array>&) {
-  if (primals.size() != 2) {
-    throw std::runtime_error(
-        "[QQMatmul::vjp] Expected exactly 2 non-quantized primal inputs (x, w).");
-  }
+  bool is_nvfp4 = mode_ == QuantizationMode::Nvfp4;
+  assert(primals.size() == 2 || (is_nvfp4 && primals.size() == 4));
   std::vector<array> vjps;
   auto& cotan = cotangents[0];
   auto& s = stream();
@@ -3499,6 +3500,15 @@ std::vector<array> QQMatmul::vjp(
   // primal[0] -- non quantized activations (M, K)
   // cotan -- non quantized grads (M, N)
   auto qmode = quantization_mode_to_string(mode_);
+  std::optional<array> cotan_amax = (primals.size() == 4)
+      ? std::make_optional(astype(max(abs(cotan, s), s), float32, s))
+      : std::nullopt;
+  auto get_primal_scale = [&](int idx) {
+    return (primals.size() == 4) ? std::make_optional(primals[idx])
+                                 : std::nullopt;
+  };
   for (auto arg : argnums) {
     if (arg == 0) { // gradient wrt to x
       // We transpose weights -> quantize along N
@@ -3509,6 +3519,8 @@ std::vector<array> QQMatmul::vjp(
           group_size_,
           bits_,
           qmode,
+          cotan_amax,
+          get_primal_scale(3), // global_scale_w (for w.T)
           s));
     } else if (arg == 1) { // gradient wrt to weights
       vjps.push_back(qqmm(
@@ -3518,7 +3530,11 @@ std::vector<array> QQMatmul::vjp(
           group_size_,
           bits_,
           qmode,
+          cotan_amax,
+          get_primal_scale(2), // global_scale_x (for x.T)
           s));
+    } else {
+      vjps.push_back(zeros_like(primals[arg], s));
     }
   }
   return vjps;
@@ -3643,6 +3659,7 @@ std::vector<array> GatherQMM::vjp(
                             bits_,
                             quantization_mode_to_string(mode_),
                             std::nullopt,
+                            std::nullopt, // amax placeholder
                             stream()),
                         -1,
                         {-1, group_size_},

data/{mlx → submodules/mlx}/mlx/scheduler.cpp RENAMED Viewed

@@ -26,6 +26,10 @@ Stream get_stream(int index) {
   return scheduler::scheduler().get_stream(index);
 }
+std::vector<Stream> get_streams() {
+  return scheduler::scheduler().get_streams();
+}
 Stream new_stream(Device d) {
   if (!gpu::is_available() && d == Device::gpu) {
     throw std::invalid_argument(

data/{mlx → submodules/mlx}/mlx/scheduler.h RENAMED Viewed

@@ -99,6 +99,9 @@ class Scheduler {
   Stream get_stream(int index) const {
     return streams_.at(index);
   }
+  std::vector<Stream> get_streams() const {
+    return streams_;
+  }
   void set_default_stream(const Stream& s) {
     default_streams_.at(s.device.type) = s;