RubyGems - mlx - Versions diffs - 0.30.7.2 → 0.30.7.6 - Mend

mlx 0.30.7.2 → 0.30.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (605) hide show

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/fp_quantize.cu RENAMED Viewed

@@ -11,6 +11,11 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda_fp4.h>
+#include <cuda_fp8.h>
+constexpr float F8E4M3_MAX = 448.0f;
+constexpr float F4E2M1_MAX = 6.0f;
 namespace mlx::core {
 namespace cu {
@@ -29,7 +34,16 @@ struct Dequantize {
 namespace cg = cooperative_groups;
 template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
-__global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
+__global__ void fp_quantize_dequantize(
+    T* w,
+    T* out,
+    size_t size,
+    float* global_scale = nullptr) {
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
+  const float inv_scale_enc = use_global_scale ? 1.0f / scale_enc : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
   uint32_t rbits = 0; // reserved bits for future use
@@ -48,26 +62,28 @@ __global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
   }
   auto w_tile = load_vector<group_size, T>(w, thread_idx);
-  float scale = 0.0f;
+  float scale_dec_b = 0.0f;
   Tx2 amax_2x = Tx2{0.0f, 0.0f};
 #pragma unroll
   for (int i = 0; i < group_size; i += 2) {
     auto pair = Tx2{w_tile[i], w_tile[i + 1]};
-    abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
   }
-  scale = static_cast<float>(
+  scale_dec_b = static_cast<float>(
       max(fabsf(static_cast<float>(amax_2x.x)),
           fabsf(static_cast<float>(amax_2x.y))));
-  scale /= bits == 4 ? 6.0f : 448.0f;
+  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+  scale_dec_b *= scale_enc;
   // Convert to mx scale or nv scale
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto s = ScaleType(scale);
-  scale = float(s);
+  auto s = ScaleType(scale_dec_b);
+  float scale_enc_b = scale_enc / float(s);
+  float scale_dec = float(s) * inv_scale_enc;
   AlignedVector<T, group_size> w_hat;
 #pragma unroll
@@ -76,24 +92,36 @@ __global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
     float4 dq;
     if constexpr (bits == 8) {
       uint32_t quantized_val =
-          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       dq = dequant_fp8(quantized_val);
     } else {
       uint16_t quantized_val =
-          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       dq = dequant_fp4(quantized_val);
     }
-    w_hat[i * 4] = static_cast<T>(dq.x * scale);
-    w_hat[i * 4 + 1] = static_cast<T>(dq.y * scale);
-    w_hat[i * 4 + 2] = static_cast<T>(dq.z * scale);
-    w_hat[i * 4 + 3] = static_cast<T>(dq.w * scale);
+    w_hat[i * 4] = static_cast<T>(dq.x * scale_dec);
+    w_hat[i * 4 + 1] = static_cast<T>(dq.y * scale_dec);
+    w_hat[i * 4 + 2] = static_cast<T>(dq.z * scale_dec);
+    w_hat[i * 4 + 3] = static_cast<T>(dq.w * scale_dec);
   }
   store_vector<group_size>(out, thread_idx, w_hat);
 }
 template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
-__global__ void
-fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
+__global__ void fp_quantize_rowwise(
+    T* w,
+    uint8_t* out,
+    uint8_t* scales,
+    size_t size,
+    float* global_scale = nullptr) {
+  // NVFP4 conversion:
+  // Global encode scale: (448 × 6) / *global_scale
+  // Per-block decode scale: S_dec_b = (block_amax / 6) × S_enc → stored as FP8
+  // E4M3 Per-block encode scale: S_enc_b = S_enc / S_dec_b
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
   uint32_t rbits = 0; // reserved bits for future use
@@ -112,27 +140,28 @@ fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
   }
   auto w_tile = load_vector<group_size, T>(w, thread_idx);
-  float scale = 0.0f;
+  float scale_dec_b = 0.0f;
   Tx2 amax_2x = Tx2{0.0f, 0.0f};
 #pragma unroll
   for (int i = 0; i < group_size; i += 2) {
     auto pair = Tx2{w_tile[i], w_tile[i + 1]};
-    abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
   }
-  scale = static_cast<float>(
+  scale_dec_b = static_cast<float>(
       max(fabsf(static_cast<float>(amax_2x.x)),
           fabsf(static_cast<float>(amax_2x.y))));
-  scale /= bits == 4 ? 6.0f : 448.0f;
+  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+  scale_dec_b *= scale_enc;
   // Convert to mx scale or nv scale
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto s = ScaleType(scale);
+  auto s = ScaleType(scale_dec_b);
   uint8_t q_scale = s.__x;
-  scale = float(s);
+  float scale_enc_b = scale_enc / float(s);
   scales[thread_idx] = q_scale;
   constexpr int elem_per_byte = bits == 8 ? 1 : 2;
@@ -143,11 +172,11 @@ fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
     Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&w_tile[i * 4]);
     if constexpr (bits == 8) {
       uint32_t quantized_val =
-          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       *reinterpret_cast<uint32_t*>(&quantized[i * 4]) = quantized_val;
     } else {
       uint16_t quantized_val =
-          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       *reinterpret_cast<uint16_t*>(&quantized[i * 2]) = quantized_val;
     }
   }
@@ -161,11 +190,15 @@ __global__ void fp_quantize_columnwise(
     uint8_t* scales,
     size_t size,
     int M,
-    int K) {
+    int K,
+    float* global_scale = nullptr) {
   // Input: [M, K] with strides [1, M] (M-major)
   // Quantized output: [M, K/elem_per_byte] row-major (K-major)
   // Scales: [M, K/group_size] row-major (K-major)
   // Quantize along K (last dimension, groups of group_size elements)
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
@@ -215,16 +248,18 @@ __global__ void fp_quantize_columnwise(
 #pragma unroll
     for (int r = 0; r < group_size; r += 2) {
       auto pair = Tx2{thread_data[r], thread_data[r + 1]};
-      abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+      absmax_x2<Tx2>(amax_2x, amax_2x, pair);
     }
-    float scale =
+    float scale_dec_b =
         max(fabsf(static_cast<float>(amax_2x.x)),
             fabsf(static_cast<float>(amax_2x.y)));
-    scale /= (bits == 4) ? 6.0f : 448.0f;
+    scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+    scale_dec_b *= scale_enc;
+    // Convert to mx scale or nv scale
     using ScaleType =
         std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-    auto s = ScaleType(scale);
-    scale = float(s);
+    auto s = ScaleType(scale_dec_b);
+    float scale_enc_b = scale_enc / float(s);
     scales_smem[tidx][tidy] = s.__x;
     int shared_idx = tidx * padded_local_cols + tidy * bytes_per_group;
@@ -234,12 +269,12 @@ __global__ void fp_quantize_columnwise(
       Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&thread_data[j * 4]);
       if constexpr (bits == 8) {
         uint32_t quantized_val =
-            scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+            scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
         *reinterpret_cast<uint32_t*>(&quantized_smem[shared_idx + j * 4]) =
             quantized_val;
       } else {
         uint16_t quantized_val =
-            scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+            scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
         *reinterpret_cast<uint16_t*>(&quantized_smem[shared_idx + j * 2]) =
             quantized_val;
       }
@@ -282,8 +317,12 @@ __global__ void fp_quantize_columnwise(
 }
 template <typename T, int group_size, int bits, bool use_mx_scale>
-__global__ void
-fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
+__global__ void fp_dequantize(
+    const uint8_t* w,
+    const uint8_t* scales,
+    T* out,
+    size_t size,
+    float* global_scale = nullptr) {
   auto block_size = cg::this_thread_block().dim_threads();
   auto block_idx = cg::this_thread_block().group_index();
   auto idx_in_block = cg::this_thread_block().thread_index();
@@ -294,6 +333,10 @@ fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
   auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;
   constexpr int pack_factor = bits == 8 ? 1 : 2;
+  const bool use_global_scale = global_scale != nullptr;
+  const float inv_scale_enc = use_mx_scale
+      ? 1.0f
+      : (use_global_scale ? (*global_scale) / (F8E4M3_MAX * F4E2M1_MAX) : 1.0f);
   size_t offset = tidx + grid_dim_x * size_t(tidy);
   size_t oindex = offset * pack_factor;
@@ -304,7 +347,7 @@ fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
   size_t gindex = oindex / group_size;
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto scale = float(((ScaleType*)(scales))[gindex]);
+  auto scale = float(((ScaleType*)(scales))[gindex]) * inv_scale_enc;
   out += oindex;
@@ -346,9 +389,13 @@ void fp_quantize_dequantize(
     array& what,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   enc.set_input_array(w);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(what);
   dispatch_float_types(w.dtype(), "fp_quantize_dequantize", [&](auto type_tag) {
     using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -370,7 +417,9 @@ void fp_quantize_dequantize(
           0,
           gpu_ptr<T>(w),
           gpu_ptr<T>(what),
-          w.size());
+          w.size(),
+          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                   : nullptr);
     }
   });
 }
@@ -381,9 +430,13 @@ void fp_quantize(
     array& scales,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   enc.set_input_array(w);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(wq);
   enc.set_output_array(scales);
   if (w.strides().back() != 1) {
@@ -410,7 +463,9 @@ void fp_quantize(
             gpu_ptr<uint8_t>(scales),
             w.size(),
             M,
-            K);
+            K,
+            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                     : nullptr);
       } else {
         throw std::runtime_error(
             "[Quantize::eval_gpu] Can not quantize input with type float64.");
@@ -438,7 +493,9 @@ void fp_quantize(
             gpu_ptr<T>(w),
             gpu_ptr<uint8_t>(wq),
             gpu_ptr<uint8_t>(scales),
-            w.size());
+            w.size(),
+            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                     : nullptr);
       } else {
         throw std::runtime_error(
             "[Quantize::eval_gpu] Can not quantize input with type float64.");
@@ -453,6 +510,7 @@ void fp_dequantize(
     array& w,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   constexpr int uint8_per_uint32 = 4;
@@ -465,6 +523,9 @@ void fp_dequantize(
   enc.set_input_array(wq);
   enc.set_input_array(scales);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(w);
   dispatch_float_types(w.dtype(), "fp_dequantize", [&](auto type_tag) {
     using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -485,7 +546,9 @@ void fp_dequantize(
           gpu_ptr<uint8_t>(wq),
           gpu_ptr<uint8_t>(scales),
           gpu_ptr<T>(w),
-          w.size());
+          w.size(),
+          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                   : nullptr);
     } else {
       throw std::runtime_error(
           "[Quantize::eval_gpu] Can not dequantize to output with type float64.");

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/no_qqmm_impl.cpp RENAMED Viewed

@@ -17,9 +17,8 @@ void qqmm_impl(
     const array&,
     const array&,
     const array&,
-    Dtype,
     QuantizationMode,
-    float) {
+    const GemmScalars&) {
   throw std::runtime_error(
       "[QQMatmul::eval_gpu] QQMM is only supported with CUDA 12.8 or higher.");
 }

data/submodules/mlx/mlx/backend/cuda/quantized/qqmm.cpp ADDED Viewed

@@ -0,0 +1,193 @@
+// Copyright © 2025 Apple Inc.
+#include "mlx/backend/cuda/device.h"
+#include "mlx/backend/cuda/quantized/qmv.h"
+#include "mlx/backend/cuda/quantized/qqmm_impl.h"
+#include "mlx/backend/cuda/quantized/qqmm_utils.h"
+#include "mlx/backend/cuda/quantized/quantized.h"
+#include "mlx/backend/cuda/quantized/quantized_utils.h"
+#include "mlx/primitives.h"
+#include <nvtx3/nvtx3.hpp>
+namespace mlx::core {
+namespace {
+std::tuple<array, array> quantize_input(
+    const array& input,
+    cu::CommandEncoder& encoder,
+    const Stream& s,
+    QuantizationMode mode,
+    int bits,
+    int group_size,
+    std::optional<array> global_scale = std::nullopt) {
+  const array x = ensure_contiguous(input, encoder, s);
+  // Compute output shapes
+  auto xq_shape = x.shape();
+  xq_shape.back() = x.shape(-1) * bits / 32;
+  const int64_t scales_inner = x.shape(-1) / group_size;
+  auto [pad_outer, pad_inner] =
+      get_padded_scale_dims(x.shape(-2), scales_inner);
+  auto sshape = x.shape();
+  sshape[x.ndim() - 2] = pad_outer;
+  sshape[x.ndim() - 1] = pad_inner;
+  sshape.back() = scales_inner;
+  // Allocate outputs
+  const int64_t xq_bytes = x.size() * bits / 8;
+  const int64_t batch = x.size() / (x.shape(-2) * x.shape(-1));
+  const int64_t scales_bytes = batch * (pad_outer * pad_inner);
+  array x_q(cu::malloc_async(xq_bytes, encoder), std::move(xq_shape), uint32);
+  array scales_x(
+      cu::malloc_async(scales_bytes, encoder), std::move(sshape), uint8);
+  encoder.add_temporary(x_q);
+  encoder.add_temporary(scales_x);
+  // global_scale is not nullopt only for NVFP4
+  fp_quantize(x, x_q, scales_x, group_size, bits, global_scale, encoder, s);
+  return {std::move(x_q), std::move(scales_x)};
+}
+GemmScalars create_nvfp4_scalars(
+    const array& global_scale_x,
+    const array& global_scale_w,
+    cu::CommandEncoder& encoder) {
+  // NVFP4 requires alpha/beta as device pointers
+  // alpha = amax_x * amax_w / (448 * 6)^2
+  // beta = 0
+  array alpha(cu::malloc_async(sizeof(float), encoder), {}, float32);
+  array beta(cu::malloc_async(sizeof(float), encoder), {}, float32);
+  compute_qqmm_pointers(alpha, beta, global_scale_x, global_scale_w, encoder);
+  encoder.add_temporary(alpha);
+  encoder.add_temporary(beta);
+  return {alpha, beta};
+}
+} // namespace
+void QQMatmul::eval_gpu(const std::vector<array>& inputs, array& out) {
+  nvtx3::scoped_range r("QQMatmul::eval_gpu");
+  auto& s = stream();
+  auto& encoder = cu::get_command_encoder(s);
+  auto& device = encoder.device();
+  bool w_quantized = (inputs[1].dtype() == uint32);
+  int base_size = w_quantized ? 3 : 2;
+  assert(
+      inputs.size() == base_size ||
+      (mode_ == QuantizationMode::Nvfp4 && inputs.size() == base_size + 2));
+  if (w_quantized && inputs[0].shape(-2) == 1) {
+    out.set_data(cu::malloc_async(out.nbytes(), encoder));
+    // For nvfp4, get global scale for x from inputs if present
+    bool has_global_scale =
+        mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
+    std::optional<array> global_scale = std::nullopt;
+    if (has_global_scale) {
+      global_scale = inputs[inputs.size() - 2];
+    }
+    bool donate_x = inputs[0].is_donatable();
+    array x = ensure_row_contiguous(inputs[0], encoder, s);
+    // If x is a copy it should be donatable
+    donate_x |= x.is_donatable();
+    auto xhat = donate_x
+        ? x
+        : array(cu::malloc_async(x.nbytes(), encoder), x.shape(), x.dtype());
+    if (!donate_x) {
+      encoder.add_temporary(xhat);
+    }
+    fp_quantize_dequantize(
+        x, xhat, group_size_, bits_, global_scale, encoder, s);
+    // Make sure the last two dims of w and s are contiguous
+    array w = ensure_row_contiguous_matrix(inputs[1], encoder, s);
+    array scales = ensure_row_contiguous_matrix(inputs[2], encoder, s);
+    bool non_batched = w.ndim() == 2;
+    int K = x.shape(-1);
+    int M = non_batched ? x.size() / K : x.shape(-2);
+    int N = out.shape(-1);
+    fp_qmv(w, scales, xhat, out, bits_, group_size_, M, N, K, encoder);
+    return;
+  }
+  auto cc = device.compute_capability_major() * 100 +
+      device.compute_capability_minor() * 10;
+  if (cc < 1000) {
+    throw std::runtime_error(
+        "[QQMatmul::eval_gpu] QQMM is only supported on GPUs with compute capability 10.0 or higher.");
+  }
+  // - 2 inputs: x, w (non-quantized w)
+  // - 3 inputs: x, w, scales_w (quantized w)
+  // For nvfp4, global scales are optional but must be both present or both
+  // absent If present, they add 2 more inputs (global_scale_x, global_scale_w)
+  bool has_global_scales =
+      mode_ == QuantizationMode::Nvfp4 && inputs.size() > base_size;
+  // For nvfp4, get global scales from inputs if present
+  std::optional<array> global_scale_x = std::nullopt;
+  std::optional<array> global_scale_w = std::nullopt;
+  if (has_global_scales) {
+    global_scale_x = inputs[inputs.size() - 2];
+    global_scale_w = inputs[inputs.size() - 1];
+  }
+  // Quantize inputs (or use pre-quantized)
+  auto [x_q, scale_x_pre] = quantize_input(
+      inputs[0], encoder, s, mode_, bits_, group_size_, global_scale_x);
+  auto [w_q, scale_w_pre] = !w_quantized
+      ? quantize_input(
+            inputs[1], encoder, s, mode_, bits_, group_size_, global_scale_w)
+      : std::make_tuple(
+            ensure_contiguous(inputs[1], encoder, s),
+            ensure_contiguous(inputs[2], encoder, s));
+  out.set_data(cu::malloc_async(out.nbytes(), encoder));
+  int M = x_q.shape(-2);
+  int N = w_q.shape(-2); // transposed
+  int K = x_q.shape(-1) * (32 / bits_);
+  bool x_transposed = false;
+  bool w_transposed = true; // always transposed
+  int64_t lda = K;
+  int64_t ldb = K;
+  // Repack scales to tiled layout for tensor cores
+  array scale_x = pad_and_swizzle_scales(scale_x_pre, encoder, s);
+  array scale_w = pad_and_swizzle_scales(scale_w_pre, encoder, s);
+  GemmScalars scalars;
+  if (has_global_scales) {
+    scalars = create_nvfp4_scalars(*global_scale_x, *global_scale_w, encoder);
+  }
+  qqmm_impl(
+      encoder,
+      M,
+      N,
+      K,
+      x_transposed,
+      lda,
+      w_transposed,
+      ldb,
+      out,
+      x_q,
+      w_q,
+      scale_x,
+      scale_w,
+      mode_,
+      scalars);
+}
+} // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_impl.cpp RENAMED Viewed

@@ -19,15 +19,10 @@ void qqmm_impl(
     const array& b,
     const array& a_scale,
     const array& b_scale,
-    Dtype out_dtype,
     QuantizationMode mode,
-    float alpha) {
-  // Invoke CublasQQMM
+    const GemmScalars& scalars) {
   std::string qmode = quantization_mode_to_string(mode);
-  // Currently only supports non-batched QQMM operations
-  // that covers all use cases for training, we will just collapse (batch,
-  // seq_len) into (tokens)
   CublasQQMM qqmm(
       encoder.device(),
       a_transposed,
@@ -41,10 +36,22 @@ void qqmm_impl(
       1, // batch_count
       0, // a_batch_stride
       0, // b_batch_stride
-      out_dtype,
+      out.dtype(),
       qmode);
-  qqmm.run(encoder, out, a, b, a_scale, b_scale, alpha);
+  if (scalars.has_values()) {
+    qqmm.run(
+        encoder,
+        out,
+        a,
+        b,
+        a_scale,
+        b_scale,
+        *scalars.alpha_device,
+        *scalars.beta_device);
+  } else {
+    qqmm.run(encoder, out, a, b, a_scale, b_scale);
+  }
 }
 } // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_impl.h RENAMED Viewed

@@ -1,10 +1,22 @@
-// Copyright © 2026 Apple Inc.
+// Copyright © 2025 Apple Inc.
 #pragma once
 #include "mlx/backend/cuda/device.h"
 #include "mlx/primitives.h"
+#include <optional>
 namespace mlx::core {
+struct GemmScalars {
+  std::optional<array> alpha_device;
+  std::optional<array> beta_device;
+  bool has_values() const {
+    return alpha_device.has_value();
+  }
+};
 void qqmm_impl(
     cu::CommandEncoder& encoder,
     int M,
@@ -19,8 +31,7 @@ void qqmm_impl(
     const array& b,
     const array& a_scale,
     const array& b_scale,
-    Dtype out_dtype,
     QuantizationMode mode,
-    float alpha = 1.0f);
+    const GemmScalars& scalars = {});
 } // namespace mlx::core

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/qqmm_utils.cu RENAMED Viewed

@@ -70,6 +70,21 @@ inline std::tuple<dim3, dim3> get_swizzle_launch_args(
 namespace cu {
+constexpr float F8E4M3_MAX = 448.0f;
+constexpr float F4E2M1_MAX = 6.0f;
+__global__ void compute_qqmm_pointers(
+    float* alpha_out,
+    float* beta_out,
+    const float* tensor_amax_x,
+    const float* tensor_amax_w) {
+  // Compute alpha = tensor_amax_x * tensor_amax_w / (448 * 6)^2
+  constexpr float inv_scale_sq =
+      1.0f / (F8E4M3_MAX * F4E2M1_MAX * F8E4M3_MAX * F4E2M1_MAX);
+  *alpha_out = (*tensor_amax_x) * (*tensor_amax_w) * inv_scale_sq;
+  *beta_out = 0.0f;
+}
 __global__ void swizzle_scales(
     const uint8_t* scales_linear,
     uint8_t* scales_swizzled,
@@ -224,4 +239,25 @@ void swizzle_scales(
       output_cols);
 }
+void compute_qqmm_pointers(
+    array& alpha_out,
+    array& beta_out,
+    const array& tensor_amax_x,
+    const array& tensor_amax_w,
+    cu::CommandEncoder& enc) {
+  enc.set_input_array(tensor_amax_x);
+  enc.set_input_array(tensor_amax_w);
+  enc.set_output_array(alpha_out);
+  enc.set_output_array(beta_out);
+  enc.add_kernel_node(
+      cu::compute_qqmm_pointers,
+      dim3(1),
+      dim3(1),
+      0,
+      gpu_ptr<void>(alpha_out),
+      gpu_ptr<void>(beta_out),
+      gpu_ptr<void>(tensor_amax_x),
+      gpu_ptr<void>(tensor_amax_w));
+}
 } // namespace mlx::core