RubyGems - mlx - Versions diffs - 0.30.7.3 → 0.30.7.6 - Mend

mlx 0.30.7.3 → 0.30.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (590) hide show

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/cublas_qqmm.cpp RENAMED Viewed

@@ -13,39 +13,26 @@ namespace mlx::core {
 namespace {
-// Currently cublas supports only mxfp8 and nvfp4
-// quantization modes for block scaled quantization
-cudaDataType_t qmode_to_cublas_scale_dtype(std::string mode) {
-  if (mode == "mxfp8") {
-    return CUDA_R_8F_UE8M0;
-  } else if (mode == "nvfp4") {
-    return CUDA_R_8F_UE4M3;
-  } else {
-    throw std::runtime_error(
-        fmt::format("Unsupported quantization mode in CublasQQMM: {}.", mode));
-  }
-}
-cudaDataType_t qmode_to_cublas_dtype(std::string mode) {
-  if (mode == "mxfp8") {
-    return CUDA_R_8F_E4M3;
-  } else if (mode == "nvfp4") {
-    return CUDA_R_4F_E2M1;
-  } else {
-    throw std::runtime_error(
-        fmt::format("Unsupported quantization mode in CublasQQMM: {}.", mode));
-  }
-}
+struct QuantModeConfig {
+  cudaDataType_t data_type;
+  cudaDataType_t scale_dtype;
+  cublasLtMatmulMatrixScale_t scale_mode;
+};
-cublasLtMatmulMatrixScale_t qmode_to_cublas_scale_mode(std::string mode) {
+QuantModeConfig get_quant_mode_config(const std::string& mode) {
   if (mode == "mxfp8") {
-    return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
+    return {
+        CUDA_R_8F_E4M3,
+        CUDA_R_8F_UE8M0,
+        CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0};
   } else if (mode == "nvfp4") {
-    return CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3;
-  } else {
-    throw std::runtime_error(
-        fmt::format("Unsupported quantization mode in CublasQQMM: {}.", mode));
+    return {
+        CUDA_R_4F_E2M1,
+        CUDA_R_8F_UE4M3,
+        CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3};
   }
+  throw std::runtime_error(
+      fmt::format("Unsupported quantization mode in CublasQQMM: {}.", mode));
 }
 } // namespace
@@ -64,21 +51,21 @@ CublasQQMM::CublasQQMM(
     int64_t a_batch_stride,
     int64_t b_batch_stride,
     Dtype out_dtype,
-    std::string qmode) {
+    const std::string& qmode) {
+  auto config = get_quant_mode_config(qmode);
   // The compute type must be CUBLAS_COMPUTE_32F.
   // The scale type must be CUDA_R_32F.
   cudaDataType_t scale_type = CUDA_R_32F;
   cublasComputeType_t gemm_compute_type = CUBLAS_COMPUTE_32F;
   cudaDataType_t output_type =
       cublas_utils::dtype_to_cublas_type(out_dtype, "CublasQQMM");
-  cudaDataType_t data_type = qmode_to_cublas_dtype(qmode);
-  quantization_mode_ = std::string(qmode);
   init_base(
       device,
       scale_type,
       gemm_compute_type,
-      data_type,
+      config.data_type,
       output_type,
       a_transposed,
       a_rows,
@@ -92,8 +79,8 @@ CublasQQMM::CublasQQMM(
       a_batch_stride,
       b_batch_stride);
-  a_scale_mode_ = qmode_to_cublas_scale_mode(qmode);
-  b_scale_mode_ = qmode_to_cublas_scale_mode(qmode);
+  a_scale_mode_ = config.scale_mode;
+  b_scale_mode_ = config.scale_mode;
   CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
       matmul_desc_,
@@ -123,7 +110,7 @@ CublasQQMM::CublasQQMM(
     int64_t b_batch_stride,
     int64_t c_batch_stride,
     Dtype out_dtype,
-    std::string qmode)
+    const std::string& qmode)
     : CublasQQMM(
           device,
           a_transposed,
@@ -158,11 +145,14 @@ void CublasQQMM::run(
     const array& b,
     const array& a_scale,
     const array& b_scale,
-    float alpha) {
+    const array& alpha,
+    const array& beta) {
   encoder.set_input_array(a);
   encoder.set_input_array(b);
   encoder.set_input_array(a_scale);
   encoder.set_input_array(b_scale);
+  encoder.set_input_array(alpha);
+  encoder.set_input_array(beta);
   encoder.set_output_array(out);
   execute(
@@ -173,19 +163,37 @@ void CublasQQMM::run(
       gpu_ptr<void>(a_scale),
       gpu_ptr<void>(b_scale),
       nullptr,
-      alpha);
+      gpu_ptr<void>(alpha),
+      gpu_ptr<void>(beta));
 }
-void CublasQQMM::execute(
+void CublasQQMM::run(
+    cu::CommandEncoder& encoder,
+    array& out,
+    const array& a,
+    const array& b,
+    const array& a_scale,
+    const array& b_scale) {
+  encoder.set_input_array(a);
+  encoder.set_input_array(b);
+  encoder.set_input_array(a_scale);
+  encoder.set_input_array(b_scale);
+  encoder.set_output_array(out);
+  execute(
+      encoder,
+      gpu_ptr<void>(out),
+      gpu_ptr<void>(a),
+      gpu_ptr<void>(b),
+      gpu_ptr<void>(a_scale),
+      gpu_ptr<void>(b_scale),
+      nullptr);
+}
+void CublasQQMM::set_scales_ptrs(
     cu::CommandEncoder& encoder,
-    void* out,
-    const void* a,
-    const void* b,
     const void* a_scale,
-    const void* b_scale,
-    const void* c,
-    float alpha /* = 1 */,
-    float beta /* = 0 */) {
+    const void* b_scale) {
   CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
       matmul_desc_,
       CUBLASLT_MATMUL_DESC_A_SCALE_POINTER,
@@ -196,6 +204,49 @@ void CublasQQMM::execute(
       CUBLASLT_MATMUL_DESC_B_SCALE_POINTER,
       &a_scale,
       sizeof(a_scale)));
+}
+void CublasQQMM::execute(
+    cu::CommandEncoder& encoder,
+    void* out,
+    const void* a,
+    const void* b,
+    const void* a_scale,
+    const void* b_scale,
+    const void* c,
+    const void* alpha,
+    const void* beta) {
+  set_scales_ptrs(encoder, a_scale, b_scale);
+  // alpha and beta are both should be device pointers for nvfp4
+  // by default cublas uses host pointers
+  // https://docs.nvidia.com/cuda/cublas/#cublasltpointermode-t
+  cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_DEVICE;
+  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
+      matmul_desc_,
+      CUBLASLT_MATMUL_DESC_POINTER_MODE,
+      &pointer_mode,
+      sizeof(pointer_mode)));
+  execute_matmul(encoder, out, a, b, c, alpha, beta);
+}
+void CublasQQMM::execute(
+    cu::CommandEncoder& encoder,
+    void* out,
+    const void* a,
+    const void* b,
+    const void* a_scale,
+    const void* b_scale,
+    const void* c,
+    const float alpha /* = 1 */,
+    const float beta /* = 0 */) {
+  set_scales_ptrs(encoder, a_scale, b_scale);
+  // alpha and beta are both should be host pointers
+  cublasLtPointerMode_t pointer_mode = CUBLASLT_POINTER_MODE_HOST;
+  CHECK_CUBLAS_ERROR(cublasLtMatmulDescSetAttribute(
+      matmul_desc_,
+      CUBLASLT_MATMUL_DESC_POINTER_MODE,
+      &pointer_mode,
+      sizeof(pointer_mode)));
   const void* alpha_ptr = &alpha;
   const void* beta_ptr = &beta;

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/cublas_qqmm.h RENAMED Viewed

@@ -25,7 +25,7 @@ class CublasQQMM : public CublasMatmulBase {
       int64_t a_batch_stride,
       int64_t b_batch_stride,
       Dtype out_dtype,
-      std::string quantization_mode);
+      const std::string& quantization_mode);
   CublasQQMM(
       cu::Device& device,
@@ -43,7 +43,7 @@ class CublasQQMM : public CublasMatmulBase {
       int64_t b_batch_stride,
       int64_t c_batch_stride,
       Dtype out_dtype,
-      std::string quantization_mode);
+      const std::string& quantization_mode);
   void run(
       cu::CommandEncoder& encoder,
@@ -52,20 +52,33 @@ class CublasQQMM : public CublasMatmulBase {
       const array& b,
       const array& a_scale,
       const array& b_scale,
-      float alpha = 1.0f);
+      const array& alpha,
+      const array& beta);
- private:
-  void run_batched(
+  void run(
       cu::CommandEncoder& encoder,
       array& out,
       const array& a,
       const array& b,
       const array& a_scale,
-      const array& b_scale,
-      const Shape& batch_shape,
-      const Strides& a_batch_strides,
-      const Strides& b_batch_strides,
-      float alpha);
+      const array& b_scale);
+ private:
+  void set_scales_ptrs(
+      cu::CommandEncoder& encoder,
+      const void* a_scale,
+      const void* b_scale);
+  void execute(
+      cu::CommandEncoder& encoder,
+      void* out,
+      const void* a,
+      const void* b,
+      const void* a_scale,
+      const void* b_scale,
+      const void* c,
+      const void* alpha,
+      const void* beta);
   void execute(
       cu::CommandEncoder& encoder,
@@ -75,10 +88,9 @@ class CublasQQMM : public CublasMatmulBase {
       const void* a_scale,
       const void* b_scale,
       const void* c,
-      float alpha = 1,
-      float beta = 0);
+      const float alpha = 1.0f,
+      const float beta = 0.0f);
-  std::string quantization_mode_;
   cublasLtMatmulMatrixScale_t a_scale_mode_;
   cublasLtMatmulMatrixScale_t b_scale_mode_;
   cublasLtMatmulMatrixScale_t c_scale_mode_;

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/fp_quantize.cu RENAMED Viewed

@@ -11,6 +11,11 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda_fp4.h>
+#include <cuda_fp8.h>
+constexpr float F8E4M3_MAX = 448.0f;
+constexpr float F4E2M1_MAX = 6.0f;
 namespace mlx::core {
 namespace cu {
@@ -29,7 +34,16 @@ struct Dequantize {
 namespace cg = cooperative_groups;
 template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
-__global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
+__global__ void fp_quantize_dequantize(
+    T* w,
+    T* out,
+    size_t size,
+    float* global_scale = nullptr) {
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
+  const float inv_scale_enc = use_global_scale ? 1.0f / scale_enc : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
   uint32_t rbits = 0; // reserved bits for future use
@@ -48,26 +62,28 @@ __global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
   }
   auto w_tile = load_vector<group_size, T>(w, thread_idx);
-  float scale = 0.0f;
+  float scale_dec_b = 0.0f;
   Tx2 amax_2x = Tx2{0.0f, 0.0f};
 #pragma unroll
   for (int i = 0; i < group_size; i += 2) {
     auto pair = Tx2{w_tile[i], w_tile[i + 1]};
-    abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
   }
-  scale = static_cast<float>(
+  scale_dec_b = static_cast<float>(
       max(fabsf(static_cast<float>(amax_2x.x)),
           fabsf(static_cast<float>(amax_2x.y))));
-  scale /= bits == 4 ? 6.0f : 448.0f;
+  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+  scale_dec_b *= scale_enc;
   // Convert to mx scale or nv scale
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto s = ScaleType(scale);
-  scale = float(s);
+  auto s = ScaleType(scale_dec_b);
+  float scale_enc_b = scale_enc / float(s);
+  float scale_dec = float(s) * inv_scale_enc;
   AlignedVector<T, group_size> w_hat;
 #pragma unroll
@@ -76,24 +92,36 @@ __global__ void fp_quantize_dequantize(T* w, T* out, size_t size) {
     float4 dq;
     if constexpr (bits == 8) {
       uint32_t quantized_val =
-          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       dq = dequant_fp8(quantized_val);
     } else {
       uint16_t quantized_val =
-          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       dq = dequant_fp4(quantized_val);
     }
-    w_hat[i * 4] = static_cast<T>(dq.x * scale);
-    w_hat[i * 4 + 1] = static_cast<T>(dq.y * scale);
-    w_hat[i * 4 + 2] = static_cast<T>(dq.z * scale);
-    w_hat[i * 4 + 3] = static_cast<T>(dq.w * scale);
+    w_hat[i * 4] = static_cast<T>(dq.x * scale_dec);
+    w_hat[i * 4 + 1] = static_cast<T>(dq.y * scale_dec);
+    w_hat[i * 4 + 2] = static_cast<T>(dq.z * scale_dec);
+    w_hat[i * 4 + 3] = static_cast<T>(dq.w * scale_dec);
   }
   store_vector<group_size>(out, thread_idx, w_hat);
 }
 template <typename T, int group_size, int bits, bool use_mx_scale, bool USE_SR>
-__global__ void
-fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
+__global__ void fp_quantize_rowwise(
+    T* w,
+    uint8_t* out,
+    uint8_t* scales,
+    size_t size,
+    float* global_scale = nullptr) {
+  // NVFP4 conversion:
+  // Global encode scale: (448 × 6) / *global_scale
+  // Per-block decode scale: S_dec_b = (block_amax / 6) × S_enc → stored as FP8
+  // E4M3 Per-block encode scale: S_enc_b = S_enc / S_dec_b
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
   uint32_t rbits = 0; // reserved bits for future use
@@ -112,27 +140,28 @@ fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
   }
   auto w_tile = load_vector<group_size, T>(w, thread_idx);
-  float scale = 0.0f;
+  float scale_dec_b = 0.0f;
   Tx2 amax_2x = Tx2{0.0f, 0.0f};
 #pragma unroll
   for (int i = 0; i < group_size; i += 2) {
     auto pair = Tx2{w_tile[i], w_tile[i + 1]};
-    abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+    absmax_x2<Tx2>(amax_2x, amax_2x, pair);
   }
-  scale = static_cast<float>(
+  scale_dec_b = static_cast<float>(
       max(fabsf(static_cast<float>(amax_2x.x)),
           fabsf(static_cast<float>(amax_2x.y))));
-  scale /= bits == 4 ? 6.0f : 448.0f;
+  scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+  scale_dec_b *= scale_enc;
   // Convert to mx scale or nv scale
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto s = ScaleType(scale);
+  auto s = ScaleType(scale_dec_b);
   uint8_t q_scale = s.__x;
-  scale = float(s);
+  float scale_enc_b = scale_enc / float(s);
   scales[thread_idx] = q_scale;
   constexpr int elem_per_byte = bits == 8 ? 1 : 2;
@@ -143,11 +172,11 @@ fp_quantize_rowwise(T* w, uint8_t* out, uint8_t* scales, size_t size) {
     Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&w_tile[i * 4]);
     if constexpr (bits == 8) {
       uint32_t quantized_val =
-          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       *reinterpret_cast<uint32_t*>(&quantized[i * 4]) = quantized_val;
     } else {
       uint16_t quantized_val =
-          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+          scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
       *reinterpret_cast<uint16_t*>(&quantized[i * 2]) = quantized_val;
     }
   }
@@ -161,11 +190,15 @@ __global__ void fp_quantize_columnwise(
     uint8_t* scales,
     size_t size,
     int M,
-    int K) {
+    int K,
+    float* global_scale = nullptr) {
   // Input: [M, K] with strides [1, M] (M-major)
   // Quantized output: [M, K/elem_per_byte] row-major (K-major)
   // Scales: [M, K/group_size] row-major (K-major)
   // Quantize along K (last dimension, groups of group_size elements)
+  const bool use_global_scale = global_scale != nullptr;
+  const float scale_enc =
+      use_global_scale ? (F8E4M3_MAX * F4E2M1_MAX) / *global_scale : 1.0f;
   using Tx2 = Vector2_t<T>;
   using Tx4 = Vector4_t<T>;
@@ -215,16 +248,18 @@ __global__ void fp_quantize_columnwise(
 #pragma unroll
     for (int r = 0; r < group_size; r += 2) {
       auto pair = Tx2{thread_data[r], thread_data[r + 1]};
-      abs_max_x2<Tx2>(amax_2x, amax_2x, pair);
+      absmax_x2<Tx2>(amax_2x, amax_2x, pair);
     }
-    float scale =
+    float scale_dec_b =
         max(fabsf(static_cast<float>(amax_2x.x)),
             fabsf(static_cast<float>(amax_2x.y)));
-    scale /= (bits == 4) ? 6.0f : 448.0f;
+    scale_dec_b /= bits == 4 ? F4E2M1_MAX : F8E4M3_MAX;
+    scale_dec_b *= scale_enc;
+    // Convert to mx scale or nv scale
     using ScaleType =
         std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-    auto s = ScaleType(scale);
-    scale = float(s);
+    auto s = ScaleType(scale_dec_b);
+    float scale_enc_b = scale_enc / float(s);
     scales_smem[tidx][tidy] = s.__x;
     int shared_idx = tidx * padded_local_cols + tidy * bytes_per_group;
@@ -234,12 +269,12 @@ __global__ void fp_quantize_columnwise(
       Tx4 w_Tx4 = *reinterpret_cast<Tx4*>(&thread_data[j * 4]);
       if constexpr (bits == 8) {
         uint32_t quantized_val =
-            scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+            scale_cvt_Tx4_to_fp8x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
         *reinterpret_cast<uint32_t*>(&quantized_smem[shared_idx + j * 4]) =
             quantized_val;
       } else {
         uint16_t quantized_val =
-            scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, 1.0f / scale, rbits);
+            scale_cvt_Tx4_to_fp4x4<T, USE_SR>(w_Tx4, scale_enc_b, rbits);
         *reinterpret_cast<uint16_t*>(&quantized_smem[shared_idx + j * 2]) =
             quantized_val;
       }
@@ -282,8 +317,12 @@ __global__ void fp_quantize_columnwise(
 }
 template <typename T, int group_size, int bits, bool use_mx_scale>
-__global__ void
-fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
+__global__ void fp_dequantize(
+    const uint8_t* w,
+    const uint8_t* scales,
+    T* out,
+    size_t size,
+    float* global_scale = nullptr) {
   auto block_size = cg::this_thread_block().dim_threads();
   auto block_idx = cg::this_thread_block().group_index();
   auto idx_in_block = cg::this_thread_block().thread_index();
@@ -294,6 +333,10 @@ fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
   auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x;
   constexpr int pack_factor = bits == 8 ? 1 : 2;
+  const bool use_global_scale = global_scale != nullptr;
+  const float inv_scale_enc = use_mx_scale
+      ? 1.0f
+      : (use_global_scale ? (*global_scale) / (F8E4M3_MAX * F4E2M1_MAX) : 1.0f);
   size_t offset = tidx + grid_dim_x * size_t(tidy);
   size_t oindex = offset * pack_factor;
@@ -304,7 +347,7 @@ fp_dequantize(const uint8_t* w, const uint8_t* scales, T* out, size_t size) {
   size_t gindex = oindex / group_size;
   using ScaleType =
       std::conditional_t<use_mx_scale, __nv_fp8_e8m0, __nv_fp8_e4m3>;
-  auto scale = float(((ScaleType*)(scales))[gindex]);
+  auto scale = float(((ScaleType*)(scales))[gindex]) * inv_scale_enc;
   out += oindex;
@@ -346,9 +389,13 @@ void fp_quantize_dequantize(
     array& what,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   enc.set_input_array(w);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(what);
   dispatch_float_types(w.dtype(), "fp_quantize_dequantize", [&](auto type_tag) {
     using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -370,7 +417,9 @@ void fp_quantize_dequantize(
           0,
           gpu_ptr<T>(w),
           gpu_ptr<T>(what),
-          w.size());
+          w.size(),
+          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                   : nullptr);
     }
   });
 }
@@ -381,9 +430,13 @@ void fp_quantize(
     array& scales,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   enc.set_input_array(w);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(wq);
   enc.set_output_array(scales);
   if (w.strides().back() != 1) {
@@ -410,7 +463,9 @@ void fp_quantize(
             gpu_ptr<uint8_t>(scales),
             w.size(),
             M,
-            K);
+            K,
+            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                     : nullptr);
       } else {
         throw std::runtime_error(
             "[Quantize::eval_gpu] Can not quantize input with type float64.");
@@ -438,7 +493,9 @@ void fp_quantize(
             gpu_ptr<T>(w),
             gpu_ptr<uint8_t>(wq),
             gpu_ptr<uint8_t>(scales),
-            w.size());
+            w.size(),
+            global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                     : nullptr);
       } else {
         throw std::runtime_error(
             "[Quantize::eval_gpu] Can not quantize input with type float64.");
@@ -453,6 +510,7 @@ void fp_dequantize(
     array& w,
     int group_size,
     int bits,
+    const std::optional<array>& global_scale /* = std::nullopt */,
     cu::CommandEncoder& enc,
     const Stream& s) {
   constexpr int uint8_per_uint32 = 4;
@@ -465,6 +523,9 @@ void fp_dequantize(
   enc.set_input_array(wq);
   enc.set_input_array(scales);
+  if (global_scale.has_value()) {
+    enc.set_input_array(global_scale.value());
+  }
   enc.set_output_array(w);
   dispatch_float_types(w.dtype(), "fp_dequantize", [&](auto type_tag) {
     using T = cuda_type_t<MLX_GET_TYPE(type_tag)>;
@@ -485,7 +546,9 @@ void fp_dequantize(
           gpu_ptr<uint8_t>(wq),
           gpu_ptr<uint8_t>(scales),
           gpu_ptr<T>(w),
-          w.size());
+          w.size(),
+          global_scale.has_value() ? gpu_ptr<float>(global_scale.value())
+                                   : nullptr);
     } else {
       throw std::runtime_error(
           "[Quantize::eval_gpu] Can not dequantize to output with type float64.");

data/{mlx → submodules/mlx}/mlx/backend/cuda/quantized/no_qqmm_impl.cpp RENAMED Viewed

@@ -17,9 +17,8 @@ void qqmm_impl(
     const array&,
     const array&,
     const array&,
-    Dtype,
     QuantizationMode,
-    float) {
+    const GemmScalars&) {
   throw std::runtime_error(
       "[QQMatmul::eval_gpu] QQMM is only supported with CUDA 12.8 or higher.");
 }