npm - whisper.rn - Versions diffs - 0.5.0-rc.8 → 0.5.0 - Mend

whisper.rn 0.5.0-rc.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (82) hide show

package/cpp/ggml-opt.cpp CHANGED Viewed

@@ -64,9 +64,11 @@ struct wsp_ggml_opt_context {
     int32_t opt_i              = 0;
     bool    loss_per_datapoint = false;
-    wsp_ggml_opt_get_optimizer_params get_opt_pars = nullptr;
-    void * get_opt_pars_ud                     = nullptr;
-    struct wsp_ggml_tensor * adamw_params          = nullptr;
+    wsp_ggml_opt_get_optimizer_params get_opt_pars    = nullptr;
+    void *                        get_opt_pars_ud = nullptr;
+    struct wsp_ggml_tensor *          opt_step_params = nullptr; // Stores output of get_opt_pars.
+    enum wsp_ggml_opt_optimizer_type optimizer = WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW;
 };
 struct wsp_ggml_opt_result {
@@ -229,9 +231,13 @@ struct wsp_ggml_opt_optimizer_params wsp_ggml_opt_get_default_optimizer_params(v
     result.adamw.eps   = 1e-8f;
     result.adamw.wd    = 0.0f;
+    result.sgd.alpha   = 1e-3f;
+    result.sgd.wd      = 0.0f;
     return result;
 }
 struct wsp_ggml_opt_optimizer_params wsp_ggml_opt_get_constant_optimizer_params(void * userdata) {
     return *((struct wsp_ggml_opt_optimizer_params *) userdata);
 }
@@ -249,6 +255,7 @@ struct wsp_ggml_opt_params wsp_ggml_opt_default_params(
         /*opt_period      =*/ 1,
         /*get_opt_pars    =*/ wsp_ggml_opt_get_default_optimizer_params,
         /*get_opt_pars_ud =*/ nullptr,
+        /*optimizer       =*/ WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW,
     };
 }
@@ -316,9 +323,14 @@ static void wsp_ggml_opt_build(wsp_ggml_opt_context_t opt_ctx) {
     WSP_GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with wsp_ggml_opt_prepare_alloc");
     WSP_GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");
+    const enum wsp_ggml_opt_optimizer_type optimizer = opt_ctx->optimizer;
     const bool accumulate = opt_ctx->build_type_alloc >= WSP_GGML_OPT_BUILD_TYPE_GRAD &&
         !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == WSP_GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);
+    const bool need_momenta = opt_ctx->build_type_alloc == WSP_GGML_OPT_BUILD_TYPE_OPT &&
+        opt_ctx->optimizer == WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW;
     wsp_ggml_set_input(opt_ctx->inputs);
     wsp_ggml_set_output(opt_ctx->outputs);
@@ -340,8 +352,7 @@ static void wsp_ggml_opt_build(wsp_ggml_opt_context_t opt_ctx) {
         //   - pred (if using static graphs)
         //   - ncorrect (if using static graphs, 2 tensors).
         constexpr size_t n_loss = 1;
-        const size_t tensors_per_param = (accumulate ? 1 : 0) +
-            (opt_ctx->build_type_alloc == WSP_GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t tensors_per_param = (accumulate ? 1 : 0) + (need_momenta ? 2 : 0);
         const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
         const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * wsp_ggml_tensor_overhead();
         struct wsp_ggml_init_params params = {
@@ -458,7 +469,7 @@ static void wsp_ggml_opt_build(wsp_ggml_opt_context_t opt_ctx) {
             }
         }
-        if (opt_ctx->build_type_alloc >= WSP_GGML_OPT_BUILD_TYPE_OPT) {
+        if (need_momenta && opt_ctx->build_type_alloc >= WSP_GGML_OPT_BUILD_TYPE_OPT) {
             opt_ctx->grad_m.resize(n_nodes);
             opt_ctx->grad_v.resize(n_nodes);
             for (int i = 0; i < n_nodes; ++i) {
@@ -492,23 +503,36 @@ static void wsp_ggml_opt_build(wsp_ggml_opt_context_t opt_ctx) {
     // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
     opt_ctx->gb_opt = wsp_ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
-    opt_ctx->adamw_params = wsp_ggml_new_tensor_1d(opt_ctx->ctx_cpu, WSP_GGML_TYPE_F32, 7);
-    wsp_ggml_set_input(opt_ctx->adamw_params);
-    wsp_ggml_set_name(opt_ctx->adamw_params, "adamw_params");
+    opt_ctx->opt_step_params = wsp_ggml_new_tensor_1d(opt_ctx->ctx_cpu, WSP_GGML_TYPE_F32, need_momenta ? 7 : 2);
+    wsp_ggml_tensor * adamw_params = opt_ctx->opt_step_params;
+    wsp_ggml_set_input(adamw_params);
+    const char * optimizer_name = wsp_ggml_opt_optimizer_name(opt_ctx->optimizer);
+    wsp_ggml_format_name(adamw_params, "%s_params", optimizer_name);
     for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
         struct wsp_ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
         struct wsp_ggml_tensor * grad = wsp_ggml_graph_get_grad(opt_ctx->gb_opt, node);
         if (grad && (node->flags & WSP_GGML_TENSOR_FLAG_PARAM)) {
-            struct wsp_ggml_tensor * m        = opt_ctx->grad_m[i];
-            struct wsp_ggml_tensor * v        = opt_ctx->grad_v[i];
-            struct wsp_ggml_tensor * opt_step = wsp_ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
-            wsp_ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
-            wsp_ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
-            wsp_ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
+            struct wsp_ggml_tensor * m = nullptr;
+            struct wsp_ggml_tensor * v = nullptr;
+            if (need_momenta) {
+                m = opt_ctx->grad_m[i];
+                v = opt_ctx->grad_v[i];
+                wsp_ggml_format_name(m, "AdamW m for %s", node->name);
+                wsp_ggml_format_name(v, "AdamW v for %s", node->name);
+            }
+            struct wsp_ggml_tensor * opt_step;
+            switch (optimizer) {
+                case WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+                    opt_step = wsp_ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, adamw_params);
+                    break;
+                case WSP_GGML_OPT_OPTIMIZER_TYPE_SGD:
+                    opt_step = wsp_ggml_opt_step_sgd(opt_ctx->ctx_compute, node, grad, adamw_params);
+                    break;
+                default:
+                    WSP_GGML_ABORT("fatal error");
+            }
+            wsp_ggml_format_name(opt_step, "%s step for %s", optimizer_name, node->name);
             wsp_ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
         }
     }
@@ -534,6 +558,7 @@ wsp_ggml_opt_context_t wsp_ggml_opt_init(struct wsp_ggml_opt_params params) {
     result->opt_period       = params.opt_period;
     result->get_opt_pars     = params.get_opt_pars;
     result->get_opt_pars_ud  = params.get_opt_pars_ud;
+    result->optimizer        = params.optimizer;
     WSP_GGML_ASSERT(result->opt_period >= 1);
@@ -756,29 +781,43 @@ void wsp_ggml_opt_alloc(wsp_ggml_opt_context_t opt_ctx, bool backward) {
 void wsp_ggml_opt_eval(wsp_ggml_opt_context_t opt_ctx, wsp_ggml_opt_result_t result) {
     WSP_GGML_ASSERT(opt_ctx->eval_ready);
     if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
-        struct wsp_ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
-        WSP_GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.eps   >= 0.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.wd    >= 0.0f);
-        WSP_GGML_ASSERT(opt_pars.adamw.wd    <= 1.0f);
-        // beta1, beta2 after applying warmup
-        const float beta1h = 1.0f/(1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
-        const float beta2h = 1.0f/(1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
-        float * adamw_par_data = wsp_ggml_get_data_f32(opt_ctx->adamw_params);
-        adamw_par_data[0] = opt_pars.adamw.alpha;
-        adamw_par_data[1] = opt_pars.adamw.beta1;
-        adamw_par_data[2] = opt_pars.adamw.beta2;
-        adamw_par_data[3] = opt_pars.adamw.eps;
-        adamw_par_data[4] = opt_pars.adamw.wd;
-        adamw_par_data[5] = beta1h;
-        adamw_par_data[6] = beta2h;
+        const wsp_ggml_opt_optimizer_params & opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);
+        switch (opt_ctx->optimizer) {
+            case WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW: {
+                WSP_GGML_ASSERT(opt_pars.adamw.alpha > 0.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.beta1 >= 0.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.beta1 <= 1.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.beta2 >= 0.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.beta2 <= 1.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.eps >= 0.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.wd >= 0.0f);
+                WSP_GGML_ASSERT(opt_pars.adamw.wd <= 1.0f);
+                // beta1, beta2 after applying warmup
+                const float beta1h = 1.0f / (1.0f - powf(opt_pars.adamw.beta1, opt_ctx->iter));
+                const float beta2h = 1.0f / (1.0f - powf(opt_pars.adamw.beta2, opt_ctx->iter));
+                float * adamw_par_data = wsp_ggml_get_data_f32(opt_ctx->opt_step_params);
+                adamw_par_data[0] = opt_pars.adamw.alpha;
+                adamw_par_data[1] = opt_pars.adamw.beta1;
+                adamw_par_data[2] = opt_pars.adamw.beta2;
+                adamw_par_data[3] = opt_pars.adamw.eps;
+                adamw_par_data[4] = opt_pars.adamw.wd;
+                adamw_par_data[5] = beta1h;
+                adamw_par_data[6] = beta2h;
+            } break;
+            case WSP_GGML_OPT_OPTIMIZER_TYPE_SGD: {
+                WSP_GGML_ASSERT(opt_pars.sgd.alpha > 0.0f);
+                WSP_GGML_ASSERT(opt_pars.sgd.wd >= 0.0f);
+                WSP_GGML_ASSERT(opt_pars.sgd.wd <= 1.0f);
+                float * sgd = wsp_ggml_get_data_f32(opt_ctx->opt_step_params);
+                sgd[0] = opt_pars.sgd.alpha;
+                sgd[1] = opt_pars.sgd.wd;
+            } break;
+            default:
+                WSP_GGML_ABORT("fatal error");
+        }
     }
     wsp_ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
@@ -963,6 +1002,7 @@ void wsp_ggml_opt_fit(
         wsp_ggml_tensor                   * outputs,
         wsp_ggml_opt_dataset_t              dataset,
         enum wsp_ggml_opt_loss_type         loss_type,
+        enum wsp_ggml_opt_optimizer_type    optimizer,
         wsp_ggml_opt_get_optimizer_params   get_opt_pars,
         int64_t                         nepoch,
         int64_t                         nbatch_logical,
@@ -993,6 +1033,7 @@ void wsp_ggml_opt_fit(
     params.opt_period      = opt_period;
     params.get_opt_pars    = get_opt_pars;
     params.get_opt_pars_ud = &epoch;
+    params.optimizer       = optimizer;
     wsp_ggml_opt_context_t opt_ctx = wsp_ggml_opt_init(params);
     // Shuffling the data is generally useful but there is only a point if not all data is used in a single batch.
@@ -1035,3 +1076,18 @@ void wsp_ggml_opt_fit(
     wsp_ggml_opt_result_free(result_train);
     wsp_ggml_opt_result_free(result_val);
 }
+enum wsp_ggml_opt_optimizer_type wsp_ggml_opt_context_optimizer_type(wsp_ggml_opt_context_t c) {
+    return c->optimizer;
+}
+WSP_GGML_API const char * wsp_ggml_opt_optimizer_name(enum wsp_ggml_opt_optimizer_type o) {
+    switch (o) {
+        case WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW:
+            return "adamw";
+        case WSP_GGML_OPT_OPTIMIZER_TYPE_SGD:
+            return "sgd";
+        default:
+            return "undefined";
+    };
+}

package/cpp/ggml-opt.h CHANGED Viewed

@@ -74,16 +74,26 @@ extern "C" {
         WSP_GGML_OPT_BUILD_TYPE_OPT     = 30,
     };
+    enum wsp_ggml_opt_optimizer_type {
+        WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW,
+        WSP_GGML_OPT_OPTIMIZER_TYPE_SGD,
+        WSP_GGML_OPT_OPTIMIZER_TYPE_COUNT
+    };
     // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
     struct wsp_ggml_opt_optimizer_params {
-        // AdamW optimizer parameters
         struct {
             float alpha; // learning rate
-            float beta1;
-            float beta2;
+            float beta1; // first AdamW momentum
+            float beta2; // second AdamW momentum
             float eps;   // epsilon for numerical stability
-            float wd;    // weight decay for AdamW, use 0.0f to disable
+            float wd;    // weight decay - 0.0f to disable
         } adamw;
+        struct {
+            float alpha; // learning rate
+            float wd;    // weight decay
+        } sgd;
     };
     // callback to calculate optimizer parameters prior to a backward pass
@@ -112,8 +122,11 @@ extern "C" {
         int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
-        wsp_ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
-        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+        wsp_ggml_opt_get_optimizer_params get_opt_pars;    // callback for calculating optimizer parameters
+        void *                        get_opt_pars_ud; // userdata for calculating optimizer parameters
+        // only WSP_GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor
+        enum wsp_ggml_opt_optimizer_type optimizer;
     };
     // get parameters for an optimization context with defaults set where possible
@@ -142,6 +155,10 @@ extern "C" {
     // get the gradient accumulator for a node from the forward graph
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_opt_grad_acc(wsp_ggml_opt_context_t opt_ctx, struct wsp_ggml_tensor * node);
+    WSP_GGML_API enum wsp_ggml_opt_optimizer_type wsp_ggml_opt_context_optimizer_type(wsp_ggml_opt_context_t); //TODO consistent naming scheme
+    WSP_GGML_API const char * wsp_ggml_opt_optimizer_name(enum wsp_ggml_opt_optimizer_type);
     // ====== Optimization Result ======
     WSP_GGML_API wsp_ggml_opt_result_t wsp_ggml_opt_result_init(void);
@@ -226,12 +243,14 @@ extern "C" {
             struct wsp_ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
             wsp_ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
             enum wsp_ggml_opt_loss_type         loss_type,      // loss to minimize
+            enum wsp_ggml_opt_optimizer_type    optimizer,      // sgd or adamw
             wsp_ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
             int64_t                         nepoch,         // how many times the dataset should be iterated over
             int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
             float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
             bool                            silent);        // whether or not info prints to stderr should be suppressed
 #ifdef  __cplusplus
 }
 #endif

package/cpp/ggml-quants.c CHANGED Viewed

@@ -21,6 +21,17 @@
 #define UNUSED WSP_GGML_UNUSED
+static inline int best_index_int8(int n, const int8_t * val, float x) {
+    if (x <= val[0]) return 0;
+    if (x >= val[n-1]) return n-1;
+    int ml = 0, mu = n-1;
+    while (mu-ml > 1) {
+        int mav = (ml+mu)/2;
+        if (x < val[mav]) mu = mav; else ml = mav;
+    }
+    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
+}
 // reference implementation for deterministic creation of model files
 void wsp_quantize_row_q4_0_ref(const float * WSP_GGML_RESTRICT x, block_q4_0 * WSP_GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_0;
@@ -246,6 +257,53 @@ void wsp_quantize_row_q8_1_ref(const float * WSP_GGML_RESTRICT x, block_q8_1 * W
     }
 }
+static inline int best_index_mxfp4(float x, float e) {
+    int best_index = 0;
+    float best_err = fabsf(kvalues_mxfp4[0]*e - x);
+    for (int i = 1; i < 16; i++) {
+        float err = fabsf(kvalues_mxfp4[i]*e - x);
+        if (err < best_err) {
+            best_index = i;
+            best_err = err;
+        }
+    }
+    return best_index;
+}
+void wsp_quantize_row_mxfp4_ref(const float * WSP_GGML_RESTRICT x, block_mxfp4 * WSP_GGML_RESTRICT y, int64_t k) {
+    static const int qk = QK_MXFP4;
+    assert(k % qk == 0);
+    const int nb = k / qk;
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+            }
+        }
+        const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0;
+        const float d = WSP_GGML_E8M0_TO_FP32_HALF(e);
+        y[i].e = e;
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t x0 = best_index_mxfp4(x[i*qk + 0    + j], d);
+            const uint8_t x1 = best_index_mxfp4(x[i*qk + qk/2 + j], d);
+            y[i].qs[j]  = x0;
+            y[i].qs[j] |= x1 << 4;
+        }
+    }
+}
 void wsp_dewsp_quantize_row_q4_0(const block_q4_0 * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k) {
     static const int qk = QK4_0;
@@ -356,6 +414,26 @@ void wsp_dewsp_quantize_row_q8_0(const block_q8_0 * WSP_GGML_RESTRICT x, float *
     }
 }
+void wsp_dewsp_quantize_row_mxfp4(const block_mxfp4 * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k) {
+    static const int qk = QK_MXFP4;
+    assert(k % qk == 0);
+    const int nb = k / qk;
+    for (int i = 0; i < nb; i++) {
+        const float d = WSP_GGML_E8M0_TO_FP32_HALF(x[i].e);
+        for (int j = 0; j < qk/2; ++j) {
+            const int8_t x0 = kvalues_mxfp4[x[i].qs[j] & 0x0F];
+            const int8_t x1 = kvalues_mxfp4[x[i].qs[j] >>   4];
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -488,7 +566,7 @@ static float make_q3_quants(int n, int nmax, const float * WSP_GGML_RESTRICT x,
         for (int i = 0; i < n; ++i) {
             L[i] += nmax;
         }
-        return sumlx / suml2;
+        return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
     }
     for (int i = 0; i < n; ++i) {
         int l = nearest_int(iscale * x[i]);
@@ -823,7 +901,7 @@ static float make_qp_quants(int n, int nmax, const float * WSP_GGML_RESTRICT x,
     for (int i = 0; i < n; ++i) {
         max = MAX(max, x[i]);
     }
-    if (!max) { // all zero
+    if (max < GROUP_MAX_EPS) { // all zero
         for (int i = 0; i < n; ++i) { L[i] = 0; }
         return 0.f;
     }
@@ -888,7 +966,7 @@ static float make_qp_quants(int n, int nmax, const float * WSP_GGML_RESTRICT x,
             break;
         }
     }
-    return sumlx/suml2;
+    return suml2 > 0.0f ? sumlx / suml2 : 0.0f;
 }
 static void wsp_quantize_row_q2_K_impl(const float * WSP_GGML_RESTRICT x, block_q2_K * WSP_GGML_RESTRICT y, int k, const float * WSP_GGML_RESTRICT quant_weights) {
@@ -2014,6 +2092,12 @@ size_t wsp_quantize_q8_0(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_RE
     return nrow * row_size;
 }
+size_t wsp_quantize_mxfp4(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    WSP_GGML_UNUSED(quant_weights);
+    wsp_quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
+    return nrow * wsp_ggml_row_size(WSP_GGML_TYPE_MXFP4, n_per_row);
+}
 // ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)
 void wsp_quantize_row_tq1_0_ref(const float * WSP_GGML_RESTRICT x, block_tq1_0 * WSP_GGML_RESTRICT y, int64_t k) {
@@ -4182,7 +4266,7 @@ static void wsp_quantize_row_iq1_s_impl(const float * WSP_GGML_RESTRICT x, void
                     sumw[j+1] = sumw[j] + weight[i];
                 }
             }
-            float best_score = -FLT_MIN, scale = max;
+            float best_score = -FLT_MAX, scale = max;
             int besti1 = -1, besti2 = -1, best_shift = 0;
             for (int i1 = 0; i1 <= block_size; ++i1) {
                 for (int i2 = i1; i2 <= block_size; ++i2) {
@@ -4358,7 +4442,7 @@ static void wsp_quantize_row_iq1_m_impl(const float * WSP_GGML_RESTRICT x, void
                 idx[2*j] = j;
             }
             qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
-            float best_score = -FLT_MIN, scale = max;
+            float best_score = -FLT_MAX, scale = max;
             int besti1 = -1, besti2 = -1, best_k = -1;
             // 0: +, +
             // 1: +, -
@@ -4551,17 +4635,6 @@ size_t wsp_quantize_iq1_m(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_R
 // ============================ 4-bit non-linear quants
-static inline int best_index_int8(int n, const int8_t * val, float x) {
-    if (x <= val[0]) return 0;
-    if (x >= val[n-1]) return n-1;
-    int ml = 0, mu = n-1;
-    while (mu-ml > 1) {
-        int mav = (ml+mu)/2;
-        if (x < val[mav]) mu = mav; else ml = mav;
-    }
-    return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
-}
 static void wsp_quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * WSP_GGML_RESTRICT x,
         wsp_ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
         float * scales, float * weight, uint8_t * L,
@@ -4961,6 +5034,15 @@ static bool validate_fp16(wsp_ggml_fp16_t f, size_t i) {
     return true;
 }
+static bool validate_e_e8m0(uint8_t e, size_t i) {
+    if (e == 0xff) {
+        fprintf(stderr, "wsp_ggml_validate_row_data: found invalid e value %d at block %zu\n", e, i);
+        return false;
+    }
+    return true;
+}
 #define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
     const type * q = (const type *) (data); \
     for (size_t i = 0; i < (nb); ++i) { \
@@ -4977,6 +5059,14 @@ static bool validate_fp16(wsp_ggml_fp16_t f, size_t i) {
         } \
     }
+#define VALIDATE_ROW_DATA_E_E8M0_IMPL(type, data, nb) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        if (!validate_e_e8m0(q[i].e, i)) { \
+            return false; \
+        } \
+    }
 #define VALIDATE_ROW_DATA_DVEC_F16_IMPL(type, data, nb, nr) \
     const type * q = (const type *) (data); \
     for (size_t i = 0; i < (nb); ++i) { \
@@ -5130,6 +5220,10 @@ bool wsp_ggml_validate_row_data(enum wsp_ggml_type type, const void * data, size
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
             } break;
+        case WSP_GGML_TYPE_MXFP4:
+            {
+                VALIDATE_ROW_DATA_E_E8M0_IMPL(block_mxfp4, data, nb);
+            } break;
         case WSP_GGML_TYPE_Q2_K:
             {
                 VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);

package/cpp/ggml-quants.h CHANGED Viewed

@@ -21,6 +21,8 @@ WSP_GGML_API void wsp_quantize_row_q5_1_ref(const float * WSP_GGML_RESTRICT x, b
 WSP_GGML_API void wsp_quantize_row_q8_0_ref(const float * WSP_GGML_RESTRICT x, block_q8_0 * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_quantize_row_q8_1_ref(const float * WSP_GGML_RESTRICT x, block_q8_1 * WSP_GGML_RESTRICT y, int64_t k);
+WSP_GGML_API void wsp_quantize_row_mxfp4_ref(const float * WSP_GGML_RESTRICT x, block_mxfp4 * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_quantize_row_q2_K_ref(const float * WSP_GGML_RESTRICT x, block_q2_K * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_quantize_row_q3_K_ref(const float * WSP_GGML_RESTRICT x, block_q3_K * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_quantize_row_q4_K_ref(const float * WSP_GGML_RESTRICT x, block_q4_K * WSP_GGML_RESTRICT y, int64_t k);
@@ -45,6 +47,8 @@ WSP_GGML_API void wsp_dewsp_quantize_row_q5_1(const block_q5_1 * WSP_GGML_RESTRI
 WSP_GGML_API void wsp_dewsp_quantize_row_q8_0(const block_q8_0 * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
 //WSP_GGML_API void wsp_dewsp_quantize_row_q8_1(const block_q8_1 * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
+WSP_GGML_API void wsp_dewsp_quantize_row_mxfp4(const block_mxfp4 * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_dewsp_quantize_row_q2_K(const block_q2_K * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_dewsp_quantize_row_q3_K(const block_q3_K * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
 WSP_GGML_API void wsp_dewsp_quantize_row_q4_K(const block_q4_K * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
@@ -90,6 +94,8 @@ WSP_GGML_API size_t wsp_quantize_q5_0(const float * WSP_GGML_RESTRICT src, void
 WSP_GGML_API size_t wsp_quantize_q5_1(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 WSP_GGML_API size_t wsp_quantize_q8_0(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+WSP_GGML_API size_t wsp_quantize_mxfp4(const float * WSP_GGML_RESTRICT src, void * WSP_GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 WSP_GGML_API void wsp_iq2xs_init_impl(enum wsp_ggml_type type);
 WSP_GGML_API void wsp_iq2xs_free_impl(enum wsp_ggml_type type);
 WSP_GGML_API void wsp_iq3xs_init_impl(int grid_size);

package/cpp/ggml-whisper-sim.metallib CHANGED Viewed

Binary file

package/cpp/ggml-whisper.metallib CHANGED Viewed

Binary file