npm - @fugood/llama.node - Versions diffs - 1.4.2 → 1.4.4 - Mend

@fugood/llama.node 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/CMakeLists.txt +1 -1
package/lib/binding.js +3 -0
package/lib/binding.ts +10 -0
package/lib/index.js +9 -0
package/lib/index.ts +10 -0
package/package.json +15 -15
package/scripts/llama.cpp.patch +25 -11
package/src/LlamaContext.cpp +24 -0
package/src/LlamaContext.h +3 -0
package/src/llama.cpp/CMakeLists.txt +21 -6
package/src/llama.cpp/common/CMakeLists.txt +6 -0
package/src/llama.cpp/common/arg.cpp +83 -22
package/src/llama.cpp/common/chat-parser.cpp +40 -0
package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
package/src/llama.cpp/common/chat-peg-parser.h +105 -0
package/src/llama.cpp/common/chat.cpp +40 -29
package/src/llama.cpp/common/chat.h +10 -1
package/src/llama.cpp/common/common.cpp +70 -7
package/src/llama.cpp/common/common.h +23 -5
package/src/llama.cpp/common/download.cpp +18 -8
package/src/llama.cpp/common/download.h +3 -1
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +18 -27
package/src/llama.cpp/common/log.h +19 -12
package/src/llama.cpp/common/peg-parser.cpp +1712 -0
package/src/llama.cpp/common/peg-parser.h +459 -0
package/src/llama.cpp/common/unicode.cpp +64 -0
package/src/llama.cpp/common/unicode.h +22 -0
package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
package/src/llama.cpp/ggml/include/ggml.h +29 -2
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
package/src/llama.cpp/src/CMakeLists.txt +1 -0
package/src/llama.cpp/src/llama-arch.cpp +30 -1
package/src/llama.cpp/src/llama-arch.h +3 -0
package/src/llama.cpp/src/llama-graph.cpp +3 -6
package/src/llama.cpp/src/llama-hparams.h +2 -2
package/src/llama.cpp/src/llama-impl.h +1 -1
package/src/llama.cpp/src/llama-mmap.cpp +1 -1
package/src/llama.cpp/src/llama-model.cpp +54 -6
package/src/llama.cpp/src/llama-quant.cpp +0 -29
package/src/llama.cpp/src/llama-vocab.cpp +1 -2
package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
package/src/llama.cpp/src/models/mistral3.cpp +160 -0
package/src/llama.cpp/src/models/models.h +4 -0
package/src/llama.cpp/src/unicode.cpp +2 -2

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
 }
 struct llama_model::impl {
-    impl() {}
-    ~impl() {}
+    impl() = default;
+    ~impl() = default;
     uint64_t n_elements = 0;
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
     pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
 }
-llama_model::~llama_model() {}
+llama_model::~llama_model() = default;
 void llama_model::load_stats(llama_model_loader & ml) {
     pimpl->n_elements = ml.n_elements;
@@ -663,8 +663,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.swa_type             = LLAMA_SWA_TYPE_NONE;
                     hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
                 } else {
-                    hparams.swa_type      = LLAMA_SWA_TYPE_CHUNKED;
-                    hparams.n_swa         = 8192;
+                    hparams.swa_type                = LLAMA_SWA_TYPE_CHUNKED;
+                    hparams.n_swa                   = 8192;
+                    hparams.n_attn_temp_floor_scale = 8192;
+                    hparams.f_attn_temp_scale       = 0.1f;
                     hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                 }
@@ -1626,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
                 ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+                // (optional) temperature tuning - used by mistral-large
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
                 switch (hparams.n_layer) {
                     case 27: type = LLM_TYPE_16B; break;
                     case 60: type = LLM_TYPE_236B; break;
@@ -2247,6 +2253,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MISTRAL3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
+                if (hparams.f_attn_temp_scale != 0.0f) {
+                    hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
+                    if (hparams.n_attn_temp_floor_scale == 0) {
+                        throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
+                    }
+                }
+                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
+                //       but may need further verification with other values
+                if (hparams.rope_yarn_log_mul != 0.0f) {
+                    float factor = 1.0f / hparams.rope_freq_scale_train;
+                    float mscale = 1.0f;
+                    float mscale_all_dims = hparams.rope_yarn_log_mul;
+                    static auto get_mscale = [](float scale, float mscale) {
+                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+                    };
+                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+                }
+                switch (hparams.n_layer) {
+                    case 26: type = LLM_TYPE_3B; break;
+                    case 34: type = LLM_TYPE_8B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    default: type = LLM_TYPE_UNKNOWN;
+                }
+            } break;
         default: throw std::runtime_error("unsupported model architecture");
     }
@@ -2560,6 +2602,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_MINICPM:
             case LLM_ARCH_GRANITE:
             case LLM_ARCH_GRANITE_MOE:
+            case LLM_ARCH_MISTRAL3:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -6487,7 +6530,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.ssm_in         = create_tensor(tn(LLM_TENSOR_SSM_IN,         "weight", i), { n_embd, qkvz_dim }, 0);
                             layer.ssm_conv1d     = create_tensor(tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
                             layer.ssm_dt         = create_tensor(tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A,                    i), { hparams.ssm_dt_rank }, 0);
+                            layer.ssm_a          = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
                             layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
                             layer.ssm_norm       = create_tensor(tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
                             layer.ssm_out        = create_tensor(tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
@@ -7522,6 +7565,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_qwen3next>(*this, params);
             } break;
+        case LLM_ARCH_MISTRAL3:
+            {
+                llm = std::make_unique<llm_build_mistral3>(*this, params);
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
@@ -7690,6 +7737,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ARCEE:
         case LLM_ARCH_ERNIE4_5:
         case LLM_ARCH_ERNIE4_5_MOE:
+        case LLM_ARCH_MISTRAL3:
             return LLAMA_ROPE_TYPE_NORM;
         // the pairs of head values are offset by n_rot/2

package/src/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::map<int, std::string> mapped;
     int blk_id = 0;
-    int pruned_attention_w = 0;
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     for (const auto & it : ml.weights_map) {
         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
         if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
             LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
             continue;
         }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
-    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
-        // attention layers have a non-zero number of kv heads
-        int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
-        if (llama_model_has_encoder(&model)) {
-            // now n_layer_attn is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_layer_attn += 2 * model.hparams.dec_n_layer;
-        }
-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
-        LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
-        GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
-    }
     size_t total_size_org = 0;
     size_t total_size_new = 0;

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
 llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
 }
-llama_vocab::~llama_vocab() {
-}
+llama_vocab::~llama_vocab() = default;
 void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
     pimpl->load(ml, kv);

package/src/llama.cpp/src/models/deepseek2.cpp CHANGED Viewed

@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
     // {n_embd, n_tokens}
     inpL = build_inp_embd(model.tok_embd);
+    // (optional) temperature tuning - used by mistral-large
+    ggml_tensor * inp_attn_scale = nullptr;
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        inp_attn_scale = build_inp_attn_scale();
+    }
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                 ggml_tensor * Vcur = kv_cmpr;
                 cb(Vcur, "Vcur", il);
+                if (inp_attn_scale) {
+                    // apply llama 4 temperature scaling
+                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                    cb(Qcur, "Qcur_attn_temp_scaled", il);
+                }
                 // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
                 cur = build_attn(inp_attn,
                         model.layers[il].wo, NULL,
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                 ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
                 cb(Kcur, "Kcur", il);
+                if (inp_attn_scale) {
+                    // apply llama 4 temperature scaling
+                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                    cb(Qcur, "Qcur_attn_temp_scaled", il);
+                }
                 // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
                 cur = build_attn(inp_attn,
                             model.layers[il].wo, NULL,

package/src/llama.cpp/src/models/mistral3.cpp ADDED Viewed

@@ -0,0 +1,160 @@
+#include "models.h"
+llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+    inpL = build_inp_embd(model.tok_embd);
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+    // (optional) temperature tuning
+    ggml_tensor * inp_attn_scale = nullptr;
+    if (hparams.f_attn_temp_scale != 0.0f) {
+        inp_attn_scale = build_inp_attn_scale();
+    }
+    auto * inp_attn = build_attn_inp_kv();
+    const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+        // norm
+        cur = build_norm(inpL,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        // self-attention
+        {
+            // rope freq factors for llama3; may return nullptr for llama2 and other models
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+            // compute Q and K and RoPE them
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+            if (inp_attn_scale) {
+                // apply llama 4 temperature scaling
+                Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                cb(Qcur, "Qcur_attn_temp_scaled", il);
+            }
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, model.layers[il].bo,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+        // feed-forward network (non-MoE)
+        if (model.layers[il].ffn_gate_inp == nullptr) {
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
+                    model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
+                    model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // MoE branch
+            cur = build_norm(ffn_inp,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+            cur = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    nullptr,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, true,
+                    false, 0.0,
+                    LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                    il);
+            cb(cur, "ffn_moe_out", il);
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "ffn_out", il);
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}

package/src/llama.cpp/src/models/models.h CHANGED Viewed

@@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
     llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
 };
+struct llm_build_mistral3 : public llm_graph_context {
+    llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
+};
 struct llm_build_mpt : public llm_graph_context {
     llm_build_mpt(const llama_model & model, const llm_graph_params & params);
 };

package/src/llama.cpp/src/unicode.cpp CHANGED Viewed

@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
+    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
     size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
     std::vector<size_t> bpe_offsets; // store the offset of each word
     bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
     size_t start = 0;