@fugood/llama.node 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CMakeLists.txt +1 -1
  2. package/lib/binding.js +3 -0
  3. package/lib/binding.ts +10 -0
  4. package/lib/index.js +9 -0
  5. package/lib/index.ts +10 -0
  6. package/package.json +15 -15
  7. package/scripts/llama.cpp.patch +25 -11
  8. package/src/LlamaContext.cpp +24 -0
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/CMakeLists.txt +21 -6
  11. package/src/llama.cpp/common/CMakeLists.txt +6 -0
  12. package/src/llama.cpp/common/arg.cpp +83 -22
  13. package/src/llama.cpp/common/chat-parser.cpp +40 -0
  14. package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
  15. package/src/llama.cpp/common/chat-peg-parser.h +105 -0
  16. package/src/llama.cpp/common/chat.cpp +40 -29
  17. package/src/llama.cpp/common/chat.h +10 -1
  18. package/src/llama.cpp/common/common.cpp +70 -7
  19. package/src/llama.cpp/common/common.h +23 -5
  20. package/src/llama.cpp/common/download.cpp +18 -8
  21. package/src/llama.cpp/common/download.h +3 -1
  22. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  23. package/src/llama.cpp/common/log.cpp +18 -27
  24. package/src/llama.cpp/common/log.h +19 -12
  25. package/src/llama.cpp/common/peg-parser.cpp +1712 -0
  26. package/src/llama.cpp/common/peg-parser.h +459 -0
  27. package/src/llama.cpp/common/unicode.cpp +64 -0
  28. package/src/llama.cpp/common/unicode.h +22 -0
  29. package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
  30. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
  31. package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +29 -2
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
  34. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
  37. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
  39. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-arch.cpp +30 -1
  43. package/src/llama.cpp/src/llama-arch.h +3 -0
  44. package/src/llama.cpp/src/llama-graph.cpp +3 -6
  45. package/src/llama.cpp/src/llama-hparams.h +2 -2
  46. package/src/llama.cpp/src/llama-impl.h +1 -1
  47. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  48. package/src/llama.cpp/src/llama-model.cpp +54 -6
  49. package/src/llama.cpp/src/llama-quant.cpp +0 -29
  50. package/src/llama.cpp/src/llama-vocab.cpp +1 -2
  51. package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
  52. package/src/llama.cpp/src/models/mistral3.cpp +160 -0
  53. package/src/llama.cpp/src/models/models.h +4 -0
  54. package/src/llama.cpp/src/unicode.cpp +2 -2
@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
423
423
  }
424
424
 
425
425
  struct llama_model::impl {
426
- impl() {}
427
- ~impl() {}
426
+ impl() = default;
427
+ ~impl() = default;
428
428
 
429
429
  uint64_t n_elements = 0;
430
430
 
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
461
461
  pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
462
462
  }
463
463
 
464
- llama_model::~llama_model() {}
464
+ llama_model::~llama_model() = default;
465
465
 
466
466
  void llama_model::load_stats(llama_model_loader & ml) {
467
467
  pimpl->n_elements = ml.n_elements;
@@ -663,8 +663,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
663
663
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
664
664
  hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
665
665
  } else {
666
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
667
- hparams.n_swa = 8192;
666
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
667
+ hparams.n_swa = 8192;
668
+ hparams.n_attn_temp_floor_scale = 8192;
669
+ hparams.f_attn_temp_scale = 0.1f;
668
670
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
669
671
  }
670
672
 
@@ -1626,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1626
1628
  }
1627
1629
  ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1628
1630
 
1631
+ // (optional) temperature tuning - used by mistral-large
1632
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1633
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1634
+
1629
1635
  switch (hparams.n_layer) {
1630
1636
  case 27: type = LLM_TYPE_16B; break;
1631
1637
  case 60: type = LLM_TYPE_236B; break;
@@ -2247,6 +2253,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2247
2253
  default: type = LLM_TYPE_UNKNOWN;
2248
2254
  }
2249
2255
  } break;
2256
+ case LLM_ARCH_MISTRAL3:
2257
+ {
2258
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2259
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2260
+
2261
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2262
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2263
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
2264
+
2265
+ // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2266
+ if (hparams.f_attn_temp_scale != 0.0f) {
2267
+ hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2268
+ if (hparams.n_attn_temp_floor_scale == 0) {
2269
+ throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2270
+ }
2271
+ }
2272
+
2273
+ // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2274
+ // but may need further verification with other values
2275
+ if (hparams.rope_yarn_log_mul != 0.0f) {
2276
+ float factor = 1.0f / hparams.rope_freq_scale_train;
2277
+ float mscale = 1.0f;
2278
+ float mscale_all_dims = hparams.rope_yarn_log_mul;
2279
+ static auto get_mscale = [](float scale, float mscale) {
2280
+ return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2281
+ };
2282
+ hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2283
+ }
2284
+
2285
+ switch (hparams.n_layer) {
2286
+ case 26: type = LLM_TYPE_3B; break;
2287
+ case 34: type = LLM_TYPE_8B; break;
2288
+ case 40: type = LLM_TYPE_14B; break;
2289
+ default: type = LLM_TYPE_UNKNOWN;
2290
+ }
2291
+ } break;
2250
2292
  default: throw std::runtime_error("unsupported model architecture");
2251
2293
  }
2252
2294
 
@@ -2560,6 +2602,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2560
2602
  case LLM_ARCH_MINICPM:
2561
2603
  case LLM_ARCH_GRANITE:
2562
2604
  case LLM_ARCH_GRANITE_MOE:
2605
+ case LLM_ARCH_MISTRAL3:
2563
2606
  {
2564
2607
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2565
2608
 
@@ -6487,7 +6530,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6487
6530
  layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6488
6531
  layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6489
6532
  layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6490
- layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
6533
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
6491
6534
  layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
6492
6535
  layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
6493
6536
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
@@ -7522,6 +7565,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7522
7565
  {
7523
7566
  llm = std::make_unique<llm_build_qwen3next>(*this, params);
7524
7567
  } break;
7568
+ case LLM_ARCH_MISTRAL3:
7569
+ {
7570
+ llm = std::make_unique<llm_build_mistral3>(*this, params);
7571
+ } break;
7525
7572
  default:
7526
7573
  GGML_ABORT("fatal error");
7527
7574
  }
@@ -7690,6 +7737,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7690
7737
  case LLM_ARCH_ARCEE:
7691
7738
  case LLM_ARCH_ERNIE4_5:
7692
7739
  case LLM_ARCH_ERNIE4_5_MOE:
7740
+ case LLM_ARCH_MISTRAL3:
7693
7741
  return LLAMA_ROPE_TYPE_NORM;
7694
7742
 
7695
7743
  // the pairs of head values are offset by n_rot/2
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
666
666
 
667
667
  std::map<int, std::string> mapped;
668
668
  int blk_id = 0;
669
- int pruned_attention_w = 0;
670
669
 
671
670
  // make a list of weights
672
671
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
674
673
  for (const auto & it : ml.weights_map) {
675
674
  const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
676
675
  if (remapped_name.empty()) {
677
- if (it.first.find("attn_v.weight") != std::string::npos ||
678
- it.first.find("attn_qkv.weight") != std::string::npos ||
679
- it.first.find("attn_kv_b.weight") != std::string::npos) {
680
- pruned_attention_w++;
681
- }
682
676
  LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683
677
  continue;
684
678
  }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
703
697
  });
704
698
  }
705
699
 
706
- bool is_clip_model = false;
707
700
  for (const auto * it : tensors) {
708
701
  const struct ggml_tensor * tensor = it->tensor;
709
702
 
@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
717
710
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
718
711
  qs.has_output = true;
719
712
  }
720
-
721
- is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
722
713
  }
723
714
 
724
715
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
725
716
 
726
- // sanity checks for models that have attention layers
727
- if (qs.n_attention_wv != 0 && !is_clip_model)
728
- {
729
- const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
730
- // attention layers have a non-zero number of kv heads
731
- int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
732
- if (llama_model_has_encoder(&model)) {
733
- // now n_layer_attn is the number of attention layers in the encoder
734
- // for each decoder block, there are 2 attention layers
735
- n_layer_attn += 2 * model.hparams.dec_n_layer;
736
- }
737
-
738
- // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
739
- const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
740
-
741
- LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
742
-
743
- GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
744
- }
745
-
746
717
  size_t total_size_org = 0;
747
718
  size_t total_size_new = 0;
748
719
 
@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
3253
3253
  llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3254
3254
  }
3255
3255
 
3256
- llama_vocab::~llama_vocab() {
3257
- }
3256
+ llama_vocab::~llama_vocab() = default;
3258
3257
 
3259
3258
  void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3260
3259
  pimpl->load(ml, kv);
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
30
30
  // {n_embd, n_tokens}
31
31
  inpL = build_inp_embd(model.tok_embd);
32
32
 
33
+ // (optional) temperature tuning - used by mistral-large
34
+ ggml_tensor * inp_attn_scale = nullptr;
35
+ if (hparams.f_attn_temp_scale != 0.0f) {
36
+ inp_attn_scale = build_inp_attn_scale();
37
+ }
38
+
33
39
  // inp_pos - contains the positions
34
40
  ggml_tensor * inp_pos = build_inp_pos();
35
41
 
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128
134
  ggml_tensor * Vcur = kv_cmpr;
129
135
  cb(Vcur, "Vcur", il);
130
136
 
137
+ if (inp_attn_scale) {
138
+ // apply llama 4 temperature scaling
139
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
140
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
141
+ }
142
+
131
143
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
132
144
  cur = build_attn(inp_attn,
133
145
  model.layers[il].wo, NULL,
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
160
172
  ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
161
173
  cb(Kcur, "Kcur", il);
162
174
 
175
+ if (inp_attn_scale) {
176
+ // apply llama 4 temperature scaling
177
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
178
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
179
+ }
180
+
163
181
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
164
182
  cur = build_attn(inp_attn,
165
183
  model.layers[il].wo, NULL,
@@ -0,0 +1,160 @@
1
+ #include "models.h"
2
+
3
+ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ // (optional) temperature tuning
18
+ ggml_tensor * inp_attn_scale = nullptr;
19
+ if (hparams.f_attn_temp_scale != 0.0f) {
20
+ inp_attn_scale = build_inp_attn_scale();
21
+ }
22
+
23
+ auto * inp_attn = build_attn_inp_kv();
24
+
25
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
26
+
27
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
28
+
29
+ for (int il = 0; il < n_layer; ++il) {
30
+ ggml_tensor * inpSA = inpL;
31
+
32
+ // norm
33
+ cur = build_norm(inpL,
34
+ model.layers[il].attn_norm, NULL,
35
+ LLM_NORM_RMS, il);
36
+ cb(cur, "attn_norm", il);
37
+
38
+ // self-attention
39
+ {
40
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
41
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
42
+
43
+ // compute Q and K and RoPE them
44
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
45
+ cb(Qcur, "Qcur", il);
46
+ if (model.layers[il].bq) {
47
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
48
+ cb(Qcur, "Qcur", il);
49
+ }
50
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
51
+ cb(Kcur, "Kcur", il);
52
+ if (model.layers[il].bk) {
53
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
54
+ cb(Kcur, "Kcur", il);
55
+ }
56
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
57
+ cb(Vcur, "Vcur", il);
58
+ if (model.layers[il].bv) {
59
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
60
+ cb(Vcur, "Vcur", il);
61
+ }
62
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
63
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
64
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
65
+
66
+ Qcur = ggml_rope_ext(
67
+ ctx0, Qcur, inp_pos, rope_factors,
68
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ ext_factor, attn_factor, beta_fast, beta_slow
70
+ );
71
+
72
+ Kcur = ggml_rope_ext(
73
+ ctx0, Kcur, inp_pos, rope_factors,
74
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
75
+ ext_factor, attn_factor, beta_fast, beta_slow
76
+ );
77
+
78
+ cb(Qcur, "Qcur", il);
79
+ cb(Kcur, "Kcur", il);
80
+ cb(Vcur, "Vcur", il);
81
+
82
+ if (inp_attn_scale) {
83
+ // apply llama 4 temperature scaling
84
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
85
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
86
+ }
87
+
88
+ cur = build_attn(inp_attn,
89
+ model.layers[il].wo, model.layers[il].bo,
90
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
91
+ cb(cur, "attn_out", il);
92
+ }
93
+ if (il == n_layer - 1 && inp_out_ids) {
94
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
95
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
96
+ }
97
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
98
+ cb(ffn_inp, "ffn_inp", il);
99
+
100
+ // feed-forward network (non-MoE)
101
+ if (model.layers[il].ffn_gate_inp == nullptr) {
102
+
103
+ cur = build_norm(ffn_inp,
104
+ model.layers[il].ffn_norm, NULL,
105
+ LLM_NORM_RMS, il);
106
+ cb(cur, "ffn_norm", il);
107
+
108
+ cur = build_ffn(cur,
109
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
110
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
111
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
112
+ NULL,
113
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
114
+ cb(cur, "ffn_out", il);
115
+ } else {
116
+ // MoE branch
117
+ cur = build_norm(ffn_inp,
118
+ model.layers[il].ffn_norm, NULL,
119
+ LLM_NORM_RMS, il);
120
+ cb(cur, "ffn_norm", il);
121
+
122
+ cur = build_moe_ffn(cur,
123
+ model.layers[il].ffn_gate_inp,
124
+ model.layers[il].ffn_up_exps,
125
+ model.layers[il].ffn_gate_exps,
126
+ model.layers[il].ffn_down_exps,
127
+ nullptr,
128
+ n_expert, n_expert_used,
129
+ LLM_FFN_SILU, true,
130
+ false, 0.0,
131
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
132
+ il);
133
+ cb(cur, "ffn_moe_out", il);
134
+ }
135
+ cur = ggml_add(ctx0, cur, ffn_inp);
136
+ cb(cur, "ffn_out", il);
137
+
138
+ cur = build_cvec(cur, il);
139
+ cb(cur, "l_out", il);
140
+
141
+ // input for next layer
142
+ inpL = cur;
143
+ }
144
+ cur = inpL;
145
+
146
+ cur = build_norm(cur,
147
+ model.output_norm, NULL,
148
+ LLM_NORM_RMS, -1);
149
+
150
+ cb(cur, "result_norm", -1);
151
+ res->t_embd = cur;
152
+
153
+ // lm_head
154
+ cur = build_lora_mm(model.output, cur);
155
+
156
+ cb(cur, "result_output", -1);
157
+ res->t_logits = cur;
158
+
159
+ ggml_build_forward_expand(gf, cur);
160
+ }
@@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
322
322
  llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
323
323
  };
324
324
 
325
+ struct llm_build_mistral3 : public llm_graph_context {
326
+ llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
327
+ };
328
+
325
329
  struct llm_build_mpt : public llm_graph_context {
326
330
  llm_build_mpt(const llama_model & model, const llm_graph_params & params);
327
331
  };
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
499
499
 
500
500
  // use std::wregex to split the text
501
501
  static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
502
- std::wregex expr(regex_expr);
502
+ std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
503
503
  std::vector<size_t> bpe_offsets; // store the offset of each word
504
504
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
505
505
  size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
529
529
 
530
530
  // use std::regex to split the text
531
531
  static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
532
- std::regex expr(regex_expr);
532
+ std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
533
533
  std::vector<size_t> bpe_offsets; // store the offset of each word
534
534
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
535
535
  size_t start = 0;