@fugood/llama.node 1.4.1 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CMakeLists.txt +1 -1
  2. package/lib/binding.js +3 -0
  3. package/lib/binding.ts +2 -0
  4. package/package.json +16 -16
  5. package/scripts/llama.cpp.patch +25 -11
  6. package/src/LlamaContext.cpp +2 -2
  7. package/src/llama.cpp/CMakeLists.txt +21 -6
  8. package/src/llama.cpp/common/CMakeLists.txt +6 -0
  9. package/src/llama.cpp/common/arg.cpp +65 -16
  10. package/src/llama.cpp/common/chat-parser.cpp +40 -0
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
  12. package/src/llama.cpp/common/chat-peg-parser.h +105 -0
  13. package/src/llama.cpp/common/chat.cpp +40 -29
  14. package/src/llama.cpp/common/chat.h +10 -1
  15. package/src/llama.cpp/common/common.cpp +24 -5
  16. package/src/llama.cpp/common/common.h +16 -5
  17. package/src/llama.cpp/common/download.cpp +18 -8
  18. package/src/llama.cpp/common/download.h +3 -1
  19. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  20. package/src/llama.cpp/common/log.cpp +15 -1
  21. package/src/llama.cpp/common/log.h +19 -12
  22. package/src/llama.cpp/common/peg-parser.cpp +1712 -0
  23. package/src/llama.cpp/common/peg-parser.h +459 -0
  24. package/src/llama.cpp/common/unicode.cpp +64 -0
  25. package/src/llama.cpp/common/unicode.h +22 -0
  26. package/src/llama.cpp/ggml/CMakeLists.txt +48 -48
  27. package/src/llama.cpp/ggml/include/ggml.h +7 -2
  28. package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -4
  29. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +60 -1
  32. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  33. package/src/llama.cpp/src/llama-arch.cpp +30 -1
  34. package/src/llama.cpp/src/llama-arch.h +3 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +3 -6
  36. package/src/llama.cpp/src/llama-hparams.h +2 -2
  37. package/src/llama.cpp/src/llama-impl.h +1 -1
  38. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  39. package/src/llama.cpp/src/llama-model.cpp +50 -6
  40. package/src/llama.cpp/src/llama-vocab.cpp +1 -2
  41. package/src/llama.cpp/src/models/mistral3.cpp +160 -0
  42. package/src/llama.cpp/src/models/models.h +4 -0
@@ -204,6 +204,10 @@
204
204
  # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
205
205
  #endif
206
206
 
207
+ #if defined(_WIN32) && !defined(_WIN32_WINNT)
208
+ # define _WIN32_WINNT 0x0A00
209
+ #endif
210
+
207
211
  #include <stdbool.h>
208
212
  #include <stddef.h>
209
213
  #include <stdint.h>
@@ -2148,7 +2152,8 @@ extern "C" {
2148
2152
  };
2149
2153
 
2150
2154
  enum ggml_scale_flag {
2151
- GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
2155
+ GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
2156
+ GGML_SCALE_FLAG_ANTIALIAS = (1 << 9),
2152
2157
  };
2153
2158
 
2154
2159
  // interpolate
@@ -2278,7 +2283,7 @@ extern "C" {
2278
2283
  float stop,
2279
2284
  float step);
2280
2285
 
2281
- #define GGML_KQ_MASK_PAD 64
2286
+ #define GGML_KQ_MASK_PAD 1
2282
2287
 
2283
2288
  // q: [n_embd_k, n_batch, n_head, ne3 ]
2284
2289
  // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
@@ -127,10 +127,6 @@ if (NOT MSVC)
127
127
  endif()
128
128
  endif()
129
129
 
130
- if (MINGW)
131
- add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
132
- endif()
133
-
134
130
  #
135
131
  # POSIX conformance
136
132
  #
@@ -8,6 +8,10 @@
8
8
  #include <sys/sysctl.h>
9
9
  #endif
10
10
 
11
+ #if !defined(HWCAP2_SVE2)
12
+ #define HWCAP2_SVE2 (1 << 1)
13
+ #endif
14
+
11
15
  #if !defined(HWCAP2_I8MM)
12
16
  #define HWCAP2_I8MM (1 << 13)
13
17
  #endif
@@ -683,22 +683,14 @@ bool ggml_is_numa(void) {
683
683
  }
684
684
 
685
685
  #if defined(__ARM_ARCH)
686
-
687
- #if defined(__linux__) && defined(__aarch64__)
688
- #include <sys/auxv.h>
689
- #endif
690
-
691
- static void ggml_init_arm_arch_features(void) {
692
686
  #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
693
- #if defined(__linux__)
694
- ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
687
+ #include <arm_sve.h>
688
+ static void ggml_init_arm_arch_features(void) {
689
+ ggml_arm_arch_features.sve_cnt = svcntb();
690
+ }
695
691
  #else
696
- // TODO: add support of SVE for non-linux systems
697
- #error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
692
+ static void ggml_init_arm_arch_features(void) {}
698
693
  #endif
699
- #endif
700
- }
701
-
702
694
  #endif // __ARM_ARCH
703
695
 
704
696
  struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
@@ -2706,6 +2698,11 @@ struct ggml_cplan ggml_graph_plan(
2706
2698
  n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
2707
2699
  }
2708
2700
 
2701
+ #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
2702
+ // Emscripten without pthreads support can only use a single thread
2703
+ n_threads = 1;
2704
+ #endif
2705
+
2709
2706
  size_t work_size = 0;
2710
2707
 
2711
2708
  struct ggml_cplan cplan;
@@ -6383,7 +6383,7 @@ static void ggml_compute_forward_im2col_3d_f16(
6383
6383
  const int64_t iih = ioh*s1 + ikh*d1 - p1;
6384
6384
  const int64_t iid = iod*s2 + ikd*d2 - p2;
6385
6385
 
6386
- if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
6386
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
6387
6387
  dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
6388
6388
  } else {
6389
6389
  const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
@@ -7420,6 +7420,65 @@ static void ggml_compute_forward_upscale_f32(
7420
7420
  }
7421
7421
  }
7422
7422
  }
7423
+ } else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
7424
+ // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
7425
+ // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
7426
+ auto triangle_filter = [](float x) -> float {
7427
+ return std::max(1.0f - fabsf(x), 0.0f);
7428
+ };
7429
+
7430
+ // support and invscale, minimum 1 pixel for bilinear
7431
+ const float support1 = std::max(1.0f, 1.0f / sf1);
7432
+ const float invscale1 = 1.0f / support1;
7433
+ const float support0 = std::max(1.0f, 1.0f / sf0);
7434
+ const float invscale0 = 1.0f / support0;
7435
+
7436
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
7437
+ const int64_t i03 = i3 / sf3;
7438
+ for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
7439
+ const int64_t i02 = i2 / sf2;
7440
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
7441
+ const float y = ((float) i1 + pixel_offset) / sf1;
7442
+ for (int64_t i0 = 0; i0 < ne0; i0++) {
7443
+ const float x = ((float) i0 + pixel_offset) / sf0;
7444
+
7445
+ // the range of source pixels that contribute
7446
+ const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
7447
+ const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
7448
+ const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
7449
+ const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
7450
+
7451
+ // bilinear filter with antialiasing
7452
+ float val = 0.0f;
7453
+ float total_weight = 0.0f;
7454
+
7455
+ for (int64_t sy = y_min; sy < y_max; sy++) {
7456
+ const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
7457
+
7458
+ for (int64_t sx = x_min; sx < x_max; sx++) {
7459
+ const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
7460
+ const float weight = weight_x * weight_y;
7461
+
7462
+ if (weight <= 0.0f) {
7463
+ continue;
7464
+ }
7465
+
7466
+ const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
7467
+ val += pixel * weight;
7468
+ total_weight += weight;
7469
+ }
7470
+ }
7471
+
7472
+ if (total_weight > 0.0f) {
7473
+ val /= total_weight;
7474
+ }
7475
+
7476
+ float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
7477
+ *dst_ptr = val;
7478
+ }
7479
+ }
7480
+ }
7481
+ }
7423
7482
  } else if (mode == GGML_SCALE_MODE_BILINEAR) {
7424
7483
  for (int64_t i3 = 0; i3 < ne3; i3++) {
7425
7484
  const int64_t i03 = i3 / sf3;
@@ -132,6 +132,7 @@ add_library(llama
132
132
  models/t5-enc.cpp
133
133
  models/wavtokenizer-dec.cpp
134
134
  models/xverse.cpp
135
+ models/mistral3.cpp
135
136
  models/graph-context-mamba.cpp
136
137
  )
137
138
 
@@ -111,6 +111,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
111
111
  { LLM_ARCH_COGVLM, "cogvlm" },
112
112
  { LLM_ARCH_RND1, "rnd1" },
113
113
  { LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
114
+ { LLM_ARCH_MISTRAL3, "mistral3" },
114
115
  { LLM_ARCH_UNKNOWN, "(unknown)" },
115
116
  };
116
117
 
@@ -204,6 +205,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
204
205
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
205
206
  { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
206
207
  { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
208
+ { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
207
209
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
208
210
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
209
211
 
@@ -853,7 +855,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
853
855
  { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
854
856
  { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
855
857
  { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
856
- { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
858
+ { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" },
857
859
  { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
858
860
  { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
859
861
  { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" },
@@ -2512,6 +2514,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2512
2514
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2513
2515
  },
2514
2516
  },
2517
+ {
2518
+ LLM_ARCH_MISTRAL3,
2519
+ {
2520
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2521
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2522
+ { LLM_TENSOR_OUTPUT, "output" },
2523
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
2524
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2525
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2526
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2527
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2528
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2529
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
2530
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2531
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2532
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
2533
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
2534
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2535
+ { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
2536
+ { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
2537
+ { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
2538
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2539
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2540
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2541
+ },
2542
+ },
2515
2543
  {
2516
2544
  LLM_ARCH_UNKNOWN,
2517
2545
  {
@@ -2611,6 +2639,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
2611
2639
  {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}},
2612
2640
  {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
2613
2641
  {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}},
2642
+ {LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN
2614
2643
  {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2615
2644
  {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
2616
2645
  {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -115,6 +115,7 @@ enum llm_arch {
115
115
  LLM_ARCH_COGVLM,
116
116
  LLM_ARCH_RND1,
117
117
  LLM_ARCH_PANGU_EMBED,
118
+ LLM_ARCH_MISTRAL3,
118
119
  LLM_ARCH_UNKNOWN,
119
120
  };
120
121
 
@@ -208,6 +209,7 @@ enum llm_kv {
208
209
  LLM_KV_ATTENTION_SCALE,
209
210
  LLM_KV_ATTENTION_OUTPUT_SCALE,
210
211
  LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
212
+ LLM_KV_ATTENTION_TEMPERATURE_SCALE,
211
213
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
212
214
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
213
215
 
@@ -377,6 +379,7 @@ enum llm_tensor {
377
379
  LLM_TENSOR_SSM_DT,
378
380
  LLM_TENSOR_SSM_DT_NORM,
379
381
  LLM_TENSOR_SSM_A,
382
+ LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN
380
383
  LLM_TENSOR_SSM_B_NORM,
381
384
  LLM_TENSOR_SSM_C_NORM,
382
385
  LLM_TENSOR_SSM_D,
@@ -71,6 +71,9 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
71
71
  if (ubatch->pos && attn_scale) {
72
72
  const int64_t n_tokens = ubatch->n_tokens;
73
73
 
74
+ GGML_ASSERT(f_attn_temp_scale != 0.0f);
75
+ GGML_ASSERT(n_attn_temp_floor_scale != 0);
76
+
74
77
  std::vector<float> attn_scale_data(n_tokens, 0.0f);
75
78
  for (int i = 0; i < n_tokens; ++i) {
76
79
  const float pos = ubatch->pos[i];
@@ -810,9 +813,6 @@ ggml_tensor * llm_graph_context::build_ffn(
810
813
  GGML_ABORT("fatal error");
811
814
  }
812
815
 
813
- //expand here so that we can fuse ffn gate
814
- ggml_build_forward_expand(gf, cur);
815
-
816
816
  if (gate && type_gate == LLM_FFN_PAR) {
817
817
  cur = ggml_mul(ctx0, cur, tmp);
818
818
  cb(cur, "ffn_gate_par", il);
@@ -1093,9 +1093,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
1093
1093
  GGML_ABORT("fatal error");
1094
1094
  }
1095
1095
 
1096
- //expand here so that we can fuse ffn gate
1097
- ggml_build_forward_expand(gf, cur);
1098
-
1099
1096
  experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
1100
1097
  cb(experts, "ffn_moe_down", il);
1101
1098
 
@@ -162,8 +162,8 @@ struct llama_hparams {
162
162
  // llama4 smallthinker
163
163
  uint32_t n_moe_layer_step = 0;
164
164
  uint32_t n_no_rope_layer_step = 4;
165
- uint32_t n_attn_temp_floor_scale = 8192;
166
- float f_attn_temp_scale = 0.1;
165
+ uint32_t n_attn_temp_floor_scale = 0;
166
+ float f_attn_temp_scale = 0.0f;
167
167
 
168
168
  // gemma3n altup
169
169
  uint32_t n_altup = 4; // altup_num_inputs
@@ -37,7 +37,7 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
37
37
  template <typename T>
38
38
  struct no_init {
39
39
  T value;
40
- no_init() { /* do nothing */ }
40
+ no_init() = default;
41
41
  };
42
42
 
43
43
  struct time_meas {
@@ -485,7 +485,7 @@ struct llama_mlock::impl {
485
485
  if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
486
486
  suggest = false;
487
487
  }
488
- if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
488
+ if (suggest && ((uint64_t)lock_limit.rlim_max > (uint64_t)lock_limit.rlim_cur + size)) {
489
489
  suggest = false;
490
490
  }
491
491
  #endif
@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
423
423
  }
424
424
 
425
425
  struct llama_model::impl {
426
- impl() {}
427
- ~impl() {}
426
+ impl() = default;
427
+ ~impl() = default;
428
428
 
429
429
  uint64_t n_elements = 0;
430
430
 
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
461
461
  pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
462
462
  }
463
463
 
464
- llama_model::~llama_model() {}
464
+ llama_model::~llama_model() = default;
465
465
 
466
466
  void llama_model::load_stats(llama_model_loader & ml) {
467
467
  pimpl->n_elements = ml.n_elements;
@@ -663,8 +663,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
663
663
  hparams.swa_type = LLAMA_SWA_TYPE_NONE;
664
664
  hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
665
665
  } else {
666
- hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
667
- hparams.n_swa = 8192;
666
+ hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
667
+ hparams.n_swa = 8192;
668
+ hparams.n_attn_temp_floor_scale = 8192;
669
+ hparams.f_attn_temp_scale = 0.1f;
668
670
  hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
669
671
  }
670
672
 
@@ -2247,6 +2249,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
2247
2249
  default: type = LLM_TYPE_UNKNOWN;
2248
2250
  }
2249
2251
  } break;
2252
+ case LLM_ARCH_MISTRAL3:
2253
+ {
2254
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
2255
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
2256
+
2257
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
2258
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
2259
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
2260
+
2261
+ // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
2262
+ if (hparams.f_attn_temp_scale != 0.0f) {
2263
+ hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
2264
+ if (hparams.n_attn_temp_floor_scale == 0) {
2265
+ throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
2266
+ }
2267
+ }
2268
+
2269
+ // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
2270
+ // but may need further verification with other values
2271
+ if (hparams.rope_yarn_log_mul != 0.0f) {
2272
+ float factor = 1.0f / hparams.rope_freq_scale_train;
2273
+ float mscale = 1.0f;
2274
+ float mscale_all_dims = hparams.rope_yarn_log_mul;
2275
+ static auto get_mscale = [](float scale, float mscale) {
2276
+ return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
2277
+ };
2278
+ hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
2279
+ }
2280
+
2281
+ switch (hparams.n_layer) {
2282
+ case 26: type = LLM_TYPE_3B; break;
2283
+ case 34: type = LLM_TYPE_8B; break;
2284
+ case 40: type = LLM_TYPE_14B; break;
2285
+ default: type = LLM_TYPE_UNKNOWN;
2286
+ }
2287
+ } break;
2250
2288
  default: throw std::runtime_error("unsupported model architecture");
2251
2289
  }
2252
2290
 
@@ -2560,6 +2598,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2560
2598
  case LLM_ARCH_MINICPM:
2561
2599
  case LLM_ARCH_GRANITE:
2562
2600
  case LLM_ARCH_GRANITE_MOE:
2601
+ case LLM_ARCH_MISTRAL3:
2563
2602
  {
2564
2603
  tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
2565
2604
 
@@ -6487,7 +6526,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6487
6526
  layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
6488
6527
  layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
6489
6528
  layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
6490
- layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
6529
+ layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
6491
6530
  layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
6492
6531
  layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
6493
6532
  layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
@@ -7522,6 +7561,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
7522
7561
  {
7523
7562
  llm = std::make_unique<llm_build_qwen3next>(*this, params);
7524
7563
  } break;
7564
+ case LLM_ARCH_MISTRAL3:
7565
+ {
7566
+ llm = std::make_unique<llm_build_mistral3>(*this, params);
7567
+ } break;
7525
7568
  default:
7526
7569
  GGML_ABORT("fatal error");
7527
7570
  }
@@ -7690,6 +7733,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
7690
7733
  case LLM_ARCH_ARCEE:
7691
7734
  case LLM_ARCH_ERNIE4_5:
7692
7735
  case LLM_ARCH_ERNIE4_5_MOE:
7736
+ case LLM_ARCH_MISTRAL3:
7693
7737
  return LLAMA_ROPE_TYPE_NORM;
7694
7738
 
7695
7739
  // the pairs of head values are offset by n_rot/2
@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
3253
3253
  llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3254
3254
  }
3255
3255
 
3256
- llama_vocab::~llama_vocab() {
3257
- }
3256
+ llama_vocab::~llama_vocab() = default;
3258
3257
 
3259
3258
  void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3260
3259
  pimpl->load(ml, kv);
@@ -0,0 +1,160 @@
1
+ #include "models.h"
2
+
3
+ llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ // (optional) temperature tuning
18
+ ggml_tensor * inp_attn_scale = nullptr;
19
+ if (hparams.f_attn_temp_scale != 0.0f) {
20
+ inp_attn_scale = build_inp_attn_scale();
21
+ }
22
+
23
+ auto * inp_attn = build_attn_inp_kv();
24
+
25
+ const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
26
+
27
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
28
+
29
+ for (int il = 0; il < n_layer; ++il) {
30
+ ggml_tensor * inpSA = inpL;
31
+
32
+ // norm
33
+ cur = build_norm(inpL,
34
+ model.layers[il].attn_norm, NULL,
35
+ LLM_NORM_RMS, il);
36
+ cb(cur, "attn_norm", il);
37
+
38
+ // self-attention
39
+ {
40
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
41
+ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
42
+
43
+ // compute Q and K and RoPE them
44
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
45
+ cb(Qcur, "Qcur", il);
46
+ if (model.layers[il].bq) {
47
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
48
+ cb(Qcur, "Qcur", il);
49
+ }
50
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
51
+ cb(Kcur, "Kcur", il);
52
+ if (model.layers[il].bk) {
53
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
54
+ cb(Kcur, "Kcur", il);
55
+ }
56
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
57
+ cb(Vcur, "Vcur", il);
58
+ if (model.layers[il].bv) {
59
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
60
+ cb(Vcur, "Vcur", il);
61
+ }
62
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
63
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
64
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
65
+
66
+ Qcur = ggml_rope_ext(
67
+ ctx0, Qcur, inp_pos, rope_factors,
68
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ ext_factor, attn_factor, beta_fast, beta_slow
70
+ );
71
+
72
+ Kcur = ggml_rope_ext(
73
+ ctx0, Kcur, inp_pos, rope_factors,
74
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
75
+ ext_factor, attn_factor, beta_fast, beta_slow
76
+ );
77
+
78
+ cb(Qcur, "Qcur", il);
79
+ cb(Kcur, "Kcur", il);
80
+ cb(Vcur, "Vcur", il);
81
+
82
+ if (inp_attn_scale) {
83
+ // apply llama 4 temperature scaling
84
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
85
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
86
+ }
87
+
88
+ cur = build_attn(inp_attn,
89
+ model.layers[il].wo, model.layers[il].bo,
90
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
91
+ cb(cur, "attn_out", il);
92
+ }
93
+ if (il == n_layer - 1 && inp_out_ids) {
94
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
95
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
96
+ }
97
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
98
+ cb(ffn_inp, "ffn_inp", il);
99
+
100
+ // feed-forward network (non-MoE)
101
+ if (model.layers[il].ffn_gate_inp == nullptr) {
102
+
103
+ cur = build_norm(ffn_inp,
104
+ model.layers[il].ffn_norm, NULL,
105
+ LLM_NORM_RMS, il);
106
+ cb(cur, "ffn_norm", il);
107
+
108
+ cur = build_ffn(cur,
109
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
110
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
111
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
112
+ NULL,
113
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
114
+ cb(cur, "ffn_out", il);
115
+ } else {
116
+ // MoE branch
117
+ cur = build_norm(ffn_inp,
118
+ model.layers[il].ffn_norm, NULL,
119
+ LLM_NORM_RMS, il);
120
+ cb(cur, "ffn_norm", il);
121
+
122
+ cur = build_moe_ffn(cur,
123
+ model.layers[il].ffn_gate_inp,
124
+ model.layers[il].ffn_up_exps,
125
+ model.layers[il].ffn_gate_exps,
126
+ model.layers[il].ffn_down_exps,
127
+ nullptr,
128
+ n_expert, n_expert_used,
129
+ LLM_FFN_SILU, true,
130
+ false, 0.0,
131
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
132
+ il);
133
+ cb(cur, "ffn_moe_out", il);
134
+ }
135
+ cur = ggml_add(ctx0, cur, ffn_inp);
136
+ cb(cur, "ffn_out", il);
137
+
138
+ cur = build_cvec(cur, il);
139
+ cb(cur, "l_out", il);
140
+
141
+ // input for next layer
142
+ inpL = cur;
143
+ }
144
+ cur = inpL;
145
+
146
+ cur = build_norm(cur,
147
+ model.output_norm, NULL,
148
+ LLM_NORM_RMS, -1);
149
+
150
+ cb(cur, "result_norm", -1);
151
+ res->t_embd = cur;
152
+
153
+ // lm_head
154
+ cur = build_lora_mm(model.output, cur);
155
+
156
+ cb(cur, "result_output", -1);
157
+ res->t_logits = cur;
158
+
159
+ ggml_build_forward_expand(gf, cur);
160
+ }
@@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
322
322
  llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
323
323
  };
324
324
 
325
+ struct llm_build_mistral3 : public llm_graph_context {
326
+ llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
327
+ };
328
+
325
329
  struct llm_build_mpt : public llm_graph_context {
326
330
  llm_build_mpt(const llama_model & model, const llm_graph_params & params);
327
331
  };