@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params) {
7
5
  // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
20
18
 
21
19
  // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
22
20
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
23
- const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
24
- const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
25
- const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
21
+ // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
22
+
23
+ // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
24
+ GGML_ASSERT(ext_factor >= 0.0f);
25
+ const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
26
+
27
+ // use the original attn_factor to pre-scale the kq_scale
28
+ const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
29
+ const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
26
30
 
27
31
  ggml_tensor * cur;
28
32
  ggml_tensor * inpL;
@@ -1,6 +1,7 @@
1
1
  #include "models.h"
2
2
 
3
- llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
3
+ template <bool iswa>
4
+ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
5
  const int64_t n_embd_head = hparams.n_embd_head_k;
5
6
 
6
7
  ggml_tensor * cur;
@@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
17
18
  ggml_tensor * inp_pos = build_inp_pos();
18
19
 
19
20
  // TODO: is causal == true correct? might need some changes
20
- auto * inp_attn = build_attn_inp_kv_iswa();
21
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
22
+ inp_attn_type * inp_attn = nullptr;
23
+
24
+ if constexpr (iswa) {
25
+ inp_attn = build_attn_inp_kv_iswa();
26
+ } else {
27
+ inp_attn = build_attn_inp_kv();
28
+ }
21
29
 
22
30
  ggml_tensor * inp_out_ids = build_inp_out_ids();
23
31
 
24
32
  for (int il = 0; il < n_layer; ++il) {
25
- const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
- const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
33
+ float freq_base_l = 0.0f;
34
+ float freq_scale_l = 0.0f;
35
+
36
+ if constexpr (iswa) {
37
+ freq_base_l = model.get_rope_freq_base (cparams, il);
38
+ freq_scale_l = model.get_rope_freq_scale(cparams, il);
39
+ } else {
40
+ freq_base_l = freq_base;
41
+ freq_scale_l = freq_scale;
42
+ }
27
43
 
28
44
  // norm
29
45
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
102
118
  cur = build_norm(cur,
103
119
  model.layers[il].ffn_post_norm, NULL,
104
120
  LLM_NORM_RMS, -1);
105
- cb(cur, "ffn_post_norm", -1);
121
+ cb(cur, "ffn_post_norm", il);
106
122
 
107
123
  cur = ggml_add(ctx0, cur, sa_out);
108
124
 
@@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
124
140
  // lm_head
125
141
  cur = build_lora_mm(model.output, cur);
126
142
 
143
+ if (hparams.f_final_logit_softcapping) {
144
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
145
+ cur = ggml_tanh(ctx0, cur);
146
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
147
+ }
148
+
127
149
  cb(cur, "result_output", -1);
128
150
  res->t_logits = cur;
129
151
 
130
152
  ggml_build_forward_expand(gf, cur);
131
153
  }
154
+
155
+ template struct llm_build_gemma3<false>;
156
+ template struct llm_build_gemma3<true>;
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
5
5
 
6
6
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
7
 
8
+ int sections[4];
9
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
10
+
8
11
  ggml_tensor * cur;
9
12
  ggml_tensor * inpL;
10
13
 
11
14
  inpL = build_inp_embd(model.tok_embd);
12
15
 
16
+ bool use_mrope = hparams.use_mrope();
17
+ if (ubatch.embd && !use_mrope) {
18
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
19
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
20
+ }
21
+
13
22
  // inp_pos - contains the positions
14
23
  ggml_tensor * inp_pos = build_inp_pos();
15
24
 
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
60
69
  Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
61
70
  cb(Kcur, "Kcur_normed", il);
62
71
  }
63
- Qcur = ggml_rope_ext(
64
- ctx0, Qcur, inp_pos, nullptr,
65
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
66
- ext_factor, attn_factor, beta_fast, beta_slow
67
- );
68
-
69
- Kcur = ggml_rope_ext(
70
- ctx0, Kcur, inp_pos, nullptr,
71
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
72
- ext_factor, attn_factor, beta_fast, beta_slow
73
- );
72
+
73
+ if (use_mrope) {
74
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
75
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
76
+ ext_factor, attn_factor, beta_fast, beta_slow);
77
+
78
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
79
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
80
+ ext_factor, attn_factor, beta_fast, beta_slow);
81
+ } else {
82
+ // Normal RoPE
83
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
84
+ rope_type, n_ctx_orig, freq_base, freq_scale,
85
+ ext_factor, attn_factor, beta_fast, beta_slow);
86
+
87
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
88
+ rope_type, n_ctx_orig, freq_base, freq_scale,
89
+ ext_factor, attn_factor, beta_fast, beta_slow);
90
+ }
74
91
 
75
92
  cb(Qcur, "Qcur", il);
76
93
  cb(Kcur, "Kcur", il);
@@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
8
8
 
9
9
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
10
 
11
+ int sections[4];
12
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
13
+
11
14
  ggml_tensor * cur;
12
15
  ggml_tensor * inpL;
13
16
 
14
17
  inpL = build_inp_embd(model.tok_embd);
15
18
 
19
+ bool use_mrope = hparams.use_mrope();
20
+ if (ubatch.embd && !use_mrope) {
21
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
22
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
23
+ }
24
+
16
25
  // inp_pos - contains the positions
17
26
  ggml_tensor * inp_pos = build_inp_pos();
18
27
 
@@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
63
72
  Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
64
73
  cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
65
74
  }
66
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
67
- ext_factor, attn_factor, beta_fast, beta_slow);
68
75
 
69
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
70
- ext_factor, attn_factor, beta_fast, beta_slow);
76
+ if (use_mrope) {
77
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
78
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
79
+ ext_factor, attn_factor, beta_fast, beta_slow);
80
+
81
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
82
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
83
+ ext_factor, attn_factor, beta_fast, beta_slow);
84
+ } else {
85
+ // Normal RoPE
86
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
87
+ rope_type, n_ctx_orig, freq_base, freq_scale,
88
+ ext_factor, attn_factor, beta_fast, beta_slow);
89
+
90
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
91
+ rope_type, n_ctx_orig, freq_base, freq_scale,
92
+ ext_factor, attn_factor, beta_fast, beta_slow);
93
+ }
71
94
 
72
95
  cb(Qcur, "Qcur", il);
73
96
  cb(Kcur, "Kcur", il);
@@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
179
179
  llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
180
180
  };
181
181
 
182
- struct llm_build_gemma3_iswa : public llm_graph_context {
183
- llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
182
+ template <bool iswa>
183
+ struct llm_build_gemma3 : public llm_graph_context {
184
+ llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
184
185
  };
185
186
 
186
187
  struct llm_build_gemma3n_iswa : public llm_graph_context {
@@ -440,13 +441,14 @@ private:
440
441
  ggml_tensor * cur,
441
442
  ggml_tensor * causal_mask,
442
443
  ggml_tensor * identity,
444
+ ggml_tensor * diag_mask,
443
445
  int il);
444
446
 
445
447
  ggml_tensor * build_layer_ffn(
446
448
  ggml_tensor * cur,
447
449
  int il);
448
450
 
449
- ggml_tensor * build_delta_net_recurrent(
451
+ ggml_tensor * build_delta_net_chunking(
450
452
  ggml_tensor * q,
451
453
  ggml_tensor * k,
452
454
  ggml_tensor * v,
@@ -455,18 +457,17 @@ private:
455
457
  ggml_tensor * state,
456
458
  ggml_tensor * causal_mask,
457
459
  ggml_tensor * identity,
460
+ ggml_tensor * diag_mask,
458
461
  int il);
459
462
 
460
- ggml_tensor * build_delta_net_chunking(
463
+ ggml_tensor * build_delta_net_autoregressive(
461
464
  ggml_tensor * q,
462
465
  ggml_tensor * k,
463
466
  ggml_tensor * v,
464
467
  ggml_tensor * g,
465
468
  ggml_tensor * beta,
466
469
  ggml_tensor * state,
467
- ggml_tensor * causal_mask,
468
- ggml_tensor * identity,
469
- int il);
470
+ int il);
470
471
 
471
472
  ggml_tensor * build_norm_gated(
472
473
  ggml_tensor * input,
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
107
107
  }
108
108
 
109
109
  ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
110
- cur = build_ffn(cur,
111
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
112
- NULL, NULL, NULL,
113
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
114
- NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
115
- cb(cur, "ffn_out", il);
110
+ if (model.layers[il].ffn_gate_inp == nullptr) {
111
+ cur = build_ffn(cur,
112
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
113
+ NULL, NULL, NULL,
114
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
115
+ NULL,
116
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
117
+ cb(cur, "ffn_out", il);
118
+ } else {
119
+ ggml_tensor * ffn_inp = cur;
120
+ ggml_tensor * moe_out =
121
+ build_moe_ffn(ffn_inp,
122
+ model.layers[il].ffn_gate_inp,
123
+ model.layers[il].ffn_up_exps,
124
+ nullptr, // no gate
125
+ model.layers[il].ffn_down_exps,
126
+ model.layers[il].ffn_exp_probs_b,
127
+ n_expert, n_expert_used,
128
+ LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
129
+ true, hparams.expert_weights_scale,
130
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
131
+ il);
132
+ cb(moe_out, "ffn_moe_out", il);
133
+
134
+ ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
135
+ model.layers[il].ffn_up_shexp, NULL, NULL,
136
+ NULL /* no gate */ , NULL, NULL,
137
+ model.layers[il].ffn_down_shexp, NULL, NULL,
138
+ NULL,
139
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
140
+ cb(ffn_shexp, "ffn_shexp", il);
141
+
142
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
143
+ cb(cur, "ffn_out", il);
144
+ }
116
145
 
117
146
  cur = build_cvec(cur, il);
118
147
  cb(cur, "l_out", il);
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
31
31
  {
32
32
  // compute Q and K and RoPE them
33
33
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
35
34
  cb(Qcur, "Qcur", il);
35
+ if (model.layers[il].bq) {
36
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
37
+ cb(Qcur, "Qcur", il);
38
+ }
36
39
 
37
40
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
38
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
39
41
  cb(Kcur, "Kcur", il);
42
+ if (model.layers[il].bk) {
43
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
44
+ cb(Kcur, "Kcur", il);
45
+ }
40
46
 
41
47
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
43
48
  cb(Vcur, "Vcur", il);
49
+ if (model.layers[il].bv) {
50
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
51
+ cb(Vcur, "Vcur", il);
52
+ }
44
53
 
45
54
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
46
55
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);