@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params) {
7
5
  // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
20
18
 
21
19
  // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
22
20
  // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
23
- const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
24
- const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
25
- const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
21
+ // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
22
+
23
+ // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
24
+ GGML_ASSERT(ext_factor >= 0.0f);
25
+ const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
26
+
27
+ // use the original attn_factor to pre-scale the kq_scale
28
+ const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
29
+ const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
26
30
 
27
31
  ggml_tensor * cur;
28
32
  ggml_tensor * inpL;
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
5
5
 
6
6
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
7
 
8
+ int sections[4];
9
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
10
+
8
11
  ggml_tensor * cur;
9
12
  ggml_tensor * inpL;
10
13
 
11
14
  inpL = build_inp_embd(model.tok_embd);
12
15
 
16
+ bool use_mrope = hparams.use_mrope();
17
+ if (ubatch.embd && !use_mrope) {
18
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
19
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
20
+ }
21
+
13
22
  // inp_pos - contains the positions
14
23
  ggml_tensor * inp_pos = build_inp_pos();
15
24
 
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
60
69
  Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
61
70
  cb(Kcur, "Kcur_normed", il);
62
71
  }
63
- Qcur = ggml_rope_ext(
64
- ctx0, Qcur, inp_pos, nullptr,
65
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
66
- ext_factor, attn_factor, beta_fast, beta_slow
67
- );
68
-
69
- Kcur = ggml_rope_ext(
70
- ctx0, Kcur, inp_pos, nullptr,
71
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
72
- ext_factor, attn_factor, beta_fast, beta_slow
73
- );
72
+
73
+ if (use_mrope) {
74
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
75
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
76
+ ext_factor, attn_factor, beta_fast, beta_slow);
77
+
78
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
79
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
80
+ ext_factor, attn_factor, beta_fast, beta_slow);
81
+ } else {
82
+ // Normal RoPE
83
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
84
+ rope_type, n_ctx_orig, freq_base, freq_scale,
85
+ ext_factor, attn_factor, beta_fast, beta_slow);
86
+
87
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
88
+ rope_type, n_ctx_orig, freq_base, freq_scale,
89
+ ext_factor, attn_factor, beta_fast, beta_slow);
90
+ }
74
91
 
75
92
  cb(Qcur, "Qcur", il);
76
93
  cb(Kcur, "Kcur", il);
@@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
8
8
 
9
9
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
10
 
11
+ int sections[4];
12
+ std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
13
+
11
14
  ggml_tensor * cur;
12
15
  ggml_tensor * inpL;
13
16
 
14
17
  inpL = build_inp_embd(model.tok_embd);
15
18
 
19
+ bool use_mrope = hparams.use_mrope();
20
+ if (ubatch.embd && !use_mrope) {
21
+ // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
22
+ GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
23
+ }
24
+
16
25
  // inp_pos - contains the positions
17
26
  ggml_tensor * inp_pos = build_inp_pos();
18
27
 
@@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
63
72
  Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
64
73
  cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
65
74
  }
66
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
67
- ext_factor, attn_factor, beta_fast, beta_slow);
68
75
 
69
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
70
- ext_factor, attn_factor, beta_fast, beta_slow);
76
+ if (use_mrope) {
77
+ Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
78
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
79
+ ext_factor, attn_factor, beta_fast, beta_slow);
80
+
81
+ Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
82
+ n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
83
+ ext_factor, attn_factor, beta_fast, beta_slow);
84
+ } else {
85
+ // Normal RoPE
86
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
87
+ rope_type, n_ctx_orig, freq_base, freq_scale,
88
+ ext_factor, attn_factor, beta_fast, beta_slow);
89
+
90
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
91
+ rope_type, n_ctx_orig, freq_base, freq_scale,
92
+ ext_factor, attn_factor, beta_fast, beta_slow);
93
+ }
71
94
 
72
95
  cb(Qcur, "Qcur", il);
73
96
  cb(Kcur, "Kcur", il);
@@ -441,13 +441,14 @@ private:
441
441
  ggml_tensor * cur,
442
442
  ggml_tensor * causal_mask,
443
443
  ggml_tensor * identity,
444
+ ggml_tensor * diag_mask,
444
445
  int il);
445
446
 
446
447
  ggml_tensor * build_layer_ffn(
447
448
  ggml_tensor * cur,
448
449
  int il);
449
450
 
450
- ggml_tensor * build_delta_net_recurrent(
451
+ ggml_tensor * build_delta_net_chunking(
451
452
  ggml_tensor * q,
452
453
  ggml_tensor * k,
453
454
  ggml_tensor * v,
@@ -456,18 +457,17 @@ private:
456
457
  ggml_tensor * state,
457
458
  ggml_tensor * causal_mask,
458
459
  ggml_tensor * identity,
460
+ ggml_tensor * diag_mask,
459
461
  int il);
460
462
 
461
- ggml_tensor * build_delta_net_chunking(
463
+ ggml_tensor * build_delta_net_autoregressive(
462
464
  ggml_tensor * q,
463
465
  ggml_tensor * k,
464
466
  ggml_tensor * v,
465
467
  ggml_tensor * g,
466
468
  ggml_tensor * beta,
467
469
  ggml_tensor * state,
468
- ggml_tensor * causal_mask,
469
- ggml_tensor * identity,
470
- int il);
470
+ int il);
471
471
 
472
472
  ggml_tensor * build_norm_gated(
473
473
  ggml_tensor * input,
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
107
107
  }
108
108
 
109
109
  ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
110
- cur = build_ffn(cur,
111
- model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
112
- NULL, NULL, NULL,
113
- model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
114
- NULL, LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
115
- cb(cur, "ffn_out", il);
110
+ if (model.layers[il].ffn_gate_inp == nullptr) {
111
+ cur = build_ffn(cur,
112
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
113
+ NULL, NULL, NULL,
114
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
115
+ NULL,
116
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
117
+ cb(cur, "ffn_out", il);
118
+ } else {
119
+ ggml_tensor * ffn_inp = cur;
120
+ ggml_tensor * moe_out =
121
+ build_moe_ffn(ffn_inp,
122
+ model.layers[il].ffn_gate_inp,
123
+ model.layers[il].ffn_up_exps,
124
+ nullptr, // no gate
125
+ model.layers[il].ffn_down_exps,
126
+ model.layers[il].ffn_exp_probs_b,
127
+ n_expert, n_expert_used,
128
+ LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
129
+ true, hparams.expert_weights_scale,
130
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
131
+ il);
132
+ cb(moe_out, "ffn_moe_out", il);
133
+
134
+ ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
135
+ model.layers[il].ffn_up_shexp, NULL, NULL,
136
+ NULL /* no gate */ , NULL, NULL,
137
+ model.layers[il].ffn_down_shexp, NULL, NULL,
138
+ NULL,
139
+ LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
140
+ cb(ffn_shexp, "ffn_shexp", il);
141
+
142
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
143
+ cb(cur, "ffn_out", il);
144
+ }
116
145
 
117
146
  cur = build_cvec(cur, il);
118
147
  cb(cur, "l_out", il);
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
31
31
  {
32
32
  // compute Q and K and RoPE them
33
33
  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34
- Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
35
34
  cb(Qcur, "Qcur", il);
35
+ if (model.layers[il].bq) {
36
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
37
+ cb(Qcur, "Qcur", il);
38
+ }
36
39
 
37
40
  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
38
- Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
39
41
  cb(Kcur, "Kcur", il);
42
+ if (model.layers[il].bk) {
43
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
44
+ cb(Kcur, "Kcur", il);
45
+ }
40
46
 
41
47
  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
- Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
43
48
  cb(Vcur, "Vcur", il);
49
+ if (model.layers[il].bv) {
50
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
51
+ cb(Vcur, "Vcur", il);
52
+ }
44
53
 
45
54
  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
46
55
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);