@fugood/llama.node 1.4.7 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +22 -23
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +103 -44
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params) {
|
|
7
5
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
|
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
20
18
|
|
|
21
19
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
22
20
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
22
|
+
|
|
23
|
+
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
|
24
|
+
GGML_ASSERT(ext_factor >= 0.0f);
|
|
25
|
+
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
26
|
+
|
|
27
|
+
// use the original attn_factor to pre-scale the kq_scale
|
|
28
|
+
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
29
|
+
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
|
26
30
|
|
|
27
31
|
ggml_tensor * cur;
|
|
28
32
|
ggml_tensor * inpL;
|
|
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|
|
5
5
|
|
|
6
6
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
7
|
|
|
8
|
+
int sections[4];
|
|
9
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
10
|
+
|
|
8
11
|
ggml_tensor * cur;
|
|
9
12
|
ggml_tensor * inpL;
|
|
10
13
|
|
|
11
14
|
inpL = build_inp_embd(model.tok_embd);
|
|
12
15
|
|
|
16
|
+
bool use_mrope = hparams.use_mrope();
|
|
17
|
+
if (ubatch.embd && !use_mrope) {
|
|
18
|
+
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
|
19
|
+
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
|
20
|
+
}
|
|
21
|
+
|
|
13
22
|
// inp_pos - contains the positions
|
|
14
23
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15
24
|
|
|
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|
|
60
69
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
61
70
|
cb(Kcur, "Kcur_normed", il);
|
|
62
71
|
}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
|
|
73
|
+
if (use_mrope) {
|
|
74
|
+
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
|
75
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
76
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
77
|
+
|
|
78
|
+
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
|
79
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
80
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
81
|
+
} else {
|
|
82
|
+
// Normal RoPE
|
|
83
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
|
84
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
85
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
86
|
+
|
|
87
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
|
88
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
89
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
90
|
+
}
|
|
74
91
|
|
|
75
92
|
cb(Qcur, "Qcur", il);
|
|
76
93
|
cb(Kcur, "Kcur", il);
|
|
@@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|
|
8
8
|
|
|
9
9
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10
10
|
|
|
11
|
+
int sections[4];
|
|
12
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
13
|
+
|
|
11
14
|
ggml_tensor * cur;
|
|
12
15
|
ggml_tensor * inpL;
|
|
13
16
|
|
|
14
17
|
inpL = build_inp_embd(model.tok_embd);
|
|
15
18
|
|
|
19
|
+
bool use_mrope = hparams.use_mrope();
|
|
20
|
+
if (ubatch.embd && !use_mrope) {
|
|
21
|
+
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
|
22
|
+
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
|
23
|
+
}
|
|
24
|
+
|
|
16
25
|
// inp_pos - contains the positions
|
|
17
26
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
18
27
|
|
|
@@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|
|
63
72
|
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
64
73
|
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
65
74
|
}
|
|
66
|
-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
67
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
68
75
|
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
if (use_mrope) {
|
|
77
|
+
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
|
78
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
79
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
80
|
+
|
|
81
|
+
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
|
82
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
83
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
84
|
+
} else {
|
|
85
|
+
// Normal RoPE
|
|
86
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
|
87
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
88
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
89
|
+
|
|
90
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
|
91
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
92
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
93
|
+
}
|
|
71
94
|
|
|
72
95
|
cb(Qcur, "Qcur", il);
|
|
73
96
|
cb(Kcur, "Kcur", il);
|
|
@@ -441,13 +441,14 @@ private:
|
|
|
441
441
|
ggml_tensor * cur,
|
|
442
442
|
ggml_tensor * causal_mask,
|
|
443
443
|
ggml_tensor * identity,
|
|
444
|
+
ggml_tensor * diag_mask,
|
|
444
445
|
int il);
|
|
445
446
|
|
|
446
447
|
ggml_tensor * build_layer_ffn(
|
|
447
448
|
ggml_tensor * cur,
|
|
448
449
|
int il);
|
|
449
450
|
|
|
450
|
-
ggml_tensor *
|
|
451
|
+
ggml_tensor * build_delta_net_chunking(
|
|
451
452
|
ggml_tensor * q,
|
|
452
453
|
ggml_tensor * k,
|
|
453
454
|
ggml_tensor * v,
|
|
@@ -456,18 +457,17 @@ private:
|
|
|
456
457
|
ggml_tensor * state,
|
|
457
458
|
ggml_tensor * causal_mask,
|
|
458
459
|
ggml_tensor * identity,
|
|
460
|
+
ggml_tensor * diag_mask,
|
|
459
461
|
int il);
|
|
460
462
|
|
|
461
|
-
ggml_tensor *
|
|
463
|
+
ggml_tensor * build_delta_net_autoregressive(
|
|
462
464
|
ggml_tensor * q,
|
|
463
465
|
ggml_tensor * k,
|
|
464
466
|
ggml_tensor * v,
|
|
465
467
|
ggml_tensor * g,
|
|
466
468
|
ggml_tensor * beta,
|
|
467
469
|
ggml_tensor * state,
|
|
468
|
-
|
|
469
|
-
ggml_tensor * identity,
|
|
470
|
-
int il);
|
|
470
|
+
int il);
|
|
471
471
|
|
|
472
472
|
ggml_tensor * build_norm_gated(
|
|
473
473
|
ggml_tensor * input,
|
|
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
110
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
111
|
+
cur = build_ffn(cur,
|
|
112
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
113
|
+
NULL, NULL, NULL,
|
|
114
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
115
|
+
NULL,
|
|
116
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
117
|
+
cb(cur, "ffn_out", il);
|
|
118
|
+
} else {
|
|
119
|
+
ggml_tensor * ffn_inp = cur;
|
|
120
|
+
ggml_tensor * moe_out =
|
|
121
|
+
build_moe_ffn(ffn_inp,
|
|
122
|
+
model.layers[il].ffn_gate_inp,
|
|
123
|
+
model.layers[il].ffn_up_exps,
|
|
124
|
+
nullptr, // no gate
|
|
125
|
+
model.layers[il].ffn_down_exps,
|
|
126
|
+
model.layers[il].ffn_exp_probs_b,
|
|
127
|
+
n_expert, n_expert_used,
|
|
128
|
+
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
|
129
|
+
true, hparams.expert_weights_scale,
|
|
130
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
131
|
+
il);
|
|
132
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
133
|
+
|
|
134
|
+
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
|
135
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
136
|
+
NULL /* no gate */ , NULL, NULL,
|
|
137
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
138
|
+
NULL,
|
|
139
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
140
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
141
|
+
|
|
142
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
143
|
+
cb(cur, "ffn_out", il);
|
|
144
|
+
}
|
|
116
145
|
|
|
117
146
|
cur = build_cvec(cur, il);
|
|
118
147
|
cb(cur, "l_out", il);
|
|
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
|
|
31
31
|
{
|
|
32
32
|
// compute Q and K and RoPE them
|
|
33
33
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
34
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
35
34
|
cb(Qcur, "Qcur", il);
|
|
35
|
+
if (model.layers[il].bq) {
|
|
36
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
37
|
+
cb(Qcur, "Qcur", il);
|
|
38
|
+
}
|
|
36
39
|
|
|
37
40
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
38
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
39
41
|
cb(Kcur, "Kcur", il);
|
|
42
|
+
if (model.layers[il].bk) {
|
|
43
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
44
|
+
cb(Kcur, "Kcur", il);
|
|
45
|
+
}
|
|
40
46
|
|
|
41
47
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
42
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
43
48
|
cb(Vcur, "Vcur", il);
|
|
49
|
+
if (model.layers[il].bv) {
|
|
50
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
51
|
+
cb(Vcur, "Vcur", il);
|
|
52
|
+
}
|
|
44
53
|
|
|
45
54
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
46
55
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|