@fugood/llama.node 1.4.6 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -26
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +3 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +272 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +680 -47
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +99 -29
- package/src/llama.cpp/src/llama-context.h +9 -3
- package/src/llama.cpp/src/llama-grammar.cpp +233 -33
- package/src/llama.cpp/src/llama-grammar.h +20 -1
- package/src/llama.cpp/src/llama-graph.cpp +85 -17
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +123 -52
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +8 -7
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params) {
|
|
7
5
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
|
@@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
20
18
|
|
|
21
19
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
22
20
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
// And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
|
|
22
|
+
|
|
23
|
+
// first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
|
|
24
|
+
GGML_ASSERT(ext_factor >= 0.0f);
|
|
25
|
+
const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
26
|
+
|
|
27
|
+
// use the original attn_factor to pre-scale the kq_scale
|
|
28
|
+
const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
29
|
+
const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
|
|
26
30
|
|
|
27
31
|
ggml_tensor * cur;
|
|
28
32
|
ggml_tensor * inpL;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
template <bool iswa>
|
|
4
|
+
llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
5
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
5
6
|
|
|
6
7
|
ggml_tensor * cur;
|
|
@@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|
|
17
18
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
18
19
|
|
|
19
20
|
// TODO: is causal == true correct? might need some changes
|
|
20
|
-
|
|
21
|
+
using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
22
|
+
inp_attn_type * inp_attn = nullptr;
|
|
23
|
+
|
|
24
|
+
if constexpr (iswa) {
|
|
25
|
+
inp_attn = build_attn_inp_kv_iswa();
|
|
26
|
+
} else {
|
|
27
|
+
inp_attn = build_attn_inp_kv();
|
|
28
|
+
}
|
|
21
29
|
|
|
22
30
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
23
31
|
|
|
24
32
|
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
-
|
|
26
|
-
|
|
33
|
+
float freq_base_l = 0.0f;
|
|
34
|
+
float freq_scale_l = 0.0f;
|
|
35
|
+
|
|
36
|
+
if constexpr (iswa) {
|
|
37
|
+
freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
38
|
+
freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
39
|
+
} else {
|
|
40
|
+
freq_base_l = freq_base;
|
|
41
|
+
freq_scale_l = freq_scale;
|
|
42
|
+
}
|
|
27
43
|
|
|
28
44
|
// norm
|
|
29
45
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
@@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|
|
102
118
|
cur = build_norm(cur,
|
|
103
119
|
model.layers[il].ffn_post_norm, NULL,
|
|
104
120
|
LLM_NORM_RMS, -1);
|
|
105
|
-
cb(cur, "ffn_post_norm",
|
|
121
|
+
cb(cur, "ffn_post_norm", il);
|
|
106
122
|
|
|
107
123
|
cur = ggml_add(ctx0, cur, sa_out);
|
|
108
124
|
|
|
@@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
|
|
|
124
140
|
// lm_head
|
|
125
141
|
cur = build_lora_mm(model.output, cur);
|
|
126
142
|
|
|
143
|
+
if (hparams.f_final_logit_softcapping) {
|
|
144
|
+
cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
|
|
145
|
+
cur = ggml_tanh(ctx0, cur);
|
|
146
|
+
cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
|
|
147
|
+
}
|
|
148
|
+
|
|
127
149
|
cb(cur, "result_output", -1);
|
|
128
150
|
res->t_logits = cur;
|
|
129
151
|
|
|
130
152
|
ggml_build_forward_expand(gf, cur);
|
|
131
153
|
}
|
|
154
|
+
|
|
155
|
+
template struct llm_build_gemma3<false>;
|
|
156
|
+
template struct llm_build_gemma3<true>;
|
|
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|
|
5
5
|
|
|
6
6
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
7
|
|
|
8
|
+
int sections[4];
|
|
9
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
10
|
+
|
|
8
11
|
ggml_tensor * cur;
|
|
9
12
|
ggml_tensor * inpL;
|
|
10
13
|
|
|
11
14
|
inpL = build_inp_embd(model.tok_embd);
|
|
12
15
|
|
|
16
|
+
bool use_mrope = hparams.use_mrope();
|
|
17
|
+
if (ubatch.embd && !use_mrope) {
|
|
18
|
+
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
|
19
|
+
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
|
20
|
+
}
|
|
21
|
+
|
|
13
22
|
// inp_pos - contains the positions
|
|
14
23
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
15
24
|
|
|
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
|
|
|
60
69
|
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
61
70
|
cb(Kcur, "Kcur_normed", il);
|
|
62
71
|
}
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
72
|
+
|
|
73
|
+
if (use_mrope) {
|
|
74
|
+
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
|
75
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
76
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
77
|
+
|
|
78
|
+
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
|
79
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
80
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
81
|
+
} else {
|
|
82
|
+
// Normal RoPE
|
|
83
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
|
84
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
85
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
86
|
+
|
|
87
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
|
88
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
89
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
90
|
+
}
|
|
74
91
|
|
|
75
92
|
cb(Qcur, "Qcur", il);
|
|
76
93
|
cb(Kcur, "Kcur", il);
|
|
@@ -8,11 +8,20 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|
|
8
8
|
|
|
9
9
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10
10
|
|
|
11
|
+
int sections[4];
|
|
12
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
13
|
+
|
|
11
14
|
ggml_tensor * cur;
|
|
12
15
|
ggml_tensor * inpL;
|
|
13
16
|
|
|
14
17
|
inpL = build_inp_embd(model.tok_embd);
|
|
15
18
|
|
|
19
|
+
bool use_mrope = hparams.use_mrope();
|
|
20
|
+
if (ubatch.embd && !use_mrope) {
|
|
21
|
+
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
|
|
22
|
+
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
|
|
23
|
+
}
|
|
24
|
+
|
|
16
25
|
// inp_pos - contains the positions
|
|
17
26
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
18
27
|
|
|
@@ -63,11 +72,25 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
|
|
|
63
72
|
Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
|
|
64
73
|
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
|
|
65
74
|
}
|
|
66
|
-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
67
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
68
75
|
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
if (use_mrope) {
|
|
77
|
+
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
|
|
78
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
79
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
80
|
+
|
|
81
|
+
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
|
|
82
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
83
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
84
|
+
} else {
|
|
85
|
+
// Normal RoPE
|
|
86
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
|
|
87
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
88
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
89
|
+
|
|
90
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
|
|
91
|
+
rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
92
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
93
|
+
}
|
|
71
94
|
|
|
72
95
|
cb(Qcur, "Qcur", il);
|
|
73
96
|
cb(Kcur, "Kcur", il);
|
|
@@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
179
179
|
llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
|
|
180
180
|
};
|
|
181
181
|
|
|
182
|
-
|
|
183
|
-
|
|
182
|
+
template <bool iswa>
|
|
183
|
+
struct llm_build_gemma3 : public llm_graph_context {
|
|
184
|
+
llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
|
|
184
185
|
};
|
|
185
186
|
|
|
186
187
|
struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
@@ -440,13 +441,14 @@ private:
|
|
|
440
441
|
ggml_tensor * cur,
|
|
441
442
|
ggml_tensor * causal_mask,
|
|
442
443
|
ggml_tensor * identity,
|
|
444
|
+
ggml_tensor * diag_mask,
|
|
443
445
|
int il);
|
|
444
446
|
|
|
445
447
|
ggml_tensor * build_layer_ffn(
|
|
446
448
|
ggml_tensor * cur,
|
|
447
449
|
int il);
|
|
448
450
|
|
|
449
|
-
ggml_tensor *
|
|
451
|
+
ggml_tensor * build_delta_net_chunking(
|
|
450
452
|
ggml_tensor * q,
|
|
451
453
|
ggml_tensor * k,
|
|
452
454
|
ggml_tensor * v,
|
|
@@ -455,18 +457,17 @@ private:
|
|
|
455
457
|
ggml_tensor * state,
|
|
456
458
|
ggml_tensor * causal_mask,
|
|
457
459
|
ggml_tensor * identity,
|
|
460
|
+
ggml_tensor * diag_mask,
|
|
458
461
|
int il);
|
|
459
462
|
|
|
460
|
-
ggml_tensor *
|
|
463
|
+
ggml_tensor * build_delta_net_autoregressive(
|
|
461
464
|
ggml_tensor * q,
|
|
462
465
|
ggml_tensor * k,
|
|
463
466
|
ggml_tensor * v,
|
|
464
467
|
ggml_tensor * g,
|
|
465
468
|
ggml_tensor * beta,
|
|
466
469
|
ggml_tensor * state,
|
|
467
|
-
|
|
468
|
-
ggml_tensor * identity,
|
|
469
|
-
int il);
|
|
470
|
+
int il);
|
|
470
471
|
|
|
471
472
|
ggml_tensor * build_norm_gated(
|
|
472
473
|
ggml_tensor * input,
|
|
@@ -107,12 +107,41 @@ ggml_tensor * llm_build_nemotron_h::build_attention_layer(ggml_tensor *
|
|
|
107
107
|
}
|
|
108
108
|
|
|
109
109
|
ggml_tensor * llm_build_nemotron_h::build_ffn_layer(ggml_tensor * cur, const llama_model & model, const int il) {
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
110
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
111
|
+
cur = build_ffn(cur,
|
|
112
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
113
|
+
NULL, NULL, NULL,
|
|
114
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
115
|
+
NULL,
|
|
116
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
117
|
+
cb(cur, "ffn_out", il);
|
|
118
|
+
} else {
|
|
119
|
+
ggml_tensor * ffn_inp = cur;
|
|
120
|
+
ggml_tensor * moe_out =
|
|
121
|
+
build_moe_ffn(ffn_inp,
|
|
122
|
+
model.layers[il].ffn_gate_inp,
|
|
123
|
+
model.layers[il].ffn_up_exps,
|
|
124
|
+
nullptr, // no gate
|
|
125
|
+
model.layers[il].ffn_down_exps,
|
|
126
|
+
model.layers[il].ffn_exp_probs_b,
|
|
127
|
+
n_expert, n_expert_used,
|
|
128
|
+
LLM_FFN_RELU_SQR, hparams.expert_weights_norm,
|
|
129
|
+
true, hparams.expert_weights_scale,
|
|
130
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
131
|
+
il);
|
|
132
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
133
|
+
|
|
134
|
+
ggml_tensor * ffn_shexp = build_ffn(ffn_inp,
|
|
135
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
136
|
+
NULL /* no gate */ , NULL, NULL,
|
|
137
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
138
|
+
NULL,
|
|
139
|
+
LLM_FFN_RELU_SQR, LLM_FFN_PAR, il);
|
|
140
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
141
|
+
|
|
142
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
143
|
+
cb(cur, "ffn_out", il);
|
|
144
|
+
}
|
|
116
145
|
|
|
117
146
|
cur = build_cvec(cur, il);
|
|
118
147
|
cb(cur, "l_out", il);
|
|
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
|
|
|
31
31
|
{
|
|
32
32
|
// compute Q and K and RoPE them
|
|
33
33
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
34
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
35
34
|
cb(Qcur, "Qcur", il);
|
|
35
|
+
if (model.layers[il].bq) {
|
|
36
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
37
|
+
cb(Qcur, "Qcur", il);
|
|
38
|
+
}
|
|
36
39
|
|
|
37
40
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
38
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
39
41
|
cb(Kcur, "Kcur", il);
|
|
42
|
+
if (model.layers[il].bk) {
|
|
43
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
44
|
+
cb(Kcur, "Kcur", il);
|
|
45
|
+
}
|
|
40
46
|
|
|
41
47
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
42
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
43
48
|
cb(Vcur, "Vcur", il);
|
|
49
|
+
if (model.layers[il].bv) {
|
|
50
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
51
|
+
cb(Vcur, "Vcur", il);
|
|
52
|
+
}
|
|
44
53
|
|
|
45
54
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
46
55
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|