@fugood/llama.node 1.4.11 → 1.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +30 -30
- package/src/llama.cpp/common/arg.cpp +29 -14
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +32 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +23 -23
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +13 -4
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +76 -0
- package/src/llama.cpp/src/llama-arch.h +7 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +22 -21
- package/src/llama.cpp/src/llama-hparams.h +4 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +287 -16
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +44 -33
- package/src/llama.cpp/src/llama-sampling.h +3 -0
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +52 -37
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
|
|
|
142
142
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
143
143
|
cb(cur, "ffn_out", il);
|
|
144
144
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
145
|
+
const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
|
|
146
|
+
auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
|
|
145
147
|
cur = build_ffn(cur,
|
|
146
|
-
model.layers[il].ffn_up,
|
|
148
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
147
149
|
model.layers[il].ffn_gate, NULL, NULL,
|
|
148
150
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
|
|
149
|
-
|
|
151
|
+
type_op, LLM_FFN_PAR, il);
|
|
150
152
|
cb(cur, "ffn_out", il);
|
|
151
153
|
} else {
|
|
152
154
|
cur = build_ffn(cur,
|
|
@@ -3,12 +3,14 @@
|
|
|
3
3
|
llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
|
|
4
4
|
llm_graph_context(params) {
|
|
5
5
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6
|
-
float
|
|
6
|
+
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
7
7
|
|
|
8
8
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
9
9
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10
10
|
|
|
11
|
-
ggml_tensor *inpL
|
|
11
|
+
ggml_tensor * inpL;
|
|
12
|
+
ggml_tensor * cur;
|
|
13
|
+
|
|
12
14
|
inpL = build_inp_embd(model.tok_embd);
|
|
13
15
|
|
|
14
16
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
|
|
44
46
|
}
|
|
45
47
|
|
|
46
48
|
ggml_tensor * inpSA = inpL;
|
|
47
|
-
cur
|
|
49
|
+
cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
48
50
|
|
|
49
51
|
// build self attention
|
|
50
52
|
{
|
|
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
215
215
|
model.layers[il].ffn_exp_probs_b,
|
|
216
216
|
n_expert, n_expert_used,
|
|
217
217
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
218
|
-
|
|
218
|
+
hparams.expert_weights_scale, hparams.expert_weights_scale,
|
|
219
219
|
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
220
220
|
il);
|
|
221
221
|
cb(moe_out, "ffn_moe_out", il);
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params) {
|
|
7
5
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
|
|
|
12
10
|
inpL = build_inp_embd(model.tok_embd);
|
|
13
11
|
|
|
14
12
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
cb(inpL, "inp_scaled", -1);
|
|
18
|
-
}
|
|
13
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
14
|
+
cb(inpL, "inp_scaled", -1);
|
|
19
15
|
|
|
20
16
|
// inp_pos - contains the positions
|
|
21
17
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
|
|
|
10
10
|
inpL = build_inp_embd(model.tok_embd);
|
|
11
11
|
|
|
12
12
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
}
|
|
13
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
14
|
+
cb(inpL, "inp_scaled", -1);
|
|
15
|
+
|
|
17
16
|
// inp_pos - contains the positions
|
|
18
17
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
19
18
|
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
3
|
llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
|
|
6
4
|
llm_graph_context(params),
|
|
7
5
|
model(model),
|
|
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
|
|
|
15
13
|
inpL = build_inp_embd(model.tok_embd);
|
|
16
14
|
|
|
17
15
|
// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
}
|
|
16
|
+
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
|
|
17
|
+
cb(inpL, "inp_scaled", -1);
|
|
18
|
+
|
|
22
19
|
// inp_pos - contains the positions
|
|
23
20
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
24
21
|
|
|
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
|
|
|
248
245
|
// equivalent to get_per_layer_inputs() in python code
|
|
249
246
|
// output shape: [n_embd_altup, n_layer, n_tokens]
|
|
250
247
|
ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
|
|
251
|
-
auto
|
|
248
|
+
auto inp = std::make_unique<llm_graph_input_embd>();
|
|
252
249
|
ggml_tensor * inp_per_layer;
|
|
253
250
|
if (ubatch.token) {
|
|
254
251
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "models.h"
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
template <bool embed>
|
|
4
|
+
llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
5
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
6
|
|
|
6
7
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
@@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
|
|
14
15
|
// inp_pos - contains the positions
|
|
15
16
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
|
|
19
|
+
|
|
20
|
+
inp_attn_type * inp_attn = nullptr;
|
|
21
|
+
if constexpr (embed) {
|
|
22
|
+
inp_attn = build_attn_inp_no_cache();
|
|
23
|
+
} else {
|
|
24
|
+
inp_attn = build_attn_inp_kv();
|
|
25
|
+
}
|
|
18
26
|
|
|
19
27
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
20
28
|
|
|
@@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
|
|
|
145
153
|
cb(cur, "result_norm", -1);
|
|
146
154
|
res->t_embd = cur;
|
|
147
155
|
|
|
148
|
-
|
|
149
|
-
|
|
156
|
+
if constexpr (!embed) {
|
|
157
|
+
// lm_head
|
|
158
|
+
cur = build_lora_mm(model.output, cur);
|
|
150
159
|
|
|
151
|
-
|
|
152
|
-
|
|
160
|
+
cb(cur, "result_output", -1);
|
|
161
|
+
res->t_logits = cur;
|
|
162
|
+
}
|
|
153
163
|
|
|
154
164
|
ggml_build_forward_expand(gf, cur);
|
|
155
165
|
}
|
|
166
|
+
|
|
167
|
+
template struct llm_build_llama<false>;
|
|
168
|
+
template struct llm_build_llama<true>;
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8
|
+
|
|
9
|
+
ggml_tensor * cur;
|
|
10
|
+
ggml_tensor * inpL;
|
|
11
|
+
|
|
12
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13
|
+
|
|
14
|
+
// inp_pos - contains the positions
|
|
15
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16
|
+
|
|
17
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
18
|
+
|
|
19
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
20
|
+
|
|
21
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
22
|
+
ggml_tensor * inpSA = inpL;
|
|
23
|
+
|
|
24
|
+
// norm
|
|
25
|
+
cur = build_norm(inpL,
|
|
26
|
+
model.layers[il].attn_norm, NULL,
|
|
27
|
+
LLM_NORM_RMS, il);
|
|
28
|
+
cb(cur, "attn_norm", il);
|
|
29
|
+
|
|
30
|
+
// self-attention
|
|
31
|
+
{
|
|
32
|
+
// compute Q and K and RoPE them
|
|
33
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
34
|
+
cb(Qcur, "Qcur", il);
|
|
35
|
+
|
|
36
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
37
|
+
cb(Kcur, "Kcur", il);
|
|
38
|
+
|
|
39
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
40
|
+
cb(Vcur, "Vcur", il);
|
|
41
|
+
|
|
42
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
43
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
44
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
45
|
+
|
|
46
|
+
Qcur = ggml_rope_ext(
|
|
47
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
48
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
49
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
Kcur = ggml_rope_ext(
|
|
53
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
54
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
55
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
59
|
+
cb(Qcur, "Qcur_normed", il);
|
|
60
|
+
|
|
61
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
62
|
+
cb(Kcur, "Kcur_normed", il);
|
|
63
|
+
|
|
64
|
+
cb(Qcur, "Qcur", il);
|
|
65
|
+
cb(Kcur, "Kcur", il);
|
|
66
|
+
cb(Vcur, "Vcur", il);
|
|
67
|
+
|
|
68
|
+
cur = build_attn(inp_attn,
|
|
69
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
70
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
71
|
+
}
|
|
72
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
73
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
74
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
75
|
+
}
|
|
76
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
77
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
78
|
+
|
|
79
|
+
// feed-forward network
|
|
80
|
+
cur = build_norm(ffn_inp,
|
|
81
|
+
model.layers[il].ffn_norm, NULL,
|
|
82
|
+
LLM_NORM_RMS, il);
|
|
83
|
+
cb(cur, "ffn_norm", il);
|
|
84
|
+
|
|
85
|
+
cur = build_ffn(cur,
|
|
86
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
87
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
88
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
89
|
+
NULL,
|
|
90
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
91
|
+
cb(cur, "ffn_out", il);
|
|
92
|
+
|
|
93
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
94
|
+
|
|
95
|
+
cur = build_cvec(cur, il);
|
|
96
|
+
cb(cur, "l_out", il);
|
|
97
|
+
|
|
98
|
+
// input for next layer
|
|
99
|
+
inpL = cur;
|
|
100
|
+
}
|
|
101
|
+
cur = inpL;
|
|
102
|
+
|
|
103
|
+
cur = build_norm(cur,
|
|
104
|
+
model.output_norm, NULL,
|
|
105
|
+
LLM_NORM_RMS, -1);
|
|
106
|
+
|
|
107
|
+
cb(cur, "result_norm", -1);
|
|
108
|
+
res->t_embd = cur;
|
|
109
|
+
|
|
110
|
+
// lm_head
|
|
111
|
+
cur = build_lora_mm(model.output, cur);
|
|
112
|
+
|
|
113
|
+
cb(cur, "result_output", -1);
|
|
114
|
+
res->t_logits = cur;
|
|
115
|
+
|
|
116
|
+
ggml_build_forward_expand(gf, cur);
|
|
117
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
|
|
2
|
+
#include "models.h"
|
|
3
|
+
|
|
4
|
+
llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
5
|
+
ggml_tensor * cur;
|
|
6
|
+
ggml_tensor * inpL;
|
|
7
|
+
|
|
8
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
9
|
+
|
|
10
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11
|
+
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
12
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13
|
+
|
|
14
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15
|
+
ggml_tensor * inpSA = inpL;
|
|
16
|
+
|
|
17
|
+
uint32_t n_head_l = hparams.n_head(il);
|
|
18
|
+
uint32_t n_head_kv_l = hparams.n_head_kv(il);
|
|
19
|
+
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
|
20
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
21
|
+
|
|
22
|
+
cur = inpL;
|
|
23
|
+
|
|
24
|
+
// self_attention
|
|
25
|
+
{
|
|
26
|
+
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
|
|
27
|
+
cb(cur, "attn_norm", il);
|
|
28
|
+
|
|
29
|
+
// compute Q and K and RoPE them
|
|
30
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
31
|
+
cb(Qcur, "Qcur", il);
|
|
32
|
+
|
|
33
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
34
|
+
cb(Kcur, "Kcur", il);
|
|
35
|
+
|
|
36
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
37
|
+
cb(Vcur, "Vcur", il);
|
|
38
|
+
|
|
39
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
|
|
40
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
|
|
41
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
|
|
42
|
+
|
|
43
|
+
Qcur = ggml_rope_ext(
|
|
44
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
45
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
46
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
Kcur = ggml_rope_ext(
|
|
50
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
51
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
cb(Qcur, "Qcur", il);
|
|
56
|
+
cb(Kcur, "Kcur", il);
|
|
57
|
+
cb(Vcur, "Vcur", il);
|
|
58
|
+
|
|
59
|
+
ggml_tensor * sinks = model.layers[il].attn_sinks;
|
|
60
|
+
|
|
61
|
+
cur = build_attn(inp_attn,
|
|
62
|
+
model.layers[il].wo, NULL,
|
|
63
|
+
Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
67
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
68
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
72
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
73
|
+
|
|
74
|
+
cur = build_norm(ffn_inp,
|
|
75
|
+
model.layers[il].ffn_norm, NULL,
|
|
76
|
+
LLM_NORM_RMS, il);
|
|
77
|
+
cb(cur, "ffn_norm", il);
|
|
78
|
+
|
|
79
|
+
// feed-forward network
|
|
80
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
81
|
+
// dense branch
|
|
82
|
+
cur = build_ffn(cur,
|
|
83
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
84
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
85
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
86
|
+
NULL,
|
|
87
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
88
|
+
cb(cur, "ffn_out", il);
|
|
89
|
+
} else {
|
|
90
|
+
// MoE branch
|
|
91
|
+
cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
|
|
92
|
+
model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
|
|
93
|
+
model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
|
|
94
|
+
0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
|
|
95
|
+
cb(cur, "ffn_moe_out", il);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
99
|
+
|
|
100
|
+
cur = build_cvec(cur, il);
|
|
101
|
+
cb(cur, "l_out", il);
|
|
102
|
+
|
|
103
|
+
// input for next layer
|
|
104
|
+
inpL = cur;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
cur = inpL;
|
|
108
|
+
|
|
109
|
+
cur = build_norm(cur,
|
|
110
|
+
model.output_norm, NULL,
|
|
111
|
+
LLM_NORM_RMS, -1);
|
|
112
|
+
|
|
113
|
+
cb(cur, "result_norm", -1);
|
|
114
|
+
res->t_embd = cur;
|
|
115
|
+
|
|
116
|
+
// lm_head
|
|
117
|
+
cur = build_lora_mm(model.output, cur);
|
|
118
|
+
|
|
119
|
+
cb(cur, "result_output", -1);
|
|
120
|
+
res->t_logits = cur;
|
|
121
|
+
|
|
122
|
+
ggml_build_forward_expand(gf, cur);
|
|
123
|
+
}
|
|
@@ -303,6 +303,7 @@ struct llm_build_llada_moe : public llm_graph_context {
|
|
|
303
303
|
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
|
|
304
304
|
};
|
|
305
305
|
|
|
306
|
+
template <bool embed>
|
|
306
307
|
struct llm_build_llama : public llm_graph_context {
|
|
307
308
|
llm_build_llama(const llama_model & model, const llm_graph_params & params);
|
|
308
309
|
};
|
|
@@ -311,10 +312,18 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
311
312
|
llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
|
|
312
313
|
};
|
|
313
314
|
|
|
315
|
+
struct llm_build_maincoder : public llm_graph_context {
|
|
316
|
+
llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
|
|
317
|
+
};
|
|
318
|
+
|
|
314
319
|
struct llm_build_mamba : public llm_graph_context_mamba {
|
|
315
320
|
llm_build_mamba(const llama_model & model, const llm_graph_params & params);
|
|
316
321
|
};
|
|
317
322
|
|
|
323
|
+
struct llm_build_mimo2_iswa : public llm_graph_context {
|
|
324
|
+
llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
|
|
325
|
+
};
|
|
326
|
+
|
|
318
327
|
struct llm_build_minicpm3 : public llm_graph_context {
|
|
319
328
|
llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
|
|
320
329
|
};
|
|
@@ -327,6 +336,10 @@ struct llm_build_mistral3 : public llm_graph_context {
|
|
|
327
336
|
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
|
328
337
|
};
|
|
329
338
|
|
|
339
|
+
struct llm_build_modern_bert : public llm_graph_context {
|
|
340
|
+
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
|
|
341
|
+
};
|
|
342
|
+
|
|
330
343
|
struct llm_build_mpt : public llm_graph_context {
|
|
331
344
|
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
|
332
345
|
};
|
|
@@ -396,6 +409,11 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
396
409
|
llm_build_plamo(const llama_model & model, const llm_graph_params & params);
|
|
397
410
|
};
|
|
398
411
|
|
|
412
|
+
template <bool iswa>
|
|
413
|
+
struct llm_build_plamo3 : public llm_graph_context {
|
|
414
|
+
llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
|
|
415
|
+
};
|
|
416
|
+
|
|
399
417
|
struct llm_build_plm : public llm_graph_context {
|
|
400
418
|
llm_build_plm(const llama_model & model, const llm_graph_params & params);
|
|
401
419
|
};
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6
|
+
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8
|
+
|
|
9
|
+
ggml_tensor * cur;
|
|
10
|
+
ggml_tensor * inpL;
|
|
11
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12
|
+
|
|
13
|
+
// construct input embeddings (token, type, position)
|
|
14
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
15
|
+
cb(inpL, "inp_embd", -1);
|
|
16
|
+
|
|
17
|
+
// embed layer norm
|
|
18
|
+
inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
|
|
19
|
+
cb(inpL, "inp_norm", -1);
|
|
20
|
+
|
|
21
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
22
|
+
|
|
23
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
24
|
+
|
|
25
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
26
|
+
float freq_base_l = model.get_rope_freq_base(cparams, il);
|
|
27
|
+
|
|
28
|
+
cur = inpL;
|
|
29
|
+
|
|
30
|
+
// attention layer norm
|
|
31
|
+
if (model.layers[il].attn_norm) {
|
|
32
|
+
cur = build_norm(inpL,
|
|
33
|
+
model.layers[il].attn_norm, NULL,
|
|
34
|
+
LLM_NORM, il);
|
|
35
|
+
cb(cur, "attn_norm", il);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// self attention
|
|
39
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
40
|
+
cb(cur, "wqkv", il);
|
|
41
|
+
|
|
42
|
+
const size_t type_size = ggml_type_size(cur->type);
|
|
43
|
+
|
|
44
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
|
|
45
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
|
|
46
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
|
|
47
|
+
|
|
48
|
+
// RoPE
|
|
49
|
+
Qcur = ggml_rope_ext(
|
|
50
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
51
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
|
|
52
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
Kcur = ggml_rope_ext(
|
|
56
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
57
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
|
|
58
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
59
|
+
);
|
|
60
|
+
|
|
61
|
+
cb(Qcur, "Qcur", il);
|
|
62
|
+
cb(Kcur, "Kcur", il);
|
|
63
|
+
cb(Vcur, "Vcur", il);
|
|
64
|
+
|
|
65
|
+
cur = build_attn(inp_attn,
|
|
66
|
+
model.layers[il].wo, nullptr,
|
|
67
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
68
|
+
cb(cur, "kqv_out", il);
|
|
69
|
+
|
|
70
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
71
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
72
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// re-add the layer input
|
|
76
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
77
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
78
|
+
|
|
79
|
+
// attention layer norm
|
|
80
|
+
cur = build_norm(ffn_inp,
|
|
81
|
+
model.layers[il].ffn_norm, NULL,
|
|
82
|
+
LLM_NORM, il);
|
|
83
|
+
cb(cur, "ffn_norm", il);
|
|
84
|
+
|
|
85
|
+
cur = build_ffn(cur,
|
|
86
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
87
|
+
NULL, NULL, NULL,
|
|
88
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
89
|
+
NULL,
|
|
90
|
+
LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
|
|
91
|
+
|
|
92
|
+
// attentions bypass the intermediate layer
|
|
93
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
94
|
+
|
|
95
|
+
// input for next layer
|
|
96
|
+
inpL = cur;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
cur = inpL;
|
|
100
|
+
|
|
101
|
+
cur = build_norm(cur,
|
|
102
|
+
model.output_norm, NULL,
|
|
103
|
+
LLM_NORM, -1);
|
|
104
|
+
cb(cur, "final_norm_out", -1);
|
|
105
|
+
|
|
106
|
+
if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
|
107
|
+
// extracting cls token
|
|
108
|
+
cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
|
|
109
|
+
cb(cur, "cls_pooled_embd", -1);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
cb(cur, "res_embd", -1);
|
|
113
|
+
res->t_embd = cur;
|
|
114
|
+
ggml_build_forward_expand(gf, cur);
|
|
115
|
+
}
|