@fugood/llama.node 1.4.10 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/LlamaContext.cpp +1 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -14
  5. package/src/llama.cpp/common/arg.h +1 -0
  6. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  7. package/src/llama.cpp/common/chat.cpp +32 -3
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +23 -23
  10. package/src/llama.cpp/common/common.h +1 -1
  11. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  12. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  20. package/src/llama.cpp/include/llama.h +13 -4
  21. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  23. package/src/llama.cpp/src/llama-adapter.h +7 -1
  24. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  25. package/src/llama.cpp/src/llama-arch.h +7 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +22 -21
  29. package/src/llama.cpp/src/llama-hparams.h +4 -3
  30. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  31. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  32. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  33. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  34. package/src/llama.cpp/src/llama-model.cpp +287 -16
  35. package/src/llama.cpp/src/llama-model.h +13 -2
  36. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  37. package/src/llama.cpp/src/llama-sampling.h +3 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  39. package/src/llama.cpp/src/llama-vocab.h +2 -0
  40. package/src/llama.cpp/src/llama.cpp +52 -37
  41. package/src/llama.cpp/src/models/bert.cpp +4 -2
  42. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  43. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  44. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  45. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  46. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  47. package/src/llama.cpp/src/models/llama.cpp +19 -6
  48. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  49. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  50. package/src/llama.cpp/src/models/models.h +18 -0
  51. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  52. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  53. package/src/llama.cpp/src/unicode.cpp +23 -14
@@ -142,11 +142,13 @@ llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params
142
142
  LLM_FFN_GELU, LLM_FFN_SEQ, il);
143
143
  cb(cur, "ffn_out", il);
144
144
  } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
145
+ const bool up_contains_gate = !model.layers[il].ffn_gate && model.layers[il].ffn_up->ne[1] != hparams.n_ff();
146
+ auto type_op = up_contains_gate ? LLM_FFN_GEGLU : LLM_FFN_GELU;
145
147
  cur = build_ffn(cur,
146
- model.layers[il].ffn_up, NULL, NULL,
148
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
147
149
  model.layers[il].ffn_gate, NULL, NULL,
148
150
  model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL,
149
- model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_GEGLU, LLM_FFN_PAR, il);
151
+ type_op, LLM_FFN_PAR, il);
150
152
  cb(cur, "ffn_out", il);
151
153
  } else {
152
154
  cur = build_ffn(cur,
@@ -3,12 +3,14 @@
3
3
  llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
4
4
  llm_graph_context(params) {
5
5
  const int64_t n_embd_head = hparams.n_embd_head_v;
6
- float kq_scale = 1.0f / sqrtf(float(n_embd_head));
6
+ const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
7
7
 
8
8
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
9
9
  GGML_ASSERT(n_embd_head == hparams.n_rot);
10
10
 
11
- ggml_tensor *inpL, *cur;
11
+ ggml_tensor * inpL;
12
+ ggml_tensor * cur;
13
+
12
14
  inpL = build_inp_embd(model.tok_embd);
13
15
 
14
16
  ggml_tensor * inp_pos = build_inp_pos();
@@ -44,7 +46,7 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
44
46
  }
45
47
 
46
48
  ggml_tensor * inpSA = inpL;
47
- cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
49
+ cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
48
50
 
49
51
  // build self attention
50
52
  {
@@ -215,7 +215,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
215
215
  model.layers[il].ffn_exp_probs_b,
216
216
  n_expert, n_expert_used,
217
217
  LLM_FFN_SILU, hparams.expert_weights_norm,
218
- true, hparams.expert_weights_scale,
218
+ hparams.expert_weights_scale, hparams.expert_weights_scale,
219
219
  (llama_expert_gating_func_type) hparams.expert_gating_func,
220
220
  il);
221
221
  cb(moe_out, "ffn_moe_out", il);
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params) {
7
5
  const int64_t n_embd_head = hparams.n_embd_head_k;
@@ -12,10 +10,8 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model,
12
10
  inpL = build_inp_embd(model.tok_embd);
13
11
 
14
12
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
15
- if (ubatch.token) {
16
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
17
- cb(inpL, "inp_scaled", -1);
18
- }
13
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
14
+ cb(inpL, "inp_scaled", -1);
19
15
 
20
16
  // inp_pos - contains the positions
21
17
  ggml_tensor * inp_pos = build_inp_pos();
@@ -10,10 +10,9 @@ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_gr
10
10
  inpL = build_inp_embd(model.tok_embd);
11
11
 
12
12
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
13
- if (ubatch.token) {
14
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
15
- cb(inpL, "inp_scaled", -1);
16
- }
13
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
14
+ cb(inpL, "inp_scaled", -1);
15
+
17
16
  // inp_pos - contains the positions
18
17
  ggml_tensor * inp_pos = build_inp_pos();
19
18
 
@@ -1,7 +1,5 @@
1
1
  #include "models.h"
2
2
 
3
-
4
-
5
3
  llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
6
4
  llm_graph_context(params),
7
5
  model(model),
@@ -15,10 +13,9 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const
15
13
  inpL = build_inp_embd(model.tok_embd);
16
14
 
17
15
  // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
18
- if (ubatch.token) {
19
- inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
20
- cb(inpL, "inp_scaled", -1);
21
- }
16
+ inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
17
+ cb(inpL, "inp_scaled", -1);
18
+
22
19
  // inp_pos - contains the positions
23
20
  ggml_tensor * inp_pos = build_inp_pos();
24
21
 
@@ -248,7 +245,7 @@ ggml_tensor * llm_build_gemma3n_iswa::view_2d_slice(ggml_tensor * x, int idx) {
248
245
  // equivalent to get_per_layer_inputs() in python code
249
246
  // output shape: [n_embd_altup, n_layer, n_tokens]
250
247
  ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
251
- auto inp = std::make_unique<llm_graph_input_embd>();
248
+ auto inp = std::make_unique<llm_graph_input_embd>();
252
249
  ggml_tensor * inp_per_layer;
253
250
  if (ubatch.token) {
254
251
  inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
@@ -1,6 +1,7 @@
1
1
  #include "models.h"
2
2
 
3
- llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
3
+ template <bool embed>
4
+ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
5
  const int64_t n_embd_head = hparams.n_embd_head_v;
5
6
 
6
7
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -14,7 +15,14 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
14
15
  // inp_pos - contains the positions
15
16
  ggml_tensor * inp_pos = build_inp_pos();
16
17
 
17
- auto * inp_attn = build_attn_inp_kv();
18
+ using inp_attn_type = std::conditional_t<embed, llm_graph_input_attn_no_cache, llm_graph_input_attn_kv>;
19
+
20
+ inp_attn_type * inp_attn = nullptr;
21
+ if constexpr (embed) {
22
+ inp_attn = build_attn_inp_no_cache();
23
+ } else {
24
+ inp_attn = build_attn_inp_kv();
25
+ }
18
26
 
19
27
  const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
20
28
 
@@ -145,11 +153,16 @@ llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_para
145
153
  cb(cur, "result_norm", -1);
146
154
  res->t_embd = cur;
147
155
 
148
- // lm_head
149
- cur = build_lora_mm(model.output, cur);
156
+ if constexpr (!embed) {
157
+ // lm_head
158
+ cur = build_lora_mm(model.output, cur);
150
159
 
151
- cb(cur, "result_output", -1);
152
- res->t_logits = cur;
160
+ cb(cur, "result_output", -1);
161
+ res->t_logits = cur;
162
+ }
153
163
 
154
164
  ggml_build_forward_expand(gf, cur);
155
165
  }
166
+
167
+ template struct llm_build_llama<false>;
168
+ template struct llm_build_llama<true>;
@@ -0,0 +1,117 @@
1
+ #include "models.h"
2
+
3
+ llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ auto * inp_attn = build_attn_inp_kv();
18
+
19
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
20
+
21
+ for (int il = 0; il < n_layer; ++il) {
22
+ ggml_tensor * inpSA = inpL;
23
+
24
+ // norm
25
+ cur = build_norm(inpL,
26
+ model.layers[il].attn_norm, NULL,
27
+ LLM_NORM_RMS, il);
28
+ cb(cur, "attn_norm", il);
29
+
30
+ // self-attention
31
+ {
32
+ // compute Q and K and RoPE them
33
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34
+ cb(Qcur, "Qcur", il);
35
+
36
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
37
+ cb(Kcur, "Kcur", il);
38
+
39
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
40
+ cb(Vcur, "Vcur", il);
41
+
42
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
43
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
44
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
45
+
46
+ Qcur = ggml_rope_ext(
47
+ ctx0, Qcur, inp_pos, nullptr,
48
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
49
+ ext_factor, attn_factor, beta_fast, beta_slow
50
+ );
51
+
52
+ Kcur = ggml_rope_ext(
53
+ ctx0, Kcur, inp_pos, nullptr,
54
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ ext_factor, attn_factor, beta_fast, beta_slow
56
+ );
57
+
58
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
59
+ cb(Qcur, "Qcur_normed", il);
60
+
61
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
62
+ cb(Kcur, "Kcur_normed", il);
63
+
64
+ cb(Qcur, "Qcur", il);
65
+ cb(Kcur, "Kcur", il);
66
+ cb(Vcur, "Vcur", il);
67
+
68
+ cur = build_attn(inp_attn,
69
+ model.layers[il].wo, model.layers[il].bo,
70
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
71
+ }
72
+ if (il == n_layer - 1 && inp_out_ids) {
73
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
74
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
75
+ }
76
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
77
+ cb(ffn_inp, "ffn_inp", il);
78
+
79
+ // feed-forward network
80
+ cur = build_norm(ffn_inp,
81
+ model.layers[il].ffn_norm, NULL,
82
+ LLM_NORM_RMS, il);
83
+ cb(cur, "ffn_norm", il);
84
+
85
+ cur = build_ffn(cur,
86
+ model.layers[il].ffn_up, NULL, NULL,
87
+ model.layers[il].ffn_gate, NULL, NULL,
88
+ model.layers[il].ffn_down, NULL, NULL,
89
+ NULL,
90
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
91
+ cb(cur, "ffn_out", il);
92
+
93
+ cur = ggml_add(ctx0, cur, ffn_inp);
94
+
95
+ cur = build_cvec(cur, il);
96
+ cb(cur, "l_out", il);
97
+
98
+ // input for next layer
99
+ inpL = cur;
100
+ }
101
+ cur = inpL;
102
+
103
+ cur = build_norm(cur,
104
+ model.output_norm, NULL,
105
+ LLM_NORM_RMS, -1);
106
+
107
+ cb(cur, "result_norm", -1);
108
+ res->t_embd = cur;
109
+
110
+ // lm_head
111
+ cur = build_lora_mm(model.output, cur);
112
+
113
+ cb(cur, "result_output", -1);
114
+ res->t_logits = cur;
115
+
116
+ ggml_build_forward_expand(gf, cur);
117
+ }
@@ -0,0 +1,123 @@
1
+
2
+ #include "models.h"
3
+
4
+ llm_build_mimo2_iswa::llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ ggml_tensor * cur;
6
+ ggml_tensor * inpL;
7
+
8
+ inpL = build_inp_embd(model.tok_embd);
9
+
10
+ ggml_tensor * inp_pos = build_inp_pos();
11
+ auto * inp_attn = build_attn_inp_kv_iswa();
12
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
13
+
14
+ for (int il = 0; il < n_layer; ++il) {
15
+ ggml_tensor * inpSA = inpL;
16
+
17
+ uint32_t n_head_l = hparams.n_head(il);
18
+ uint32_t n_head_kv_l = hparams.n_head_kv(il);
19
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
20
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
21
+
22
+ cur = inpL;
23
+
24
+ // self_attention
25
+ {
26
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
27
+ cb(cur, "attn_norm", il);
28
+
29
+ // compute Q and K and RoPE them
30
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
31
+ cb(Qcur, "Qcur", il);
32
+
33
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
34
+ cb(Kcur, "Kcur", il);
35
+
36
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
37
+ cb(Vcur, "Vcur", il);
38
+
39
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens);
40
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
41
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
42
+
43
+ Qcur = ggml_rope_ext(
44
+ ctx0, Qcur, inp_pos, nullptr,
45
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
46
+ ext_factor, attn_factor, beta_fast, beta_slow
47
+ );
48
+
49
+ Kcur = ggml_rope_ext(
50
+ ctx0, Kcur, inp_pos, nullptr,
51
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
+ ext_factor, attn_factor, beta_fast, beta_slow
53
+ );
54
+
55
+ cb(Qcur, "Qcur", il);
56
+ cb(Kcur, "Kcur", il);
57
+ cb(Vcur, "Vcur", il);
58
+
59
+ ggml_tensor * sinks = model.layers[il].attn_sinks;
60
+
61
+ cur = build_attn(inp_attn,
62
+ model.layers[il].wo, NULL,
63
+ Qcur, Kcur, Vcur, nullptr, sinks, nullptr, 1.0f/sqrtf(float(n_embd_head_k)), il);
64
+ }
65
+
66
+ if (il == n_layer - 1 && inp_out_ids) {
67
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
68
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
69
+ }
70
+
71
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
72
+ cb(ffn_inp, "ffn_inp", il);
73
+
74
+ cur = build_norm(ffn_inp,
75
+ model.layers[il].ffn_norm, NULL,
76
+ LLM_NORM_RMS, il);
77
+ cb(cur, "ffn_norm", il);
78
+
79
+ // feed-forward network
80
+ if (model.layers[il].ffn_gate_inp == nullptr) {
81
+ // dense branch
82
+ cur = build_ffn(cur,
83
+ model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
84
+ model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
85
+ model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
86
+ NULL,
87
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
88
+ cb(cur, "ffn_out", il);
89
+ } else {
90
+ // MoE branch
91
+ cur = build_moe_ffn(cur, model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
92
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
93
+ model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false,
94
+ 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, il);
95
+ cb(cur, "ffn_moe_out", il);
96
+ }
97
+
98
+ cur = ggml_add(ctx0, cur, ffn_inp);
99
+
100
+ cur = build_cvec(cur, il);
101
+ cb(cur, "l_out", il);
102
+
103
+ // input for next layer
104
+ inpL = cur;
105
+ }
106
+
107
+ cur = inpL;
108
+
109
+ cur = build_norm(cur,
110
+ model.output_norm, NULL,
111
+ LLM_NORM_RMS, -1);
112
+
113
+ cb(cur, "result_norm", -1);
114
+ res->t_embd = cur;
115
+
116
+ // lm_head
117
+ cur = build_lora_mm(model.output, cur);
118
+
119
+ cb(cur, "result_output", -1);
120
+ res->t_logits = cur;
121
+
122
+ ggml_build_forward_expand(gf, cur);
123
+ }
@@ -303,6 +303,7 @@ struct llm_build_llada_moe : public llm_graph_context {
303
303
  llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
304
304
  };
305
305
 
306
+ template <bool embed>
306
307
  struct llm_build_llama : public llm_graph_context {
307
308
  llm_build_llama(const llama_model & model, const llm_graph_params & params);
308
309
  };
@@ -311,10 +312,18 @@ struct llm_build_llama_iswa : public llm_graph_context {
311
312
  llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params);
312
313
  };
313
314
 
315
+ struct llm_build_maincoder : public llm_graph_context {
316
+ llm_build_maincoder(const llama_model & model, const llm_graph_params & params);
317
+ };
318
+
314
319
  struct llm_build_mamba : public llm_graph_context_mamba {
315
320
  llm_build_mamba(const llama_model & model, const llm_graph_params & params);
316
321
  };
317
322
 
323
+ struct llm_build_mimo2_iswa : public llm_graph_context {
324
+ llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params);
325
+ };
326
+
318
327
  struct llm_build_minicpm3 : public llm_graph_context {
319
328
  llm_build_minicpm3(const llama_model & model, const llm_graph_params & params);
320
329
  };
@@ -327,6 +336,10 @@ struct llm_build_mistral3 : public llm_graph_context {
327
336
  llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
328
337
  };
329
338
 
339
+ struct llm_build_modern_bert : public llm_graph_context {
340
+ llm_build_modern_bert(const llama_model & model, const llm_graph_params & params);
341
+ };
342
+
330
343
  struct llm_build_mpt : public llm_graph_context {
331
344
  llm_build_mpt(const llama_model & model, const llm_graph_params & params);
332
345
  };
@@ -396,6 +409,11 @@ struct llm_build_plamo : public llm_graph_context {
396
409
  llm_build_plamo(const llama_model & model, const llm_graph_params & params);
397
410
  };
398
411
 
412
+ template <bool iswa>
413
+ struct llm_build_plamo3 : public llm_graph_context {
414
+ llm_build_plamo3(const llama_model & model, const llm_graph_params & params);
415
+ };
416
+
399
417
  struct llm_build_plm : public llm_graph_context {
400
418
  llm_build_plm(const llama_model & model, const llm_graph_params & params);
401
419
  };
@@ -0,0 +1,115 @@
1
+ #include "models.h"
2
+
3
+ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+ ggml_tensor * inp_pos = build_inp_pos();
12
+
13
+ // construct input embeddings (token, type, position)
14
+ inpL = build_inp_embd(model.tok_embd);
15
+ cb(inpL, "inp_embd", -1);
16
+
17
+ // embed layer norm
18
+ inpL = build_norm(inpL, model.tok_norm, nullptr, LLM_NORM, -1);
19
+ cb(inpL, "inp_norm", -1);
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ auto * inp_attn = build_attn_inp_no_cache();
24
+
25
+ for (int il = 0; il < n_layer; ++il) {
26
+ float freq_base_l = model.get_rope_freq_base(cparams, il);
27
+
28
+ cur = inpL;
29
+
30
+ // attention layer norm
31
+ if (model.layers[il].attn_norm) {
32
+ cur = build_norm(inpL,
33
+ model.layers[il].attn_norm, NULL,
34
+ LLM_NORM, il);
35
+ cb(cur, "attn_norm", il);
36
+ }
37
+
38
+ // self attention
39
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
40
+ cb(cur, "wqkv", il);
41
+
42
+ const size_t type_size = ggml_type_size(cur->type);
43
+
44
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*type_size, cur->nb[1], 0*type_size*(n_embd));
45
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd));
46
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*type_size, cur->nb[1], 1*type_size*(n_embd + n_embd_gqa));
47
+
48
+ // RoPE
49
+ Qcur = ggml_rope_ext(
50
+ ctx0, Qcur, inp_pos, nullptr,
51
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
52
+ ext_factor, attn_factor, beta_fast, beta_slow
53
+ );
54
+
55
+ Kcur = ggml_rope_ext(
56
+ ctx0, Kcur, inp_pos, nullptr,
57
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
58
+ ext_factor, attn_factor, beta_fast, beta_slow
59
+ );
60
+
61
+ cb(Qcur, "Qcur", il);
62
+ cb(Kcur, "Kcur", il);
63
+ cb(Vcur, "Vcur", il);
64
+
65
+ cur = build_attn(inp_attn,
66
+ model.layers[il].wo, nullptr,
67
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
68
+ cb(cur, "kqv_out", il);
69
+
70
+ if (il == n_layer - 1 && inp_out_ids) {
71
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
72
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
73
+ }
74
+
75
+ // re-add the layer input
76
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
77
+ cb(ffn_inp, "ffn_inp", il);
78
+
79
+ // attention layer norm
80
+ cur = build_norm(ffn_inp,
81
+ model.layers[il].ffn_norm, NULL,
82
+ LLM_NORM, il);
83
+ cb(cur, "ffn_norm", il);
84
+
85
+ cur = build_ffn(cur,
86
+ model.layers[il].ffn_up, NULL, NULL,
87
+ NULL, NULL, NULL,
88
+ model.layers[il].ffn_down, NULL, NULL,
89
+ NULL,
90
+ LLM_FFN_GEGLU, LLM_FFN_SEQ, il);
91
+
92
+ // attentions bypass the intermediate layer
93
+ cur = ggml_add(ctx0, cur, ffn_inp);
94
+
95
+ // input for next layer
96
+ inpL = cur;
97
+ }
98
+
99
+ cur = inpL;
100
+
101
+ cur = build_norm(cur,
102
+ model.output_norm, NULL,
103
+ LLM_NORM, -1);
104
+ cb(cur, "final_norm_out", -1);
105
+
106
+ if (hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
107
+ // extracting cls token
108
+ cur = ggml_view_1d(ctx0, cur, hparams.n_embd, 0);
109
+ cb(cur, "cls_pooled_embd", -1);
110
+ }
111
+
112
+ cb(cur, "res_embd", -1);
113
+ res->t_embd = cur;
114
+ ggml_build_forward_expand(gf, cur);
115
+ }