@fugood/llama.node 1.3.0-rc.6 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. package/CMakeLists.txt +12 -2
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +8 -9
  4. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  5. package/src/llama.cpp/common/arg.cpp +39 -1001
  6. package/src/llama.cpp/common/arg.h +2 -2
  7. package/src/llama.cpp/common/chat.cpp +216 -2
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +33 -0
  10. package/src/llama.cpp/common/common.h +13 -0
  11. package/src/llama.cpp/common/download.cpp +1054 -0
  12. package/src/llama.cpp/common/download.h +55 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
  14. package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
  15. package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
  16. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +7 -3
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +10 -3
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +0 -5
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -35
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  26. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  27. package/src/llama.cpp/include/llama.h +7 -3
  28. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  29. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  30. package/src/llama.cpp/src/llama-arch.h +11 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  32. package/src/llama.cpp/src/llama-batch.h +12 -1
  33. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  34. package/src/llama.cpp/src/llama-chat.h +1 -0
  35. package/src/llama.cpp/src/llama-context.cpp +44 -16
  36. package/src/llama.cpp/src/llama-context.h +5 -5
  37. package/src/llama.cpp/src/llama-cparams.h +1 -0
  38. package/src/llama.cpp/src/llama-graph.cpp +12 -7
  39. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  40. package/src/llama.cpp/src/llama-hparams.h +6 -0
  41. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  42. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -21
  43. package/src/llama.cpp/src/llama-kv-cache.h +2 -4
  44. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-model.cpp +350 -13194
  48. package/src/llama.cpp/src/llama-model.h +9 -2
  49. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  50. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  51. package/src/llama.cpp/src/llama-vocab.h +1 -0
  52. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  53. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  54. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  55. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  56. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  57. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  58. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  59. package/src/llama.cpp/src/models/bert.cpp +176 -0
  60. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  61. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  62. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  63. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  64. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  65. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  66. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  67. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  68. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  69. package/src/llama.cpp/src/models/deci.cpp +135 -0
  70. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  71. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  72. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  73. package/src/llama.cpp/src/models/dream.cpp +105 -0
  74. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  75. package/src/llama.cpp/src/models/ernie4-5.cpp +111 -0
  76. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  77. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  78. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  79. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  80. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  81. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  82. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  83. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  84. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  85. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  86. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  87. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  88. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  89. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  90. package/src/llama.cpp/src/models/granite.cpp +211 -0
  91. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  92. package/src/llama.cpp/src/models/grok.cpp +159 -0
  93. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  94. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  95. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  96. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  97. package/src/llama.cpp/src/models/jais.cpp +86 -0
  98. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  99. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  100. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  101. package/src/llama.cpp/src/models/llada.cpp +99 -0
  102. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  103. package/src/llama.cpp/src/models/llama.cpp +155 -0
  104. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  105. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  106. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  107. package/src/llama.cpp/src/models/models.h +481 -0
  108. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  109. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  110. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  111. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  112. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  113. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  114. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  115. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +123 -0
  116. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  117. package/src/llama.cpp/src/models/orion.cpp +123 -0
  118. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  119. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  120. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  121. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  122. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  123. package/src/llama.cpp/src/models/plm.cpp +168 -0
  124. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  125. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  126. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  127. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  128. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  129. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  130. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  131. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  132. package/src/llama.cpp/src/models/refact.cpp +94 -0
  133. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  134. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  135. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  136. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  137. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  138. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  139. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  140. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  141. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  142. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  143. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  144. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  145. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  146. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  147. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -0,0 +1,236 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
6
+ llm_graph_context(params) {
7
+ bool is_lite = (hparams.n_layer == 27);
8
+
9
+ const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
10
+
11
+ // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
12
+ const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
13
+ const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
14
+
15
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
16
+ const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
17
+
18
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
19
+
20
+ // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
21
+ // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
22
+ const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
23
+ const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
24
+ const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
25
+
26
+ ggml_tensor * cur;
27
+ ggml_tensor * inpL;
28
+
29
+ // {n_embd, n_tokens}
30
+ inpL = build_inp_embd(model.tok_embd);
31
+
32
+ // inp_pos - contains the positions
33
+ ggml_tensor * inp_pos = build_inp_pos();
34
+
35
+ auto * inp_attn = build_attn_inp_kv();
36
+
37
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
38
+
39
+ for (int il = 0; il < n_layer; ++il) {
40
+ ggml_tensor * inpSA = inpL;
41
+
42
+ // norm
43
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
44
+ cb(cur, "attn_norm", il);
45
+
46
+ // self_attention
47
+ {
48
+ ggml_tensor * q = NULL;
49
+ if (!is_lite) {
50
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
51
+ cb(q, "q", il);
52
+
53
+ q = build_norm(q, model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
54
+ cb(q, "q", il);
55
+
56
+ q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
57
+ cb(q, "q", il);
58
+ } else {
59
+ q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
60
+ cb(q, "q", il);
61
+ }
62
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
63
+ ggml_tensor * q_nope =
64
+ ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
65
+ ggml_row_size(q->type, n_embd_head_k) * n_head, 0);
66
+ cb(q_nope, "q_nope", il);
67
+
68
+ // and {n_embd_head_qk_rope, n_head, n_tokens}
69
+ ggml_tensor * q_pe = ggml_view_3d(
70
+ ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k),
71
+ ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
72
+ cb(q_pe, "q_pe", il);
73
+
74
+ ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
75
+ cb(kv_cmpr_pe, "kv_cmpr_pe", il);
76
+
77
+ // split into {kv_lora_rank, n_tokens}
78
+ ggml_tensor * kv_cmpr =
79
+ ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
80
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
81
+ cb(kv_cmpr, "kv_cmpr", il);
82
+
83
+ // and {n_embd_head_qk_rope, 1, n_tokens}
84
+ ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
85
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
86
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
87
+ ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
88
+ cb(k_pe, "k_pe", il);
89
+
90
+ q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
91
+ ext_factor, attn_factor, beta_fast, beta_slow);
92
+ cb(q_pe, "q_pe", il);
93
+
94
+ k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
95
+ ext_factor, attn_factor, beta_fast, beta_slow);
96
+ cb(k_pe, "k_pe", il);
97
+
98
+ kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
99
+ cb(kv_cmpr, "kv_cmpr", il);
100
+
101
+ if (is_mla) {
102
+ // {n_embd_head_qk_nope, n_tokens, n_head}
103
+ q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
104
+ cb(q_nope, "q_nope_perm", il);
105
+
106
+ // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
107
+ ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
108
+ cb(q_nope_absorbed, "q_nope_absorbed", il);
109
+
110
+ // {kv_lora_rank, n_head, n_tokens}
111
+ q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
112
+ cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
113
+
114
+ // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
115
+ // note: rope must go first for in-place context shifting in build_rope_shift()
116
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
117
+ cb(Qcur, "Qcur", il);
118
+
119
+ kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
120
+ cb(kv_cmpr, "kv_cmpr_reshape", il);
121
+
122
+ // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
123
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
124
+ cb(Kcur, "Kcur", il);
125
+
126
+ // {kv_lora_rank, 1, n_tokens}
127
+ ggml_tensor * Vcur = kv_cmpr;
128
+ cb(Vcur, "Vcur", il);
129
+
130
+ // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
131
+ cur = build_attn(inp_attn,
132
+ model.layers[il].wo, NULL,
133
+ Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
134
+ } else {
135
+ ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
136
+ cb(kv, "kv", il);
137
+
138
+ // split into {n_embd_head_qk_nope, n_head, n_tokens}
139
+ ggml_tensor * k_nope =
140
+ ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
141
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
142
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, 0);
143
+ cb(k_nope, "k_nope_view", il);
144
+
145
+ // and {n_embd_head_v, n_head, n_tokens}
146
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v, n_head, n_tokens,
147
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
148
+ ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
149
+ ggml_row_size(kv->type, n_embd_head_qk_nope));
150
+ cb(Vcur, "Vcur_view", il);
151
+
152
+ Vcur = ggml_cont(ctx0, Vcur);
153
+ cb(Vcur, "Vcur_cont", il);
154
+
155
+ // note: rope must go first for in-place context shifting in build_rope_shift()
156
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
157
+ cb(Qcur, "Qcur", il);
158
+
159
+ ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
160
+ cb(Kcur, "Kcur", il);
161
+
162
+ // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
163
+ cur = build_attn(inp_attn,
164
+ model.layers[il].wo, NULL,
165
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
166
+ }
167
+ }
168
+ if (il == n_layer - 1 && inp_out_ids) {
169
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
170
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
171
+ }
172
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
173
+ cb(ffn_inp, "ffn_inp", il);
174
+
175
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
176
+ cb(cur, "ffn_norm", il);
177
+
178
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
179
+ cur = build_ffn(cur,
180
+ model.layers[il].ffn_up, NULL, NULL,
181
+ model.layers[il].ffn_gate, NULL, NULL,
182
+ model.layers[il].ffn_down, NULL, NULL,
183
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
184
+ cb(cur, "ffn_out", il);
185
+ } else {
186
+ // MoE branch
187
+ ggml_tensor * moe_out = build_moe_ffn(cur,
188
+ model.layers[il].ffn_gate_inp,
189
+ model.layers[il].ffn_up_exps,
190
+ model.layers[il].ffn_gate_exps,
191
+ model.layers[il].ffn_down_exps,
192
+ model.layers[il].ffn_exp_probs_b,
193
+ n_expert, n_expert_used,
194
+ LLM_FFN_SILU, hparams.expert_weights_norm,
195
+ true, hparams.expert_weights_scale,
196
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
197
+ il);
198
+ cb(moe_out, "ffn_moe_out", il);
199
+
200
+ // FFN shared expert
201
+ {
202
+ ggml_tensor * ffn_shexp =
203
+ build_ffn(cur,
204
+ model.layers[il].ffn_up_shexp, NULL, NULL,
205
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
206
+ model.layers[il].ffn_down_shexp, NULL, NULL,
207
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
208
+ cb(ffn_shexp, "ffn_shexp", il);
209
+
210
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
211
+ cb(cur, "ffn_out", il);
212
+ }
213
+ }
214
+ cur = ggml_add(ctx0, cur, ffn_inp);
215
+
216
+ cur = build_cvec(cur, il);
217
+ cb(cur, "l_out", il);
218
+
219
+ // input for next layer
220
+ inpL = cur;
221
+ }
222
+ cur = inpL;
223
+
224
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
225
+
226
+ cb(cur, "result_norm", -1);
227
+ res->t_embd = cur;
228
+
229
+ // lm_head
230
+ cur = ggml_mul_mat(ctx0, model.output, cur);
231
+
232
+ cb(cur, "result_output", -1);
233
+ res->t_logits = cur;
234
+
235
+ ggml_build_forward_expand(gf, cur);
236
+ }
@@ -0,0 +1,134 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
6
+ llm_graph_context(params) {
7
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8
+
9
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
11
+
12
+ ggml_tensor * cur;
13
+ ggml_tensor * inpL;
14
+
15
+ inpL = build_inp_embd(model.tok_embd);
16
+
17
+ // inp_pos - contains the positions
18
+ ggml_tensor * inp_pos = build_inp_pos();
19
+
20
+ auto * inp_attn = build_attn_inp_kv();
21
+
22
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
23
+
24
+ for (int il = 0; il < n_layer; ++il) {
25
+ ggml_tensor * inpSA = inpL;
26
+
27
+ // norm
28
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
29
+ cb(cur, "attn_norm", il);
30
+
31
+ // self_attention
32
+ {
33
+ // compute Q and K and RoPE them
34
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
35
+ cb(Qcur, "Qcur", il);
36
+
37
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
38
+ cb(Kcur, "Kcur", il);
39
+
40
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
41
+ cb(Vcur, "Vcur", il);
42
+
43
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
44
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
45
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
46
+
47
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
48
+ cb(Qcur, "Qcur_normed", il);
49
+
50
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
51
+ ext_factor, attn_factor, beta_fast, beta_slow);
52
+
53
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
54
+ cb(Kcur, "Kcur_normed", il);
55
+
56
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
57
+ ext_factor, attn_factor, beta_fast, beta_slow);
58
+
59
+ cb(Qcur, "Qcur", il);
60
+ cb(Kcur, "Kcur", il);
61
+ cb(Vcur, "Vcur", il);
62
+
63
+ cur = build_attn(inp_attn,
64
+ model.layers[il].wo, model.layers[il].bo,
65
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
66
+ }
67
+ if (il == n_layer - 1 && inp_out_ids) {
68
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
69
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
70
+ }
71
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
72
+ cb(ffn_inp, "ffn_inp", il);
73
+
74
+ // MoE branch
75
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
76
+ cb(cur, "ffn_norm", il);
77
+
78
+ if ((uint32_t) il < hparams.n_layer_dense_lead) {
79
+ cur = build_ffn(cur,
80
+ model.layers[il].ffn_up, NULL, NULL,
81
+ model.layers[il].ffn_gate, NULL, NULL,
82
+ model.layers[il].ffn_down, NULL, NULL,
83
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
84
+ cb(cur, "ffn_out", il);
85
+ } else {
86
+ ggml_tensor * moe_out = build_moe_ffn(cur,
87
+ model.layers[il].ffn_gate_inp,
88
+ model.layers[il].ffn_up_exps,
89
+ model.layers[il].ffn_gate_exps,
90
+ model.layers[il].ffn_down_exps,
91
+ model.layers[il].ffn_exp_probs_b,
92
+ n_expert, n_expert_used,
93
+ LLM_FFN_SILU, hparams.expert_weights_norm,
94
+ true, hparams.expert_weights_scale,
95
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
96
+ il);
97
+ cb(moe_out, "ffn_moe_out", il);
98
+
99
+ {
100
+ ggml_tensor * ffn_shexp =
101
+ build_ffn(cur,
102
+ model.layers[il].ffn_up_shexp, NULL, NULL,
103
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
104
+ model.layers[il].ffn_down_shexp, NULL, NULL,
105
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
106
+ cb(ffn_shexp, "ffn_shexp", il);
107
+
108
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
109
+ cb(cur, "ffn_out", il);
110
+ }
111
+ }
112
+ cur = ggml_add(ctx0, cur, ffn_inp);
113
+
114
+ cur = build_cvec(cur, il);
115
+ cb(cur, "l_out", il);
116
+
117
+ // input for next layer
118
+ inpL = cur;
119
+ }
120
+ cur = inpL;
121
+
122
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
123
+
124
+ cb(cur, "result_norm", -1);
125
+ res->t_embd = cur;
126
+
127
+ // lm_head
128
+ cur = build_lora_mm(model.output, cur);
129
+
130
+ cb(cur, "result_output", -1);
131
+ res->t_logits = cur;
132
+
133
+ ggml_build_forward_expand(gf, cur);
134
+ }
@@ -0,0 +1,105 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
6
+ llm_graph_context(params) {
7
+ //copied from qwen2
8
+ const int64_t n_embd_head = hparams.n_embd_head_v;
9
+
10
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12
+
13
+ ggml_tensor * cur;
14
+ ggml_tensor * inpL;
15
+
16
+ inpL = build_inp_embd(model.tok_embd);
17
+
18
+ // inp_pos - contains the positions
19
+ ggml_tensor * inp_pos = build_inp_pos();
20
+
21
+ auto * inp_attn = build_attn_inp_no_cache();
22
+
23
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
24
+
25
+ for (int il = 0; il < n_layer; ++il) {
26
+ ggml_tensor * inpSA = inpL;
27
+
28
+ // norm
29
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+
32
+ // self-attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
37
+ cb(Qcur, "Qcur", il);
38
+
39
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
40
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
41
+ cb(Kcur, "Kcur", il);
42
+
43
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
44
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
45
+ cb(Vcur, "Vcur", il);
46
+
47
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
48
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
49
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
50
+
51
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
52
+ ext_factor, attn_factor, beta_fast, beta_slow);
53
+
54
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ ext_factor, attn_factor, beta_fast, beta_slow);
56
+
57
+ cb(Qcur, "Qcur", il);
58
+ cb(Kcur, "Kcur", il);
59
+ cb(Vcur, "Vcur", il);
60
+
61
+ cur = build_attn(inp_attn,
62
+ model.layers[il].wo, model.layers[il].bo,
63
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
64
+ }
65
+ if (il == n_layer - 1 && inp_out_ids) {
66
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
67
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
68
+ }
69
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
70
+ cb(ffn_inp, "ffn_inp", il);
71
+
72
+ // feed-forward network
73
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
74
+ cb(cur, "ffn_norm", il);
75
+
76
+ cur = build_ffn(cur,
77
+ model.layers[il].ffn_up, NULL, NULL,
78
+ model.layers[il].ffn_gate, NULL, NULL,
79
+ model.layers[il].ffn_down, NULL, NULL,
80
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
81
+ cb(cur, "ffn_out", il);
82
+
83
+ cur = ggml_add(ctx0, cur, ffn_inp);
84
+
85
+ cur = build_cvec(cur, il);
86
+ cb(cur, "l_out", il);
87
+
88
+ // input for next layer
89
+ inpL = cur;
90
+ }
91
+ cur = inpL;
92
+
93
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
94
+
95
+ cb(cur, "result_norm", -1);
96
+ res->t_embd = cur;
97
+
98
+ // lm_head
99
+ cur = build_lora_mm(model.output, cur);
100
+
101
+ cb(cur, "result_output", -1);
102
+ res->t_logits = cur;
103
+
104
+ ggml_build_forward_expand(gf, cur);
105
+ }
@@ -0,0 +1,150 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
6
+ llm_graph_context(params) {
7
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8
+
9
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
10
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
11
+
12
+ ggml_tensor * cur;
13
+ ggml_tensor * inpL;
14
+
15
+ inpL = build_inp_embd(model.tok_embd);
16
+
17
+ // inp_pos - contains the positions
18
+ ggml_tensor * inp_pos = build_inp_pos();
19
+
20
+ auto * inp_attn = build_attn_inp_kv();
21
+
22
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
23
+
24
+ GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Ernie 4.5 MoE requires n_moe_layer_step > 0");
25
+ for (int il = 0; il < n_layer; ++il) {
26
+ ggml_tensor * inpSA = inpL;
27
+ // norm
28
+ {
29
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+ }
32
+ // self-attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+ if (model.layers[il].bq) {
38
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
39
+ cb(Qcur, "Qcur", il);
40
+ }
41
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
42
+ cb(Kcur, "Kcur", il);
43
+ if (model.layers[il].bk) {
44
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
45
+ cb(Kcur, "Kcur", il);
46
+ }
47
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
48
+ cb(Vcur, "Vcur", il);
49
+ if (model.layers[il].bv) {
50
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
51
+ cb(Vcur, "Vcur", il);
52
+ }
53
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
54
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
55
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
56
+
57
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58
+ ext_factor, attn_factor, beta_fast, beta_slow);
59
+
60
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61
+ ext_factor, attn_factor, beta_fast, beta_slow);
62
+
63
+ cb(Qcur, "Qcur", il);
64
+ cb(Kcur, "Kcur", il);
65
+ cb(Vcur, "Vcur", il);
66
+
67
+ cur = build_attn(inp_attn,
68
+ model.layers[il].wo, NULL,
69
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
70
+ cb(cur, "attn_out", il);
71
+ }
72
+ if (il == n_layer - 1 && inp_out_ids) {
73
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
74
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
75
+ }
76
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
77
+ cb(ffn_inp, "ffn_inp", il);
78
+
79
+ // feed-forward network
80
+ bool is_moe_layer =
81
+ static_cast<uint32_t>(il) >= hparams.n_layer_dense_lead && (il + 1) % hparams.n_moe_layer_step == 0;
82
+
83
+ if (!is_moe_layer) {
84
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
85
+ cb(cur, "ffn_norm", il);
86
+
87
+ cur = build_ffn(cur,
88
+ model.layers[il].ffn_up, NULL, NULL,
89
+ model.layers[il].ffn_gate, NULL, NULL,
90
+ model.layers[il].ffn_down, NULL, NULL,
91
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
92
+ cb(cur, "ffn_out", il);
93
+ } else {
94
+ // MoE branch
95
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
96
+ cb(cur, "ffn_norm", il);
97
+
98
+ ggml_tensor * moe_out = build_moe_ffn(cur,
99
+ model.layers[il].ffn_gate_inp,
100
+ model.layers[il].ffn_up_exps,
101
+ model.layers[il].ffn_gate_exps,
102
+ model.layers[il].ffn_down_exps,
103
+ model.layers[il].ffn_exp_probs_b,
104
+ n_expert, n_expert_used,
105
+ LLM_FFN_SILU, true,
106
+ false, 0.0,
107
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
108
+ il);
109
+ cb(moe_out, "ffn_moe_out", il);
110
+
111
+ // Shared expert (if present)
112
+ if (hparams.n_ff_shexp > 0) {
113
+ ggml_tensor * ffn_shexp =
114
+ build_ffn(cur,
115
+ model.layers[il].ffn_up_shexp, NULL, NULL,
116
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
117
+ model.layers[il].ffn_down_shexp, NULL, NULL,
118
+ NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
119
+ cb(ffn_shexp, "ffn_shexp", il);
120
+
121
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
122
+ } else {
123
+ cur = moe_out;
124
+ }
125
+ cb(cur, "ffn_out", il);
126
+ }
127
+ cur = ggml_add(ctx0, cur, ffn_inp);
128
+ cb(cur, "ffn_out", il);
129
+
130
+ cur = build_cvec(cur, il);
131
+ cb(cur, "l_out", il);
132
+
133
+ // input for next layer
134
+ inpL = cur;
135
+ }
136
+ cur = inpL;
137
+
138
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
139
+
140
+ cb(cur, "result_norm", -1);
141
+ res->t_embd = cur;
142
+
143
+ // lm_head
144
+ cur = build_lora_mm(model.output, cur);
145
+
146
+ cb(cur, "result_output", -1);
147
+ res->t_logits = cur;
148
+
149
+ ggml_build_forward_expand(gf, cur);
150
+ }