@fugood/llama.node 1.3.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +8 -8
  3. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  4. package/src/llama.cpp/common/arg.cpp +44 -999
  5. package/src/llama.cpp/common/arg.h +2 -2
  6. package/src/llama.cpp/common/chat.cpp +17 -2
  7. package/src/llama.cpp/common/common.cpp +33 -0
  8. package/src/llama.cpp/common/common.h +15 -1
  9. package/src/llama.cpp/common/download.cpp +1054 -0
  10. package/src/llama.cpp/common/download.h +55 -0
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  12. package/src/llama.cpp/ggml/include/ggml.h +2 -0
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -11
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +4 -5
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +108 -49
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -1
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +21 -21
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +172 -75
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +0 -4
  23. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +82 -21
  24. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +25 -25
  25. package/src/llama.cpp/include/llama.h +7 -3
  26. package/src/llama.cpp/src/CMakeLists.txt +95 -0
  27. package/src/llama.cpp/src/llama-arch.cpp +108 -0
  28. package/src/llama.cpp/src/llama-arch.h +11 -0
  29. package/src/llama.cpp/src/llama-batch.cpp +63 -31
  30. package/src/llama.cpp/src/llama-batch.h +12 -1
  31. package/src/llama.cpp/src/llama-chat.cpp +32 -0
  32. package/src/llama.cpp/src/llama-chat.h +1 -0
  33. package/src/llama.cpp/src/llama-context.cpp +36 -13
  34. package/src/llama.cpp/src/llama-context.h +5 -5
  35. package/src/llama.cpp/src/llama-cparams.h +1 -0
  36. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  37. package/src/llama.cpp/src/llama-hparams.cpp +11 -1
  38. package/src/llama.cpp/src/llama-hparams.h +6 -0
  39. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +3 -1
  40. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -1
  41. package/src/llama.cpp/src/llama-kv-cells.h +44 -2
  42. package/src/llama.cpp/src/llama-memory-recurrent.cpp +4 -3
  43. package/src/llama.cpp/src/llama-model.cpp +320 -13171
  44. package/src/llama.cpp/src/llama-model.h +8 -0
  45. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  46. package/src/llama.cpp/src/llama-vocab.cpp +5 -0
  47. package/src/llama.cpp/src/llama-vocab.h +1 -0
  48. package/src/llama.cpp/src/models/apertus.cpp +125 -0
  49. package/src/llama.cpp/src/models/arcee.cpp +135 -0
  50. package/src/llama.cpp/src/models/arctic.cpp +138 -0
  51. package/src/llama.cpp/src/models/arwkv7.cpp +86 -0
  52. package/src/llama.cpp/src/models/baichuan.cpp +122 -0
  53. package/src/llama.cpp/src/models/bailingmoe.cpp +144 -0
  54. package/src/llama.cpp/src/models/bailingmoe2.cpp +135 -0
  55. package/src/llama.cpp/src/models/bert.cpp +176 -0
  56. package/src/llama.cpp/src/models/bitnet.cpp +160 -0
  57. package/src/llama.cpp/src/models/bloom.cpp +101 -0
  58. package/src/llama.cpp/src/models/chameleon.cpp +178 -0
  59. package/src/llama.cpp/src/models/chatglm.cpp +132 -0
  60. package/src/llama.cpp/src/models/codeshell.cpp +111 -0
  61. package/src/llama.cpp/src/models/cogvlm.cpp +100 -0
  62. package/src/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
  63. package/src/llama.cpp/src/models/command-r.cpp +122 -0
  64. package/src/llama.cpp/src/models/dbrx.cpp +123 -0
  65. package/src/llama.cpp/src/models/deci.cpp +135 -0
  66. package/src/llama.cpp/src/models/deepseek.cpp +144 -0
  67. package/src/llama.cpp/src/models/deepseek2.cpp +236 -0
  68. package/src/llama.cpp/src/models/dots1.cpp +134 -0
  69. package/src/llama.cpp/src/models/dream.cpp +105 -0
  70. package/src/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
  71. package/src/llama.cpp/src/models/ernie4-5.cpp +110 -0
  72. package/src/llama.cpp/src/models/exaone.cpp +114 -0
  73. package/src/llama.cpp/src/models/exaone4.cpp +123 -0
  74. package/src/llama.cpp/src/models/falcon-h1.cpp +113 -0
  75. package/src/llama.cpp/src/models/falcon.cpp +120 -0
  76. package/src/llama.cpp/src/models/gemma-embedding.cpp +120 -0
  77. package/src/llama.cpp/src/models/gemma.cpp +112 -0
  78. package/src/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
  79. package/src/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
  80. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
  81. package/src/llama.cpp/src/models/glm4-moe.cpp +153 -0
  82. package/src/llama.cpp/src/models/glm4.cpp +127 -0
  83. package/src/llama.cpp/src/models/gpt2.cpp +105 -0
  84. package/src/llama.cpp/src/models/gptneox.cpp +144 -0
  85. package/src/llama.cpp/src/models/granite-hybrid.cpp +196 -0
  86. package/src/llama.cpp/src/models/granite.cpp +211 -0
  87. package/src/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
  88. package/src/llama.cpp/src/models/grok.cpp +159 -0
  89. package/src/llama.cpp/src/models/grovemoe.cpp +141 -0
  90. package/src/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
  91. package/src/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
  92. package/src/llama.cpp/src/models/internlm2.cpp +120 -0
  93. package/src/llama.cpp/src/models/jais.cpp +86 -0
  94. package/src/llama.cpp/src/models/jamba.cpp +106 -0
  95. package/src/llama.cpp/src/models/lfm2.cpp +173 -0
  96. package/src/llama.cpp/src/models/llada-moe.cpp +122 -0
  97. package/src/llama.cpp/src/models/llada.cpp +99 -0
  98. package/src/llama.cpp/src/models/llama-iswa.cpp +174 -0
  99. package/src/llama.cpp/src/models/llama.cpp +155 -0
  100. package/src/llama.cpp/src/models/mamba.cpp +55 -0
  101. package/src/llama.cpp/src/models/minicpm3.cpp +199 -0
  102. package/src/llama.cpp/src/models/minimax-m2.cpp +124 -0
  103. package/src/llama.cpp/src/models/models.h +481 -0
  104. package/src/llama.cpp/src/models/mpt.cpp +126 -0
  105. package/src/llama.cpp/src/models/nemotron-h.cpp +121 -0
  106. package/src/llama.cpp/src/models/nemotron.cpp +122 -0
  107. package/src/llama.cpp/src/models/neo-bert.cpp +104 -0
  108. package/src/llama.cpp/src/models/olmo.cpp +121 -0
  109. package/src/llama.cpp/src/models/olmo2.cpp +150 -0
  110. package/src/llama.cpp/src/models/olmoe.cpp +124 -0
  111. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
  112. package/src/llama.cpp/src/models/openelm.cpp +124 -0
  113. package/src/llama.cpp/src/models/orion.cpp +123 -0
  114. package/src/llama.cpp/src/models/pangu-embedded.cpp +121 -0
  115. package/src/llama.cpp/src/models/phi2.cpp +121 -0
  116. package/src/llama.cpp/src/models/phi3.cpp +152 -0
  117. package/src/llama.cpp/src/models/plamo.cpp +110 -0
  118. package/src/llama.cpp/src/models/plamo2.cpp +316 -0
  119. package/src/llama.cpp/src/models/plm.cpp +168 -0
  120. package/src/llama.cpp/src/models/qwen.cpp +108 -0
  121. package/src/llama.cpp/src/models/qwen2.cpp +117 -0
  122. package/src/llama.cpp/src/models/qwen2moe.cpp +151 -0
  123. package/src/llama.cpp/src/models/qwen2vl.cpp +117 -0
  124. package/src/llama.cpp/src/models/qwen3.cpp +117 -0
  125. package/src/llama.cpp/src/models/qwen3moe.cpp +124 -0
  126. package/src/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
  127. package/src/llama.cpp/src/models/qwen3vl.cpp +141 -0
  128. package/src/llama.cpp/src/models/refact.cpp +94 -0
  129. package/src/llama.cpp/src/models/rwkv6-base.cpp +162 -0
  130. package/src/llama.cpp/src/models/rwkv6.cpp +94 -0
  131. package/src/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
  132. package/src/llama.cpp/src/models/rwkv7-base.cpp +135 -0
  133. package/src/llama.cpp/src/models/rwkv7.cpp +90 -0
  134. package/src/llama.cpp/src/models/seed-oss.cpp +124 -0
  135. package/src/llama.cpp/src/models/smallthinker.cpp +120 -0
  136. package/src/llama.cpp/src/models/smollm3.cpp +128 -0
  137. package/src/llama.cpp/src/models/stablelm.cpp +146 -0
  138. package/src/llama.cpp/src/models/starcoder.cpp +100 -0
  139. package/src/llama.cpp/src/models/starcoder2.cpp +121 -0
  140. package/src/llama.cpp/src/models/t5-dec.cpp +166 -0
  141. package/src/llama.cpp/src/models/t5-enc.cpp +96 -0
  142. package/src/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
  143. package/src/llama.cpp/src/models/xverse.cpp +108 -0
@@ -0,0 +1,283 @@
1
+ #include "models.h"
2
+
3
+ llm_graph_context_mamba::llm_graph_context_mamba(const llm_graph_params & params) : llm_graph_context(params) {}
4
+
5
+ ggml_tensor * llm_graph_context_mamba::build_mamba_layer(llm_graph_input_rs * inp,
6
+ ggml_tensor * cur,
7
+ const llama_model & model,
8
+ const llama_ubatch & ubatch,
9
+ int il) {
10
+ const auto * mctx_cur = inp->mctx;
11
+
12
+ const auto kv_head = mctx_cur->get_head();
13
+
14
+ const auto & layer = model.layers[il];
15
+
16
+ const int64_t d_conv = hparams.ssm_d_conv;
17
+ const int64_t d_inner = hparams.ssm_d_inner;
18
+ const int64_t d_state = hparams.ssm_d_state;
19
+ const int64_t dt_rank = hparams.ssm_dt_rank;
20
+ const int64_t n_head = d_inner;
21
+ const int64_t head_dim = 1;
22
+ const int64_t n_seqs = ubatch.n_seqs;
23
+ // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
24
+ const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
25
+
26
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
27
+
28
+ GGML_ASSERT(n_seqs != 0);
29
+ GGML_ASSERT(ubatch.equal_seqs());
30
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
31
+
32
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
33
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
34
+
35
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
36
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs);
37
+
38
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
39
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
40
+
41
+ // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
42
+ ggml_tensor * xz = build_lora_mm(layer.ssm_in, cur);
43
+ // split the above in two
44
+ // => {d_inner, n_seq_tokens, n_seqs}
45
+ ggml_tensor * x = ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
46
+ ggml_tensor * z =
47
+ ggml_view_3d(ctx0, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner * ggml_element_size(xz));
48
+
49
+ // conv
50
+ {
51
+ // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
52
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, x), 0);
53
+
54
+ // copy last (d_conv - 1) columns back into the state cache
55
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2],
56
+ n_seq_tokens * (conv_x->nb[0]));
57
+
58
+ ggml_build_forward_expand(
59
+ gf, ggml_cpy(ctx0, last_conv,
60
+ ggml_view_1d(ctx0, conv_states_all, (d_conv - 1) * (d_inner) * (n_seqs),
61
+ kv_head * (d_conv - 1) * (d_inner) *ggml_element_size(conv_states_all))));
62
+
63
+ // 1D convolution
64
+ // The equivalent is to make a self-overlapping view of conv_x
65
+ // over d_conv columns at each stride in the 3rd dimension,
66
+ // then element-wise multiply that with the conv1d weight,
67
+ // then sum the elements of each row,
68
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
69
+ // then permute away the ne[0] dimension,
70
+ // and then you're left with the resulting x tensor.
71
+ // For simultaneous sequences, all sequences need to have the same length.
72
+ x = ggml_ssm_conv(ctx0, conv_x, layer.ssm_conv1d);
73
+
74
+ // bias
75
+ x = ggml_add(ctx0, x, layer.ssm_conv1d_b);
76
+
77
+ x = ggml_silu(ctx0, x);
78
+ }
79
+
80
+ // ssm
81
+ {
82
+ // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
83
+ ggml_tensor * x_db = build_lora_mm(layer.ssm_x, x);
84
+ // split
85
+ ggml_tensor * dt = ggml_view_3d(ctx0, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
86
+ ggml_tensor * B =
87
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
88
+ x_db->nb[2], ggml_element_size(x_db) * dt_rank);
89
+ ggml_tensor * C =
90
+ ggml_view_4d(ctx0, x_db, d_state, /* n_group */ 1, n_seq_tokens, n_seqs, d_state * x_db->nb[0], x_db->nb[1],
91
+ x_db->nb[2], ggml_element_size(x_db) * (dt_rank + d_state));
92
+
93
+ // Some Mamba variants (e.g. FalconMamba, Jamba) apply RMS norm in B, C & Dt layers
94
+ if (ssm_dt_b_c_rms || (layer.ssm_dt_norm && layer.ssm_b_norm && layer.ssm_c_norm)) {
95
+ dt = build_norm(dt, layer.ssm_dt_norm, NULL, LLM_NORM_RMS, il);
96
+ B = build_norm(B, layer.ssm_b_norm, NULL, LLM_NORM_RMS, il);
97
+ C = build_norm(C, layer.ssm_c_norm, NULL, LLM_NORM_RMS, il);
98
+ }
99
+
100
+ // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
101
+ dt = build_lora_mm(layer.ssm_dt, dt);
102
+ dt = ggml_add(ctx0, dt, layer.ssm_dt_b);
103
+
104
+ cur = x;
105
+ x = ggml_reshape_4d(ctx0, x, head_dim, n_head, n_seq_tokens, n_seqs);
106
+
107
+ ggml_tensor * A = layer.ssm_a;
108
+
109
+ // use the states and the indices provided by build_recurrent_state
110
+ // (this is necessary in order to properly use the states before they are overwritten,
111
+ // while avoiding to make unnecessary copies of the states)
112
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
113
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
114
+
115
+ // Custom operator to optimize the parallel associative scan
116
+ // as described in the Annex D of the Mamba paper.
117
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
118
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
119
+ };
120
+
121
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
122
+
123
+ // store last states
124
+ ggml_build_forward_expand(
125
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, x->nb[3] * x->ne[3]),
126
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
127
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
128
+
129
+ ggml_tensor * y = ggml_view_3d(ctx0, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[2], x->nb[3], 0);
130
+
131
+ // TODO: skip computing output earlier for unused tokens
132
+
133
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, cur, layer.ssm_d));
134
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
135
+
136
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
137
+ cur = build_lora_mm(layer.ssm_out, y);
138
+ }
139
+
140
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
141
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
142
+
143
+ return cur;
144
+ }
145
+
146
+ ggml_tensor * llm_graph_context_mamba::build_mamba2_layer(llm_graph_input_rs * inp,
147
+ ggml_tensor * cur,
148
+ const llama_model & model,
149
+ const llama_ubatch & ubatch,
150
+ int il) const {
151
+ const auto * mctx_cur = inp->mctx;
152
+
153
+ const auto kv_head = mctx_cur->get_head();
154
+
155
+ const int64_t d_conv = hparams.ssm_d_conv;
156
+ const int64_t d_inner = hparams.ssm_d_inner;
157
+ const int64_t d_state = hparams.ssm_d_state;
158
+ const int64_t n_head = hparams.ssm_dt_rank;
159
+ const int64_t head_dim = d_inner / n_head;
160
+ const int64_t n_group = hparams.ssm_n_group;
161
+ const int64_t n_seqs = ubatch.n_seqs;
162
+
163
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
164
+
165
+ GGML_ASSERT(n_seqs != 0);
166
+ GGML_ASSERT(ubatch.equal_seqs());
167
+ GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);
168
+
169
+ ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
170
+ ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
171
+
172
+ ggml_tensor * conv = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
173
+ conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs);
174
+
175
+ // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
176
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);
177
+
178
+ // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
179
+
180
+ // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs}
181
+ ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur);
182
+
183
+ // split the above in three
184
+ ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * zxBCdt->nb[0],
185
+ zxBCdt->nb[1], zxBCdt->nb[2], 0);
186
+ ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2 * n_group * d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1],
187
+ zxBCdt->nb[2], d_inner * ggml_element_size(zxBCdt));
188
+ ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2],
189
+ (2 * d_inner + 2 * n_group * d_state) * ggml_element_size(zxBCdt));
190
+
191
+ // conv
192
+ {
193
+ // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs}
194
+ ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0);
195
+
196
+ // copy last (d_conv - 1) columns back into the state cache
197
+ ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2 * n_group * d_state, n_seqs,
198
+ conv_x->nb[1], conv_x->nb[2], n_seq_tokens * (conv_x->nb[0]));
199
+
200
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv,
201
+ ggml_view_1d(ctx0, conv_states_all,
202
+ (d_conv - 1) * (d_inner + 2 * n_group * d_state) * (n_seqs),
203
+ kv_head * (d_conv - 1) * (d_inner + 2 * n_group * d_state) *
204
+ ggml_element_size(conv_states_all))));
205
+
206
+ // 1D convolution
207
+ // The equivalent is to make a self-overlapping view of conv_x
208
+ // over d_conv columns at each stride in the 3rd dimension,
209
+ // then element-wise multiply that with the conv1d weight,
210
+ // then sum the elements of each row,
211
+ // (the last two steps are a dot product over rows (also doable with mul_mat))
212
+ // then permute away the ne[0] dimension,
213
+ // and then you're left with the resulting x tensor.
214
+ // For simultaneous sequences, all sequences need to have the same length.
215
+ xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
216
+
217
+ // bias
218
+ xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b);
219
+
220
+ xBC = ggml_silu(ctx0, xBC);
221
+ }
222
+
223
+ // ssm
224
+ {
225
+ // These correspond to V K Q in SSM/attention duality
226
+ ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim * xBC->nb[0],
227
+ xBC->nb[1], xBC->nb[2], 0);
228
+ ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
229
+ xBC->nb[1], xBC->nb[2], d_inner * ggml_element_size(xBC));
230
+ ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state * xBC->nb[0],
231
+ xBC->nb[1], xBC->nb[2], (d_inner + n_group * d_state) * ggml_element_size(xBC));
232
+
233
+ // {n_head, n_seq_tokens, n_seqs}
234
+ dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b);
235
+
236
+ ggml_tensor * A = model.layers[il].ssm_a;
237
+
238
+ // use the states and the indices provided by build_recurrent_state
239
+ // (this is necessary in order to properly use the states before they are overwritten,
240
+ // while avoiding to make unnecessary copies of the states)
241
+ auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) {
242
+ ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, mctx_cur->get_size());
243
+
244
+ // TODO: use semistructured matrices to implement state-space duality
245
+ // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
246
+ return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids);
247
+ };
248
+
249
+ ggml_tensor * y_ssm = build_rs(inp, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows);
250
+
251
+ // store last states
252
+ ggml_build_forward_expand(
253
+ gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state * d_inner * n_seqs, ggml_nelements(x) * x->nb[0]),
254
+ ggml_view_1d(ctx0, ssm_states_all, d_state * d_inner * n_seqs,
255
+ kv_head * d_state * d_inner * ggml_element_size(ssm_states_all))));
256
+
257
+ ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head * x->nb[1],
258
+ n_seq_tokens * n_head * x->nb[1], 0);
259
+
260
+ // TODO: skip computing output earlier for unused tokens
261
+
262
+ y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
263
+ cb(y, "mamba2_y_add_d", il);
264
+ y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y);
265
+
266
+ // grouped RMS norm
267
+ if (model.layers[il].ssm_norm) {
268
+ y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs);
269
+ y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il);
270
+ }
271
+
272
+ y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs);
273
+
274
+ // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
275
+ cur = build_lora_mm(model.layers[il].ssm_out, y);
276
+ }
277
+
278
+ // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
279
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs);
280
+ cb(cur, "mamba_out", il);
281
+
282
+ return cur;
283
+ }
@@ -0,0 +1,159 @@
1
+ #include "models.h"
2
+
3
+ llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
+ const int64_t n_embd_head = hparams.n_embd_head_v;
5
+
6
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
7
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
8
+
9
+ ggml_tensor * cur;
10
+ ggml_tensor * inpL;
11
+
12
+ inpL = build_inp_embd(model.tok_embd);
13
+
14
+ // inp_pos - contains the positions
15
+ ggml_tensor * inp_pos = build_inp_pos();
16
+
17
+ auto * inp_attn = build_attn_inp_kv();
18
+
19
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
20
+
21
+ for (int il = 0; il < n_layer; ++il) {
22
+ ggml_tensor * inpSA = inpL;
23
+
24
+ // norm
25
+ cur = build_norm(inpL,
26
+ model.layers[il].attn_norm, NULL,
27
+ LLM_NORM_RMS, il);
28
+ cb(cur, "attn_norm", il);
29
+
30
+ // self-attention
31
+ {
32
+ // compute Q and K and RoPE them
33
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
34
+ cb(Qcur, "Qcur", il);
35
+ if (model.layers[il].bq) {
36
+ Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
37
+ cb(Qcur, "Qcur", il);
38
+ }
39
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
40
+ cb(Kcur, "Kcur", il);
41
+ if (model.layers[il].bk) {
42
+ Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
43
+ cb(Kcur, "Kcur", il);
44
+ }
45
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
46
+ cb(Vcur, "Vcur", il);
47
+ if (model.layers[il].bv) {
48
+ Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
49
+ cb(Vcur, "Vcur", il);
50
+ }
51
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
52
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
53
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
54
+
55
+ Qcur = ggml_rope_ext(
56
+ ctx0, Qcur, inp_pos, nullptr,
57
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58
+ ext_factor, attn_factor, beta_fast, beta_slow
59
+ );
60
+
61
+ Kcur = ggml_rope_ext(
62
+ ctx0, Kcur, inp_pos, nullptr,
63
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
64
+ ext_factor, attn_factor, beta_fast, beta_slow
65
+ );
66
+
67
+ cb(Qcur, "Qcur", il);
68
+ cb(Kcur, "Kcur", il);
69
+ cb(Vcur, "Vcur", il);
70
+
71
+ cur = build_attn(inp_attn,
72
+ model.layers[il].wo, model.layers[il].bo,
73
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f, il);
74
+ }
75
+ if (il == n_layer - 1 && inp_out_ids) {
76
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
77
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
78
+ }
79
+ cur = build_norm(cur,
80
+ model.layers[il].attn_out_norm, NULL,
81
+ LLM_NORM_RMS, il);
82
+ cb(cur, "attn_out_norm", il);
83
+
84
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
85
+ cb(ffn_inp, "ffn_inp", il);
86
+
87
+ // feed-forward network
88
+ cur = build_norm(ffn_inp,
89
+ model.layers[il].ffn_norm, NULL,
90
+ LLM_NORM_RMS, il);
91
+ cb(cur, "ffn_norm", il);
92
+
93
+ // MoE branch
94
+ ggml_tensor * moe_out = build_moe_ffn(cur,
95
+ model.layers[il].ffn_gate_inp,
96
+ model.layers[il].ffn_up_exps,
97
+ model.layers[il].ffn_gate_exps,
98
+ model.layers[il].ffn_down_exps,
99
+ nullptr,
100
+ n_expert, n_expert_used,
101
+ LLM_FFN_GELU, true,
102
+ false, 0.0,
103
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
104
+ il);
105
+ cb(moe_out, "ffn_moe_out", il);
106
+
107
+ if (model.layers[il].ffn_up) {
108
+ ggml_tensor * ffn_out = build_ffn(cur,
109
+ model.layers[il].ffn_up, NULL, NULL,
110
+ model.layers[il].ffn_gate, NULL, NULL,
111
+ model.layers[il].ffn_down, NULL, NULL,
112
+ NULL,
113
+ LLM_FFN_GELU, LLM_FFN_PAR, il);
114
+ cb(ffn_out, "ffn_out", il);
115
+
116
+ cur = ggml_scale(ctx0, ggml_add(ctx0, ffn_out, moe_out), std::sqrt(2) / 2);
117
+ cb(cur, "ffn_out", il);
118
+ } else {
119
+ cur = moe_out;
120
+ }
121
+ cur = build_norm(cur,
122
+ model.layers[il].ffn_post_norm, NULL,
123
+ LLM_NORM_RMS, il);
124
+ cb(cur, "ffn_post_norm", il);
125
+
126
+ cur = ggml_add(ctx0, cur, ffn_inp);
127
+ cb(cur, "ffn_out", il);
128
+
129
+ cur = build_cvec(cur, il);
130
+ cb(cur, "l_out", il);
131
+
132
+ // input for next layer
133
+ inpL = cur;
134
+ }
135
+ cur = inpL;
136
+
137
+ cur = build_norm(cur,
138
+ model.output_norm, NULL,
139
+ LLM_NORM_RMS, -1);
140
+
141
+ cb(cur, "result_norm", -1);
142
+ res->t_embd = cur;
143
+
144
+ // lm_head
145
+ cur = build_lora_mm(model.output, cur);
146
+
147
+ cur = ggml_scale(ctx0, cur, hparams.f_logit_scale);
148
+
149
+ // final logit soft-capping
150
+ if (hparams.f_final_logit_softcapping) {
151
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
152
+ cur = ggml_tanh(ctx0, cur);
153
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
154
+ }
155
+ cb(cur, "result_output", -1);
156
+ res->t_logits = cur;
157
+
158
+ ggml_build_forward_expand(gf, cur);
159
+ }
@@ -0,0 +1,141 @@
1
+ #include "models.h"
2
+
3
+
4
+
5
+ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
6
+ llm_graph_context(params) {
7
+ const int64_t n_embd_head = hparams.n_embd_head_v;
8
+ const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;
9
+
10
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
11
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
12
+
13
+ ggml_tensor * cur;
14
+ ggml_tensor * inpL;
15
+
16
+ inpL = build_inp_embd(model.tok_embd);
17
+
18
+ // inp_pos - contains the positions
19
+ ggml_tensor * inp_pos = build_inp_pos();
20
+
21
+ auto * inp_attn = build_attn_inp_kv();
22
+
23
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
24
+
25
+ for (int il = 0; il < n_layer; ++il) {
26
+ ggml_tensor * inpSA = inpL;
27
+
28
+ // norm
29
+ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+
32
+ // self_attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+
38
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39
+ cb(Kcur, "Kcur", il);
40
+
41
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
+ cb(Vcur, "Vcur", il);
43
+
44
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
45
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
46
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
47
+
48
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
49
+ cb(Qcur, "Qcur_normed", il);
50
+
51
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
52
+ ext_factor, attn_factor, beta_fast, beta_slow);
53
+
54
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
55
+ cb(Kcur, "Kcur_normed", il);
56
+
57
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
58
+ ext_factor, attn_factor, beta_fast, beta_slow);
59
+
60
+ cb(Qcur, "Qcur", il);
61
+ cb(Kcur, "Kcur", il);
62
+ cb(Vcur, "Vcur", il);
63
+
64
+ cur = build_attn(inp_attn,
65
+ model.layers[il].wo, model.layers[il].bo,
66
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
67
+ }
68
+
69
+ if (il == n_layer - 1 && inp_out_ids) {
70
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
71
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
72
+ }
73
+
74
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
75
+ cb(ffn_inp, "ffn_inp", il);
76
+
77
+ // MoE branch
78
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
79
+ cb(cur, "ffn_norm", il);
80
+
81
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, cur); // [n_expert, n_tokens]
82
+ cb(probs, "ffn_moe_logits", il);
83
+
84
+ ggml_tensor * moe_out =
85
+ build_moe_ffn(cur,
86
+ nullptr,
87
+ model.layers[il].ffn_up_exps,
88
+ model.layers[il].ffn_gate_exps,
89
+ model.layers[il].ffn_down_exps,
90
+ nullptr,
91
+ n_expert, n_expert_used,
92
+ LLM_FFN_SILU, true,
93
+ false, 0.0,
94
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
95
+ il,
96
+ probs);
97
+ cb(moe_out, "ffn_moe_out", il);
98
+ cur = moe_out;
99
+
100
+ // TODO: Only do the expert selection and weights once
101
+ moe_out = build_moe_ffn(cur,
102
+ nullptr,
103
+ model.layers[il].ffn_up_chexps,
104
+ model.layers[il].ffn_gate_chexps,
105
+ model.layers[il].ffn_down_chexps,
106
+ nullptr,
107
+ n_chunk_expert, n_expert_used > n_chunk_expert ? n_chunk_expert : n_expert_used,
108
+ LLM_FFN_SILU, true,
109
+ false, 0.0,
110
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
111
+ il,
112
+ probs);
113
+ cb(moe_out, "ffn_adj_moe_out", il);
114
+
115
+ cur = ggml_add(ctx0, cur, ggml_scale(ctx0, moe_out, hparams.expert_group_scale));
116
+ cb(cur, "ffn_final_moe_out", il);
117
+
118
+ cur = ggml_add(ctx0, cur, ffn_inp);
119
+
120
+ cur = build_cvec(cur, il);
121
+ cb(cur, "l_out", il);
122
+
123
+ // input for next layer
124
+ inpL = cur;
125
+ }
126
+
127
+ cur = inpL;
128
+
129
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
130
+
131
+ cb(cur, "result_norm", -1);
132
+ res->t_embd = cur;
133
+
134
+ // lm_head
135
+ cur = build_lora_mm(model.output, cur);
136
+
137
+ cb(cur, "result_output", -1);
138
+ res->t_logits = cur;
139
+
140
+ ggml_build_forward_expand(gf, cur);
141
+ }