@fugood/llama.node 0.3.14 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +32 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +12 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -27
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +253 -2
  58. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  60. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  61. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  62. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  63. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -0
  64. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  65. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +66 -26
  66. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  67. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  68. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  69. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  70. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +103 -34
  71. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  72. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  73. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  74. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  75. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  76. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  78. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +352 -146
  79. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +3 -0
  81. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  82. package/src/llama.cpp/include/llama.h +86 -22
  83. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  84. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  85. package/src/llama.cpp/src/llama-adapter.h +11 -9
  86. package/src/llama.cpp/src/llama-arch.cpp +102 -16
  87. package/src/llama.cpp/src/llama-arch.h +18 -0
  88. package/src/llama.cpp/src/llama-batch.h +2 -2
  89. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  90. package/src/llama.cpp/src/llama-context.h +214 -77
  91. package/src/llama.cpp/src/llama-cparams.h +1 -0
  92. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  93. package/src/llama.cpp/src/llama-graph.h +574 -0
  94. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  95. package/src/llama.cpp/src/llama-hparams.h +9 -0
  96. package/src/llama.cpp/src/llama-io.cpp +15 -0
  97. package/src/llama.cpp/src/llama-io.h +35 -0
  98. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  99. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  100. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  101. package/src/llama.cpp/src/llama-memory.h +21 -0
  102. package/src/llama.cpp/src/llama-model.cpp +8207 -163
  103. package/src/llama.cpp/src/llama-model.h +34 -1
  104. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  105. package/src/llama.cpp/src/llama.cpp +51 -9984
  106. package/src/llama.cpp/tests/test-backend-ops.cpp +88 -9
  107. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  108. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -0,0 +1,1662 @@
1
+ #include "llama-graph.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-kv-cache.h"
7
+
8
+ #include <cassert>
9
+ #include <cmath>
10
+ #include <cstring>
11
+
12
+ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
13
+ // TODO move to hparams if a T5 variant appears that uses a different value
14
+ const int64_t max_distance = 128;
15
+
16
+ if (bidirectional) {
17
+ n_buckets >>= 1;
18
+ }
19
+
20
+ const int64_t max_exact = n_buckets >> 1;
21
+
22
+ int32_t relative_position = x - y;
23
+ int32_t relative_bucket = 0;
24
+
25
+ if (bidirectional) {
26
+ relative_bucket += (relative_position > 0) * n_buckets;
27
+ relative_position = abs(relative_position);
28
+ } else {
29
+ relative_position = -std::min<int32_t>(relative_position, 0);
30
+ }
31
+
32
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
33
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
34
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
35
+
36
+ return relative_bucket;
37
+ }
38
+
39
+ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
40
+ if (ubatch->token) {
41
+ const int64_t n_tokens = ubatch->n_tokens;
42
+
43
+ ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
44
+ }
45
+
46
+ if (ubatch->embd) {
47
+ const int64_t n_embd = embd->ne[0];
48
+ const int64_t n_tokens = ubatch->n_tokens;
49
+
50
+ ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(embd));
51
+ }
52
+ }
53
+
54
+ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
55
+ if (ubatch->pos && pos) {
56
+ const int64_t n_tokens = ubatch->n_tokens;
57
+
58
+ ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
59
+ }
60
+ }
61
+
62
+ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
63
+ if (pos_bucket) {
64
+ const int64_t n_tokens = ubatch->n_tokens;
65
+
66
+ GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
67
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
68
+
69
+ int32_t * data = (int32_t *) pos_bucket->data;
70
+
71
+ for (int h = 0; h < 1; ++h) {
72
+ for (int j = 0; j < n_tokens; ++j) {
73
+ for (int i = 0; i < n_tokens; ++i) {
74
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
75
+ }
76
+ }
77
+ }
78
+ }
79
+ }
80
+
81
+ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
82
+ if (pos_bucket) {
83
+ const int64_t n_tokens = ubatch->n_tokens;
84
+
85
+ GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
86
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
87
+
88
+ int32_t * data = (int32_t *) pos_bucket->data;
89
+
90
+ const int64_t n_kv = kv_self->n;
91
+
92
+ for (int h = 0; h < 1; ++h) {
93
+ for (int j = 0; j < n_tokens; ++j) {
94
+ for (int i = 0; i < n_kv; ++i) {
95
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
96
+ }
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
103
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
104
+ //GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
105
+
106
+ if (!out_ids) {
107
+ LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
108
+ } else {
109
+ const int64_t n_tokens = ubatch->n_tokens;
110
+
111
+ GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer));
112
+ int32_t * data = (int32_t *) out_ids->data;
113
+
114
+ if (n_outputs == n_tokens) {
115
+ for (int i = 0; i < n_tokens; ++i) {
116
+ data[i] = i;
117
+ }
118
+ } else if (ubatch->output) {
119
+ int32_t n_outputs = 0;
120
+ for (int i = 0; i < n_tokens; ++i) {
121
+ if (ubatch->output[i]) {
122
+ data[n_outputs++] = i;
123
+ }
124
+ }
125
+ // the graph needs to have been passed the correct number of outputs
126
+ GGML_ASSERT(n_outputs == n_outputs);
127
+ } else if (n_outputs == 1) {
128
+ // only keep last output
129
+ data[0] = n_tokens - 1;
130
+ } else {
131
+ GGML_ASSERT(n_outputs == 0);
132
+ }
133
+ }
134
+ }
135
+ }
136
+
137
+ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
138
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
139
+ const int64_t n_tokens = ubatch->n_tokens;
140
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
141
+ const int64_t n_seqs = ubatch->n_seqs;
142
+
143
+ GGML_ASSERT(mean);
144
+ GGML_ASSERT(ggml_backend_buffer_is_host(mean->buffer));
145
+
146
+ float * data = (float *) mean->data;
147
+ memset(mean->data, 0, n_tokens * n_tokens * ggml_element_size(mean));
148
+
149
+ std::vector<uint64_t> sum(n_tokens, 0);
150
+
151
+ for (int s = 0; s < n_seqs; ++s) {
152
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
153
+
154
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
155
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
156
+
157
+ sum[seq_id] += ubatch->n_seq_tokens;
158
+ }
159
+
160
+ std::vector<float> div(n_tokens, 0.0f);
161
+ for (int i = 0; i < n_tokens; ++i) {
162
+ const uint64_t s = sum[i];
163
+ if (s > 0) {
164
+ div[i] = 1.0f/float(s);
165
+ }
166
+ }
167
+
168
+ for (int s = 0; s < n_seqs; ++s) {
169
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
170
+
171
+ for (int i = 0; i < n_seq_tokens; ++i) {
172
+ data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
173
+ }
174
+ }
175
+ }
176
+ }
177
+
178
+ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
179
+ if (cparams.embeddings && (
180
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
181
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
182
+ const int64_t n_tokens = ubatch->n_tokens;
183
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
184
+ const int64_t n_seqs = ubatch->n_seqs;
185
+
186
+ GGML_ASSERT(cls);
187
+ GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
188
+
189
+ uint32_t * data = (uint32_t *) cls->data;
190
+ memset(cls->data, 0, n_tokens * ggml_element_size(cls));
191
+
192
+ for (int s = 0; s < n_seqs; ++s) {
193
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
194
+
195
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
196
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
197
+
198
+ for (int i = 0; i < n_seq_tokens; ++i) {
199
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
200
+
201
+ if (pos == 0) {
202
+ data[seq_id] = s*n_seq_tokens + i;
203
+ }
204
+ }
205
+ }
206
+ }
207
+
208
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
209
+ const int64_t n_tokens = ubatch->n_tokens;
210
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
211
+ const int64_t n_seqs = ubatch->n_seqs;
212
+
213
+ GGML_ASSERT(cls);
214
+ GGML_ASSERT(ggml_backend_buffer_is_host(cls->buffer));
215
+
216
+ uint32_t * data = (uint32_t *) cls->data;
217
+ memset(cls->data, 0, n_tokens * ggml_element_size(cls));
218
+
219
+ std::vector<int> last_pos(n_tokens, -1);
220
+ std::vector<int> last_row(n_tokens, -1);
221
+
222
+ for (int s = 0; s < n_seqs; ++s) {
223
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
224
+
225
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
226
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
227
+
228
+ for (int i = 0; i < n_seq_tokens; ++i) {
229
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
230
+
231
+ if (pos >= last_pos[seq_id]) {
232
+ last_pos[seq_id] = pos;
233
+ last_row[seq_id] = s*n_seq_tokens + i;
234
+ }
235
+ }
236
+ }
237
+
238
+ for (int i = 0; i < n_tokens; ++i) {
239
+ if (last_row[i] >= 0) {
240
+ data[i] = last_row[i];
241
+ }
242
+ }
243
+ }
244
+ }
245
+
246
+ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
247
+ GGML_UNUSED(ubatch);
248
+
249
+ const int64_t n_kv = kv_self->n;
250
+
251
+ if (s_copy) {
252
+ GGML_ASSERT(ggml_backend_buffer_is_host(s_copy->buffer));
253
+ int32_t * data = (int32_t *) s_copy->data;
254
+
255
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
256
+ for (uint32_t i = 0; i < n_kv; ++i) {
257
+ const uint32_t cell_id = i + kv_self->head;
258
+
259
+ //////////////////////////////////////////////
260
+ // TODO: this should not mutate the KV cache !
261
+ llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
262
+
263
+ // prevent out-of-bound sources
264
+ if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
265
+ kv_cell.src = cell_id;
266
+ }
267
+
268
+ data[i] = kv_cell.src;
269
+
270
+ // TODO: do not mutate the KV cache
271
+ // ensure copy only happens once
272
+ if (kv_cell.src != (int32_t) cell_id) {
273
+ kv_cell.src = cell_id;
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
280
+ GGML_UNUSED(ubatch);
281
+
282
+ const int64_t n_kv = kv_self->n;
283
+
284
+ if (s_mask) {
285
+ GGML_ASSERT(ggml_backend_buffer_is_host(s_mask->buffer));
286
+ float * data = (float *) s_mask->data;
287
+
288
+ // clear unused states
289
+ for (int i = 0; i < n_kv; ++i) {
290
+ const uint32_t cell_id = i + kv_self->head;
291
+
292
+ //////////////////////////////////////////////
293
+ // TODO: this should not mutate the KV cache !
294
+ llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
295
+
296
+ data[i] = (float) (kv_cell.src >= 0);
297
+
298
+ // only clear once
299
+ if (kv_cell.src < 0) {
300
+ kv_cell.src = cell_id;
301
+ }
302
+ }
303
+ }
304
+ }
305
+
306
+ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
307
+ GGML_UNUSED(ubatch);
308
+
309
+ if (cross_embd && !cross->v_embd.empty()) {
310
+ assert(cross_embd->type == GGML_TYPE_F32);
311
+
312
+ ggml_backend_tensor_set(cross_embd, cross->v_embd.data(), 0, ggml_nbytes(cross_embd));
313
+ }
314
+ }
315
+
316
+ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
317
+ if (kq_mask) {
318
+ if (cparams.causal_attn) {
319
+ const int64_t n_kv = ubatch->n_tokens;
320
+ const int64_t n_tokens = ubatch->n_tokens;
321
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
322
+ const int64_t n_seqs = ubatch->n_seqs;
323
+
324
+ GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
325
+ float * data = (float *) kq_mask->data;
326
+
327
+ for (int h = 0; h < 1; ++h) {
328
+ for (int s1 = 0; s1 < n_seqs; ++s1) {
329
+ const llama_seq_id seq_id = ubatch->seq_id[s1][0];
330
+
331
+ for (int j = 0; j < n_seq_tokens; ++j) {
332
+ const int32_t tj = s1*n_seq_tokens + j;
333
+
334
+ for (int s0 = 0; s0 < n_seqs; ++s0) {
335
+ for (int i = 0; i < n_seq_tokens; ++i) {
336
+ const int32_t ti = s0*n_seq_tokens + i;
337
+ float f = -INFINITY;
338
+
339
+ for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
340
+ if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
341
+ if (hparams.use_alibi) {
342
+ f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
343
+ } else {
344
+ f = 0.0f;
345
+ }
346
+ break;
347
+ }
348
+ }
349
+
350
+ data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
351
+ }
352
+ }
353
+ }
354
+ }
355
+ }
356
+ } else {
357
+ const int64_t n_tokens = ubatch->n_tokens;
358
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
359
+ const int64_t n_seqs = ubatch->n_seqs;
360
+ const int64_t n_stride = ubatch->n_tokens;
361
+
362
+ GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
363
+
364
+ float * data = (float *) kq_mask->data;
365
+
366
+ for (int h = 0; h < 1; ++h) {
367
+ for (int s1 = 0; s1 < n_seqs; ++s1) {
368
+ const llama_seq_id seq_id = ubatch->seq_id[s1][0];
369
+
370
+ for (int j = 0; j < n_seq_tokens; ++j) {
371
+ const int32_t tj = s1*n_seq_tokens + j;
372
+
373
+ for (int s0 = 0; s0 < n_seqs; ++s0) {
374
+ for (int i = 0; i < n_seq_tokens; ++i) {
375
+ const int32_t ti = s0*n_seq_tokens + i;
376
+ float f = -INFINITY;
377
+
378
+ for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
379
+ if (ubatch->seq_id[s0][s] == seq_id) {
380
+ if (hparams.use_alibi) {
381
+ f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
382
+ } else {
383
+ f = 0.0f;
384
+ }
385
+ break;
386
+ }
387
+ }
388
+
389
+ data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
390
+ }
391
+ }
392
+
393
+ for (int i = n_tokens; i < n_stride; ++i) {
394
+ data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
395
+ }
396
+ }
397
+ }
398
+ }
399
+ }
400
+ }
401
+ }
402
+
403
+ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
404
+ if (self_kq_mask || self_kq_mask_swa) {
405
+ // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
406
+ if (cparams.causal_attn) {
407
+ const int64_t n_kv = kv_self->n;
408
+ const int64_t n_tokens = ubatch->n_tokens;
409
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
410
+ const int64_t n_seqs = ubatch->n_seqs;
411
+
412
+ float * data = nullptr;
413
+ float * data_swa = nullptr;
414
+
415
+ if (self_kq_mask) {
416
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
417
+ data = (float *) self_kq_mask->data;
418
+ }
419
+
420
+ if (self_kq_mask_swa) {
421
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
422
+ data_swa = (float *) self_kq_mask_swa->data;
423
+ }
424
+
425
+ // For causal attention, use only the previous KV cells
426
+ // of the correct sequence for each token of the ubatch.
427
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
428
+ for (int h = 0; h < 1; ++h) {
429
+ for (int s = 0; s < n_seqs; ++s) {
430
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
431
+
432
+ for (int j = 0; j < n_seq_tokens; ++j) {
433
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
434
+
435
+ for (int i = 0; i < n_kv; ++i) {
436
+ float f;
437
+ if (!kv_self->cells[i].has_seq_id(seq_id) || kv_self->cells[i].pos > pos) {
438
+ f = -INFINITY;
439
+ } else {
440
+ if (hparams.use_alibi) {
441
+ f = -std::abs(kv_self->cells[i].pos - pos);
442
+ } else {
443
+ f = 0.0f;
444
+ }
445
+ }
446
+
447
+ if (data) {
448
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
449
+ }
450
+
451
+ // may need to cut off old tokens for sliding window
452
+ if (data_swa) {
453
+ if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
454
+ f = -INFINITY;
455
+ }
456
+ data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
457
+ }
458
+ }
459
+ }
460
+ }
461
+
462
+ if (data) {
463
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
464
+ for (int j = 0; j < n_kv; ++j) {
465
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
466
+ }
467
+ }
468
+ }
469
+
470
+ if (data_swa) {
471
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
472
+ for (int j = 0; j < n_kv; ++j) {
473
+ data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
474
+ }
475
+ }
476
+ }
477
+ }
478
+ } else {
479
+ const int64_t n_tokens = ubatch->n_tokens;
480
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
481
+ const int64_t n_seqs = ubatch->n_seqs;
482
+ // when using kv cache, the mask needs to match the kv cache size
483
+ const int64_t n_stride = n_tokens;
484
+
485
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
486
+
487
+ float * data = (float *) self_kq_mask->data;
488
+
489
+ for (int h = 0; h < 1; ++h) {
490
+ for (int s1 = 0; s1 < n_seqs; ++s1) {
491
+ const llama_seq_id seq_id = ubatch->seq_id[s1][0];
492
+
493
+ for (int j = 0; j < n_seq_tokens; ++j) {
494
+ const int32_t tj = s1*n_seq_tokens + j;
495
+
496
+ for (int s0 = 0; s0 < n_seqs; ++s0) {
497
+ for (int i = 0; i < n_seq_tokens; ++i) {
498
+ const int32_t ti = s0*n_seq_tokens + i;
499
+ float f = -INFINITY;
500
+
501
+ for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
502
+ if (ubatch->seq_id[s0][s] == seq_id) {
503
+ if (hparams.use_alibi) {
504
+ f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
505
+ } else {
506
+ f = 0.0f;
507
+ }
508
+ break;
509
+ }
510
+ }
511
+
512
+ data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
513
+ }
514
+ }
515
+
516
+ for (int i = n_tokens; i < n_stride; ++i) {
517
+ data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
518
+ }
519
+ }
520
+ }
521
+ }
522
+ }
523
+ }
524
+ }
525
+
526
+ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
527
+ if (cross_kq_mask) {
528
+ const int64_t n_enc = cross_kq_mask->ne[0];
529
+ const int64_t n_tokens = ubatch->n_tokens;
530
+
531
+ GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
532
+ GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
533
+
534
+ float * data = (float *) cross_kq_mask->data;
535
+
536
+ for (int h = 0; h < 1; ++h) {
537
+ for (int j = 0; j < n_tokens; ++j) {
538
+ for (int i = 0; i < n_enc; ++i) {
539
+ float f = -INFINITY;
540
+ for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
541
+ const llama_seq_id seq_id = ubatch->seq_id[j][s];
542
+ if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
543
+ f = 0.0f;
544
+ }
545
+ }
546
+ data[h*(n_enc*n_tokens) + j*n_enc + i] = f;
547
+ }
548
+ }
549
+
550
+ for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
551
+ for (int j = 0; j < n_enc; ++j) {
552
+ data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
553
+ }
554
+ }
555
+ }
556
+ }
557
+ }
558
+
559
+ //
560
+ // llm_graph_context
561
+ //
562
+
563
+ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
564
+ arch (params.arch),
565
+ hparams (params.hparams),
566
+ cparams (params.cparams),
567
+ ubatch (params.ubatch),
568
+ n_embd (hparams.n_embd),
569
+ n_layer (hparams.n_layer),
570
+ n_rot (hparams.n_rot),
571
+ n_ctx (cparams.n_ctx),
572
+ n_ctx_per_seq (cparams.n_ctx / cparams.n_seq_max),
573
+ n_head (hparams.n_head()),
574
+ n_head_kv (hparams.n_head_kv()),
575
+ n_embd_head_k (hparams.n_embd_head_k),
576
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
577
+ n_embd_head_v (hparams.n_embd_head_v),
578
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
579
+ n_expert (hparams.n_expert),
580
+ n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
581
+ freq_base (cparams.rope_freq_base),
582
+ freq_scale (cparams.rope_freq_scale),
583
+ ext_factor (cparams.yarn_ext_factor),
584
+ attn_factor (cparams.yarn_attn_factor),
585
+ beta_fast (cparams.yarn_beta_fast),
586
+ beta_slow (cparams.yarn_beta_slow),
587
+ norm_eps (hparams.f_norm_eps),
588
+ norm_rms_eps (hparams.f_norm_rms_eps),
589
+ n_tokens (ubatch.n_tokens),
590
+ n_outputs (params.n_outputs),
591
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
592
+ pooling_type (cparams.pooling_type),
593
+ rope_type (hparams.rope_type),
594
+ ctx0 (params.ctx),
595
+ sched (params.sched),
596
+ backend_cpu (params.backend_cpu),
597
+ cvec (params.cvec),
598
+ loras (params.loras),
599
+ memory (params.memory),
600
+ cross (params.cross),
601
+ cb_func (params.cb),
602
+ res (std::make_unique<llm_graph_result>()) {
603
+ }
604
+
605
+ int64_t llm_graph_context::n_pos_per_token() const {
606
+ return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
607
+ }
608
+
609
+ void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
610
+ if (cb_func) {
611
+ cb_func(ubatch, cur, name, il);
612
+ }
613
+ }
614
+
615
+ ggml_tensor * llm_graph_context::build_cvec(
616
+ ggml_tensor * cur,
617
+ int il) const {
618
+ return cvec->apply_to(ctx0, cur, il);
619
+ }
620
+
621
+ ggml_tensor * llm_graph_context::build_lora_mm(
622
+ ggml_tensor * w,
623
+ ggml_tensor * cur) const {
624
+ ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
625
+
626
+ for (const auto & lora : *loras) {
627
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
628
+ if (lw == nullptr) {
629
+ continue;
630
+ }
631
+
632
+ const float adapter_scale = lora.second;
633
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
634
+
635
+ ggml_tensor * ab_cur = ggml_mul_mat(
636
+ ctx0, lw->b,
637
+ ggml_mul_mat(ctx0, lw->a, cur)
638
+ );
639
+
640
+ ab_cur = ggml_scale(ctx0, ab_cur, scale);
641
+ res = ggml_add(ctx0, res, ab_cur);
642
+ }
643
+
644
+ return res;
645
+ }
646
+
647
+ ggml_tensor * llm_graph_context::build_lora_mm_id(
648
+ ggml_tensor * w, // ggml_tensor * as
649
+ ggml_tensor * cur, // ggml_tensor * b
650
+ ggml_tensor * ids) const {
651
+ ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
652
+ for (const auto & lora : *loras) {
653
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
654
+ if (lw == nullptr) {
655
+ continue;
656
+ }
657
+
658
+ const float alpha = lora.first->alpha;
659
+ const float rank = (float) lw->b->ne[0];
660
+ const float scale = alpha ? lora.second * alpha / rank : lora.second;
661
+
662
+ ggml_tensor * ab_cur = ggml_mul_mat_id(
663
+ ctx0, lw->b,
664
+ ggml_mul_mat_id(ctx0, lw->a, cur, ids),
665
+ ids
666
+ );
667
+
668
+ ab_cur = ggml_scale(ctx0, ab_cur, scale);
669
+ res = ggml_add(ctx0, res, ab_cur);
670
+ }
671
+
672
+ return res;
673
+ }
674
+
675
+ ggml_tensor * llm_graph_context::build_norm(
676
+ ggml_tensor * cur,
677
+ ggml_tensor * mw,
678
+ ggml_tensor * mb,
679
+ llm_norm_type type,
680
+ int il) const {
681
+ switch (type) {
682
+ case LLM_NORM: cur = ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
683
+ case LLM_NORM_RMS: cur = ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
684
+ case LLM_NORM_GROUP:
685
+ {
686
+ cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
687
+ cur = ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
688
+ cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]);
689
+ } break;
690
+ }
691
+
692
+ if (mw || mb) {
693
+ cb(cur, "norm", il);
694
+ }
695
+
696
+ if (mw) {
697
+ cur = ggml_mul(ctx0, cur, mw);
698
+ if (mb) {
699
+ cb(cur, "norm_w", il);
700
+ }
701
+ }
702
+
703
+ if (mb) {
704
+ cur = ggml_add(ctx0, cur, mb);
705
+ }
706
+
707
+ return cur;
708
+ }
709
+
710
+ ggml_tensor * llm_graph_context::build_ffn(
711
+ ggml_tensor * cur,
712
+ ggml_tensor * up,
713
+ ggml_tensor * up_b,
714
+ ggml_tensor * up_s,
715
+ ggml_tensor * gate,
716
+ ggml_tensor * gate_b,
717
+ ggml_tensor * gate_s,
718
+ ggml_tensor * down,
719
+ ggml_tensor * down_b,
720
+ ggml_tensor * down_s,
721
+ ggml_tensor * act_scales,
722
+ llm_ffn_op_type type_op,
723
+ llm_ffn_gate_type type_gate,
724
+ int il) const {
725
+ ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
726
+ cb(tmp, "ffn_up", il);
727
+
728
+ if (up_b) {
729
+ tmp = ggml_add(ctx0, tmp, up_b);
730
+ cb(tmp, "ffn_up_b", il);
731
+ }
732
+
733
+ if (up_s) {
734
+ tmp = ggml_mul(ctx0, tmp, up_s);
735
+ cb(tmp, "ffn_up_s", il);
736
+ }
737
+
738
+ if (gate) {
739
+ switch (type_gate) {
740
+ case LLM_FFN_SEQ:
741
+ {
742
+ cur = build_lora_mm(gate, tmp);
743
+ cb(cur, "ffn_gate", il);
744
+ } break;
745
+ case LLM_FFN_PAR:
746
+ {
747
+ cur = build_lora_mm(gate, cur);
748
+ cb(cur, "ffn_gate", il);
749
+ } break;
750
+ }
751
+
752
+ if (gate_b) {
753
+ cur = ggml_add(ctx0, cur, gate_b);
754
+ cb(cur, "ffn_gate_b", il);
755
+ }
756
+
757
+ if (gate_s) {
758
+ cur = ggml_mul(ctx0, cur, gate_s);
759
+ cb(cur, "ffn_gate_s", il);
760
+ }
761
+
762
+ } else {
763
+ cur = tmp;
764
+ }
765
+
766
+ switch (type_op) {
767
+ case LLM_FFN_SILU:
768
+ {
769
+ cur = ggml_silu(ctx0, cur);
770
+ cb(cur, "ffn_silu", il);
771
+ } break;
772
+ case LLM_FFN_GELU:
773
+ {
774
+ cur = ggml_gelu(ctx0, cur);
775
+ cb(cur, "ffn_gelu", il);
776
+ if (act_scales != NULL) {
777
+ cur = ggml_div(ctx0, cur, act_scales);
778
+ cb(cur, "ffn_act", il);
779
+ }
780
+ } break;
781
+ case LLM_FFN_RELU:
782
+ {
783
+ cur = ggml_relu(ctx0, cur);
784
+ cb(cur, "ffn_relu", il);
785
+ } break;
786
+ case LLM_FFN_RELU_SQR:
787
+ {
788
+ cur = ggml_relu(ctx0, cur);
789
+ cb(cur, "ffn_relu", il);
790
+
791
+ cur = ggml_sqr(ctx0, cur);
792
+ cb(cur, "ffn_sqr(relu)", il);
793
+ } break;
794
+ case LLM_FFN_SWIGLU:
795
+ {
796
+ // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
797
+ int64_t split_point = cur->ne[0] / 2;
798
+ ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
799
+ ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
800
+
801
+ x0 = ggml_silu(ctx0, x0);
802
+ cb(cur, "ffn_silu", il);
803
+
804
+ cur = ggml_mul(ctx0, x0, x1);
805
+ cb(cur, "ffn_mul", il);
806
+ } break;
807
+ }
808
+
809
+ if (type_gate == LLM_FFN_PAR) {
810
+ cur = ggml_mul(ctx0, cur, tmp);
811
+ cb(cur, "ffn_gate_par", il);
812
+ }
813
+
814
+ if (down) {
815
+ cur = build_lora_mm(down, cur);
816
+ }
817
+
818
+ if (down_b) {
819
+ cb(cur, "ffn_down", il);
820
+ }
821
+
822
+ if (down_b) {
823
+ cur = ggml_add(ctx0, cur, down_b);
824
+ }
825
+
826
+ if (down_s) {
827
+ cur = ggml_mul(ctx0, cur, down_s);
828
+ cb(cur, "ffn_down_s", il);
829
+ }
830
+
831
+ return cur;
832
+ }
833
+
834
+ ggml_tensor * llm_graph_context::build_moe_ffn(
835
+ ggml_tensor * cur,
836
+ ggml_tensor * gate_inp,
837
+ ggml_tensor * up_exps,
838
+ ggml_tensor * gate_exps,
839
+ ggml_tensor * down_exps,
840
+ ggml_tensor * exp_probs_b,
841
+ int64_t n_expert,
842
+ int64_t n_expert_used,
843
+ llm_ffn_op_type type_op,
844
+ bool norm_w,
845
+ bool scale_w,
846
+ float w_scale,
847
+ llama_expert_gating_func_type gating_op,
848
+ int il) const {
849
+ int64_t n_embd = cur->ne[0];
850
+ int64_t n_tokens = cur->ne[1];
851
+
852
+ ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
853
+ cb(logits, "ffn_moe_logits", il);
854
+
855
+ ggml_tensor * probs = nullptr;
856
+ switch (gating_op) {
857
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
858
+ {
859
+ probs = ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
860
+ } break;
861
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
862
+ {
863
+ probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
864
+ } break;
865
+ default:
866
+ GGML_ABORT("fatal error");
867
+ }
868
+ cb(probs, "ffn_moe_probs", il);
869
+
870
+ // add experts selection bias - introduced in DeepSeek V3
871
+ // leave probs unbiased as it's later used to get expert weights
872
+ ggml_tensor * selection_probs = probs;
873
+ if (exp_probs_b != nullptr) {
874
+ selection_probs = ggml_add(ctx0, probs, exp_probs_b);
875
+ cb(selection_probs, "ffn_moe_probs_biased", il);
876
+ }
877
+
878
+ // select experts
879
+ ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
880
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
881
+ cb(selected_experts, "ffn_moe_topk", il);
882
+
883
+ ggml_tensor * weights = ggml_get_rows(ctx0,
884
+ ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
885
+ cb(weights, "ffn_moe_weights", il);
886
+
887
+ if (norm_w) {
888
+ weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
889
+
890
+ ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
891
+ cb(weights_sum, "ffn_moe_weights_sum", il);
892
+
893
+ weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
894
+ cb(weights, "ffn_moe_weights_norm", il);
895
+
896
+ weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
897
+ }
898
+ if (scale_w) {
899
+ weights = ggml_scale(ctx0, weights, w_scale);
900
+ cb(weights, "ffn_moe_weights_scaled", il);
901
+ }
902
+
903
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
904
+ ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
905
+ cb(up, "ffn_moe_up", il);
906
+
907
+ ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
908
+ cb(gate, "ffn_moe_gate", il);
909
+
910
+ switch (type_op) {
911
+ case LLM_FFN_SILU:
912
+ {
913
+ gate = ggml_silu(ctx0, gate);
914
+ cb(gate, "ffn_moe_silu", il);
915
+ } break;
916
+ case LLM_FFN_GELU:
917
+ {
918
+ gate = ggml_gelu(ctx0, gate);
919
+ cb(gate, "ffn_moe_gelu", il);
920
+ } break;
921
+ default:
922
+ GGML_ABORT("fatal error");
923
+ }
924
+
925
+ ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
926
+ cb(par, "ffn_moe_gate_par", il);
927
+
928
+ ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
929
+ cb(experts, "ffn_moe_down", il);
930
+
931
+ experts = ggml_mul(ctx0, experts, weights);
932
+
933
+ // aggregate experts
934
+ ggml_tensor * moe_out = nullptr;
935
+ for (int i = 0; i < n_expert_used; ++i) {
936
+ ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
937
+ experts->nb[2], i*experts->nb[1]);
938
+
939
+ if (i == 0) {
940
+ moe_out = cur_expert;
941
+ } else {
942
+ moe_out = ggml_add(ctx0, moe_out, cur_expert);
943
+ }
944
+ }
945
+
946
+ if (n_expert_used == 1) {
947
+ // avoid returning a non-contiguous tensor
948
+ moe_out = ggml_cont(ctx0, moe_out);
949
+ }
950
+
951
+ return moe_out;
952
+ }
953
+
954
+ // input embeddings with optional lora
955
+ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
956
+ const int64_t n_embd = hparams.n_embd;
957
+
958
+ auto inp = std::make_unique<llm_graph_input_embd>();
959
+
960
+ ggml_tensor * cur = nullptr;
961
+
962
+ if (ubatch.token) {
963
+ inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
964
+ //cb(inp->tokens, "inp_tokens", -1);
965
+ ggml_set_input(inp->tokens);
966
+
967
+ cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
968
+
969
+ // apply lora for embedding tokens if needed
970
+ for (const auto & lora : *loras) {
971
+ llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
972
+ if (lw == nullptr) {
973
+ continue;
974
+ }
975
+
976
+ const float adapter_scale = lora.second;
977
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
978
+
979
+ ggml_tensor * inpL_delta = ggml_scale(ctx0, ggml_mul_mat(
980
+ ctx0, lw->b, // non-transposed lora_b
981
+ ggml_get_rows(ctx0, lw->a, inp->tokens)
982
+ ), scale);
983
+
984
+ cur = ggml_add(ctx0, cur, inpL_delta);
985
+ }
986
+ } else {
987
+ inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ubatch.n_tokens);
988
+ ggml_set_input(inp->embd);
989
+
990
+ cur = inp->embd;
991
+ }
992
+
993
+ // For Granite architecture
994
+ if (hparams.f_embedding_scale != 0.0f) {
995
+ cur = ggml_scale(ctx0, cur, hparams.f_embedding_scale);
996
+ }
997
+
998
+ cb(cur, "inp_embd", -1);
999
+
1000
+ res->add_input(std::move(inp));
1001
+
1002
+ return cur;
1003
+ }
1004
+
1005
+ ggml_tensor * llm_graph_context::build_inp_pos() const {
1006
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
1007
+
1008
+ auto & cur = inp->pos;
1009
+
1010
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
1011
+ ggml_set_input(cur);
1012
+
1013
+ res->add_input(std::move(inp));
1014
+
1015
+ return cur;
1016
+ }
1017
+
1018
+ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
1019
+ auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
1020
+
1021
+ auto & cur = inp->out_ids;
1022
+
1023
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
1024
+ ggml_set_input(cur);
1025
+
1026
+ res->add_input(std::move(inp));
1027
+
1028
+ return cur;
1029
+ }
1030
+
1031
+ ggml_tensor * llm_graph_context::build_inp_mean() const {
1032
+ auto inp = std::make_unique<llm_graph_input_mean>(cparams);
1033
+
1034
+ auto & cur = inp->mean;
1035
+
1036
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
1037
+ ggml_set_input(cur);
1038
+
1039
+ res->add_input(std::move(inp));
1040
+
1041
+ return cur;
1042
+ }
1043
+
1044
+ ggml_tensor * llm_graph_context::build_inp_cls() const {
1045
+ auto inp = std::make_unique<llm_graph_input_cls>(cparams);
1046
+
1047
+ auto & cur = inp->cls;
1048
+
1049
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
1050
+ ggml_set_input(cur);
1051
+
1052
+ res->add_input(std::move(inp));
1053
+
1054
+ return cur;
1055
+ }
1056
+
1057
+ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
1058
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1059
+
1060
+ auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
1061
+
1062
+ const auto n_kv = kv_self->n;
1063
+
1064
+ auto & cur = inp->s_copy;
1065
+
1066
+ cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
1067
+ ggml_set_input(cur);
1068
+
1069
+ res->add_input(std::move(inp));
1070
+
1071
+ return cur;
1072
+ }
1073
+
1074
+ ggml_tensor * llm_graph_context::build_inp_s_mask() const {
1075
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1076
+
1077
+ auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
1078
+
1079
+ const auto n_kv = kv_self->n;
1080
+
1081
+ auto & cur = inp->s_mask;
1082
+
1083
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
1084
+ ggml_set_input(cur);
1085
+
1086
+ res->add_input(std::move(inp));
1087
+
1088
+ return cur;
1089
+ }
1090
+
1091
+ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
1092
+ auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
1093
+
1094
+ auto & cur = inp->cross_embd;
1095
+
1096
+ // if we have the output embeddings from the encoder, use them directly
1097
+ // TODO: needs more work to be correct, for now just use the tensor shape
1098
+ //if (cross->t_embd) {
1099
+ // cur = ggml_view_tensor(ctx0, cross->t_embd);
1100
+
1101
+ // return cur;
1102
+ //}
1103
+
1104
+ const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
1105
+ const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1106
+
1107
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
1108
+ ggml_set_input(cur);
1109
+
1110
+ res->add_input(std::move(inp));
1111
+
1112
+ return cur;
1113
+ }
1114
+
1115
+ ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
1116
+ auto inp = std::make_unique<llm_graph_input_pos_bucket>(hparams);
1117
+
1118
+ auto & cur = inp->pos_bucket;
1119
+
1120
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
1121
+ ggml_set_input(cur);
1122
+
1123
+ res->add_input(std::move(inp));
1124
+
1125
+ return cur;
1126
+ }
1127
+
1128
+ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
1129
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1130
+
1131
+ auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
1132
+
1133
+ const auto n_kv = kv_self->n;
1134
+
1135
+ auto & cur = inp->pos_bucket;
1136
+
1137
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
1138
+ ggml_set_input(cur);
1139
+
1140
+ res->add_input(std::move(inp));
1141
+
1142
+ return cur;
1143
+ }
1144
+
1145
+ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const {
1146
+ ggml_tensor * pos_bucket_1d = ggml_reshape_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1]);
1147
+ cb(pos_bucket_1d, "pos_bucket_1d", -1);
1148
+
1149
+ ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
1150
+
1151
+ pos_bias = ggml_reshape_3d(ctx0, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1]);
1152
+ pos_bias = ggml_permute (ctx0, pos_bias, 2, 0, 1, 3);
1153
+ pos_bias = ggml_cont (ctx0, pos_bias);
1154
+
1155
+ cb(pos_bias, "pos_bias", -1);
1156
+
1157
+ return pos_bias;
1158
+ }
1159
+
1160
+ ggml_tensor * llm_graph_context::build_attn_mha(
1161
+ ggml_cgraph * gf,
1162
+ ggml_tensor * q,
1163
+ ggml_tensor * k,
1164
+ ggml_tensor * v,
1165
+ ggml_tensor * kq_b,
1166
+ ggml_tensor * kq_mask,
1167
+ bool v_trans,
1168
+ float kq_scale) const {
1169
+ //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1170
+ //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1171
+
1172
+ //const int64_t n_head = hparams.n_head(il);
1173
+ //const int64_t n_head_kv = hparams.n_head_kv(il);
1174
+
1175
+ //const auto & n_embd_head_k = hparams.n_embd_head_k;
1176
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
1177
+
1178
+ const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
1179
+
1180
+ const auto n_tokens = q->ne[1];
1181
+ const auto n_head = q->ne[2];
1182
+ const auto n_kv = k->ne[1];
1183
+
1184
+ ggml_tensor * cur;
1185
+
1186
+ // TODO: replace hardcoded padding with ggml-provided padding
1187
+ if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
1188
+ GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
1189
+
1190
+ if (v_trans) {
1191
+ v = ggml_transpose(ctx0, v);
1192
+ }
1193
+
1194
+ cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1195
+ hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1196
+
1197
+ ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
1198
+
1199
+ cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
1200
+ } else {
1201
+ ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
1202
+
1203
+ // note: this op tends to require high floating point range
1204
+ // while for some models F16 is enough, for others it is not, so we default to F32 here
1205
+ ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
1206
+
1207
+ if (arch == LLM_ARCH_GROK) {
1208
+ // need to do the following:
1209
+ // multiply by attn_output_multiplyer of 0.08838834764831845
1210
+ // and then :
1211
+ // kq = 30 * tanh(kq / 30)
1212
+ // before the softmax below
1213
+
1214
+ kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1215
+ kq = ggml_scale(ctx0, kq, 30);
1216
+ }
1217
+
1218
+ if (hparams.attn_soft_cap) {
1219
+ kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
1220
+ kq = ggml_tanh (ctx0, kq);
1221
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1222
+ }
1223
+
1224
+ if (kq_b) {
1225
+ kq = ggml_add(ctx0, kq, kq_b);
1226
+ }
1227
+
1228
+ kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1229
+
1230
+ if (!v_trans) {
1231
+ // note: avoid this branch
1232
+ v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
1233
+ }
1234
+
1235
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
1236
+
1237
+ ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
1238
+
1239
+ cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
1240
+
1241
+ if (!cparams.offload_kqv) {
1242
+ // all nodes between the KV store and the attention output are run on the CPU
1243
+ ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
1244
+ }
1245
+ }
1246
+
1247
+ ggml_build_forward_expand(gf, cur);
1248
+
1249
+ return cur;
1250
+ }
1251
+
1252
+ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const {
1253
+ auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1254
+
1255
+ // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1256
+ inp->kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1257
+ //cb(inp_kq_mask, "KQ_mask", -1);
1258
+ ggml_set_input(inp->kq_mask);
1259
+
1260
+ inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
1261
+
1262
+ return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
1263
+ }
1264
+
1265
+ ggml_tensor * llm_graph_context::build_attn(
1266
+ llm_graph_input_attn_no_cache * inp,
1267
+ ggml_cgraph * gf,
1268
+ ggml_tensor * wo,
1269
+ ggml_tensor * wo_b,
1270
+ ggml_tensor * q_cur,
1271
+ ggml_tensor * k_cur,
1272
+ ggml_tensor * v_cur,
1273
+ ggml_tensor * kq_b,
1274
+ float kq_scale,
1275
+ int il) const {
1276
+ GGML_UNUSED(n_tokens);
1277
+
1278
+ // these nodes are added to the graph together so that they are not reordered
1279
+ // by doing so, the number of splits in the graph is reduced
1280
+ ggml_build_forward_expand(gf, q_cur);
1281
+ ggml_build_forward_expand(gf, k_cur);
1282
+ ggml_build_forward_expand(gf, v_cur);
1283
+
1284
+ const auto & kq_mask = inp->get_kq_mask();
1285
+
1286
+ ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1287
+ //cb(q, "q", il);
1288
+
1289
+ ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1290
+ //cb(k, "k", il);
1291
+
1292
+ ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1293
+ //cb(k, "v", il);
1294
+
1295
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
1296
+
1297
+ cb(cur, "kqv_out", il);
1298
+
1299
+ if (wo) {
1300
+ cur = build_lora_mm(wo, cur);
1301
+ }
1302
+
1303
+ if (wo_b) {
1304
+ //cb(cur, "kqv_wo", il);
1305
+ }
1306
+
1307
+ if (wo_b) {
1308
+ cur = ggml_add(ctx0, cur, wo_b);
1309
+ }
1310
+
1311
+ return cur;
1312
+ }
1313
+
1314
+ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
1315
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1316
+
1317
+ auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
1318
+
1319
+ const auto n_kv = kv_self->n;
1320
+
1321
+ inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1322
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
1323
+ ggml_set_input(inp->self_kq_mask);
1324
+
1325
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1326
+
1327
+ if (hparams.n_swa_pattern > 1) {
1328
+ GGML_ASSERT(hparams.n_swa > 0);
1329
+
1330
+ inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1331
+ //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
1332
+ ggml_set_input(inp->self_kq_mask_swa);
1333
+
1334
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1335
+ }
1336
+
1337
+ return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
1338
+ }
1339
+
1340
+ ggml_tensor * llm_graph_context::build_attn(
1341
+ llm_graph_input_attn_kv_unified * inp,
1342
+ ggml_cgraph * gf,
1343
+ ggml_tensor * wo,
1344
+ ggml_tensor * wo_b,
1345
+ ggml_tensor * q_cur,
1346
+ ggml_tensor * k_cur,
1347
+ ggml_tensor * v_cur,
1348
+ ggml_tensor * kq_b,
1349
+ float kq_scale,
1350
+ int il) const {
1351
+ // these nodes are added to the graph together so that they are not reordered
1352
+ // by doing so, the number of splits in the graph is reduced
1353
+ ggml_build_forward_expand(gf, q_cur);
1354
+ ggml_build_forward_expand(gf, k_cur);
1355
+ ggml_build_forward_expand(gf, v_cur);
1356
+
1357
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1358
+ const auto & n_ctx = cparams.n_ctx;
1359
+
1360
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1361
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1362
+
1363
+ const auto n_tokens = q_cur->ne[2];
1364
+
1365
+ const bool v_trans = !cparams.flash_attn;
1366
+
1367
+ // store to KV cache
1368
+ {
1369
+ GGML_ASSERT(!kv_self->recurrent);
1370
+
1371
+ const auto kv_head = kv_self->head;
1372
+
1373
+ GGML_ASSERT(kv_self->size == n_ctx);
1374
+
1375
+ ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
1376
+ //cb(k_cache_view, "k_cache_view", il);
1377
+
1378
+ // note: storing RoPE-ed version of K in the KV cache
1379
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
1380
+
1381
+ v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
1382
+
1383
+ ggml_tensor * v_cache_view = nullptr;
1384
+
1385
+ if (!v_trans) {
1386
+ v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
1387
+ } else {
1388
+ // note: the V cache is transposed when not using flash attention
1389
+ v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
1390
+ ( n_ctx)*ggml_element_size(kv_self->v_l[il]),
1391
+ (kv_head)*ggml_element_size(kv_self->v_l[il]));
1392
+
1393
+ v_cur = ggml_transpose(ctx0, v_cur);
1394
+ }
1395
+ //cb(v_cache_view, "v_cache_view", il);
1396
+
1397
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
1398
+ }
1399
+
1400
+ const bool is_swa = hparams.is_swa(il);
1401
+
1402
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
1403
+
1404
+ const auto n_kv = kv_self->n;
1405
+
1406
+ const int64_t n_head_kv = hparams.n_head_kv(il);
1407
+
1408
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
1409
+ const auto & n_embd_head_v = hparams.n_embd_head_v;
1410
+
1411
+ ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1412
+ //cb(q, "q", il);
1413
+
1414
+ ggml_tensor * k =
1415
+ ggml_view_3d(ctx0, kv_self->k_l[il],
1416
+ n_embd_head_k, n_kv, n_head_kv,
1417
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
1418
+ ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
1419
+ 0);
1420
+ //cb(k, "k", il);
1421
+
1422
+ ggml_tensor * v = !v_trans ?
1423
+ ggml_view_3d(ctx0, kv_self->v_l[il],
1424
+ n_embd_head_v, n_kv, n_head_kv,
1425
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
1426
+ ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
1427
+ 0) :
1428
+ ggml_view_3d(ctx0, kv_self->v_l[il],
1429
+ n_kv, n_embd_head_v, n_head_kv,
1430
+ ggml_element_size(kv_self->v_l[il])*n_ctx,
1431
+ ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
1432
+ 0);
1433
+
1434
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
1435
+ cb(cur, "kqv_out", il);
1436
+
1437
+ if (wo) {
1438
+ cur = build_lora_mm(wo, cur);
1439
+ }
1440
+
1441
+ if (wo_b) {
1442
+ //cb(cur, "kqv_wo", il);
1443
+ }
1444
+
1445
+ if (wo_b) {
1446
+ cur = ggml_add(ctx0, cur, wo_b);
1447
+ }
1448
+
1449
+ return cur;
1450
+ }
1451
+
1452
+ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
1453
+ auto inp = std::make_unique<llm_graph_input_attn_cross>(cross);
1454
+
1455
+ const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1456
+
1457
+ inp->cross_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1458
+ ggml_set_input(inp->cross_kq_mask);
1459
+
1460
+ inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
1461
+
1462
+ return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
1463
+ }
1464
+
1465
+ ggml_tensor * llm_graph_context::build_attn(
1466
+ llm_graph_input_attn_cross * inp,
1467
+ ggml_cgraph * gf,
1468
+ ggml_tensor * wo,
1469
+ ggml_tensor * wo_b,
1470
+ ggml_tensor * q_cur,
1471
+ ggml_tensor * k_cur,
1472
+ ggml_tensor * v_cur,
1473
+ ggml_tensor * kq_b,
1474
+ float kq_scale,
1475
+ int il) const {
1476
+ // these nodes are added to the graph together so that they are not reordered
1477
+ // by doing so, the number of splits in the graph is reduced
1478
+ ggml_build_forward_expand(gf, q_cur);
1479
+ ggml_build_forward_expand(gf, k_cur);
1480
+ ggml_build_forward_expand(gf, v_cur);
1481
+
1482
+ const auto & kq_mask = inp->get_kq_mask_cross();
1483
+
1484
+ ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1485
+ //cb(q, "q", il);
1486
+
1487
+ ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1488
+ //cb(k, "k", il);
1489
+
1490
+ ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1491
+ //cb(k, "v", il);
1492
+
1493
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
1494
+
1495
+ cb(cur, "kqv_out", il);
1496
+
1497
+ if (wo) {
1498
+ cur = build_lora_mm(wo, cur);
1499
+ }
1500
+
1501
+ if (wo_b) {
1502
+ //cb(cur, "kqv_wo", il);
1503
+ }
1504
+
1505
+ if (wo_b) {
1506
+ cur = ggml_add(ctx0, cur, wo_b);
1507
+ }
1508
+
1509
+ return cur;
1510
+ }
1511
+
1512
+ ggml_tensor * llm_graph_context::build_copy_mask_state(
1513
+ ggml_cgraph * gf,
1514
+ ggml_tensor * s,
1515
+ ggml_tensor * state_copy,
1516
+ ggml_tensor * state_mask,
1517
+ int32_t n_state,
1518
+ int32_t n_seqs) const {
1519
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1520
+
1521
+ const auto n_kv = kv_self->n;
1522
+ const auto kv_head = kv_self->head;
1523
+
1524
+ ggml_tensor * states = ggml_reshape_2d(ctx0, s, n_state, kv_self->size);
1525
+
1526
+ // copy states
1527
+ // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
1528
+ // this shrinks the tensors's ne[1] to n_kv
1529
+ states = ggml_get_rows(ctx0, states, state_copy);
1530
+
1531
+ // clear states of sequences which are starting at the beginning of this batch
1532
+ // FIXME: zero-out NANs?
1533
+ states = ggml_mul(ctx0, states, state_mask);
1534
+
1535
+ // copy states which won't be changed further (between n_seqs and n_kv)
1536
+ ggml_build_forward_expand(gf,
1537
+ ggml_cpy(ctx0,
1538
+ ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)),
1539
+ ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
1540
+
1541
+ // the part of the states that will be used and modified
1542
+ return ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
1543
+ }
1544
+
1545
+ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
1546
+ ggml_cgraph * gf,
1547
+ ggml_tensor * state_copy,
1548
+ ggml_tensor * state_mask,
1549
+ const llama_ubatch & ubatch,
1550
+ int il) const {
1551
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1552
+
1553
+ const auto token_shift_count = hparams.token_shift_count;
1554
+
1555
+ const int64_t n_seqs = ubatch.n_seqs;
1556
+
1557
+ ggml_tensor * token_shift_all = kv_self->k_l[il];
1558
+
1559
+ ggml_tensor * token_shift = build_copy_mask_state(
1560
+ gf, token_shift_all, state_copy, state_mask,
1561
+ hparams.n_embd_k_s(), n_seqs);
1562
+
1563
+ token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
1564
+
1565
+ return token_shift;
1566
+ }
1567
+
1568
+ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1569
+ ggml_tensor * token_shift,
1570
+ const llama_ubatch & ubatch,
1571
+ int il) const {
1572
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1573
+
1574
+ const auto token_shift_count = hparams.token_shift_count;
1575
+ const auto n_embd = hparams.n_embd;
1576
+
1577
+ const int64_t n_seqs = ubatch.n_seqs;
1578
+
1579
+ const auto kv_head = kv_self->head;
1580
+
1581
+ return ggml_cpy(
1582
+ ctx0,
1583
+ ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
1584
+ ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self->k_l[il]))
1585
+ );
1586
+ }
1587
+
1588
+ void llm_graph_context::build_pooling(
1589
+ ggml_cgraph * gf,
1590
+ ggml_tensor * cls,
1591
+ ggml_tensor * cls_b,
1592
+ ggml_tensor * cls_out,
1593
+ ggml_tensor * cls_out_b) const {
1594
+ if (!cparams.embeddings) {
1595
+ return;
1596
+ }
1597
+
1598
+ ggml_tensor * inp = res->t_embd;
1599
+
1600
+ //// find result_norm tensor for input
1601
+ //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
1602
+ // inp = ggml_graph_node(gf, i);
1603
+ // if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
1604
+ // break;
1605
+ // }
1606
+
1607
+ // inp = nullptr;
1608
+ //}
1609
+
1610
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
1611
+
1612
+ ggml_tensor * cur;
1613
+
1614
+ switch (pooling_type) {
1615
+ case LLAMA_POOLING_TYPE_NONE:
1616
+ {
1617
+ cur = inp;
1618
+ } break;
1619
+ case LLAMA_POOLING_TYPE_MEAN:
1620
+ {
1621
+ ggml_tensor * inp_mean = build_inp_mean();
1622
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
1623
+ } break;
1624
+ case LLAMA_POOLING_TYPE_CLS:
1625
+ case LLAMA_POOLING_TYPE_LAST:
1626
+ {
1627
+ ggml_tensor * inp_cls = build_inp_cls();
1628
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
1629
+ } break;
1630
+ case LLAMA_POOLING_TYPE_RANK:
1631
+ {
1632
+ ggml_tensor * inp_cls = build_inp_cls();
1633
+ inp = ggml_get_rows(ctx0, inp, inp_cls);
1634
+
1635
+ // classification head
1636
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1637
+ GGML_ASSERT(cls != nullptr);
1638
+ GGML_ASSERT(cls_b != nullptr);
1639
+
1640
+ cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls, inp), cls_b);
1641
+ cur = ggml_tanh(ctx0, cur);
1642
+
1643
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1644
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1645
+ if (cls_out) {
1646
+ GGML_ASSERT(cls_out_b != nullptr);
1647
+
1648
+ cur = ggml_add (ctx0, ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
1649
+ }
1650
+ } break;
1651
+ default:
1652
+ {
1653
+ GGML_ABORT("unknown pooling type");
1654
+ }
1655
+ }
1656
+
1657
+ cb(cur, "result_embd_pooled", -1);
1658
+ res->t_embd_pooled = cur;
1659
+
1660
+ ggml_build_forward_expand(gf, cur);
1661
+ }
1662
+