cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +52 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1779
  14. package/cpp/chat.h +9 -1
  15. package/cpp/common.cpp +20 -522
  16. package/cpp/common.h +13 -36
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-common.h +12 -6
  19. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  20. package/cpp/ggml-cpu-impl.h +2 -21
  21. package/cpp/ggml-cpu-quants.c +904 -405
  22. package/cpp/ggml-cpu.c +909 -13237
  23. package/cpp/ggml-impl.h +50 -23
  24. package/cpp/ggml-metal-impl.h +77 -3
  25. package/cpp/ggml-metal.m +794 -580
  26. package/cpp/ggml.c +92 -3
  27. package/cpp/ggml.h +29 -5
  28. package/cpp/gguf.cpp +1 -0
  29. package/cpp/llama-adapter.cpp +55 -20
  30. package/cpp/llama-adapter.h +11 -9
  31. package/cpp/llama-arch.cpp +217 -16
  32. package/cpp/llama-arch.h +25 -0
  33. package/cpp/llama-batch.h +2 -2
  34. package/cpp/llama-chat.cpp +54 -2
  35. package/cpp/llama-chat.h +3 -0
  36. package/cpp/llama-context.cpp +2294 -1238
  37. package/cpp/llama-context.h +214 -77
  38. package/cpp/llama-cparams.h +1 -0
  39. package/cpp/llama-graph.cpp +1695 -0
  40. package/cpp/llama-graph.h +592 -0
  41. package/cpp/llama-hparams.cpp +8 -0
  42. package/cpp/llama-hparams.h +17 -0
  43. package/cpp/llama-io.cpp +15 -0
  44. package/cpp/llama-io.h +35 -0
  45. package/cpp/llama-kv-cache.cpp +965 -303
  46. package/cpp/llama-kv-cache.h +145 -151
  47. package/cpp/llama-memory.cpp +1 -0
  48. package/cpp/llama-memory.h +21 -0
  49. package/cpp/llama-mmap.cpp +1 -1
  50. package/cpp/llama-model-loader.cpp +10 -5
  51. package/cpp/llama-model-loader.h +5 -3
  52. package/cpp/llama-model.cpp +9194 -201
  53. package/cpp/llama-model.h +40 -1
  54. package/cpp/llama-sampling.cpp +5 -0
  55. package/cpp/llama-vocab.cpp +36 -5
  56. package/cpp/llama.cpp +51 -9984
  57. package/cpp/llama.h +102 -22
  58. package/cpp/log.cpp +34 -0
  59. package/cpp/minja/chat-template.hpp +15 -7
  60. package/cpp/minja/minja.hpp +120 -94
  61. package/cpp/ops.cpp +8723 -0
  62. package/cpp/ops.h +128 -0
  63. package/cpp/rn-llama.cpp +44 -53
  64. package/cpp/rn-llama.h +2 -12
  65. package/cpp/sampling.cpp +3 -0
  66. package/cpp/sgemm.cpp +533 -88
  67. package/cpp/simd-mappings.h +888 -0
  68. package/cpp/speculative.cpp +4 -4
  69. package/cpp/unary-ops.cpp +186 -0
  70. package/cpp/unary-ops.h +28 -0
  71. package/cpp/vec.cpp +258 -0
  72. package/cpp/vec.h +802 -0
  73. package/ios/CMakeLists.txt +5 -2
  74. package/ios/RNLlama.mm +2 -2
  75. package/ios/RNLlamaContext.mm +40 -24
  76. package/package.json +1 -1
  77. package/src/NativeRNLlama.ts +6 -4
  78. package/src/index.ts +3 -1
  79. package/cpp/chat-template.hpp +0 -529
  80. package/cpp/minja.hpp +0 -2915
@@ -0,0 +1,1695 @@
1
+ #include "llama-graph.h"
2
+
3
+ #include "llama-impl.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-kv-cache.h"
7
+
8
+ #include <cassert>
9
+ #include <cmath>
10
+ #include <cstring>
11
+
12
+ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
13
+ // TODO move to hparams if a T5 variant appears that uses a different value
14
+ const int64_t max_distance = 128;
15
+
16
+ if (bidirectional) {
17
+ n_buckets >>= 1;
18
+ }
19
+
20
+ const int64_t max_exact = n_buckets >> 1;
21
+
22
+ int32_t relative_position = x - y;
23
+ int32_t relative_bucket = 0;
24
+
25
+ if (bidirectional) {
26
+ relative_bucket += (relative_position > 0) * n_buckets;
27
+ relative_position = abs(relative_position);
28
+ } else {
29
+ relative_position = -std::min<int32_t>(relative_position, 0);
30
+ }
31
+
32
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
33
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
34
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
35
+
36
+ return relative_bucket;
37
+ }
38
+
39
+ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
40
+ if (ubatch->token) {
41
+ const int64_t n_tokens = ubatch->n_tokens;
42
+
43
+ lm_ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*lm_ggml_element_size(tokens));
44
+ }
45
+
46
+ if (ubatch->embd) {
47
+ const int64_t n_embd = embd->ne[0];
48
+ const int64_t n_tokens = ubatch->n_tokens;
49
+
50
+ lm_ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*lm_ggml_element_size(embd));
51
+ }
52
+ }
53
+
54
+ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
55
+ if (ubatch->pos && pos) {
56
+ const int64_t n_tokens = ubatch->n_tokens;
57
+
58
+ lm_ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*lm_ggml_element_size(pos));
59
+ }
60
+ }
61
+
62
+ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
63
+ if (ubatch->pos && attn_scale) {
64
+ const int64_t n_tokens = ubatch->n_tokens;
65
+
66
+ std::vector<float> attn_scale_data(n_tokens, 0.0f);
67
+ for (int i = 0; i < n_tokens; ++i) {
68
+ const float pos = ubatch->pos[i];
69
+ attn_scale_data[i] = std::log(
70
+ std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
71
+ ) * f_attn_temp_scale + 1.0;
72
+ }
73
+
74
+ lm_ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*lm_ggml_element_size(attn_scale));
75
+ }
76
+ }
77
+
78
+ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
79
+ if (pos_bucket) {
80
+ const int64_t n_tokens = ubatch->n_tokens;
81
+
82
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(pos_bucket->buffer));
83
+ LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
84
+
85
+ int32_t * data = (int32_t *) pos_bucket->data;
86
+
87
+ for (int h = 0; h < 1; ++h) {
88
+ for (int j = 0; j < n_tokens; ++j) {
89
+ for (int i = 0; i < n_tokens; ++i) {
90
+ data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
91
+ }
92
+ }
93
+ }
94
+ }
95
+ }
96
+
97
+ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
98
+ if (pos_bucket) {
99
+ const int64_t n_tokens = ubatch->n_tokens;
100
+
101
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(pos_bucket->buffer));
102
+ LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
103
+
104
+ int32_t * data = (int32_t *) pos_bucket->data;
105
+
106
+ const int64_t n_kv = kv_self->n;
107
+
108
+ for (int h = 0; h < 1; ++h) {
109
+ for (int j = 0; j < n_tokens; ++j) {
110
+ for (int i = 0; i < n_kv; ++i) {
111
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
112
+ }
113
+ }
114
+ }
115
+ }
116
+ }
117
+
118
+ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
119
+ if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
120
+ //LM_GGML_ASSERT(out_ids && "every model that can must skip unused outputs");
121
+
122
+ if (!out_ids) {
123
+ LLAMA_LOG_WARN("%s: 'out_ids' is not created\n", __func__);
124
+ } else {
125
+ const int64_t n_tokens = ubatch->n_tokens;
126
+
127
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(out_ids->buffer));
128
+ int32_t * data = (int32_t *) out_ids->data;
129
+
130
+ if (n_outputs == n_tokens) {
131
+ for (int i = 0; i < n_tokens; ++i) {
132
+ data[i] = i;
133
+ }
134
+ } else if (ubatch->output) {
135
+ int32_t n_outputs = 0;
136
+ for (int i = 0; i < n_tokens; ++i) {
137
+ if (ubatch->output[i]) {
138
+ data[n_outputs++] = i;
139
+ }
140
+ }
141
+ // the graph needs to have been passed the correct number of outputs
142
+ LM_GGML_ASSERT(n_outputs == n_outputs);
143
+ } else if (n_outputs == 1) {
144
+ // only keep last output
145
+ data[0] = n_tokens - 1;
146
+ } else {
147
+ LM_GGML_ASSERT(n_outputs == 0);
148
+ }
149
+ }
150
+ }
151
+ }
152
+
153
+ void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
154
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
155
+ const int64_t n_tokens = ubatch->n_tokens;
156
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
157
+ const int64_t n_seqs = ubatch->n_seqs;
158
+
159
+ LM_GGML_ASSERT(mean);
160
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(mean->buffer));
161
+
162
+ float * data = (float *) mean->data;
163
+ memset(mean->data, 0, n_tokens * n_tokens * lm_ggml_element_size(mean));
164
+
165
+ std::vector<uint64_t> sum(n_tokens, 0);
166
+
167
+ for (int s = 0; s < n_seqs; ++s) {
168
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
169
+
170
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
171
+ LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
172
+
173
+ sum[seq_id] += ubatch->n_seq_tokens;
174
+ }
175
+
176
+ std::vector<float> div(n_tokens, 0.0f);
177
+ for (int i = 0; i < n_tokens; ++i) {
178
+ const uint64_t s = sum[i];
179
+ if (s > 0) {
180
+ div[i] = 1.0f/float(s);
181
+ }
182
+ }
183
+
184
+ for (int s = 0; s < n_seqs; ++s) {
185
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
186
+
187
+ for (int i = 0; i < n_seq_tokens; ++i) {
188
+ data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
189
+ }
190
+ }
191
+ }
192
+ }
193
+
194
+ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
195
+ if (cparams.embeddings && (
196
+ cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
197
+ cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
198
+ const int64_t n_tokens = ubatch->n_tokens;
199
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
200
+ const int64_t n_seqs = ubatch->n_seqs;
201
+
202
+ LM_GGML_ASSERT(cls);
203
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(cls->buffer));
204
+
205
+ uint32_t * data = (uint32_t *) cls->data;
206
+ memset(cls->data, 0, n_tokens * lm_ggml_element_size(cls));
207
+
208
+ for (int s = 0; s < n_seqs; ++s) {
209
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
210
+
211
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
212
+ LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
213
+
214
+ for (int i = 0; i < n_seq_tokens; ++i) {
215
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
216
+
217
+ if (pos == 0) {
218
+ data[seq_id] = s*n_seq_tokens + i;
219
+ }
220
+ }
221
+ }
222
+ }
223
+
224
+ if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
225
+ const int64_t n_tokens = ubatch->n_tokens;
226
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
227
+ const int64_t n_seqs = ubatch->n_seqs;
228
+
229
+ LM_GGML_ASSERT(cls);
230
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(cls->buffer));
231
+
232
+ uint32_t * data = (uint32_t *) cls->data;
233
+ memset(cls->data, 0, n_tokens * lm_ggml_element_size(cls));
234
+
235
+ std::vector<int> last_pos(n_tokens, -1);
236
+ std::vector<int> last_row(n_tokens, -1);
237
+
238
+ for (int s = 0; s < n_seqs; ++s) {
239
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
240
+
241
+ // TODO: adapt limits to n_seqs when ubatch->equal_seqs is true
242
+ LM_GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
243
+
244
+ for (int i = 0; i < n_seq_tokens; ++i) {
245
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + i];
246
+
247
+ if (pos >= last_pos[seq_id]) {
248
+ last_pos[seq_id] = pos;
249
+ last_row[seq_id] = s*n_seq_tokens + i;
250
+ }
251
+ }
252
+ }
253
+
254
+ for (int i = 0; i < n_tokens; ++i) {
255
+ if (last_row[i] >= 0) {
256
+ data[i] = last_row[i];
257
+ }
258
+ }
259
+ }
260
+ }
261
+
262
+ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
263
+ LM_GGML_UNUSED(ubatch);
264
+
265
+ const int64_t n_kv = kv_self->n;
266
+
267
+ if (s_copy) {
268
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(s_copy->buffer));
269
+ int32_t * data = (int32_t *) s_copy->data;
270
+
271
+ // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
272
+ for (uint32_t i = 0; i < n_kv; ++i) {
273
+ const uint32_t cell_id = i + kv_self->head;
274
+
275
+ //////////////////////////////////////////////
276
+ // TODO: this should not mutate the KV cache !
277
+ llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
278
+
279
+ // prevent out-of-bound sources
280
+ if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
281
+ kv_cell.src = cell_id;
282
+ }
283
+
284
+ data[i] = kv_cell.src;
285
+
286
+ // TODO: do not mutate the KV cache
287
+ // ensure copy only happens once
288
+ if (kv_cell.src != (int32_t) cell_id) {
289
+ kv_cell.src = cell_id;
290
+ }
291
+ }
292
+ }
293
+ }
294
+
295
+ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
296
+ LM_GGML_UNUSED(ubatch);
297
+
298
+ const int64_t n_kv = kv_self->n;
299
+
300
+ if (s_mask) {
301
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(s_mask->buffer));
302
+ float * data = (float *) s_mask->data;
303
+
304
+ // clear unused states
305
+ for (int i = 0; i < n_kv; ++i) {
306
+ const uint32_t cell_id = i + kv_self->head;
307
+
308
+ //////////////////////////////////////////////
309
+ // TODO: this should not mutate the KV cache !
310
+ llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
311
+
312
+ data[i] = (float) (kv_cell.src >= 0);
313
+
314
+ // only clear once
315
+ if (kv_cell.src < 0) {
316
+ kv_cell.src = cell_id;
317
+ }
318
+ }
319
+ }
320
+ }
321
+
322
+ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
323
+ LM_GGML_UNUSED(ubatch);
324
+
325
+ if (cross_embd && !cross->v_embd.empty()) {
326
+ assert(cross_embd->type == LM_GGML_TYPE_F32);
327
+
328
+ lm_ggml_backend_tensor_set(cross_embd, cross->v_embd.data(), 0, lm_ggml_nbytes(cross_embd));
329
+ }
330
+ }
331
+
332
+ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
333
+ if (kq_mask) {
334
+ if (cparams.causal_attn) {
335
+ const int64_t n_kv = ubatch->n_tokens;
336
+ const int64_t n_tokens = ubatch->n_tokens;
337
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
338
+ const int64_t n_seqs = ubatch->n_seqs;
339
+
340
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(kq_mask->buffer));
341
+ float * data = (float *) kq_mask->data;
342
+
343
+ for (int h = 0; h < 1; ++h) {
344
+ for (int s1 = 0; s1 < n_seqs; ++s1) {
345
+ const llama_seq_id seq_id = ubatch->seq_id[s1][0];
346
+
347
+ for (int j = 0; j < n_seq_tokens; ++j) {
348
+ const int32_t tj = s1*n_seq_tokens + j;
349
+
350
+ for (int s0 = 0; s0 < n_seqs; ++s0) {
351
+ for (int i = 0; i < n_seq_tokens; ++i) {
352
+ const int32_t ti = s0*n_seq_tokens + i;
353
+ float f = -INFINITY;
354
+
355
+ for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
356
+ if (ubatch->seq_id[s0][s] == seq_id && ubatch->pos[ti] <= ubatch->pos[tj]) {
357
+ if (hparams.use_alibi) {
358
+ f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
359
+ } else {
360
+ f = 0.0f;
361
+ }
362
+ break;
363
+ }
364
+ }
365
+
366
+ data[h*(n_kv*n_tokens) + tj*n_kv + ti] = f;
367
+ }
368
+ }
369
+ }
370
+ }
371
+ }
372
+ } else {
373
+ const int64_t n_tokens = ubatch->n_tokens;
374
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
375
+ const int64_t n_seqs = ubatch->n_seqs;
376
+ const int64_t n_stride = ubatch->n_tokens;
377
+
378
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(kq_mask->buffer));
379
+
380
+ float * data = (float *) kq_mask->data;
381
+
382
+ for (int h = 0; h < 1; ++h) {
383
+ for (int s1 = 0; s1 < n_seqs; ++s1) {
384
+ const llama_seq_id seq_id = ubatch->seq_id[s1][0];
385
+
386
+ for (int j = 0; j < n_seq_tokens; ++j) {
387
+ const int32_t tj = s1*n_seq_tokens + j;
388
+
389
+ for (int s0 = 0; s0 < n_seqs; ++s0) {
390
+ for (int i = 0; i < n_seq_tokens; ++i) {
391
+ const int32_t ti = s0*n_seq_tokens + i;
392
+ float f = -INFINITY;
393
+
394
+ for (int s = 0; s < ubatch->n_seq_id[s0]; ++s) {
395
+ if (ubatch->seq_id[s0][s] == seq_id) {
396
+ if (hparams.use_alibi) {
397
+ f = -std::abs(ubatch->pos[ti] - ubatch->pos[tj]);
398
+ } else {
399
+ f = 0.0f;
400
+ }
401
+ break;
402
+ }
403
+ }
404
+
405
+ data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
406
+ }
407
+ }
408
+
409
+ for (int i = n_tokens; i < n_stride; ++i) {
410
+ data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
411
+ }
412
+ }
413
+ }
414
+ }
415
+ }
416
+ }
417
+ }
418
+
419
+ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
420
+ if (self_kq_mask || self_kq_mask_swa) {
421
+ const int64_t n_kv = kv_self->n;
422
+ const int64_t n_tokens = ubatch->n_tokens;
423
+ const int64_t n_seq_tokens = ubatch->n_seq_tokens;
424
+ const int64_t n_seqs = ubatch->n_seqs;
425
+
426
+ float * data = nullptr;
427
+ float * data_swa = nullptr;
428
+
429
+ if (self_kq_mask) {
430
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(self_kq_mask->buffer));
431
+ data = (float *) self_kq_mask->data;
432
+ }
433
+
434
+ if (self_kq_mask_swa) {
435
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
436
+ data_swa = (float *) self_kq_mask_swa->data;
437
+ }
438
+
439
+ // Use only the previous KV cells of the correct sequence for each token of the ubatch.
440
+ // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
441
+ // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
442
+ // Causal mask:
443
+ // xxx-------
444
+ // xxxx------
445
+ // xxxxx-----
446
+ // Non-causal mask:
447
+ // xxxxx-----
448
+ // xxxxx-----
449
+ // xxxxx-----
450
+ // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
451
+ for (int h = 0; h < 1; ++h) {
452
+ for (int s = 0; s < n_seqs; ++s) {
453
+ const llama_seq_id seq_id = ubatch->seq_id[s][0];
454
+
455
+ for (int j = 0; j < n_seq_tokens; ++j) {
456
+ const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
457
+ for (int i = 0; i < n_kv; ++i) {
458
+ float f;
459
+ // mask the token if:
460
+ if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
461
+ || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
462
+ ) {
463
+ f = -INFINITY;
464
+ } else {
465
+ if (hparams.use_alibi) {
466
+ f = -std::abs(kv_self->cells[i].pos - pos);
467
+ } else {
468
+ f = 0.0f;
469
+ }
470
+ }
471
+
472
+ if (data) {
473
+ data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
474
+ }
475
+
476
+ // may need to cut off old tokens for sliding window
477
+ // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
478
+ if (data_swa) {
479
+ if (hparams.n_attn_chunk) {
480
+ llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
481
+ if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
482
+ f = -INFINITY;
483
+ }
484
+ } else {
485
+ if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
486
+ f = -INFINITY;
487
+ }
488
+ }
489
+ data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
490
+ }
491
+ }
492
+ }
493
+ }
494
+
495
+ // mask padded tokens
496
+ if (data) {
497
+ for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
498
+ for (int j = 0; j < n_kv; ++j) {
499
+ data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
500
+ }
501
+ }
502
+ }
503
+
504
+ // mask padded tokens
505
+ if (data_swa) {
506
+ for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
507
+ for (int j = 0; j < n_kv; ++j) {
508
+ data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
509
+ }
510
+ }
511
+ }
512
+ }
513
+ }
514
+ }
515
+
516
+ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
517
+ if (cross_kq_mask) {
518
+ const int64_t n_enc = cross_kq_mask->ne[0];
519
+ const int64_t n_tokens = ubatch->n_tokens;
520
+
521
+ LM_GGML_ASSERT(lm_ggml_backend_buffer_is_host(cross_kq_mask->buffer));
522
+ LM_GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
523
+
524
+ float * data = (float *) cross_kq_mask->data;
525
+
526
+ for (int h = 0; h < 1; ++h) {
527
+ for (int j = 0; j < n_tokens; ++j) {
528
+ for (int i = 0; i < n_enc; ++i) {
529
+ float f = -INFINITY;
530
+ for (int s = 0; s < ubatch->n_seq_id[j]; ++s) {
531
+ const llama_seq_id seq_id = ubatch->seq_id[j][s];
532
+ if (cross->seq_ids_enc[i].find(seq_id) != cross->seq_ids_enc[i].end()) {
533
+ f = 0.0f;
534
+ }
535
+ }
536
+ data[h*(n_enc*n_tokens) + j*n_enc + i] = f;
537
+ }
538
+ }
539
+
540
+ for (int i = n_tokens; i < LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD); ++i) {
541
+ for (int j = 0; j < n_enc; ++j) {
542
+ data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
543
+ }
544
+ }
545
+ }
546
+ }
547
+ }
548
+
549
+ //
550
+ // llm_graph_context
551
+ //
552
+
553
+ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
554
+ arch (params.arch),
555
+ hparams (params.hparams),
556
+ cparams (params.cparams),
557
+ ubatch (params.ubatch),
558
+ n_embd (hparams.n_embd),
559
+ n_layer (hparams.n_layer),
560
+ n_rot (hparams.n_rot),
561
+ n_ctx (cparams.n_ctx),
562
+ n_ctx_per_seq (cparams.n_ctx / cparams.n_seq_max),
563
+ n_head (hparams.n_head()),
564
+ n_head_kv (hparams.n_head_kv()),
565
+ n_embd_head_k (hparams.n_embd_head_k),
566
+ n_embd_k_gqa (hparams.n_embd_k_gqa()),
567
+ n_embd_head_v (hparams.n_embd_head_v),
568
+ n_embd_v_gqa (hparams.n_embd_v_gqa()),
569
+ n_expert (hparams.n_expert),
570
+ n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
571
+ freq_base (cparams.rope_freq_base),
572
+ freq_scale (cparams.rope_freq_scale),
573
+ ext_factor (cparams.yarn_ext_factor),
574
+ attn_factor (cparams.yarn_attn_factor),
575
+ beta_fast (cparams.yarn_beta_fast),
576
+ beta_slow (cparams.yarn_beta_slow),
577
+ norm_eps (hparams.f_norm_eps),
578
+ norm_rms_eps (hparams.f_norm_rms_eps),
579
+ n_tokens (ubatch.n_tokens),
580
+ n_outputs (params.n_outputs),
581
+ n_ctx_orig (cparams.n_ctx_orig_yarn),
582
+ pooling_type (cparams.pooling_type),
583
+ rope_type (hparams.rope_type),
584
+ ctx0 (params.ctx),
585
+ sched (params.sched),
586
+ backend_cpu (params.backend_cpu),
587
+ cvec (params.cvec),
588
+ loras (params.loras),
589
+ memory (params.memory),
590
+ cross (params.cross),
591
+ cb_func (params.cb),
592
+ res (std::make_unique<llm_graph_result>()) {
593
+ }
594
+
595
+ int64_t llm_graph_context::n_pos_per_token() const {
596
+ return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
597
+ }
598
+
599
+ void llm_graph_context::cb(lm_ggml_tensor * cur, const char * name, int il) const {
600
+ if (cb_func) {
601
+ cb_func(ubatch, cur, name, il);
602
+ }
603
+ }
604
+
605
+ lm_ggml_tensor * llm_graph_context::build_cvec(
606
+ lm_ggml_tensor * cur,
607
+ int il) const {
608
+ return cvec->apply_to(ctx0, cur, il);
609
+ }
610
+
611
+ lm_ggml_tensor * llm_graph_context::build_lora_mm(
612
+ lm_ggml_tensor * w,
613
+ lm_ggml_tensor * cur) const {
614
+ lm_ggml_tensor * res = lm_ggml_mul_mat(ctx0, w, cur);
615
+
616
+ for (const auto & lora : *loras) {
617
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
618
+ if (lw == nullptr) {
619
+ continue;
620
+ }
621
+
622
+ const float adapter_scale = lora.second;
623
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
624
+
625
+ lm_ggml_tensor * ab_cur = lm_ggml_mul_mat(
626
+ ctx0, lw->b,
627
+ lm_ggml_mul_mat(ctx0, lw->a, cur)
628
+ );
629
+
630
+ ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
631
+ res = lm_ggml_add(ctx0, res, ab_cur);
632
+ }
633
+
634
+ return res;
635
+ }
636
+
637
+ lm_ggml_tensor * llm_graph_context::build_lora_mm_id(
638
+ lm_ggml_tensor * w, // lm_ggml_tensor * as
639
+ lm_ggml_tensor * cur, // lm_ggml_tensor * b
640
+ lm_ggml_tensor * ids) const {
641
+ lm_ggml_tensor * res = lm_ggml_mul_mat_id(ctx0, w, cur, ids);
642
+ for (const auto & lora : *loras) {
643
+ llama_adapter_lora_weight * lw = lora.first->get_weight(w);
644
+ if (lw == nullptr) {
645
+ continue;
646
+ }
647
+
648
+ const float alpha = lora.first->alpha;
649
+ const float rank = (float) lw->b->ne[0];
650
+ const float scale = alpha ? lora.second * alpha / rank : lora.second;
651
+
652
+ lm_ggml_tensor * ab_cur = lm_ggml_mul_mat_id(
653
+ ctx0, lw->b,
654
+ lm_ggml_mul_mat_id(ctx0, lw->a, cur, ids),
655
+ ids
656
+ );
657
+
658
+ ab_cur = lm_ggml_scale(ctx0, ab_cur, scale);
659
+ res = lm_ggml_add(ctx0, res, ab_cur);
660
+ }
661
+
662
+ return res;
663
+ }
664
+
665
+ lm_ggml_tensor * llm_graph_context::build_norm(
666
+ lm_ggml_tensor * cur,
667
+ lm_ggml_tensor * mw,
668
+ lm_ggml_tensor * mb,
669
+ llm_norm_type type,
670
+ int il) const {
671
+ switch (type) {
672
+ case LLM_NORM: cur = lm_ggml_norm (ctx0, cur, hparams.f_norm_eps); break;
673
+ case LLM_NORM_RMS: cur = lm_ggml_rms_norm(ctx0, cur, hparams.f_norm_rms_eps); break;
674
+ case LLM_NORM_GROUP:
675
+ {
676
+ cur = lm_ggml_reshape_3d(ctx0, cur, cur->ne[0], 1, cur->ne[1]);
677
+ cur = lm_ggml_group_norm(ctx0, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
678
+ cur = lm_ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[2]);
679
+ } break;
680
+ }
681
+
682
+ if (mw || mb) {
683
+ cb(cur, "norm", il);
684
+ }
685
+
686
+ if (mw) {
687
+ cur = lm_ggml_mul(ctx0, cur, mw);
688
+ if (mb) {
689
+ cb(cur, "norm_w", il);
690
+ }
691
+ }
692
+
693
+ if (mb) {
694
+ cur = lm_ggml_add(ctx0, cur, mb);
695
+ }
696
+
697
+ return cur;
698
+ }
699
+
700
+ lm_ggml_tensor * llm_graph_context::build_ffn(
701
+ lm_ggml_tensor * cur,
702
+ lm_ggml_tensor * up,
703
+ lm_ggml_tensor * up_b,
704
+ lm_ggml_tensor * up_s,
705
+ lm_ggml_tensor * gate,
706
+ lm_ggml_tensor * gate_b,
707
+ lm_ggml_tensor * gate_s,
708
+ lm_ggml_tensor * down,
709
+ lm_ggml_tensor * down_b,
710
+ lm_ggml_tensor * down_s,
711
+ lm_ggml_tensor * act_scales,
712
+ llm_ffn_op_type type_op,
713
+ llm_ffn_gate_type type_gate,
714
+ int il) const {
715
+ lm_ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
716
+ cb(tmp, "ffn_up", il);
717
+
718
+ if (up_b) {
719
+ tmp = lm_ggml_add(ctx0, tmp, up_b);
720
+ cb(tmp, "ffn_up_b", il);
721
+ }
722
+
723
+ if (up_s) {
724
+ tmp = lm_ggml_mul(ctx0, tmp, up_s);
725
+ cb(tmp, "ffn_up_s", il);
726
+ }
727
+
728
+ if (gate) {
729
+ switch (type_gate) {
730
+ case LLM_FFN_SEQ:
731
+ {
732
+ cur = build_lora_mm(gate, tmp);
733
+ cb(cur, "ffn_gate", il);
734
+ } break;
735
+ case LLM_FFN_PAR:
736
+ {
737
+ cur = build_lora_mm(gate, cur);
738
+ cb(cur, "ffn_gate", il);
739
+ } break;
740
+ }
741
+
742
+ if (gate_b) {
743
+ cur = lm_ggml_add(ctx0, cur, gate_b);
744
+ cb(cur, "ffn_gate_b", il);
745
+ }
746
+
747
+ if (gate_s) {
748
+ cur = lm_ggml_mul(ctx0, cur, gate_s);
749
+ cb(cur, "ffn_gate_s", il);
750
+ }
751
+
752
+ } else {
753
+ cur = tmp;
754
+ }
755
+
756
+ switch (type_op) {
757
+ case LLM_FFN_SILU:
758
+ {
759
+ cur = lm_ggml_silu(ctx0, cur);
760
+ cb(cur, "ffn_silu", il);
761
+ } break;
762
+ case LLM_FFN_GELU:
763
+ {
764
+ cur = lm_ggml_gelu(ctx0, cur);
765
+ cb(cur, "ffn_gelu", il);
766
+ if (act_scales != NULL) {
767
+ cur = lm_ggml_div(ctx0, cur, act_scales);
768
+ cb(cur, "ffn_act", il);
769
+ }
770
+ } break;
771
+ case LLM_FFN_RELU:
772
+ {
773
+ cur = lm_ggml_relu(ctx0, cur);
774
+ cb(cur, "ffn_relu", il);
775
+ } break;
776
+ case LLM_FFN_RELU_SQR:
777
+ {
778
+ cur = lm_ggml_relu(ctx0, cur);
779
+ cb(cur, "ffn_relu", il);
780
+
781
+ cur = lm_ggml_sqr(ctx0, cur);
782
+ cb(cur, "ffn_sqr(relu)", il);
783
+ } break;
784
+ case LLM_FFN_SWIGLU:
785
+ {
786
+ // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
787
+ int64_t split_point = cur->ne[0] / 2;
788
+ lm_ggml_tensor * x0 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
789
+ lm_ggml_tensor * x1 = lm_ggml_cont(ctx0, lm_ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * lm_ggml_element_size(cur)));
790
+
791
+ x0 = lm_ggml_silu(ctx0, x0);
792
+ cb(cur, "ffn_silu", il);
793
+
794
+ cur = lm_ggml_mul(ctx0, x0, x1);
795
+ cb(cur, "ffn_mul", il);
796
+ } break;
797
+ }
798
+
799
+ if (type_gate == LLM_FFN_PAR) {
800
+ cur = lm_ggml_mul(ctx0, cur, tmp);
801
+ cb(cur, "ffn_gate_par", il);
802
+ }
803
+
804
+ if (down) {
805
+ cur = build_lora_mm(down, cur);
806
+ }
807
+
808
+ if (down_b) {
809
+ cb(cur, "ffn_down", il);
810
+ }
811
+
812
+ if (down_b) {
813
+ cur = lm_ggml_add(ctx0, cur, down_b);
814
+ }
815
+
816
+ if (down_s) {
817
+ cur = lm_ggml_mul(ctx0, cur, down_s);
818
+ cb(cur, "ffn_down_s", il);
819
+ }
820
+
821
+ return cur;
822
+ }
823
+
824
+ lm_ggml_tensor * llm_graph_context::build_moe_ffn(
825
+ lm_ggml_tensor * cur,
826
+ lm_ggml_tensor * gate_inp,
827
+ lm_ggml_tensor * up_exps,
828
+ lm_ggml_tensor * gate_exps,
829
+ lm_ggml_tensor * down_exps,
830
+ lm_ggml_tensor * exp_probs_b,
831
+ int64_t n_expert,
832
+ int64_t n_expert_used,
833
+ llm_ffn_op_type type_op,
834
+ bool norm_w,
835
+ bool scale_w,
836
+ float w_scale,
837
+ llama_expert_gating_func_type gating_op,
838
+ int il) const {
839
+ const int64_t n_embd = cur->ne[0];
840
+ const int64_t n_tokens = cur->ne[1];
841
+ const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
842
+
843
+ lm_ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
844
+ cb(logits, "ffn_moe_logits", il);
845
+
846
+ lm_ggml_tensor * probs = nullptr;
847
+ switch (gating_op) {
848
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
849
+ {
850
+ probs = lm_ggml_soft_max(ctx0, logits); // [n_expert, n_tokens]
851
+ } break;
852
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
853
+ {
854
+ probs = lm_ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
855
+ } break;
856
+ default:
857
+ LM_GGML_ABORT("fatal error");
858
+ }
859
+ cb(probs, "ffn_moe_probs", il);
860
+
861
+ // add experts selection bias - introduced in DeepSeek V3
862
+ // leave probs unbiased as it's later used to get expert weights
863
+ lm_ggml_tensor * selection_probs = probs;
864
+ if (exp_probs_b != nullptr) {
865
+ selection_probs = lm_ggml_add(ctx0, probs, exp_probs_b);
866
+ cb(selection_probs, "ffn_moe_probs_biased", il);
867
+ }
868
+
869
+ // llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
870
+ // see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
871
+ if (arch == LLM_ARCH_LLAMA4) {
872
+ selection_probs = logits;
873
+ }
874
+
875
+ // select experts
876
+ lm_ggml_tensor * selected_experts = lm_ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
877
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
878
+ cb(selected_experts, "ffn_moe_topk", il);
879
+
880
+ lm_ggml_tensor * weights = lm_ggml_get_rows(ctx0,
881
+ lm_ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
882
+ cb(weights, "ffn_moe_weights", il);
883
+
884
+ if (norm_w) {
885
+ weights = lm_ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
886
+
887
+ lm_ggml_tensor * weights_sum = lm_ggml_sum_rows(ctx0, weights); // [1, n_tokens]
888
+ cb(weights_sum, "ffn_moe_weights_sum", il);
889
+
890
+ weights = lm_ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
891
+ cb(weights, "ffn_moe_weights_norm", il);
892
+
893
+ weights = lm_ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens);
894
+ }
895
+ if (scale_w) {
896
+ weights = lm_ggml_scale(ctx0, weights, w_scale);
897
+ cb(weights, "ffn_moe_weights_scaled", il);
898
+ }
899
+
900
+ cur = lm_ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
901
+
902
+ if (weight_before_ffn) {
903
+ // TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (lm_ggml_repeat_4d)
904
+ lm_ggml_tensor * repeated = lm_ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
905
+ repeated = lm_ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
906
+ cur = lm_ggml_mul(ctx0, repeated, weights);
907
+ cb(cur, "ffn_moe_weighted", il);
908
+ }
909
+
910
+ lm_ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
911
+ cb(up, "ffn_moe_up", il);
912
+
913
+ lm_ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
914
+ cb(gate, "ffn_moe_gate", il);
915
+
916
+ switch (type_op) {
917
+ case LLM_FFN_SILU:
918
+ {
919
+ gate = lm_ggml_silu(ctx0, gate);
920
+ cb(gate, "ffn_moe_silu", il);
921
+ } break;
922
+ case LLM_FFN_GELU:
923
+ {
924
+ gate = lm_ggml_gelu(ctx0, gate);
925
+ cb(gate, "ffn_moe_gelu", il);
926
+ } break;
927
+ default:
928
+ LM_GGML_ABORT("fatal error");
929
+ }
930
+
931
+ lm_ggml_tensor * par = lm_ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
932
+ cb(par, "ffn_moe_gate_par", il);
933
+
934
+ lm_ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
935
+ cb(experts, "ffn_moe_down", il);
936
+
937
+ if (!weight_before_ffn) {
938
+ experts = lm_ggml_mul(ctx0, experts, weights);
939
+ cb(cur, "ffn_moe_weighted", il);
940
+ }
941
+
942
+ // aggregate experts
943
+ lm_ggml_tensor * moe_out = nullptr;
944
+ for (int i = 0; i < n_expert_used; ++i) {
945
+ lm_ggml_tensor * cur_expert = lm_ggml_view_2d(ctx0, experts, n_embd, n_tokens,
946
+ experts->nb[2], i*experts->nb[1]);
947
+
948
+ if (i == 0) {
949
+ moe_out = cur_expert;
950
+ } else {
951
+ moe_out = lm_ggml_add(ctx0, moe_out, cur_expert);
952
+ }
953
+ }
954
+
955
+ if (n_expert_used == 1) {
956
+ // avoid returning a non-contiguous tensor
957
+ moe_out = lm_ggml_cont(ctx0, moe_out);
958
+ }
959
+
960
+ cb(moe_out, "ffn_moe_out", il);
961
+
962
+ return moe_out;
963
+ }
964
+
965
+ // input embeddings with optional lora
966
+ lm_ggml_tensor * llm_graph_context::build_inp_embd(lm_ggml_tensor * tok_embd) const {
967
+ const int64_t n_embd = hparams.n_embd;
968
+
969
+ auto inp = std::make_unique<llm_graph_input_embd>();
970
+
971
+ lm_ggml_tensor * cur = nullptr;
972
+
973
+ if (ubatch.token) {
974
+ inp->tokens = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, ubatch.n_tokens);
975
+ //cb(inp->tokens, "inp_tokens", -1);
976
+ lm_ggml_set_input(inp->tokens);
977
+
978
+ cur = lm_ggml_get_rows(ctx0, tok_embd, inp->tokens);
979
+
980
+ // apply lora for embedding tokens if needed
981
+ for (const auto & lora : *loras) {
982
+ llama_adapter_lora_weight * lw = lora.first->get_weight(tok_embd);
983
+ if (lw == nullptr) {
984
+ continue;
985
+ }
986
+
987
+ const float adapter_scale = lora.second;
988
+ const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
989
+
990
+ lm_ggml_tensor * inpL_delta = lm_ggml_scale(ctx0, lm_ggml_mul_mat(
991
+ ctx0, lw->b, // non-transposed lora_b
992
+ lm_ggml_get_rows(ctx0, lw->a, inp->tokens)
993
+ ), scale);
994
+
995
+ cur = lm_ggml_add(ctx0, cur, inpL_delta);
996
+ }
997
+ } else {
998
+ inp->embd = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, ubatch.n_tokens);
999
+ lm_ggml_set_input(inp->embd);
1000
+
1001
+ cur = inp->embd;
1002
+ }
1003
+
1004
+ // For Granite architecture
1005
+ if (hparams.f_embedding_scale != 0.0f) {
1006
+ cur = lm_ggml_scale(ctx0, cur, hparams.f_embedding_scale);
1007
+ }
1008
+
1009
+ cb(cur, "inp_embd", -1);
1010
+
1011
+ res->add_input(std::move(inp));
1012
+
1013
+ return cur;
1014
+ }
1015
+
1016
+ lm_ggml_tensor * llm_graph_context::build_inp_pos() const {
1017
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
1018
+
1019
+ auto & cur = inp->pos;
1020
+
1021
+ cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens*n_pos_per_token());
1022
+ lm_ggml_set_input(cur);
1023
+
1024
+ res->add_input(std::move(inp));
1025
+
1026
+ return cur;
1027
+ }
1028
+
1029
+ lm_ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1030
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1031
+
1032
+ auto & cur = inp->attn_scale;
1033
+
1034
+ cur = lm_ggml_new_tensor_3d(ctx0, LM_GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
1035
+ lm_ggml_set_input(cur);
1036
+
1037
+ res->add_input(std::move(inp));
1038
+
1039
+ return cur;
1040
+ }
1041
+
1042
+ lm_ggml_tensor * llm_graph_context::build_inp_out_ids() const {
1043
+ auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);
1044
+
1045
+ auto & cur = inp->out_ids;
1046
+
1047
+ cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_outputs);
1048
+ lm_ggml_set_input(cur);
1049
+
1050
+ res->add_input(std::move(inp));
1051
+
1052
+ return cur;
1053
+ }
1054
+
1055
+ lm_ggml_tensor * llm_graph_context::build_inp_mean() const {
1056
+ auto inp = std::make_unique<llm_graph_input_mean>(cparams);
1057
+
1058
+ auto & cur = inp->mean;
1059
+
1060
+ cur = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_tokens, n_tokens);
1061
+ lm_ggml_set_input(cur);
1062
+
1063
+ res->add_input(std::move(inp));
1064
+
1065
+ return cur;
1066
+ }
1067
+
1068
+ lm_ggml_tensor * llm_graph_context::build_inp_cls() const {
1069
+ auto inp = std::make_unique<llm_graph_input_cls>(cparams);
1070
+
1071
+ auto & cur = inp->cls;
1072
+
1073
+ cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_tokens);
1074
+ lm_ggml_set_input(cur);
1075
+
1076
+ res->add_input(std::move(inp));
1077
+
1078
+ return cur;
1079
+ }
1080
+
1081
+ lm_ggml_tensor * llm_graph_context::build_inp_s_copy() const {
1082
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1083
+
1084
+ auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
1085
+
1086
+ const auto n_kv = kv_self->n;
1087
+
1088
+ auto & cur = inp->s_copy;
1089
+
1090
+ cur = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, n_kv);
1091
+ lm_ggml_set_input(cur);
1092
+
1093
+ res->add_input(std::move(inp));
1094
+
1095
+ return cur;
1096
+ }
1097
+
1098
+ lm_ggml_tensor * llm_graph_context::build_inp_s_mask() const {
1099
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1100
+
1101
+ auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
1102
+
1103
+ const auto n_kv = kv_self->n;
1104
+
1105
+ auto & cur = inp->s_mask;
1106
+
1107
+ cur = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, 1, n_kv);
1108
+ lm_ggml_set_input(cur);
1109
+
1110
+ res->add_input(std::move(inp));
1111
+
1112
+ return cur;
1113
+ }
1114
+
1115
+ lm_ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
1116
+ auto inp = std::make_unique<llm_graph_input_cross_embd>(cross);
1117
+
1118
+ auto & cur = inp->cross_embd;
1119
+
1120
+ // if we have the output embeddings from the encoder, use them directly
1121
+ // TODO: needs more work to be correct, for now just use the tensor shape
1122
+ //if (cross->t_embd) {
1123
+ // cur = lm_ggml_view_tensor(ctx0, cross->t_embd);
1124
+
1125
+ // return cur;
1126
+ //}
1127
+
1128
+ const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
1129
+ const auto n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1130
+
1131
+ cur = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_embd, n_enc);
1132
+ lm_ggml_set_input(cur);
1133
+
1134
+ res->add_input(std::move(inp));
1135
+
1136
+ return cur;
1137
+ }
1138
+
1139
+ lm_ggml_tensor * llm_graph_context::build_inp_pos_bucket_enc() const {
1140
+ auto inp = std::make_unique<llm_graph_input_pos_bucket>(hparams);
1141
+
1142
+ auto & cur = inp->pos_bucket;
1143
+
1144
+ cur = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, n_tokens, n_tokens);
1145
+ lm_ggml_set_input(cur);
1146
+
1147
+ res->add_input(std::move(inp));
1148
+
1149
+ return cur;
1150
+ }
1151
+
1152
+ lm_ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
1153
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1154
+
1155
+ auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
1156
+
1157
+ const auto n_kv = kv_self->n;
1158
+
1159
+ auto & cur = inp->pos_bucket;
1160
+
1161
+ cur = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_I32, n_kv, n_tokens);
1162
+ lm_ggml_set_input(cur);
1163
+
1164
+ res->add_input(std::move(inp));
1165
+
1166
+ return cur;
1167
+ }
1168
+
1169
+ lm_ggml_tensor * llm_graph_context::build_pos_bias(lm_ggml_tensor * pos_bucket, lm_ggml_tensor * attn_rel_b) const {
1170
+ lm_ggml_tensor * pos_bucket_1d = lm_ggml_reshape_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1]);
1171
+ cb(pos_bucket_1d, "pos_bucket_1d", -1);
1172
+
1173
+ lm_ggml_tensor * pos_bias = lm_ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
1174
+
1175
+ pos_bias = lm_ggml_reshape_3d(ctx0, pos_bias, pos_bias->ne[0], pos_bucket->ne[0], pos_bucket->ne[1]);
1176
+ pos_bias = lm_ggml_permute (ctx0, pos_bias, 2, 0, 1, 3);
1177
+ pos_bias = lm_ggml_cont (ctx0, pos_bias);
1178
+
1179
+ cb(pos_bias, "pos_bias", -1);
1180
+
1181
+ return pos_bias;
1182
+ }
1183
+
1184
+ lm_ggml_tensor * llm_graph_context::build_attn_mha(
1185
+ lm_ggml_cgraph * gf,
1186
+ lm_ggml_tensor * q,
1187
+ lm_ggml_tensor * k,
1188
+ lm_ggml_tensor * v,
1189
+ lm_ggml_tensor * kq_b,
1190
+ lm_ggml_tensor * kq_mask,
1191
+ bool v_trans,
1192
+ float kq_scale) const {
1193
+ //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1194
+ //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1195
+
1196
+ //const int64_t n_head = hparams.n_head(il);
1197
+ //const int64_t n_head_kv = hparams.n_head_kv(il);
1198
+
1199
+ //const auto & n_embd_head_k = hparams.n_embd_head_k;
1200
+ //const auto & n_embd_head_v = hparams.n_embd_head_v;
1201
+
1202
+ const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
1203
+
1204
+ const auto n_tokens = q->ne[1];
1205
+ const auto n_head = q->ne[2];
1206
+ const auto n_kv = k->ne[1];
1207
+
1208
+ lm_ggml_tensor * cur;
1209
+
1210
+ // TODO: replace hardcoded padding with ggml-provided padding
1211
+ if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
1212
+ LM_GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
1213
+
1214
+ if (v_trans) {
1215
+ v = lm_ggml_transpose(ctx0, v);
1216
+ }
1217
+
1218
+ // this can happen when KV cache is not used (e.g. an embedding model with non-causal attn)
1219
+ if (k->type == LM_GGML_TYPE_F32) {
1220
+ k = lm_ggml_cast(ctx0, k, LM_GGML_TYPE_F16);
1221
+ }
1222
+
1223
+ if (v->type == LM_GGML_TYPE_F32) {
1224
+ v = lm_ggml_cast(ctx0, v, LM_GGML_TYPE_F16);
1225
+ }
1226
+
1227
+ cur = lm_ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1228
+ hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1229
+
1230
+ lm_ggml_flash_attn_ext_set_prec(cur, LM_GGML_PREC_F32);
1231
+
1232
+ cur = lm_ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens);
1233
+ } else {
1234
+ lm_ggml_tensor * kq = lm_ggml_mul_mat(ctx0, k, q);
1235
+
1236
+ // note: this op tends to require high floating point range
1237
+ // while for some models F16 is enough, for others it is not, so we default to F32 here
1238
+ lm_ggml_mul_mat_set_prec(kq, LM_GGML_PREC_F32);
1239
+
1240
+ if (arch == LLM_ARCH_GROK) {
1241
+ // need to do the following:
1242
+ // multiply by attn_output_multiplyer of 0.08838834764831845
1243
+ // and then :
1244
+ // kq = 30 * tanh(kq / 30)
1245
+ // before the softmax below
1246
+
1247
+ kq = lm_ggml_tanh(ctx0, lm_ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1248
+ kq = lm_ggml_scale(ctx0, kq, 30);
1249
+ }
1250
+
1251
+ if (hparams.attn_soft_cap) {
1252
+ kq = lm_ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
1253
+ kq = lm_ggml_tanh (ctx0, kq);
1254
+ kq = lm_ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1255
+ }
1256
+
1257
+ if (kq_b) {
1258
+ kq = lm_ggml_add(ctx0, kq, kq_b);
1259
+ }
1260
+
1261
+ kq = lm_ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1262
+
1263
+ if (!v_trans) {
1264
+ // note: avoid this branch
1265
+ v = lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, v));
1266
+ }
1267
+
1268
+ lm_ggml_tensor * kqv = lm_ggml_mul_mat(ctx0, v, kq);
1269
+
1270
+ lm_ggml_tensor * kqv_merged = lm_ggml_permute(ctx0, kqv, 0, 2, 1, 3);
1271
+
1272
+ cur = lm_ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
1273
+
1274
+ if (!cparams.offload_kqv) {
1275
+ // all nodes between the KV store and the attention output are run on the CPU
1276
+ lm_ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
1277
+ }
1278
+ }
1279
+
1280
+ lm_ggml_build_forward_expand(gf, cur);
1281
+
1282
+ return cur;
1283
+ }
1284
+
1285
+ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() const {
1286
+ auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1287
+
1288
+ // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1289
+ inp->kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_tokens, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
1290
+ //cb(inp_kq_mask, "KQ_mask", -1);
1291
+ lm_ggml_set_input(inp->kq_mask);
1292
+
1293
+ inp->kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->kq_mask, LM_GGML_TYPE_F16) : inp->kq_mask;
1294
+
1295
+ return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
1296
+ }
1297
+
1298
+ lm_ggml_tensor * llm_graph_context::build_attn(
1299
+ llm_graph_input_attn_no_cache * inp,
1300
+ lm_ggml_cgraph * gf,
1301
+ lm_ggml_tensor * wo,
1302
+ lm_ggml_tensor * wo_b,
1303
+ lm_ggml_tensor * q_cur,
1304
+ lm_ggml_tensor * k_cur,
1305
+ lm_ggml_tensor * v_cur,
1306
+ lm_ggml_tensor * kq_b,
1307
+ float kq_scale,
1308
+ int il) const {
1309
+ LM_GGML_UNUSED(n_tokens);
1310
+
1311
+ // these nodes are added to the graph together so that they are not reordered
1312
+ // by doing so, the number of splits in the graph is reduced
1313
+ lm_ggml_build_forward_expand(gf, q_cur);
1314
+ lm_ggml_build_forward_expand(gf, k_cur);
1315
+ lm_ggml_build_forward_expand(gf, v_cur);
1316
+
1317
+ const auto & kq_mask = inp->get_kq_mask();
1318
+
1319
+ lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1320
+ //cb(q, "q", il);
1321
+
1322
+ lm_ggml_tensor * k = lm_ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1323
+ //cb(k, "k", il);
1324
+
1325
+ lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1326
+ //cb(k, "v", il);
1327
+
1328
+ lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
1329
+
1330
+ cb(cur, "kqv_out", il);
1331
+
1332
+ if (wo) {
1333
+ cur = build_lora_mm(wo, cur);
1334
+ }
1335
+
1336
+ if (wo_b) {
1337
+ //cb(cur, "kqv_wo", il);
1338
+ }
1339
+
1340
+ if (wo_b) {
1341
+ cur = lm_ggml_add(ctx0, cur, wo_b);
1342
+ }
1343
+
1344
+ return cur;
1345
+ }
1346
+
1347
+ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const {
1348
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1349
+
1350
+ auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
1351
+
1352
+ const auto n_kv = kv_self->n;
1353
+
1354
+ inp->self_kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
1355
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
1356
+ lm_ggml_set_input(inp->self_kq_mask);
1357
+
1358
+ inp->self_kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask, LM_GGML_TYPE_F16) : inp->self_kq_mask;
1359
+
1360
+ if (hparams.n_swa_pattern > 1) {
1361
+ LM_GGML_ASSERT(hparams.n_swa > 0);
1362
+
1363
+ inp->self_kq_mask_swa = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_kv, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
1364
+ //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
1365
+ lm_ggml_set_input(inp->self_kq_mask_swa);
1366
+
1367
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->self_kq_mask_swa, LM_GGML_TYPE_F16) : inp->self_kq_mask_swa;
1368
+ }
1369
+
1370
+ return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
1371
+ }
1372
+
1373
+ lm_ggml_tensor * llm_graph_context::build_attn(
1374
+ llm_graph_input_attn_kv_unified * inp,
1375
+ lm_ggml_cgraph * gf,
1376
+ lm_ggml_tensor * wo,
1377
+ lm_ggml_tensor * wo_b,
1378
+ lm_ggml_tensor * q_cur,
1379
+ lm_ggml_tensor * k_cur,
1380
+ lm_ggml_tensor * v_cur,
1381
+ lm_ggml_tensor * kq_b,
1382
+ float kq_scale,
1383
+ int il) const {
1384
+ // these nodes are added to the graph together so that they are not reordered
1385
+ // by doing so, the number of splits in the graph is reduced
1386
+ lm_ggml_build_forward_expand(gf, q_cur);
1387
+ lm_ggml_build_forward_expand(gf, k_cur);
1388
+ lm_ggml_build_forward_expand(gf, v_cur);
1389
+
1390
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1391
+ const auto & n_ctx = cparams.n_ctx;
1392
+
1393
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1394
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1395
+
1396
+ const auto n_tokens = q_cur->ne[2];
1397
+
1398
+ const bool v_trans = !cparams.flash_attn;
1399
+
1400
+ // store to KV cache
1401
+ {
1402
+ LM_GGML_ASSERT(!kv_self->recurrent);
1403
+
1404
+ const auto kv_head = kv_self->head;
1405
+
1406
+ LM_GGML_ASSERT(kv_self->size == n_ctx);
1407
+
1408
+ lm_ggml_tensor * k_cache_view = lm_ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
1409
+ //cb(k_cache_view, "k_cache_view", il);
1410
+
1411
+ // note: storing RoPE-ed version of K in the KV cache
1412
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, k_cur, k_cache_view));
1413
+
1414
+ v_cur = lm_ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
1415
+
1416
+ lm_ggml_tensor * v_cache_view = nullptr;
1417
+
1418
+ if (!v_trans) {
1419
+ v_cache_view = lm_ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
1420
+ } else {
1421
+ // note: the V cache is transposed when not using flash attention
1422
+ v_cache_view = lm_ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
1423
+ ( n_ctx)*lm_ggml_element_size(kv_self->v_l[il]),
1424
+ (kv_head)*lm_ggml_element_size(kv_self->v_l[il]));
1425
+
1426
+ v_cur = lm_ggml_transpose(ctx0, v_cur);
1427
+ }
1428
+ //cb(v_cache_view, "v_cache_view", il);
1429
+
1430
+ lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, v_cur, v_cache_view));
1431
+ }
1432
+
1433
+ const bool is_swa = hparams.is_swa(il);
1434
+
1435
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
1436
+
1437
+ const auto n_kv = kv_self->n;
1438
+
1439
+ const int64_t n_head_kv = hparams.n_head_kv(il);
1440
+
1441
+ const auto & n_embd_head_k = hparams.n_embd_head_k;
1442
+ const auto & n_embd_head_v = hparams.n_embd_head_v;
1443
+
1444
+ lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1445
+ //cb(q, "q", il);
1446
+
1447
+ lm_ggml_tensor * k =
1448
+ lm_ggml_view_3d(ctx0, kv_self->k_l[il],
1449
+ n_embd_head_k, n_kv, n_head_kv,
1450
+ lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
1451
+ lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
1452
+ 0);
1453
+ //cb(k, "k", il);
1454
+
1455
+ lm_ggml_tensor * v = !v_trans ?
1456
+ lm_ggml_view_3d(ctx0, kv_self->v_l[il],
1457
+ n_embd_head_v, n_kv, n_head_kv,
1458
+ lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
1459
+ lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
1460
+ 0) :
1461
+ lm_ggml_view_3d(ctx0, kv_self->v_l[il],
1462
+ n_kv, n_embd_head_v, n_head_kv,
1463
+ lm_ggml_element_size(kv_self->v_l[il])*n_ctx,
1464
+ lm_ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
1465
+ 0);
1466
+
1467
+ lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
1468
+ cb(cur, "kqv_out", il);
1469
+
1470
+ if (wo) {
1471
+ cur = build_lora_mm(wo, cur);
1472
+ }
1473
+
1474
+ if (wo_b) {
1475
+ //cb(cur, "kqv_wo", il);
1476
+ }
1477
+
1478
+ if (wo_b) {
1479
+ cur = lm_ggml_add(ctx0, cur, wo_b);
1480
+ }
1481
+
1482
+ return cur;
1483
+ }
1484
+
1485
+ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
1486
+ auto inp = std::make_unique<llm_graph_input_attn_cross>(cross);
1487
+
1488
+ const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
1489
+
1490
+ inp->cross_kq_mask = lm_ggml_new_tensor_2d(ctx0, LM_GGML_TYPE_F32, n_enc, LM_GGML_PAD(n_tokens, LM_GGML_KQ_MASK_PAD));
1491
+ lm_ggml_set_input(inp->cross_kq_mask);
1492
+
1493
+ inp->cross_kq_mask_cnv = cparams.flash_attn ? lm_ggml_cast(ctx0, inp->cross_kq_mask, LM_GGML_TYPE_F16) : inp->cross_kq_mask;
1494
+
1495
+ return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
1496
+ }
1497
+
1498
+ lm_ggml_tensor * llm_graph_context::build_attn(
1499
+ llm_graph_input_attn_cross * inp,
1500
+ lm_ggml_cgraph * gf,
1501
+ lm_ggml_tensor * wo,
1502
+ lm_ggml_tensor * wo_b,
1503
+ lm_ggml_tensor * q_cur,
1504
+ lm_ggml_tensor * k_cur,
1505
+ lm_ggml_tensor * v_cur,
1506
+ lm_ggml_tensor * kq_b,
1507
+ float kq_scale,
1508
+ int il) const {
1509
+ // these nodes are added to the graph together so that they are not reordered
1510
+ // by doing so, the number of splits in the graph is reduced
1511
+ lm_ggml_build_forward_expand(gf, q_cur);
1512
+ lm_ggml_build_forward_expand(gf, k_cur);
1513
+ lm_ggml_build_forward_expand(gf, v_cur);
1514
+
1515
+ const auto & kq_mask = inp->get_kq_mask_cross();
1516
+
1517
+ lm_ggml_tensor * q = lm_ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1518
+ //cb(q, "q", il);
1519
+
1520
+ lm_ggml_tensor * k = lm_ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1521
+ //cb(k, "k", il);
1522
+
1523
+ lm_ggml_tensor * v = lm_ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1524
+ //cb(k, "v", il);
1525
+
1526
+ lm_ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
1527
+
1528
+ cb(cur, "kqv_out", il);
1529
+
1530
+ if (wo) {
1531
+ cur = build_lora_mm(wo, cur);
1532
+ }
1533
+
1534
+ if (wo_b) {
1535
+ //cb(cur, "kqv_wo", il);
1536
+ }
1537
+
1538
+ if (wo_b) {
1539
+ cur = lm_ggml_add(ctx0, cur, wo_b);
1540
+ }
1541
+
1542
+ return cur;
1543
+ }
1544
+
1545
+ lm_ggml_tensor * llm_graph_context::build_copy_mask_state(
1546
+ lm_ggml_cgraph * gf,
1547
+ lm_ggml_tensor * s,
1548
+ lm_ggml_tensor * state_copy,
1549
+ lm_ggml_tensor * state_mask,
1550
+ int32_t n_state,
1551
+ int32_t n_seqs) const {
1552
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1553
+
1554
+ const auto n_kv = kv_self->n;
1555
+ const auto kv_head = kv_self->head;
1556
+
1557
+ lm_ggml_tensor * states = lm_ggml_reshape_2d(ctx0, s, n_state, kv_self->size);
1558
+
1559
+ // copy states
1560
+ // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
1561
+ // this shrinks the tensors's ne[1] to n_kv
1562
+ states = lm_ggml_get_rows(ctx0, states, state_copy);
1563
+
1564
+ // clear states of sequences which are starting at the beginning of this batch
1565
+ // FIXME: zero-out NANs?
1566
+ states = lm_ggml_mul(ctx0, states, state_mask);
1567
+
1568
+ // copy states which won't be changed further (between n_seqs and n_kv)
1569
+ lm_ggml_build_forward_expand(gf,
1570
+ lm_ggml_cpy(ctx0,
1571
+ lm_ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*lm_ggml_element_size(states)),
1572
+ lm_ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*lm_ggml_element_size(s))));
1573
+
1574
+ // the part of the states that will be used and modified
1575
+ return lm_ggml_view_2d(ctx0, states, n_state, n_seqs, states->nb[1], 0);
1576
+ }
1577
+
1578
+ lm_ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
1579
+ lm_ggml_cgraph * gf,
1580
+ lm_ggml_tensor * state_copy,
1581
+ lm_ggml_tensor * state_mask,
1582
+ const llama_ubatch & ubatch,
1583
+ int il) const {
1584
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1585
+
1586
+ const auto token_shift_count = hparams.token_shift_count;
1587
+
1588
+ const int64_t n_seqs = ubatch.n_seqs;
1589
+
1590
+ lm_ggml_tensor * token_shift_all = kv_self->k_l[il];
1591
+
1592
+ lm_ggml_tensor * token_shift = build_copy_mask_state(
1593
+ gf, token_shift_all, state_copy, state_mask,
1594
+ hparams.n_embd_k_s(), n_seqs);
1595
+
1596
+ token_shift = lm_ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
1597
+
1598
+ return token_shift;
1599
+ }
1600
+
1601
+ lm_ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
1602
+ lm_ggml_tensor * token_shift,
1603
+ const llama_ubatch & ubatch,
1604
+ int il) const {
1605
+ const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1606
+
1607
+ const auto token_shift_count = hparams.token_shift_count;
1608
+ const auto n_embd = hparams.n_embd;
1609
+
1610
+ const int64_t n_seqs = ubatch.n_seqs;
1611
+
1612
+ const auto kv_head = kv_self->head;
1613
+
1614
+ return lm_ggml_cpy(
1615
+ ctx0,
1616
+ lm_ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * token_shift_count, 0),
1617
+ lm_ggml_view_1d(ctx0, kv_self->k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * lm_ggml_element_size(kv_self->k_l[il]))
1618
+ );
1619
+ }
1620
+
1621
+ void llm_graph_context::build_pooling(
1622
+ lm_ggml_cgraph * gf,
1623
+ lm_ggml_tensor * cls,
1624
+ lm_ggml_tensor * cls_b,
1625
+ lm_ggml_tensor * cls_out,
1626
+ lm_ggml_tensor * cls_out_b) const {
1627
+ if (!cparams.embeddings) {
1628
+ return;
1629
+ }
1630
+
1631
+ lm_ggml_tensor * inp = res->t_embd;
1632
+
1633
+ //// find result_norm tensor for input
1634
+ //for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
1635
+ // inp = lm_ggml_graph_node(gf, i);
1636
+ // if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
1637
+ // break;
1638
+ // }
1639
+
1640
+ // inp = nullptr;
1641
+ //}
1642
+
1643
+ LM_GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
1644
+
1645
+ lm_ggml_tensor * cur;
1646
+
1647
+ switch (pooling_type) {
1648
+ case LLAMA_POOLING_TYPE_NONE:
1649
+ {
1650
+ cur = inp;
1651
+ } break;
1652
+ case LLAMA_POOLING_TYPE_MEAN:
1653
+ {
1654
+ lm_ggml_tensor * inp_mean = build_inp_mean();
1655
+ cur = lm_ggml_mul_mat(ctx0, lm_ggml_cont(ctx0, lm_ggml_transpose(ctx0, inp)), inp_mean);
1656
+ } break;
1657
+ case LLAMA_POOLING_TYPE_CLS:
1658
+ case LLAMA_POOLING_TYPE_LAST:
1659
+ {
1660
+ lm_ggml_tensor * inp_cls = build_inp_cls();
1661
+ cur = lm_ggml_get_rows(ctx0, inp, inp_cls);
1662
+ } break;
1663
+ case LLAMA_POOLING_TYPE_RANK:
1664
+ {
1665
+ lm_ggml_tensor * inp_cls = build_inp_cls();
1666
+ inp = lm_ggml_get_rows(ctx0, inp, inp_cls);
1667
+
1668
+ // classification head
1669
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1670
+ LM_GGML_ASSERT(cls != nullptr);
1671
+ LM_GGML_ASSERT(cls_b != nullptr);
1672
+
1673
+ cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, cls, inp), cls_b);
1674
+ cur = lm_ggml_tanh(ctx0, cur);
1675
+
1676
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1677
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1678
+ if (cls_out) {
1679
+ LM_GGML_ASSERT(cls_out_b != nullptr);
1680
+
1681
+ cur = lm_ggml_add (ctx0, lm_ggml_mul_mat(ctx0, cls_out, cur), cls_out_b);
1682
+ }
1683
+ } break;
1684
+ default:
1685
+ {
1686
+ LM_GGML_ABORT("unknown pooling type");
1687
+ }
1688
+ }
1689
+
1690
+ cb(cur, "result_embd_pooled", -1);
1691
+ res->t_embd_pooled = cur;
1692
+
1693
+ lm_ggml_build_forward_expand(gf, cur);
1694
+ }
1695
+