@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
258
258
  }
259
259
  }
260
260
 
261
+ static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
262
+ LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
263
+ const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
264
+ (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
265
+ (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
266
+ (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
267
+ LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
268
+ LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
269
+ LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
270
+
271
+ LLAMA_LOG_DEBUG(" ");
272
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
273
+ LLAMA_LOG_DEBUG("%2d", j);
274
+ }
275
+ LLAMA_LOG_DEBUG("\n");
276
+
277
+ for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
278
+ LLAMA_LOG_DEBUG(" %2d ", i);
279
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
280
+ float val = data[i * n_kv + j];
281
+ if (val == -INFINITY) {
282
+ LLAMA_LOG_DEBUG(" ∞");
283
+ } else {
284
+ LLAMA_LOG_DEBUG(" 0");
285
+ }
286
+ }
287
+ LLAMA_LOG_DEBUG("\n");
288
+ }
289
+ }
290
+
261
291
  void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
262
292
  const int64_t n_kv = ubatch->n_tokens;
263
293
  const int64_t n_tokens = ubatch->n_tokens;
@@ -267,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
267
297
 
268
298
  float * data = (float *) kq_mask->data;
269
299
 
300
+ // [TAG_NO_CACHE_ISWA]
301
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
302
+
270
303
  for (int h = 0; h < 1; ++h) {
271
304
  for (int i1 = 0; i1 < n_tokens; ++i1) {
272
305
  const llama_seq_id s1 = ubatch->seq_id[i1][0];
@@ -277,21 +310,33 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
277
310
  for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
278
311
  const llama_seq_id s0 = ubatch->seq_id[i0][0];
279
312
 
280
- // TODO: reimplement this like in llama_kv_cache
281
- if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
282
- if (hparams.use_alibi) {
283
- f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
284
- } else {
285
- f = 0.0f;
286
- }
287
- break;
313
+ if (s0 != s1) {
314
+ continue; // skip different sequences
288
315
  }
289
- }
290
316
 
317
+ if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
318
+ continue; // skip future tokens for causal attention
319
+ }
320
+
321
+ // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
322
+ //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
323
+ // continue; // skip masked tokens for SWA
324
+ //}
325
+
326
+ // TODO: reimplement this like in llama_kv_cache_unified
327
+ if (hparams.use_alibi) {
328
+ f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
329
+ } else {
330
+ f = 0.0f;
331
+ }
332
+ }
291
333
  data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
292
334
  }
293
335
  }
294
336
  }
337
+ if (debug) {
338
+ print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
339
+ }
295
340
  }
296
341
 
297
342
  void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
@@ -314,8 +359,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
314
359
  res &= self_kq_mask->ne[0] == mctx->get_n_kv();
315
360
  res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
316
361
 
317
- res &= mctx->get_supports_set_rows(); // TODO: tmp
318
-
319
362
  return res;
320
363
  }
321
364
 
@@ -350,8 +393,6 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
350
393
  res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
351
394
  res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
352
395
 
353
- res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
354
-
355
396
  return res;
356
397
  }
357
398
 
@@ -1225,7 +1266,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1225
1266
  ggml_tensor * kq_mask,
1226
1267
  ggml_tensor * sinks,
1227
1268
  ggml_tensor * v_mla,
1228
- float kq_scale) const {
1269
+ float kq_scale,
1270
+ int il) const {
1229
1271
  const bool v_trans = v->nb[1] > v->nb[2];
1230
1272
 
1231
1273
  // split the batch into streams if needed
@@ -1260,6 +1302,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1260
1302
 
1261
1303
  cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
1262
1304
  hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
1305
+ cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
1263
1306
 
1264
1307
  ggml_flash_attn_ext_add_sinks(cur, sinks);
1265
1308
  ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
@@ -1275,6 +1318,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1275
1318
  // The permutations are noops and only change how the tensor data is interpreted.
1276
1319
  cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1277
1320
  cur = ggml_mul_mat(ctx0, v_mla, cur);
1321
+ cb(cur, "fattn_mla", il);
1278
1322
  cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
1279
1323
  cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
1280
1324
  #endif
@@ -1283,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1283
1327
  cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
1284
1328
  } else {
1285
1329
  ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
1330
+ cb(kq, "kq", il);
1286
1331
 
1287
1332
  // note: this op tends to require high floating point range
1288
1333
  // while for some models F16 is enough, for others it is not, so we default to F32 here
@@ -1296,32 +1341,42 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1296
1341
  // before the softmax below
1297
1342
 
1298
1343
  kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1344
+ cb(kq, "kq_tanh", il);
1299
1345
  kq = ggml_scale(ctx0, kq, 30);
1346
+ cb(kq, "kq_scaled", il);
1300
1347
  }
1301
1348
 
1302
1349
  if (hparams.attn_soft_cap) {
1303
1350
  kq = ggml_scale(ctx0, kq, 1.0f / hparams.f_attn_logit_softcapping);
1351
+ cb(kq, "kq_scaled_1", il);
1304
1352
  kq = ggml_tanh (ctx0, kq);
1353
+ cb(kq, "kq_tanh", il);
1305
1354
  kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1355
+ cb(kq, "kq_scaled_2", il);
1306
1356
  }
1307
1357
 
1308
1358
  if (kq_b) {
1309
1359
  kq = ggml_add(ctx0, kq, kq_b);
1360
+ cb(kq, "kq_plus_kq_b", il);
1310
1361
  }
1311
1362
 
1312
1363
  kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
1313
1364
  ggml_soft_max_add_sinks(kq, sinks);
1365
+ cb(kq, "kq_soft_max", il);
1314
1366
 
1315
1367
  if (!v_trans) {
1316
1368
  // note: avoid this branch
1317
1369
  v = ggml_cont(ctx0, ggml_transpose(ctx0, v));
1370
+ cb(v, "v_cont", il);
1318
1371
  }
1319
1372
 
1320
1373
  ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
1374
+ cb(kqv, "kqv", il);
1321
1375
 
1322
1376
  // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
1323
1377
  if (v_mla) {
1324
1378
  kqv = ggml_mul_mat(ctx0, v_mla, kqv);
1379
+ cb(kqv, "kqv_mla", il);
1325
1380
  }
1326
1381
 
1327
1382
  cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
@@ -1376,13 +1431,13 @@ ggml_tensor * llm_graph_context::build_attn(
1376
1431
 
1377
1432
  // [TAG_NO_CACHE_PAD]
1378
1433
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
1379
- assert(!ubatch.equal_seqs());
1434
+ assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1380
1435
 
1381
1436
  ggml_tensor * q = q_cur;
1382
1437
  ggml_tensor * k = k_cur;
1383
1438
  ggml_tensor * v = v_cur;
1384
1439
 
1385
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
1440
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
1386
1441
  cb(cur, "kqv_out", il);
1387
1442
 
1388
1443
  if (wo) {
@@ -1471,7 +1526,7 @@ ggml_tensor * llm_graph_context::build_attn(
1471
1526
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1472
1527
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1473
1528
 
1474
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
1529
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
1475
1530
  cb(cur, "kqv_out", il);
1476
1531
 
1477
1532
  if (wo) {
@@ -1538,7 +1593,7 @@ ggml_tensor * llm_graph_context::build_attn(
1538
1593
  ggml_tensor * k = mctx_cur->get_k(ctx0, il);
1539
1594
  ggml_tensor * v = mctx_cur->get_v(ctx0, il);
1540
1595
 
1541
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
1596
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
1542
1597
  cb(cur, "kqv_out", il);
1543
1598
 
1544
1599
  if (wo) {
@@ -1593,7 +1648,7 @@ ggml_tensor * llm_graph_context::build_attn(
1593
1648
  ggml_tensor * k = k_cur;
1594
1649
  ggml_tensor * v = v_cur;
1595
1650
 
1596
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale);
1651
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
1597
1652
  cb(cur, "kqv_out", il);
1598
1653
 
1599
1654
  if (wo) {
@@ -78,6 +78,11 @@ struct llm_graph_params;
78
78
 
79
79
  class llm_graph_input_i {
80
80
  public:
81
+ llm_graph_input_i() {
82
+ const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
83
+ debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
84
+ }
85
+
81
86
  virtual ~llm_graph_input_i() = default;
82
87
 
83
88
  virtual void set_input(const llama_ubatch * ubatch) = 0;
@@ -90,6 +95,9 @@ public:
90
95
  GGML_UNUSED(params);
91
96
  return false;
92
97
  }
98
+ protected:
99
+ // env: LLAMA_GRAPH_INPUT_DEBUG
100
+ int debug = 0;
93
101
  };
94
102
 
95
103
  using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
@@ -687,7 +695,8 @@ struct llm_graph_context {
687
695
  ggml_tensor * kq_mask,
688
696
  ggml_tensor * sinks, // [n_head_q]
689
697
  ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
690
- float kq_scale) const;
698
+ float kq_scale,
699
+ int il) const;
691
700
 
692
701
  llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
693
702
 
@@ -1,6 +1,7 @@
1
1
  #include "llama-hparams.h"
2
2
 
3
3
  #include "ggml.h"
4
+ #include <cassert>
4
5
 
5
6
  void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
6
7
  if (dense_first) {
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
178
179
 
179
180
  return res;
180
181
  }
182
+
183
+ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
184
+ assert(p0 >= 0 && p1 >= 0);
185
+
186
+ switch (swa_type) {
187
+ case LLAMA_SWA_TYPE_NONE:
188
+ {
189
+ } break;
190
+ case LLAMA_SWA_TYPE_STANDARD:
191
+ {
192
+ if (p1 - p0 >= (int32_t) n_swa) {
193
+ return true;
194
+ }
195
+ } break;
196
+ case LLAMA_SWA_TYPE_CHUNKED:
197
+ {
198
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
199
+
200
+ if (p0 < pos_chunk_start) {
201
+ return true;
202
+ }
203
+ } break;
204
+ case LLAMA_SWA_TYPE_SYMMETRIC:
205
+ {
206
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
207
+ const int32_t pos_diff = p1 - p0;
208
+
209
+ // Mask if outside the symmetric window
210
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
211
+ return true;
212
+ }
213
+ } break;
214
+ }
215
+
216
+ return false;
217
+ }
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
16
16
  };
17
17
 
18
18
  enum llama_swa_type {
19
- LLAMA_SWA_TYPE_NONE = 0,
20
- LLAMA_SWA_TYPE_STANDARD = 1,
21
- LLAMA_SWA_TYPE_CHUNKED = 2,
19
+ LLAMA_SWA_TYPE_NONE = 0,
20
+ LLAMA_SWA_TYPE_STANDARD = 1,
21
+ LLAMA_SWA_TYPE_CHUNKED = 2,
22
+ LLAMA_SWA_TYPE_SYMMETRIC = 3,
22
23
  };
23
24
 
24
25
  struct llama_hparams_posnet {
@@ -227,6 +228,11 @@ struct llama_hparams {
227
228
 
228
229
  // number of layers for which has_kv() returns true
229
230
  uint32_t n_layer_kv() const;
231
+
232
+ // note that this function uses different SWA parameters from those in the hparams
233
+ // TODO: think of a better place for this function
234
+ // TODO: pack the SWA params in a struct?
235
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
230
236
  };
231
237
 
232
238
  static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -59,3 +59,5 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne);
59
59
  std::string llama_format_tensor_shape(const struct ggml_tensor * t);
60
60
 
61
61
  std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);
62
+
63
+ #define LLAMA_TENSOR_NAME_FATTN "__fattn__"
@@ -197,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
197
197
 
198
198
  const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
199
199
  debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
200
-
201
- const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
202
- supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
203
-
204
- if (!supports_set_rows) {
205
- // ref: https://github.com/ggml-org/llama.cpp/pull/14363
206
- GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
207
- }
208
-
209
- if (!supports_set_rows) {
210
- LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
211
- }
212
200
  }
213
201
 
214
202
  void llama_kv_cache::clear(bool data) {
@@ -551,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
551
539
  bool success = true;
552
540
 
553
541
  for (const auto & ubatch : ubatches) {
554
- // non-continuous slots require support for ggml_set_rows()
555
- const bool cont = supports_set_rows ? false : true;
556
-
557
542
  // only find a suitable slot for the ubatch. don't modify the cells yet
558
- const auto sinfo_new = find_slot(ubatch, cont);
543
+ const auto sinfo_new = find_slot(ubatch, false);
559
544
  if (sinfo_new.empty()) {
560
545
  success = false;
561
546
  break;
@@ -771,8 +756,8 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
771
756
  GGML_ASSERT(ubatch.seq_id [s*n_tokens][0] == seq_id);
772
757
  }
773
758
 
774
- res.s0 = std::min<llama_seq_id>(res.s0, seq_to_stream[seq_id]);
775
- res.s1 = std::max<llama_seq_id>(res.s1, seq_to_stream[seq_id]);
759
+ res.s0 = std::min<uint32_t>(res.s0, seq_to_stream[seq_id]);
760
+ res.s1 = std::max<uint32_t>(res.s1, seq_to_stream[seq_id]);
776
761
 
777
762
  res.strm[s] = seq_to_stream[seq_id];
778
763
  res.idxs[s].reserve(n_tokens);
@@ -964,11 +949,11 @@ bool llama_kv_cache::get_has_shift() const {
964
949
  return result;
965
950
  }
966
951
 
967
- uint32_t llama_kv_cache::get_n_kv() const {
952
+ uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
968
953
  uint32_t result = 0;
969
954
 
970
- for (uint32_t s = 0; s < n_stream; ++s) {
971
- const auto & cells = v_cells[s];
955
+ for (uint32_t s = 0; s < sinfo.n_stream(); ++s) {
956
+ const auto & cells = v_cells[sinfo.strm[s]];
972
957
 
973
958
  result = std::max(std::min(cells.size(), std::max(n_pad, GGML_PAD(cells.used_max_p1(), n_pad))), result);
974
959
  }
@@ -976,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
976
961
  return result;
977
962
  }
978
963
 
979
- bool llama_kv_cache::get_supports_set_rows() const {
980
- return supports_set_rows;
981
- }
982
-
983
964
  ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
984
965
  const int32_t ikv = map_layer_ids.at(il);
985
966
 
@@ -1017,52 +998,42 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
1017
998
  // note: v->nb[1] <= v->nb[2]
1018
999
  return ggml_view_4d(ctx, v,
1019
1000
  hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
1020
- ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
1021
- ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
1022
- ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
1001
+ ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1]
1002
+ ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2]
1003
+ ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3]
1023
1004
  ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
1024
1005
  }
1025
1006
 
1026
1007
  // note: v->nb[1] > v->nb[2]
1027
1008
  return ggml_view_4d(ctx, v,
1028
1009
  n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
1029
- ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
1030
- ggml_row_size(v->type, kv_size), // v->nb[2]
1031
- ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
1010
+ ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1]
1011
+ ggml_row_size(v->type, kv_size), // v->nb[2]
1012
+ ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3]
1032
1013
  ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
1033
1014
  }
1034
1015
 
1035
1016
  ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1017
+ GGML_UNUSED(sinfo);
1018
+
1036
1019
  const int32_t ikv = map_layer_ids.at(il);
1037
1020
 
1038
1021
  auto * k = layers[ikv].k;
1039
1022
 
1040
- const int64_t n_embd_k_gqa = k->ne[0];
1041
1023
  const int64_t n_tokens = k_cur->ne[2];
1042
1024
 
1043
1025
  k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
1044
1026
 
1045
- if (k_idxs && supports_set_rows) {
1046
- if (k->ne[2] > 1) {
1047
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1048
- }
1049
-
1050
- return ggml_set_rows(ctx, k, k_cur, k_idxs);
1027
+ if (k->ne[2] > 1) {
1028
+ k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1051
1029
  }
1052
1030
 
1053
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1054
- // will be removed when ggml_set_rows() is adopted by all backends
1055
-
1056
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1057
-
1058
- ggml_tensor * k_view = ggml_view_1d(ctx, k,
1059
- n_tokens*n_embd_k_gqa,
1060
- ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
1061
-
1062
- return ggml_cpy(ctx, k_cur, k_view);
1031
+ return ggml_set_rows(ctx, k, k_cur, k_idxs);
1063
1032
  }
1064
1033
 
1065
1034
  ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1035
+ GGML_UNUSED(sinfo);
1036
+
1066
1037
  const int32_t ikv = map_layer_ids.at(il);
1067
1038
 
1068
1039
  auto * v = layers[ikv].v;
@@ -1072,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1072
1043
 
1073
1044
  v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
1074
1045
 
1075
- if (v_idxs && supports_set_rows) {
1076
- if (!v_trans) {
1077
- if (v->ne[2] > 1) {
1078
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1079
- }
1080
-
1081
- return ggml_set_rows(ctx, v, v_cur, v_idxs);
1082
- }
1083
-
1084
- // [TAG_V_CACHE_VARIABLE]
1085
- if (n_embd_v_gqa < v->ne[0]) {
1086
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1046
+ if (!v_trans) {
1047
+ if (v->ne[2] > 1) {
1048
+ v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1087
1049
  }
1088
1050
 
1089
- // the row becomes a single element
1090
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1091
-
1092
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1093
-
1094
- return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1051
+ return ggml_set_rows(ctx, v, v_cur, v_idxs);
1095
1052
  }
1096
1053
 
1097
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1098
- // will be removed when ggml_set_rows() is adopted by all backends
1099
-
1100
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1054
+ // [TAG_V_CACHE_VARIABLE]
1055
+ if (n_embd_v_gqa < v->ne[0]) {
1056
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1057
+ }
1101
1058
 
1102
- ggml_tensor * v_view = nullptr;
1059
+ // the row becomes a single element
1060
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1103
1061
 
1104
- if (!v_trans) {
1105
- v_view = ggml_view_1d(ctx, v,
1106
- n_tokens*n_embd_v_gqa,
1107
- ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
1108
- } else {
1109
- v_cur = ggml_transpose(ctx, v_cur);
1110
-
1111
- v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
1112
- (v->ne[1] )*ggml_element_size(v),
1113
- (sinfo.head())*ggml_element_size(v));
1114
- }
1062
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1115
1063
 
1116
- return ggml_cpy(ctx, v_cur, v_view);
1064
+ return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1117
1065
  }
1118
1066
 
1119
1067
  ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
@@ -1143,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
1143
1091
  }
1144
1092
 
1145
1093
  void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1146
- if (!supports_set_rows) {
1147
- return;
1148
- }
1149
-
1150
1094
  const uint32_t n_tokens = ubatch->n_tokens;
1151
1095
  GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1152
1096
 
@@ -1163,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
1163
1107
  }
1164
1108
 
1165
1109
  void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1166
- if (!supports_set_rows) {
1167
- return;
1168
- }
1169
-
1170
1110
  const uint32_t n_tokens = ubatch->n_tokens;
1171
1111
  GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
1172
1112
 
@@ -1453,29 +1393,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
1453
1393
  }
1454
1394
 
1455
1395
  bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1456
- assert(p0 >= 0 && p1 >= 0);
1457
-
1458
- switch (swa_type) {
1459
- case LLAMA_SWA_TYPE_NONE:
1460
- {
1461
- } break;
1462
- case LLAMA_SWA_TYPE_STANDARD:
1463
- {
1464
- if (p1 - p0 >= (int32_t) n_swa) {
1465
- return true;
1466
- }
1467
- } break;
1468
- case LLAMA_SWA_TYPE_CHUNKED:
1469
- {
1470
- const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1471
-
1472
- if (p0 < pos_chunk_start) {
1473
- return true;
1474
- }
1475
- } break;
1476
- }
1477
-
1478
- return false;
1396
+ return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
1479
1397
  }
1480
1398
 
1481
1399
  void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
@@ -1985,8 +1903,7 @@ bool llama_kv_cache_context::apply() {
1985
1903
  }
1986
1904
 
1987
1905
  kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
1988
-
1989
- n_kv = kv->get_n_kv();
1906
+ n_kv = kv->get_n_kv(sinfos[i_cur]);
1990
1907
 
1991
1908
  return true;
1992
1909
  }
@@ -2005,10 +1922,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
2005
1922
  return n_kv;
2006
1923
  }
2007
1924
 
2008
- bool llama_kv_cache_context::get_supports_set_rows() const {
2009
- return kv->get_supports_set_rows();
2010
- }
2011
-
2012
1925
  ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
2013
1926
  return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
2014
1927
  }
@@ -38,8 +38,8 @@ public:
38
38
  using idx_vec_t = std::vector<uint32_t>;
39
39
 
40
40
  // number of streams: ns = s1 - s0 + 1
41
- llama_seq_id s0;
42
- llama_seq_id s1;
41
+ uint32_t s0;
42
+ uint32_t s1;
43
43
 
44
44
  std::vector<llama_seq_id> strm; // [ns]
45
45
  std::vector<idx_vec_t> idxs; // [ns]
@@ -139,10 +139,7 @@ public:
139
139
  // graph_build API
140
140
  //
141
141
 
142
- uint32_t get_n_kv() const;
143
-
144
- // TODO: temporary
145
- bool get_supports_set_rows() const;
142
+ uint32_t get_n_kv(const slot_info & sinfo) const;
146
143
 
147
144
  // get views of the current state of the cache
148
145
  ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@@ -215,10 +212,7 @@ private:
215
212
  // env: LLAMA_KV_CACHE_DEBUG
216
213
  int debug = 0;
217
214
 
218
- // env: LLAMA_SET_ROWS (temporary)
219
- // ref: https://github.com/ggml-org/llama.cpp/pull/14285
220
- bool supports_set_rows = true;
221
-
215
+ // this is the SWA type of the cache - not to be confused with the model SWA type
222
216
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
223
217
 
224
218
  std::vector<ggml_context_ptr> ctxs;
@@ -318,9 +312,6 @@ public:
318
312
 
319
313
  uint32_t get_n_kv() const;
320
314
 
321
- // TODO: temporary
322
- bool get_supports_set_rows() const;
323
-
324
315
  // get views of the current state of the cache
325
316
  ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
326
317
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
@@ -788,6 +788,7 @@ const struct ggml_tensor * llama_model_loader::check_tensor_dims(const std::stri
788
788
  }
789
789
 
790
790
  struct ggml_tensor * llama_model_loader::create_tensor(struct ggml_context * ctx, const std::string & name, const std::initializer_list<int64_t> & ne, int flags) {
791
+ LLAMA_LOG_DEBUG("%s: loading tensor %s\n", __func__, name.c_str());
791
792
  const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
792
793
 
793
794
  if (cur == NULL) {