@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -270,19 +270,7 @@ llama_context::llama_context(
270
270
  }
271
271
  }
272
272
 
273
- // resolve automatic Flash Attention use and reserve worst-case graph
274
273
  if (!hparams.vocab_only) {
275
- const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
276
- const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
277
-
278
- LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
279
-
280
- int n_splits_pp = -1;
281
- int n_nodes_pp = -1;
282
-
283
- int n_splits_tg = -1;
284
- int n_nodes_tg = -1;
285
-
286
274
  llama_memory_context_ptr mctx;
287
275
  if (memory) {
288
276
  LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
@@ -294,53 +282,68 @@ llama_context::llama_context(
294
282
 
295
283
  cross.v_embd.clear();
296
284
 
297
- // reserve pp (prompt processing) graph first so that buffers are only allocated once
298
- {
299
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
285
+ const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
286
+ const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
287
+
288
+ // avoid reserving graphs with zero outputs - assume one output per sequence
289
+ n_outputs = n_seqs;
290
+
291
+ LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
292
+
293
+ // resolve automatic Flash Attention use
294
+ if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
295
+ auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
300
296
  if (!gf) {
301
- throw std::runtime_error("failed to allocate compute pp buffers");
297
+ throw std::runtime_error("failed to split graph for Flash Attention check");
302
298
  }
303
299
 
304
- if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
305
- ggml_backend_sched_alloc_graph(sched.get(), gf);
306
-
307
- const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
308
- bool fa_device_mismatch = false;
309
- for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
310
- ggml_tensor * n = ggml_graph_node(gf, i);
311
- if (n->op != GGML_OP_FLASH_ATTN_EXT) {
312
- continue;
313
- }
314
- ggml_backend_dev_t device_fa = ggml_backend_get_device(
315
- ggml_backend_sched_get_tensor_backend(sched.get(), n));
316
-
317
- // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
318
- GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
319
- const int il = std::stoi(n->name + prefix_len);
320
- ggml_backend_dev_t device_kv = model.dev_layer(il);
321
- if (device_fa != device_kv) {
322
- LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
323
- "is assigned to device %s (usually due to missing support)\n",
324
- __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
325
- // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
326
- fa_device_mismatch = true;
327
- break;
328
- }
300
+ const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
301
+ bool fa_device_mismatch = false;
302
+ for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
303
+ ggml_tensor * n = ggml_graph_node(gf, i);
304
+ if (n->op != GGML_OP_FLASH_ATTN_EXT) {
305
+ continue;
329
306
  }
330
- if (fa_device_mismatch) {
331
- cparams.flash_attn = false;
332
- LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
333
- if (ggml_is_quantized(params.type_v)) {
334
- throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
335
- }
336
- auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
337
- if (!gf) {
338
- throw std::runtime_error("failed to allocate compute pp buffers");
339
- }
340
- } else {
341
- cparams.flash_attn = true;
342
- LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
307
+ ggml_backend_dev_t device_fa = ggml_backend_get_device(
308
+ ggml_backend_sched_get_tensor_backend(sched.get(), n));
309
+
310
+ // TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
311
+ GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
312
+ const int il = std::stoi(n->name + prefix_len);
313
+ ggml_backend_dev_t device_kv = model.dev_layer(il);
314
+ if (device_fa != device_kv) {
315
+ LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
316
+ "is assigned to device %s (usually due to missing support)\n",
317
+ __func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
318
+ // FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
319
+ fa_device_mismatch = true;
320
+ break;
321
+ }
322
+ }
323
+ if (fa_device_mismatch) {
324
+ cparams.flash_attn = false;
325
+ LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
326
+ if (ggml_is_quantized(params.type_v)) {
327
+ throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
343
328
  }
329
+ } else {
330
+ cparams.flash_attn = true;
331
+ LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
332
+ }
333
+ }
334
+
335
+ // reserve worst-case graph
336
+ int n_splits_pp = -1;
337
+ int n_nodes_pp = -1;
338
+
339
+ int n_splits_tg = -1;
340
+ int n_nodes_tg = -1;
341
+
342
+ // reserve pp (prompt processing) graph first so that buffers are only allocated once
343
+ {
344
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
345
+ if (!gf) {
346
+ throw std::runtime_error("failed to allocate compute pp buffers");
344
347
  }
345
348
 
346
349
  n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
@@ -1366,8 +1369,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
1366
1369
  return static_cast<llm_graph_result *>(gf_res_reserve.get());
1367
1370
  }
1368
1371
 
1369
- ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
1372
+ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
1370
1373
  LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
1374
+ GGML_ASSERT(n_outputs >= 1);
1371
1375
 
1372
1376
  if (n_tokens % n_seqs != 0) {
1373
1377
  n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
@@ -1401,7 +1405,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
1401
1405
  this->n_outputs = save_n_outputs;
1402
1406
 
1403
1407
  // initialize scheduler with the specified graph
1404
- if (!ggml_backend_sched_reserve(sched.get(), gf)) {
1408
+ if (split_only) {
1409
+ ggml_backend_sched_split_graph(sched.get(), gf);
1410
+ } else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
1405
1411
  LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
1406
1412
  return nullptr;
1407
1413
  }
@@ -1441,7 +1447,9 @@ ggml_status llama_context::graph_compute(
1441
1447
  if (backend_cpu != nullptr) {
1442
1448
  auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
1443
1449
  auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
1444
- set_threadpool_fn(backend_cpu, tp);
1450
+ if (set_threadpool_fn) {
1451
+ set_threadpool_fn(backend_cpu, tp);
1452
+ }
1445
1453
  }
1446
1454
 
1447
1455
  // set the number of threads for all the backends
@@ -196,7 +196,7 @@ public:
196
196
  ggml_status graph_compute(ggml_cgraph * gf, bool batched);
197
197
 
198
198
  // reserve a graph with a dummy ubatch of the specified size
199
- ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
199
+ ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
200
200
 
201
201
  private:
202
202
  llm_graph_params graph_params(
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
258
258
  }
259
259
  }
260
260
 
261
+ static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
262
+ LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
263
+ const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
264
+ (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
265
+ (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
266
+ (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
267
+ LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
268
+ LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
269
+ LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
270
+
271
+ LLAMA_LOG_DEBUG(" ");
272
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
273
+ LLAMA_LOG_DEBUG("%2d", j);
274
+ }
275
+ LLAMA_LOG_DEBUG("\n");
276
+
277
+ for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
278
+ LLAMA_LOG_DEBUG(" %2d ", i);
279
+ for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
280
+ float val = data[i * n_kv + j];
281
+ if (val == -INFINITY) {
282
+ LLAMA_LOG_DEBUG(" ∞");
283
+ } else {
284
+ LLAMA_LOG_DEBUG(" 0");
285
+ }
286
+ }
287
+ LLAMA_LOG_DEBUG("\n");
288
+ }
289
+ }
290
+
261
291
  void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
262
292
  const int64_t n_kv = ubatch->n_tokens;
263
293
  const int64_t n_tokens = ubatch->n_tokens;
@@ -267,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
267
297
 
268
298
  float * data = (float *) kq_mask->data;
269
299
 
300
+ // [TAG_NO_CACHE_ISWA]
301
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
302
+
270
303
  for (int h = 0; h < 1; ++h) {
271
304
  for (int i1 = 0; i1 < n_tokens; ++i1) {
272
305
  const llama_seq_id s1 = ubatch->seq_id[i1][0];
@@ -277,21 +310,33 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
277
310
  for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
278
311
  const llama_seq_id s0 = ubatch->seq_id[i0][0];
279
312
 
280
- // TODO: reimplement this like in llama_kv_cache
281
- if (s0 == s1 && (!cparams.causal_attn || ubatch->pos[i0] <= ubatch->pos[i1])) {
282
- if (hparams.use_alibi) {
283
- f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
284
- } else {
285
- f = 0.0f;
286
- }
287
- break;
313
+ if (s0 != s1) {
314
+ continue; // skip different sequences
288
315
  }
289
- }
290
316
 
317
+ if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
318
+ continue; // skip future tokens for causal attention
319
+ }
320
+
321
+ // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
322
+ //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
323
+ // continue; // skip masked tokens for SWA
324
+ //}
325
+
326
+ // TODO: reimplement this like in llama_kv_cache_unified
327
+ if (hparams.use_alibi) {
328
+ f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
329
+ } else {
330
+ f = 0.0f;
331
+ }
332
+ }
291
333
  data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
292
334
  }
293
335
  }
294
336
  }
337
+ if (debug) {
338
+ print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
339
+ }
295
340
  }
296
341
 
297
342
  void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
@@ -1228,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1228
1273
  // split the batch into streams if needed
1229
1274
  const auto n_stream = k->ne[3];
1230
1275
 
1231
- q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
1276
+ q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
1232
1277
 
1233
1278
  q = ggml_permute(ctx0, q, 0, 2, 1, 3);
1234
1279
  k = ggml_permute(ctx0, k, 0, 2, 1, 3);
@@ -1386,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
1386
1431
 
1387
1432
  // [TAG_NO_CACHE_PAD]
1388
1433
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
1389
- assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1434
+ // but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
1435
+ //assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
1390
1436
 
1391
1437
  ggml_tensor * q = q_cur;
1392
1438
  ggml_tensor * k = k_cur;
@@ -78,6 +78,11 @@ struct llm_graph_params;
78
78
 
79
79
  class llm_graph_input_i {
80
80
  public:
81
+ llm_graph_input_i() {
82
+ const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
83
+ debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
84
+ }
85
+
81
86
  virtual ~llm_graph_input_i() = default;
82
87
 
83
88
  virtual void set_input(const llama_ubatch * ubatch) = 0;
@@ -90,6 +95,9 @@ public:
90
95
  GGML_UNUSED(params);
91
96
  return false;
92
97
  }
98
+ protected:
99
+ // env: LLAMA_GRAPH_INPUT_DEBUG
100
+ int debug = 0;
93
101
  };
94
102
 
95
103
  using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
@@ -1,6 +1,7 @@
1
1
  #include "llama-hparams.h"
2
2
 
3
3
  #include "ggml.h"
4
+ #include <cassert>
4
5
 
5
6
  void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
6
7
  if (dense_first) {
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
178
179
 
179
180
  return res;
180
181
  }
182
+
183
+ bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
184
+ assert(p0 >= 0 && p1 >= 0);
185
+
186
+ switch (swa_type) {
187
+ case LLAMA_SWA_TYPE_NONE:
188
+ {
189
+ } break;
190
+ case LLAMA_SWA_TYPE_STANDARD:
191
+ {
192
+ if (p1 - p0 >= (int32_t) n_swa) {
193
+ return true;
194
+ }
195
+ } break;
196
+ case LLAMA_SWA_TYPE_CHUNKED:
197
+ {
198
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
199
+
200
+ if (p0 < pos_chunk_start) {
201
+ return true;
202
+ }
203
+ } break;
204
+ case LLAMA_SWA_TYPE_SYMMETRIC:
205
+ {
206
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
207
+ const int32_t pos_diff = p1 - p0;
208
+
209
+ // Mask if outside the symmetric window
210
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
211
+ return true;
212
+ }
213
+ } break;
214
+ }
215
+
216
+ return false;
217
+ }
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
16
16
  };
17
17
 
18
18
  enum llama_swa_type {
19
- LLAMA_SWA_TYPE_NONE = 0,
20
- LLAMA_SWA_TYPE_STANDARD = 1,
21
- LLAMA_SWA_TYPE_CHUNKED = 2,
19
+ LLAMA_SWA_TYPE_NONE = 0,
20
+ LLAMA_SWA_TYPE_STANDARD = 1,
21
+ LLAMA_SWA_TYPE_CHUNKED = 2,
22
+ LLAMA_SWA_TYPE_SYMMETRIC = 3,
22
23
  };
23
24
 
24
25
  struct llama_hparams_posnet {
@@ -158,6 +159,7 @@ struct llama_hparams {
158
159
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
159
160
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
160
161
  llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
162
+ uint32_t dec_n_layer = 0;
161
163
 
162
164
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
163
165
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@@ -227,6 +229,11 @@ struct llama_hparams {
227
229
 
228
230
  // number of layers for which has_kv() returns true
229
231
  uint32_t n_layer_kv() const;
232
+
233
+ // note that this function uses different SWA parameters from those in the hparams
234
+ // TODO: think of a better place for this function
235
+ // TODO: pack the SWA params in a struct?
236
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
230
237
  };
231
238
 
232
239
  static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
1018
1018
 
1019
1019
  const int32_t ikv = map_layer_ids.at(il);
1020
1020
 
1021
- auto * k = layers[ikv].k;
1021
+ ggml_tensor * k = layers[ikv].k;
1022
+
1023
+ const int64_t n_embd_head = k_cur->ne[0];
1024
+ const int64_t n_head = k_cur->ne[1];
1025
+ const int64_t n_tokens = k_cur->ne[2];
1026
+
1027
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1028
+
1029
+ // we can merge dims 0 and 1
1030
+ // TODO: add ggml helper function for this?
1031
+ GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
1022
1032
 
1023
- const int64_t n_tokens = k_cur->ne[2];
1033
+ k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
1024
1034
 
1025
- k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
1035
+ const int64_t n_stream = k->ne[2];
1036
+
1037
+ if (n_stream > 1) {
1038
+ const int64_t kv_size = get_size();
1026
1039
 
1027
- if (k->ne[2] > 1) {
1028
- k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1040
+ assert(n_embd_gqa == k->ne[0]);
1041
+ assert(kv_size == k->ne[1]);
1042
+
1043
+ // merge the buffer across all streams because the idxs are global
1044
+ k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
1029
1045
  }
1030
1046
 
1047
+ // store the current K values into the cache
1031
1048
  return ggml_set_rows(ctx, k, k_cur, k_idxs);
1032
1049
  }
1033
1050
 
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1038
1055
 
1039
1056
  auto * v = layers[ikv].v;
1040
1057
 
1041
- const int64_t n_embd_v_gqa = v_cur->ne[0]*v_cur->ne[1];
1042
- const int64_t n_tokens = v_cur->ne[2];
1058
+ const int64_t n_embd_head = v_cur->ne[0];
1059
+ const int64_t n_head = v_cur->ne[1];
1060
+ const int64_t n_tokens = v_cur->ne[2];
1061
+
1062
+ const int64_t n_embd_gqa = n_embd_head*n_head;
1063
+
1064
+ // we can merge dims 0 and 1
1065
+ GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
1043
1066
 
1044
- v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
1067
+ const int64_t n_stream = v->ne[2];
1045
1068
 
1069
+ // take this branch when FA is enabled (the V cache is not transposed)
1046
1070
  if (!v_trans) {
1047
- if (v->ne[2] > 1) {
1048
- v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1071
+ v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
1072
+
1073
+ if (n_stream > 1) {
1074
+ const int64_t kv_size = get_size();
1075
+
1076
+ assert(n_embd_gqa == v->ne[0]);
1077
+ assert(kv_size == v->ne[1]);
1078
+
1079
+ // merge the buffer across all streams because the idxs are global
1080
+ v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
1049
1081
  }
1050
1082
 
1051
1083
  return ggml_set_rows(ctx, v, v_cur, v_idxs);
1052
1084
  }
1053
1085
 
1086
+ if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
1087
+ // we can merge dims 0, 1 and 2
1088
+ v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
1089
+ } else {
1090
+ // otherwise -> make a copy to get contiguous data
1091
+ v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
1092
+ }
1093
+
1054
1094
  // [TAG_V_CACHE_VARIABLE]
1055
- if (n_embd_v_gqa < v->ne[0]) {
1056
- v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1095
+ if (n_embd_gqa < v->ne[0]) {
1096
+ v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
1057
1097
  }
1058
1098
 
1059
- // the row becomes a single element
1060
- ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1099
+ // in this branch the v_idxs are constructed in such a way that each row is a single head element
1100
+ ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
1061
1101
 
1062
- v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1102
+ v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
1063
1103
 
1064
1104
  return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1065
1105
  }
@@ -1393,29 +1433,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
1393
1433
  }
1394
1434
 
1395
1435
  bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1396
- assert(p0 >= 0 && p1 >= 0);
1397
-
1398
- switch (swa_type) {
1399
- case LLAMA_SWA_TYPE_NONE:
1400
- {
1401
- } break;
1402
- case LLAMA_SWA_TYPE_STANDARD:
1403
- {
1404
- if (p1 - p0 >= (int32_t) n_swa) {
1405
- return true;
1406
- }
1407
- } break;
1408
- case LLAMA_SWA_TYPE_CHUNKED:
1409
- {
1410
- const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
1411
-
1412
- if (p0 < pos_chunk_start) {
1413
- return true;
1414
- }
1415
- } break;
1416
- }
1417
-
1418
- return false;
1436
+ return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
1419
1437
  }
1420
1438
 
1421
1439
  void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
@@ -212,6 +212,7 @@ private:
212
212
  // env: LLAMA_KV_CACHE_DEBUG
213
213
  int debug = 0;
214
214
 
215
+ // this is the SWA type of the cache - not to be confused with the model SWA type
215
216
  const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
216
217
 
217
218
  std::vector<ggml_context_ptr> ctxs;
@@ -316,9 +317,17 @@ public:
316
317
  ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
317
318
 
318
319
  // store k_cur and v_cur in the cache based on the provided head location
320
+ // note: the heads in k_cur and v_cur should be layed out contiguously in memory
321
+ // - k_cur [n_embd_head_k, n_head_k, n_tokens]
322
+ // - k_idxs [n_tokens]
323
+ // - v_cur [n_embd_head_v, n_head_v, n_tokens]
324
+ // - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
319
325
  ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
320
326
  ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
321
327
 
328
+ // create destination indices for each head of the current batch for where it would be written in the KV cache
329
+ // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
330
+ // helps understand the implementation logic of cpy_k and cpy_v
322
331
  ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
323
332
  ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
324
333