@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -9,33 +9,6 @@
9
9
  #include <cmath>
10
10
  #include <cstring>
11
11
 
12
- static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
13
- // TODO move to hparams if a T5 variant appears that uses a different value
14
- const int64_t max_distance = 128;
15
-
16
- if (bidirectional) {
17
- n_buckets >>= 1;
18
- }
19
-
20
- const int64_t max_exact = n_buckets >> 1;
21
-
22
- int32_t relative_position = x - y;
23
- int32_t relative_bucket = 0;
24
-
25
- if (bidirectional) {
26
- relative_bucket += (relative_position > 0) * n_buckets;
27
- relative_position = abs(relative_position);
28
- } else {
29
- relative_position = -std::min<int32_t>(relative_position, 0);
30
- }
31
-
32
- int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
33
- relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
34
- relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
35
-
36
- return relative_bucket;
37
- }
38
-
39
12
  void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
40
13
  if (ubatch->token) {
41
14
  const int64_t n_tokens = ubatch->n_tokens;
@@ -110,22 +83,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
110
83
 
111
84
  void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) {
112
85
  if (pos_bucket) {
113
- const int64_t n_tokens = ubatch->n_tokens;
114
-
115
- GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
116
- GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
117
-
118
- int32_t * data = (int32_t *) pos_bucket->data;
119
-
120
- const int64_t n_kv = kv_self->n;
121
-
122
- for (int h = 0; h < 1; ++h) {
123
- for (int j = 0; j < n_tokens; ++j) {
124
- for (int i = 0; i < n_kv; ++i) {
125
- data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(kv_self->cells[i].pos, ubatch->pos[j], hparams.n_rel_attn_bkts, false);
126
- }
127
- }
128
- }
86
+ kv_self->set_input_pos_bucket(pos_bucket, ubatch);
129
87
  }
130
88
  }
131
89
 
@@ -403,99 +361,18 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
403
361
  }
404
362
 
405
363
  void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
406
- if (self_kq_mask || self_kq_mask_swa) {
407
- const int64_t n_kv = kv_self->n;
408
- const int64_t n_tokens = ubatch->n_tokens;
409
- const int64_t n_seq_tokens = ubatch->n_seq_tokens;
410
- const int64_t n_seqs = ubatch->n_seqs;
411
-
412
- float * data = nullptr;
413
- float * data_swa = nullptr;
414
-
415
- if (self_kq_mask) {
416
- GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
417
- data = (float *) self_kq_mask->data;
418
- }
419
-
420
- if (self_kq_mask_swa) {
421
- GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
422
- data_swa = (float *) self_kq_mask_swa->data;
423
- }
424
-
425
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
426
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
427
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
428
- // Causal mask:
429
- // xxx-------
430
- // xxxx------
431
- // xxxxx-----
432
- // Non-causal mask:
433
- // xxxxx-----
434
- // xxxxx-----
435
- // xxxxx-----
436
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
437
- for (int h = 0; h < 1; ++h) {
438
- for (int s = 0; s < n_seqs; ++s) {
439
- const llama_seq_id seq_id = ubatch->seq_id[s][0];
440
-
441
- for (int j = 0; j < n_seq_tokens; ++j) {
442
- const llama_pos pos = ubatch->pos[s*n_seq_tokens + j];
443
- for (int i = 0; i < n_kv; ++i) {
444
- float f;
445
- // mask the token if:
446
- if (!kv_self->cells[i].has_seq_id(seq_id) // not the correct sequence
447
- || (cparams.causal_attn && kv_self->cells[i].pos > pos) // for causal, mask future tokens
448
- ) {
449
- f = -INFINITY;
450
- } else {
451
- if (hparams.use_alibi) {
452
- f = -std::abs(kv_self->cells[i].pos - pos);
453
- } else {
454
- f = 0.0f;
455
- }
456
- }
457
-
458
- if (data) {
459
- data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
460
- }
461
-
462
- // may need to cut off old tokens for sliding window
463
- // TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
464
- if (data_swa) {
465
- if (hparams.n_attn_chunk) {
466
- llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
467
- if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
468
- f = -INFINITY;
469
- }
470
- } else {
471
- if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
472
- f = -INFINITY;
473
- }
474
- }
475
- data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
476
- }
477
- }
478
- }
479
- }
364
+ if (self_kq_mask) {
365
+ kv_self->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
366
+ }
367
+ }
480
368
 
481
- // mask padded tokens
482
- if (data) {
483
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
484
- for (int j = 0; j < n_kv; ++j) {
485
- data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
486
- }
487
- }
488
- }
369
+ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
370
+ if (self_kq_mask) {
371
+ kv_self->get_kv_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
372
+ }
489
373
 
490
- // mask padded tokens
491
- if (data_swa) {
492
- for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
493
- for (int j = 0; j < n_kv; ++j) {
494
- data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
495
- }
496
- }
497
- }
498
- }
374
+ if (self_kq_mask_swa) {
375
+ kv_self->get_kv_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
499
376
  }
500
377
  }
501
378
 
@@ -545,7 +422,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
545
422
  n_layer (hparams.n_layer),
546
423
  n_rot (hparams.n_rot),
547
424
  n_ctx (cparams.n_ctx),
548
- n_ctx_per_seq (cparams.n_ctx / cparams.n_seq_max),
549
425
  n_head (hparams.n_head()),
550
426
  n_head_kv (hparams.n_head_kv()),
551
427
  n_embd_head_k (hparams.n_embd_head_k),
@@ -1153,7 +1029,7 @@ ggml_tensor * llm_graph_context::build_inp_pos_bucket_dec() const {
1153
1029
 
1154
1030
  auto inp = std::make_unique<llm_graph_input_pos_bucket_kv>(hparams, kv_self);
1155
1031
 
1156
- const auto n_kv = kv_self->n;
1032
+ const auto n_kv = kv_self->get_n();
1157
1033
 
1158
1034
  auto & cur = inp->pos_bucket;
1159
1035
 
@@ -1188,16 +1064,12 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1188
1064
  ggml_tensor * kq_b,
1189
1065
  ggml_tensor * kq_mask,
1190
1066
  ggml_tensor * v_mla,
1191
- bool v_trans,
1192
1067
  float kq_scale) const {
1193
- //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1194
- //const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1195
-
1196
- //const int64_t n_head = hparams.n_head(il);
1197
- //const int64_t n_head_kv = hparams.n_head_kv(il);
1068
+ const bool v_trans = v->nb[1] > v->nb[2];
1198
1069
 
1199
- //const auto & n_embd_head_k = hparams.n_embd_head_k;
1200
- //const auto & n_embd_head_v = hparams.n_embd_head_v;
1070
+ q = ggml_permute(ctx0, q, 0, 2, 1, 3);
1071
+ k = ggml_permute(ctx0, k, 0, 2, 1, 3);
1072
+ v = ggml_permute(ctx0, v, 0, 2, 1, 3);
1201
1073
 
1202
1074
  const auto n_tokens = q->ne[1];
1203
1075
  const auto n_head = q->ne[2];
@@ -1336,17 +1208,11 @@ ggml_tensor * llm_graph_context::build_attn(
1336
1208
 
1337
1209
  const auto & kq_mask = inp->get_kq_mask();
1338
1210
 
1339
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1340
- //cb(q, "q", il);
1341
-
1342
- ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1343
- //cb(k, "k", il);
1344
-
1345
- ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1346
- //cb(k, "v", il);
1347
-
1348
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
1211
+ ggml_tensor * q = q_cur;
1212
+ ggml_tensor * k = k_cur;
1213
+ ggml_tensor * v = v_cur;
1349
1214
 
1215
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1350
1216
  cb(cur, "kqv_out", il);
1351
1217
 
1352
1218
  if (wo) {
@@ -1369,22 +1235,16 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
1369
1235
 
1370
1236
  auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
1371
1237
 
1372
- const auto n_kv = kv_self->n;
1373
-
1374
- inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1375
- //cb(inp->self_kq_mask, "KQ_mask", -1);
1376
- ggml_set_input(inp->self_kq_mask);
1377
-
1378
- inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1238
+ {
1239
+ GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
1379
1240
 
1380
- if (hparams.n_swa_pattern > 1) {
1381
- GGML_ASSERT(hparams.n_swa > 0);
1241
+ const auto n_kv = kv_self->get_n();
1382
1242
 
1383
- inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1384
- //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
1385
- ggml_set_input(inp->self_kq_mask_swa);
1243
+ inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1244
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
1245
+ ggml_set_input(inp->self_kq_mask);
1386
1246
 
1387
- inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1247
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1388
1248
  }
1389
1249
 
1390
1250
  return (llm_graph_input_attn_kv_unified *) res->add_input(std::move(inp));
@@ -1409,85 +1269,108 @@ ggml_tensor * llm_graph_context::build_attn(
1409
1269
  ggml_build_forward_expand(gf, v_cur);
1410
1270
 
1411
1271
  const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
1412
- const auto & n_ctx = cparams.n_ctx;
1413
1272
 
1414
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
1415
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
1273
+ // store to KV cache
1274
+ {
1275
+ ggml_build_forward_expand(gf, kv_self->cpy_k(ctx0, k_cur, il));
1276
+ ggml_build_forward_expand(gf, kv_self->cpy_v(ctx0, v_cur, il));
1277
+ }
1278
+
1279
+ const auto & kq_mask = inp->get_kq_mask();
1416
1280
 
1417
- const auto n_tokens = q_cur->ne[2];
1281
+ ggml_tensor * q = q_cur;
1282
+ ggml_tensor * k = kv_self->get_k(ctx0, il);
1283
+ ggml_tensor * v = kv_self->get_v(ctx0, il);
1418
1284
 
1419
- const bool v_trans = !cparams.flash_attn;
1285
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1286
+ cb(cur, "kqv_out", il);
1420
1287
 
1421
- // store to KV cache
1422
- {
1423
- const auto kv_head = kv_self->head;
1288
+ if (wo) {
1289
+ cur = build_lora_mm(wo, cur);
1290
+ }
1291
+
1292
+ if (wo_b) {
1293
+ cur = ggml_add(ctx0, cur, wo_b);
1294
+ }
1424
1295
 
1425
- GGML_ASSERT(kv_self->size == n_ctx);
1296
+ return cur;
1297
+ }
1426
1298
 
1427
- ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head);
1428
- //cb(k_cache_view, "k_cache_view", il);
1299
+ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unified_iswa() const {
1300
+ const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
1429
1301
 
1430
- // note: storing RoPE-ed version of K in the KV cache
1431
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view));
1302
+ auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, kv_self);
1432
1303
 
1433
- v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens);
1304
+ {
1305
+ const auto n_kv = kv_self->get_kv_base()->get_n();
1434
1306
 
1435
- ggml_tensor * v_cache_view = nullptr;
1307
+ inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1308
+ //cb(inp->self_kq_mask, "KQ_mask", -1);
1309
+ ggml_set_input(inp->self_kq_mask);
1436
1310
 
1437
- if (!v_trans) {
1438
- v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head);
1439
- } else {
1440
- // note: the V cache is transposed when not using flash attention
1441
- v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa,
1442
- ( n_ctx)*ggml_element_size(kv_self->v_l[il]),
1443
- (kv_head)*ggml_element_size(kv_self->v_l[il]));
1311
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1312
+ }
1444
1313
 
1445
- v_cur = ggml_transpose(ctx0, v_cur);
1446
- }
1447
- //cb(v_cache_view, "v_cache_view", il);
1314
+ {
1315
+ GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
1316
+
1317
+ const auto n_kv = kv_self->get_kv_swa()->get_n();
1448
1318
 
1449
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view));
1319
+ inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
1320
+ //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1);
1321
+ ggml_set_input(inp->self_kq_mask_swa);
1322
+
1323
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1450
1324
  }
1451
1325
 
1326
+ return (llm_graph_input_attn_kv_unified_iswa *) res->add_input(std::move(inp));
1327
+ }
1328
+
1329
+ ggml_tensor * llm_graph_context::build_attn(
1330
+ llm_graph_input_attn_kv_unified_iswa * inp,
1331
+ ggml_cgraph * gf,
1332
+ ggml_tensor * wo,
1333
+ ggml_tensor * wo_b,
1334
+ ggml_tensor * q_cur,
1335
+ ggml_tensor * k_cur,
1336
+ ggml_tensor * v_cur,
1337
+ ggml_tensor * kq_b,
1338
+ ggml_tensor * v_mla,
1339
+ float kq_scale,
1340
+ int il) const {
1341
+ // these nodes are added to the graph together so that they are not reordered
1342
+ // by doing so, the number of splits in the graph is reduced
1343
+ ggml_build_forward_expand(gf, q_cur);
1344
+ ggml_build_forward_expand(gf, k_cur);
1345
+ ggml_build_forward_expand(gf, v_cur);
1346
+
1452
1347
  const bool is_swa = hparams.is_swa(il);
1453
1348
 
1349
+ const llama_kv_cache_unified_iswa * kv_self = static_cast<const llama_kv_cache_unified_iswa *>(memory);
1350
+
1351
+ const auto * kv = is_swa ? kv_self->get_kv_swa() : kv_self->get_kv_base();
1352
+
1353
+ // store to KV cache
1354
+ {
1355
+ ggml_build_forward_expand(gf, kv->cpy_k(ctx0, k_cur, il));
1356
+ ggml_build_forward_expand(gf, kv->cpy_v(ctx0, v_cur, il));
1357
+ }
1358
+
1454
1359
  const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
1455
1360
 
1456
- const auto n_kv = kv_self->n;
1361
+ ggml_tensor * q = q_cur;
1362
+ ggml_tensor * k = kv->get_k(ctx0, il);
1363
+ ggml_tensor * v = kv->get_v(ctx0, il);
1457
1364
 
1458
- const int64_t n_head_kv = hparams.n_head_kv(il);
1459
-
1460
- const auto & n_embd_head_k = hparams.n_embd_head_k;
1461
- const auto & n_embd_head_v = hparams.n_embd_head_v;
1462
-
1463
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1464
- //cb(q, "q", il);
1465
-
1466
- ggml_tensor * k =
1467
- ggml_view_3d(ctx0, kv_self->k_l[il],
1468
- n_embd_head_k, n_kv, n_head_kv,
1469
- ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
1470
- ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
1471
- 0);
1472
- //cb(k, "k", il);
1473
-
1474
- ggml_tensor * v = !v_trans ?
1475
- ggml_view_3d(ctx0, kv_self->v_l[il],
1476
- n_embd_head_v, n_kv, n_head_kv,
1477
- ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
1478
- ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v),
1479
- 0) :
1480
- ggml_view_3d(ctx0, kv_self->v_l[il],
1481
- n_kv, n_embd_head_v, n_head_kv,
1482
- ggml_element_size(kv_self->v_l[il])*n_ctx,
1483
- ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
1484
- 0);
1485
-
1486
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
1365
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1487
1366
  cb(cur, "kqv_out", il);
1488
1367
 
1489
1368
  if (wo) {
1490
1369
  cur = build_lora_mm(wo, cur);
1370
+ if (arch == LLM_ARCH_GLM4) {
1371
+ // GLM4 seems to have numerical issues with half-precision accumulators
1372
+ ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
1373
+ }
1491
1374
  }
1492
1375
 
1493
1376
  if (wo_b) {
@@ -1534,17 +1417,11 @@ ggml_tensor * llm_graph_context::build_attn(
1534
1417
 
1535
1418
  const auto & kq_mask = inp->get_kq_mask_cross();
1536
1419
 
1537
- ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3);
1538
- //cb(q, "q", il);
1539
-
1540
- ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3);
1541
- //cb(k, "k", il);
1542
-
1543
- ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
1544
- //cb(k, "v", il);
1545
-
1546
- ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
1420
+ ggml_tensor * q = q_cur;
1421
+ ggml_tensor * k = k_cur;
1422
+ ggml_tensor * v = v_cur;
1547
1423
 
1424
+ ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
1548
1425
  cb(cur, "kqv_out", il);
1549
1426
 
1550
1427
  if (wo) {
@@ -1712,3 +1589,30 @@ void llm_graph_context::build_pooling(
1712
1589
 
1713
1590
  ggml_build_forward_expand(gf, cur);
1714
1591
  }
1592
+
1593
+ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
1594
+ // TODO move to hparams if a T5 variant appears that uses a different value
1595
+ const int64_t max_distance = 128;
1596
+
1597
+ if (bidirectional) {
1598
+ n_buckets >>= 1;
1599
+ }
1600
+
1601
+ const int64_t max_exact = n_buckets >> 1;
1602
+
1603
+ int32_t relative_position = x - y;
1604
+ int32_t relative_bucket = 0;
1605
+
1606
+ if (bidirectional) {
1607
+ relative_bucket += (relative_position > 0) * n_buckets;
1608
+ relative_position = abs(relative_position);
1609
+ } else {
1610
+ relative_position = -std::min<int32_t>(relative_position, 0);
1611
+ }
1612
+
1613
+ int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
1614
+ relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
1615
+ relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
1616
+
1617
+ return relative_bucket;
1618
+ }
@@ -19,6 +19,7 @@ struct llama_cparams;
19
19
 
20
20
  class llama_memory_i;
21
21
  class llama_kv_cache_unified;
22
+ class llama_kv_cache_unified_iswa;
22
23
  class llama_kv_cache_recurrent;
23
24
 
24
25
  // certain models (typically multi-modal) can produce different types of graphs
@@ -255,6 +256,31 @@ public:
255
256
 
256
257
  void set_input(const llama_ubatch * ubatch) override;
257
258
 
259
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
260
+
261
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch]
262
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch]
263
+
264
+ const llama_hparams & hparams;
265
+ const llama_cparams & cparams;
266
+
267
+ const llama_kv_cache_unified * kv_self;
268
+ };
269
+
270
+ class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
271
+ public:
272
+ llm_graph_input_attn_kv_unified_iswa(
273
+ const llama_hparams & hparams,
274
+ const llama_cparams & cparams,
275
+ const llama_kv_cache_unified_iswa * kv_self) :
276
+ hparams(hparams),
277
+ cparams(cparams),
278
+ kv_self(kv_self) {
279
+ }
280
+ ~llm_graph_input_attn_kv_unified_iswa() = default;
281
+
282
+ void set_input(const llama_ubatch * ubatch) override;
283
+
258
284
  ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
259
285
  ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
260
286
 
@@ -266,7 +292,7 @@ public:
266
292
  const llama_hparams & hparams;
267
293
  const llama_cparams & cparams;
268
294
 
269
- const llama_kv_cache_unified * kv_self;
295
+ const llama_kv_cache_unified_iswa * kv_self;
270
296
  };
271
297
 
272
298
  class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -378,7 +404,6 @@ struct llm_graph_context {
378
404
  const int64_t n_layer;
379
405
  const int64_t n_rot;
380
406
  const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
381
- const int64_t n_ctx_per_seq;
382
407
  const int64_t n_head;
383
408
  const int64_t n_head_kv;
384
409
  const int64_t n_embd_head_k;
@@ -507,13 +532,12 @@ struct llm_graph_context {
507
532
 
508
533
  ggml_tensor * build_attn_mha(
509
534
  ggml_cgraph * gf,
510
- ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
511
- ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
512
- ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
535
+ ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
536
+ ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
537
+ ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
513
538
  ggml_tensor * kq_b,
514
539
  ggml_tensor * kq_mask,
515
- ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
516
- bool v_trans,
540
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
517
541
  float kq_scale) const;
518
542
 
519
543
  llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@@ -546,6 +570,21 @@ struct llm_graph_context {
546
570
  float kq_scale,
547
571
  int il) const;
548
572
 
573
+ llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
574
+
575
+ ggml_tensor * build_attn(
576
+ llm_graph_input_attn_kv_unified_iswa * inp,
577
+ ggml_cgraph * gf,
578
+ ggml_tensor * wo,
579
+ ggml_tensor * wo_b,
580
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
581
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
582
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
583
+ ggml_tensor * kq_b,
584
+ ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
585
+ float kq_scale,
586
+ int il) const;
587
+
549
588
  llm_graph_input_attn_cross * build_attn_inp_cross() const;
550
589
 
551
590
  ggml_tensor * build_attn(
@@ -596,3 +635,6 @@ struct llm_graph_context {
596
635
  ggml_tensor * cls_out,
597
636
  ggml_tensor * cls_out_b) const;
598
637
  };
638
+
639
+ // TODO: better name
640
+ int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
@@ -2,6 +2,22 @@
2
2
 
3
3
  #include "ggml.h"
4
4
 
5
+ void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
6
+ for (uint32_t il = 0; il < n_layer; ++il) {
7
+ swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
8
+ }
9
+ }
10
+
11
+ bool llama_hparams::is_swa_any() const {
12
+ for (uint32_t il = 0; il < n_layer; ++il) {
13
+ if (swa_layers[il]) {
14
+ return true;
15
+ }
16
+ }
17
+
18
+ return false;
19
+ }
20
+
5
21
  uint32_t llama_hparams::n_head(uint32_t il) const {
6
22
  if (il < n_layer) {
7
23
  return n_head_arr[il];
@@ -72,7 +88,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
72
88
 
73
89
  bool llama_hparams::is_swa(uint32_t il) const {
74
90
  if (il < n_layer) {
75
- return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
91
+ return swa_layers[il];
76
92
  }
77
93
 
78
94
  GGML_ABORT("fatal error");
@@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
14
14
  LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
15
  };
16
16
 
17
+ enum llama_swa_type {
18
+ LLAMA_SWA_TYPE_NONE = 0,
19
+ LLAMA_SWA_TYPE_STANDARD = 1,
20
+ LLAMA_SWA_TYPE_CHUNKED = 2,
21
+ };
22
+
17
23
  struct llama_hparams_posnet {
18
24
  uint32_t n_embd;
19
25
  uint32_t n_layer;
@@ -35,8 +41,6 @@ struct llama_hparams {
35
41
  uint32_t n_embd_features = 0;
36
42
  uint32_t n_layer;
37
43
  uint32_t n_rot;
38
- uint32_t n_swa = 0; // sliding window attention (SWA)
39
- uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
40
44
  uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
41
45
  uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
42
46
  uint32_t n_expert = 0;
@@ -96,6 +100,15 @@ struct llama_hparams {
96
100
 
97
101
  std::array<int, 4> rope_sections;
98
102
 
103
+ // Sliding Window Attention (SWA)
104
+ llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
105
+ // the size of the sliding window (0 - no SWA)
106
+ uint32_t n_swa = 0;
107
+ // if swa_layers[il] == true, then layer il is SWA
108
+ // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
109
+ // by default, all layers are dense
110
+ std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
111
+
99
112
  // for State Space Models
100
113
  uint32_t ssm_d_conv = 0;
101
114
  uint32_t ssm_d_inner = 0;
@@ -116,11 +129,10 @@ struct llama_hparams {
116
129
  bool causal_attn = true;
117
130
  bool use_alibi = false;
118
131
  bool attn_soft_cap = false;
132
+ bool use_kq_norm = true;
119
133
 
134
+ // llama4
120
135
  uint32_t n_moe_layer_step = 0;
121
- bool use_kq_norm = true;
122
- uint32_t n_attn_chunk = 0;
123
- // values below seems to be fixed on llama4
124
136
  uint32_t n_no_rope_layer_step = 4;
125
137
  uint32_t n_attn_temp_floor_scale = 8192;
126
138
  float f_attn_temp_scale = 0.1;
@@ -133,6 +145,23 @@ struct llama_hparams {
133
145
  enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
134
146
  enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
135
147
 
148
+ // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
149
+ // note that if n_pattern == 0, all layers are SWA
150
+ // if n_pattern == 1, all layers are dense
151
+ // example: n_pattern = 3
152
+ // il == 0: swa
153
+ // il == 1: swa
154
+ // il == 2: dense
155
+ // il == 3: swa
156
+ // il == 4: swa
157
+ // il == 5: dense
158
+ // il == 6: swa
159
+ // etc ...
160
+ void set_swa_pattern(uint32_t n_pattern);
161
+
162
+ // return true if one of the layers is SWA
163
+ bool is_swa_any() const;
164
+
136
165
  uint32_t n_head(uint32_t il = 0) const;
137
166
 
138
167
  uint32_t n_head_kv(uint32_t il = 0) const;