@fugood/llama.node 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +9 -9
  8. package/src/LlamaCompletionWorker.cpp +73 -20
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/LlamaContext.cpp +9 -0
  11. package/src/common.hpp +8 -1
  12. package/src/llama.cpp/CMakeLists.txt +2 -0
  13. package/src/llama.cpp/common/arg.cpp +132 -41
  14. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  15. package/src/llama.cpp/common/chat.cpp +311 -9
  16. package/src/llama.cpp/common/chat.h +4 -1
  17. package/src/llama.cpp/common/common.cpp +54 -0
  18. package/src/llama.cpp/common/common.h +46 -9
  19. package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
  20. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  21. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  22. package/src/llama.cpp/ggml/include/ggml.h +28 -2
  23. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
  33. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  37. package/src/llama.cpp/include/llama.h +25 -0
  38. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  39. package/src/llama.cpp/src/llama-chat.cpp +2 -4
  40. package/src/llama.cpp/src/llama-context.cpp +29 -22
  41. package/src/llama.cpp/src/llama-context.h +6 -5
  42. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  43. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  44. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
  45. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  46. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  47. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  48. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  49. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  50. package/src/llama.cpp/src/llama-memory.h +2 -2
  51. package/src/llama.cpp/src/llama-model.cpp +81 -70
  52. package/src/llama.cpp/src/llama-model.h +2 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  54. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
206
206
  const int ncols_interleaved = 4;
207
207
  const int blocklen = 4;
208
208
 
209
- assert (n % qk == 0);
210
- assert (nc % ncols_interleaved == 0);
209
+ assert(nr == 1);
210
+ assert(n % qk == 0);
211
+ assert(nc % ncols_interleaved == 0);
211
212
 
212
213
  UNUSED(s);
213
214
  UNUSED(bs);
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
307
308
  UNUSED(ncols_interleaved);
308
309
  UNUSED(blocklen);
309
310
 
310
- {
311
- float sumf[8];
312
- int sumi;
311
+ float sumf[8];
312
+ int sumi;
313
313
 
314
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
- for (int x = 0; x < nc / ncols_interleaved; x++) {
316
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
314
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
316
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
317
 
318
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
- for (int l = 0; l < nb; l++) {
320
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
- for (int j = 0; j < ncols_interleaved; j++) {
322
- sumi = 0;
323
- for (int i = 0; i < blocklen; ++i) {
324
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
- }
328
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
318
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
+ for (int l = 0; l < nb; l++) {
320
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
+ for (int j = 0; j < ncols_interleaved; j++) {
322
+ sumi = 0;
323
+ for (int i = 0; i < blocklen; ++i) {
324
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
329
327
  }
328
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
330
329
  }
331
330
  }
332
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
331
  }
332
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
334
333
  }
335
334
  }
336
335
 
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
494
493
  const int ncols_interleaved = 4;
495
494
  const int blocklen = 4;
496
495
 
497
- assert (n % qk == 0);
498
- assert (nc % ncols_interleaved == 0);
496
+ assert(nr == 1);
497
+ assert(n % qk == 0);
498
+ assert(nc % ncols_interleaved == 0);
499
499
 
500
- UNUSED(s);
501
500
  UNUSED(bs);
502
- UNUSED(vx);
503
- UNUSED(vy);
504
501
  UNUSED(nr);
505
- UNUSED(nc);
506
- UNUSED(nb);
507
- UNUSED(ncols_interleaved);
508
- UNUSED(blocklen);
509
502
 
510
- {
511
- float sumf[4];
512
- int sumi;
503
+ float sumf[4];
504
+ int sumi;
513
505
 
514
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
515
- for (int x = 0; x < nc / ncols_interleaved; x++) {
516
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
506
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
507
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
508
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
517
509
 
518
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
519
- for (int l = 0; l < nb; l++) {
520
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
521
- for (int j = 0; j < ncols_interleaved; j++) {
522
- sumi = 0;
523
- for (int i = 0; i < blocklen; ++i) {
524
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
525
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
526
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
527
- }
528
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
510
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
511
+ for (int l = 0; l < nb; l++) {
512
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
513
+ for (int j = 0; j < ncols_interleaved; j++) {
514
+ sumi = 0;
515
+ for (int i = 0; i < blocklen; ++i) {
516
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
517
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
518
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
529
519
  }
520
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
530
521
  }
531
522
  }
532
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
533
523
  }
524
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
525
+ }
526
+ }
527
+
528
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
529
+ const int qk = QK8_0;
530
+ const int nb = n / qk;
531
+ const int ncols_interleaved = 8;
532
+ const int blocklen = 8;
533
+
534
+ assert(nr == 1);
535
+ assert(n % qk == 0);
536
+ assert(nc % ncols_interleaved == 0);
537
+
538
+ UNUSED(bs);
539
+ UNUSED(nr);
540
+
541
+ float sumf[8];
542
+ int sumi;
543
+
544
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
545
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
546
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
547
+
548
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
549
+ for (int l = 0; l < nb; l++) {
550
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
555
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
556
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
557
+ }
558
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
559
+ }
560
+ }
561
+ }
562
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
534
563
  }
535
564
  }
536
565
 
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
934
963
  }
935
964
  }
936
965
 
966
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
967
+ const int qk = QK8_0;
968
+ const int nb = n / qk;
969
+ const int ncols_interleaved = 8;
970
+ const int blocklen = 8;
971
+
972
+ assert(n % qk == 0);
973
+ assert(nr % 4 == 0);
974
+ assert(nc % ncols_interleaved == 0);
975
+
976
+ float sumf[4][8];
977
+ int sumi;
978
+
979
+ for (int y = 0; y < nr / 4; y++) {
980
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
983
+ for (int m = 0; m < 4; m++) {
984
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
985
+ }
986
+ for (int l = 0; l < nb; l++) {
987
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
988
+ for (int m = 0; m < 4; m++) {
989
+ for (int j = 0; j < ncols_interleaved; j++) {
990
+ sumi = 0;
991
+ for (int i = 0; i < blocklen; ++i) {
992
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
993
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
994
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
995
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
996
+ }
997
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
998
+ }
999
+ }
1000
+ }
1001
+ }
1002
+ for (int m = 0; m < 4; m++) {
1003
+ for (int j = 0; j < ncols_interleaved; j++)
1004
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
937
1010
  } // extern "C"
938
1011
 
939
1012
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
1285
1358
 
1286
1359
  static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1287
1360
  GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1288
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1289
1361
  GGML_ASSERT(interleave_block == 4);
1290
1362
 
1291
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
1292
- const block_iq4_nl * src = (const block_iq4_nl *)data;
1363
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1364
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
1365
+
1293
1366
  block_iq4_nl dst_tmp[4];
1367
+
1294
1368
  int nrow = ggml_nrows(t);
1295
1369
  int nrows_interleaved = 4;
1296
- int nblocks = t->ne[0] / QK4_0;
1370
+ int nblocks = t->ne[0] / QK4_NL;
1297
1371
 
1298
1372
  GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1299
1373
 
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
1315
1389
  GGML_UNUSED(data_size);
1316
1390
  }
1317
1391
 
1392
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1393
+ block_iq4_nlx8 out;
1394
+
1395
+ for (int i = 0; i < 8; i++) {
1396
+ out.d[i] = in[i].d;
1397
+ }
1398
+
1399
+ const int end = QK4_NL * 4 / blck_size_interleave;
1400
+
1401
+ if (blck_size_interleave == 8) {
1402
+ for (int i = 0; i < end; ++i) {
1403
+ int src_id = i % 8;
1404
+ int src_offset = (i / 8) * blck_size_interleave;
1405
+ int dst_offset = i * blck_size_interleave;
1406
+
1407
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1408
+ }
1409
+ } else {
1410
+ GGML_ASSERT(false);
1411
+ }
1412
+
1413
+ return out;
1414
+ }
1415
+
1416
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1417
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1418
+ GGML_ASSERT(interleave_block == 8);
1419
+
1420
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1421
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
1422
+
1423
+ block_iq4_nl dst_tmp[8];
1424
+
1425
+ int nrow = ggml_nrows(t);
1426
+ int nrows_interleaved = 8;
1427
+ int nblocks = t->ne[0] / QK4_NL;
1428
+
1429
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1430
+
1431
+ if (t->ne[1] % nrows_interleaved != 0) {
1432
+ return -1;
1433
+ }
1434
+
1435
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1436
+ for (int64_t x = 0; x < nblocks; x++) {
1437
+ for (int i = 0; i < nrows_interleaved; i++) {
1438
+ dst_tmp[i] = src[x + i * nblocks];
1439
+ }
1440
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1441
+ }
1442
+ src += nrows_interleaved * nblocks;
1443
+ }
1444
+ return 0;
1445
+
1446
+ GGML_UNUSED(data_size);
1447
+ }
1448
+
1318
1449
  namespace ggml::cpu::repack {
1319
1450
  // repack
1320
1451
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
1350
1481
  // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1351
1482
  //}
1352
1483
 
1484
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1485
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1486
+ }
1487
+
1353
1488
  // gemv
1354
1489
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1355
1490
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1378
1513
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1379
1514
  }
1380
1515
 
1516
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1517
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1518
+ }
1519
+
1381
1520
  // gemm
1382
1521
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1383
1522
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1406
1545
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1407
1546
  }
1408
1547
 
1548
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1549
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1550
+ }
1551
+
1409
1552
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1410
1553
  public:
1411
1554
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1680
1823
 
1681
1824
  // instance for IQ4
1682
1825
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1826
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
1683
1827
 
1684
1828
  if (cur->type == GGML_TYPE_Q4_0) {
1685
1829
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1710
1854
  }
1711
1855
  }
1712
1856
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1857
+ if (ggml_cpu_has_avx2()) {
1858
+ if (cur->ne[1] % 8 == 0) {
1859
+ return &iq4_nl_8x8_q8_0;
1860
+ }
1861
+ }
1713
1862
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1714
1863
  if (cur->ne[1] % 4 == 0) {
1715
1864
  return &iq4_nl_4x4_q8_0;
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
67
67
 
68
68
  static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
69
69
 
70
+ struct block_iq4_nlx8 {
71
+ ggml_half d[8]; // deltas for 8 iq4_nl blocks
72
+ uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
73
+ };
74
+
75
+ static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
76
+
70
77
  #if defined(__cplusplus)
71
78
  extern "C" {
72
79
  #endif
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
80
87
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
88
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
82
89
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
+ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
83
91
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
84
92
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
93
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
94
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
95
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
96
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
97
+ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
98
 
90
99
  // Native implementations
91
100
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
97
106
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
107
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
108
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
100
110
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
111
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
112
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
113
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
114
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
115
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106
117
 
107
118
  #if defined(__cplusplus)
108
119
  } // extern "C"
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
10
10
  } // namespace ggml::cpu
11
11
 
12
12
  bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
13
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
14
14
  if (extra && extra->context) {
15
15
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
16
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
23
23
  }
24
24
 
25
25
  bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
26
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
27
27
  if (extra && extra->context) {
28
28
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
29
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -33,6 +33,6 @@ class extra_buffer_type {
33
33
  } // namespace ggml::cpu
34
34
 
35
35
  // implemented in ggml-cpu.cpp.
36
- std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
36
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
37
37
 
38
38
  #endif
@@ -870,6 +870,29 @@ extern "C" {
870
870
  size_t n_token_capacity,
871
871
  size_t * n_token_count_out);
872
872
 
873
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
874
+
875
+ typedef uint32_t llama_state_seq_flags;
876
+
877
+ LLAMA_API size_t llama_state_seq_get_size_ext(
878
+ struct llama_context * ctx,
879
+ llama_seq_id seq_id,
880
+ llama_state_seq_flags flags);
881
+
882
+ LLAMA_API size_t llama_state_seq_get_data_ext(
883
+ struct llama_context * ctx,
884
+ uint8_t * dst,
885
+ size_t size,
886
+ llama_seq_id seq_id,
887
+ llama_state_seq_flags flags);
888
+
889
+ LLAMA_API size_t llama_state_seq_set_data_ext(
890
+ struct llama_context * ctx,
891
+ const uint8_t * src,
892
+ size_t size,
893
+ llama_seq_id dest_seq_id,
894
+ llama_state_seq_flags flags);
895
+
873
896
  //
874
897
  // Decoding
875
898
  //
@@ -1437,6 +1460,8 @@ extern "C" {
1437
1460
 
1438
1461
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1439
1462
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1463
+
1464
+ enum ggml_opt_optimizer_type optimizer_type;
1440
1465
  };
1441
1466
 
1442
1467
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
477
477
 
478
478
  llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
479
479
  if (sequential && has_cpl) {
480
- LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
480
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
481
481
 
482
482
  return {};
483
483
  }
@@ -193,11 +193,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
193
193
  return LLM_CHAT_TEMPLATE_LLAMA4;
194
194
  } else if (tmpl_contains("<|endofuserprompt|>")) {
195
195
  return LLM_CHAT_TEMPLATE_DOTS1;
196
- } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
196
+ } else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
197
197
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
198
198
  } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
199
199
  return LLM_CHAT_TEMPLATE_OPENAI_MOE;
200
- } else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
200
+ } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
201
201
  return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
202
202
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
203
203
  return LLM_CHAT_TEMPLATE_KIMI_K2;
@@ -625,8 +625,6 @@ int32_t llm_chat_apply_template(
625
625
  } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
626
626
  // Yandex template ("\n\n" is defined as EOT token)
627
627
 
628
- ss << "<s>";
629
-
630
628
  for (size_t i = 0; i < chat.size(); i++) {
631
629
  std::string role(chat[i]->role);
632
630
  if (role == "user") {
@@ -145,11 +145,6 @@ llama_context::llama_context(
145
145
  __func__, n_ctx_per_seq, hparams.n_ctx_train);
146
146
  }
147
147
 
148
- if (!params.swa_full && cparams.n_seq_max > 1 && hparams.is_swa_any()) {
149
- LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
150
- __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
151
- }
152
-
153
148
  if (!hparams.vocab_only) {
154
149
  // GPU backends
155
150
  for (auto * dev : model.devices) {
@@ -1657,30 +1652,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
1657
1652
  }
1658
1653
  }
1659
1654
 
1660
- size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
1655
+ size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
1661
1656
  llama_io_write_dummy io;
1662
1657
  try {
1663
- return state_seq_write_data(io, seq_id);
1658
+ return state_seq_write_data(io, seq_id, flags);
1664
1659
  } catch (const std::exception & err) {
1665
1660
  LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
1666
1661
  return 0;
1667
1662
  }
1668
1663
  }
1669
1664
 
1670
- size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
1665
+ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
1671
1666
  llama_io_write_buffer io(dst, size);
1672
1667
  try {
1673
- return state_seq_write_data(io, seq_id);
1668
+ return state_seq_write_data(io, seq_id, flags);
1674
1669
  } catch (const std::exception & err) {
1675
1670
  LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
1676
1671
  return 0;
1677
1672
  }
1678
1673
  }
1679
1674
 
1680
- size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
1675
+ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
1681
1676
  llama_io_read_buffer io(src, size);
1682
1677
  try {
1683
- return state_seq_read_data(io, seq_id);
1678
+ return state_seq_read_data(io, seq_id, flags);
1684
1679
  } catch (const std::exception & err) {
1685
1680
  LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
1686
1681
  return 0;
@@ -1778,7 +1773,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
1778
1773
  {
1779
1774
  const size_t state_size = file.size() - file.tell();
1780
1775
  llama_io_read_file io(&file);
1781
- const size_t nread = state_seq_read_data(io, seq_id);
1776
+ const size_t nread = state_seq_read_data(io, seq_id, 0);
1782
1777
  if (!nread) {
1783
1778
  LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
1784
1779
  return 0;
@@ -1802,7 +1797,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
1802
1797
 
1803
1798
  // save the context state using stream saving
1804
1799
  llama_io_write_file io(&file);
1805
- state_seq_write_data(io, seq_id);
1800
+ state_seq_write_data(io, seq_id, 0);
1806
1801
 
1807
1802
  const size_t res = file.tell();
1808
1803
  GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@@ -1971,21 +1966,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
1971
1966
  return io.n_bytes();
1972
1967
  }
1973
1968
 
1974
- size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
1969
+ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1975
1970
  GGML_UNUSED(seq_id);
1976
1971
 
1977
1972
  if (memory) {
1978
- memory->state_write(io, seq_id);
1973
+ memory->state_write(io, seq_id, flags);
1979
1974
  }
1980
1975
 
1981
1976
  return io.n_bytes();
1982
1977
  }
1983
1978
 
1984
- size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
1979
+ size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1985
1980
  GGML_UNUSED(seq_id);
1986
1981
 
1987
1982
  if (memory) {
1988
- memory->state_read(io, seq_id);
1983
+ memory->state_read(io, seq_id, flags);
1989
1984
  }
1990
1985
 
1991
1986
  return io.n_bytes();
@@ -2048,7 +2043,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
2048
2043
  opt_params.opt_period = n_batch / n_ubatch;
2049
2044
  opt_params.get_opt_pars = lopt_params.get_opt_pars;
2050
2045
  opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
2051
-
2046
+ opt_params.optimizer = lopt_params.optimizer_type;
2052
2047
  opt_ctx = ggml_opt_init(opt_params);
2053
2048
 
2054
2049
  llama_opt_param_filter param_filter = lopt_params.param_filter;
@@ -2801,19 +2796,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
2801
2796
  }
2802
2797
 
2803
2798
  size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
2804
- return ctx->state_seq_get_size(seq_id);
2799
+ return llama_state_seq_get_size_ext(ctx, seq_id, 0);
2805
2800
  }
2806
2801
 
2807
2802
  size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
2803
+ return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
2804
+ }
2805
+
2806
+ size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2807
+ return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
2808
+ }
2809
+
2810
+ size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
2811
+ return ctx->state_seq_get_size(seq_id, flags);
2812
+ }
2813
+
2814
+ size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2808
2815
  ctx->synchronize();
2809
2816
 
2810
- return ctx->state_seq_get_data(seq_id, dst, size);
2817
+ return ctx->state_seq_get_data(seq_id, dst, size, flags);
2811
2818
  }
2812
2819
 
2813
- size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2820
+ size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2814
2821
  ctx->synchronize();
2815
2822
 
2816
- return ctx->state_seq_set_data(seq_id, src, size);
2823
+ return ctx->state_seq_set_data(seq_id, src, size, flags);
2817
2824
  }
2818
2825
 
2819
2826
  size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@@ -111,9 +111,9 @@ struct llama_context {
111
111
  size_t state_get_data( uint8_t * dst, size_t size);
112
112
  size_t state_set_data(const uint8_t * src, size_t size);
113
113
 
114
- size_t state_seq_get_size(llama_seq_id seq_id);
115
- size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
116
- size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
114
+ size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
115
+ size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
116
+ size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
117
117
 
118
118
  bool state_load_file(
119
119
  const char * filepath,
@@ -152,6 +152,7 @@ struct llama_context {
152
152
 
153
153
  void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
154
154
 
155
+ // TODO: more flexible combinations of logical/physical batch size and context size
155
156
  void opt_epoch(
156
157
  ggml_opt_dataset_t dataset,
157
158
  ggml_opt_result_t result_train,
@@ -212,8 +213,8 @@ private:
212
213
  size_t state_write_data(llama_io_write_i & io);
213
214
  size_t state_read_data (llama_io_read_i & io);
214
215
 
215
- size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
216
- size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
216
+ size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
217
+ size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
217
218
 
218
219
  //
219
220
  // members