@fugood/llama.node 1.1.6 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +9 -9
  8. package/src/LlamaCompletionWorker.cpp +73 -20
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +124 -40
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +312 -9
  14. package/src/llama.cpp/common/chat.h +4 -1
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +41 -7
  17. package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +28 -2
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
  30. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  31. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  33. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  34. package/src/llama.cpp/include/llama.h +25 -0
  35. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  36. package/src/llama.cpp/src/llama-chat.cpp +2 -4
  37. package/src/llama.cpp/src/llama-context.cpp +29 -17
  38. package/src/llama.cpp/src/llama-context.h +6 -5
  39. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  40. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  41. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
  42. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  43. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  44. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  45. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  46. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  47. package/src/llama.cpp/src/llama-memory.h +2 -2
  48. package/src/llama.cpp/src/llama-model.cpp +1 -0
  49. package/src/llama.cpp/src/llama-model.h +1 -0
  50. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  51. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
@@ -206,8 +206,9 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
206
206
  const int ncols_interleaved = 4;
207
207
  const int blocklen = 4;
208
208
 
209
- assert (n % qk == 0);
210
- assert (nc % ncols_interleaved == 0);
209
+ assert(nr == 1);
210
+ assert(n % qk == 0);
211
+ assert(nc % ncols_interleaved == 0);
211
212
 
212
213
  UNUSED(s);
213
214
  UNUSED(bs);
@@ -307,30 +308,28 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
307
308
  UNUSED(ncols_interleaved);
308
309
  UNUSED(blocklen);
309
310
 
310
- {
311
- float sumf[8];
312
- int sumi;
311
+ float sumf[8];
312
+ int sumi;
313
313
 
314
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
- for (int x = 0; x < nc / ncols_interleaved; x++) {
316
- const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
314
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
315
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
316
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
317
317
 
318
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
- for (int l = 0; l < nb; l++) {
320
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
- for (int j = 0; j < ncols_interleaved; j++) {
322
- sumi = 0;
323
- for (int i = 0; i < blocklen; ++i) {
324
- const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
- const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
327
- }
328
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
318
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
319
+ for (int l = 0; l < nb; l++) {
320
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
321
+ for (int j = 0; j < ncols_interleaved; j++) {
322
+ sumi = 0;
323
+ for (int i = 0; i < blocklen; ++i) {
324
+ const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
325
+ const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
326
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
329
327
  }
328
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
330
329
  }
331
330
  }
332
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
333
331
  }
332
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
334
333
  }
335
334
  }
336
335
 
@@ -494,43 +493,73 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
494
493
  const int ncols_interleaved = 4;
495
494
  const int blocklen = 4;
496
495
 
497
- assert (n % qk == 0);
498
- assert (nc % ncols_interleaved == 0);
496
+ assert(nr == 1);
497
+ assert(n % qk == 0);
498
+ assert(nc % ncols_interleaved == 0);
499
499
 
500
- UNUSED(s);
501
500
  UNUSED(bs);
502
- UNUSED(vx);
503
- UNUSED(vy);
504
501
  UNUSED(nr);
505
- UNUSED(nc);
506
- UNUSED(nb);
507
- UNUSED(ncols_interleaved);
508
- UNUSED(blocklen);
509
502
 
510
- {
511
- float sumf[4];
512
- int sumi;
503
+ float sumf[4];
504
+ int sumi;
513
505
 
514
- const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
515
- for (int x = 0; x < nc / ncols_interleaved; x++) {
516
- const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
506
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
507
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
508
+ const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
517
509
 
518
- for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
519
- for (int l = 0; l < nb; l++) {
520
- for (int k = 0; k < (qk / (2 * blocklen)); k++) {
521
- for (int j = 0; j < ncols_interleaved; j++) {
522
- sumi = 0;
523
- for (int i = 0; i < blocklen; ++i) {
524
- const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
525
- const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
526
- sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
527
- }
528
- sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
510
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
511
+ for (int l = 0; l < nb; l++) {
512
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
513
+ for (int j = 0; j < ncols_interleaved; j++) {
514
+ sumi = 0;
515
+ for (int i = 0; i < blocklen; ++i) {
516
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
517
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
518
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
529
519
  }
520
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
530
521
  }
531
522
  }
532
- for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
533
523
  }
524
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
525
+ }
526
+ }
527
+
528
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
529
+ const int qk = QK8_0;
530
+ const int nb = n / qk;
531
+ const int ncols_interleaved = 8;
532
+ const int blocklen = 8;
533
+
534
+ assert(nr == 1);
535
+ assert(n % qk == 0);
536
+ assert(nc % ncols_interleaved == 0);
537
+
538
+ UNUSED(bs);
539
+ UNUSED(nr);
540
+
541
+ float sumf[8];
542
+ int sumi;
543
+
544
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
545
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
546
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
547
+
548
+ for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
549
+ for (int l = 0; l < nb; l++) {
550
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
551
+ for (int j = 0; j < ncols_interleaved; j++) {
552
+ sumi = 0;
553
+ for (int i = 0; i < blocklen; ++i) {
554
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
555
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
556
+ sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
557
+ }
558
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
559
+ }
560
+ }
561
+ }
562
+ for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
534
563
  }
535
564
  }
536
565
 
@@ -934,6 +963,50 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
934
963
  }
935
964
  }
936
965
 
966
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
967
+ const int qk = QK8_0;
968
+ const int nb = n / qk;
969
+ const int ncols_interleaved = 8;
970
+ const int blocklen = 8;
971
+
972
+ assert(n % qk == 0);
973
+ assert(nr % 4 == 0);
974
+ assert(nc % ncols_interleaved == 0);
975
+
976
+ float sumf[4][8];
977
+ int sumi;
978
+
979
+ for (int y = 0; y < nr / 4; y++) {
980
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
981
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
982
+ const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
983
+ for (int m = 0; m < 4; m++) {
984
+ for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
985
+ }
986
+ for (int l = 0; l < nb; l++) {
987
+ for (int k = 0; k < (qk / (2 * blocklen)); k++) {
988
+ for (int m = 0; m < 4; m++) {
989
+ for (int j = 0; j < ncols_interleaved; j++) {
990
+ sumi = 0;
991
+ for (int i = 0; i < blocklen; ++i) {
992
+ const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
993
+ const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
994
+ sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
995
+ (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
996
+ }
997
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
998
+ }
999
+ }
1000
+ }
1001
+ }
1002
+ for (int m = 0; m < 4; m++) {
1003
+ for (int j = 0; j < ncols_interleaved; j++)
1004
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1005
+ }
1006
+ }
1007
+ }
1008
+ }
1009
+
937
1010
  } // extern "C"
938
1011
 
939
1012
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
@@ -1285,15 +1358,16 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
1285
1358
 
1286
1359
  static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1287
1360
  GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1288
- //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1289
1361
  GGML_ASSERT(interleave_block == 4);
1290
1362
 
1291
- block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
1292
- const block_iq4_nl * src = (const block_iq4_nl *)data;
1363
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1364
+ block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
1365
+
1293
1366
  block_iq4_nl dst_tmp[4];
1367
+
1294
1368
  int nrow = ggml_nrows(t);
1295
1369
  int nrows_interleaved = 4;
1296
- int nblocks = t->ne[0] / QK4_0;
1370
+ int nblocks = t->ne[0] / QK4_NL;
1297
1371
 
1298
1372
  GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1299
1373
 
@@ -1315,6 +1389,63 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
1315
1389
  GGML_UNUSED(data_size);
1316
1390
  }
1317
1391
 
1392
+ static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
1393
+ block_iq4_nlx8 out;
1394
+
1395
+ for (int i = 0; i < 8; i++) {
1396
+ out.d[i] = in[i].d;
1397
+ }
1398
+
1399
+ const int end = QK4_NL * 4 / blck_size_interleave;
1400
+
1401
+ if (blck_size_interleave == 8) {
1402
+ for (int i = 0; i < end; ++i) {
1403
+ int src_id = i % 8;
1404
+ int src_offset = (i / 8) * blck_size_interleave;
1405
+ int dst_offset = i * blck_size_interleave;
1406
+
1407
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
1408
+ }
1409
+ } else {
1410
+ GGML_ASSERT(false);
1411
+ }
1412
+
1413
+ return out;
1414
+ }
1415
+
1416
+ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
1417
+ GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
1418
+ GGML_ASSERT(interleave_block == 8);
1419
+
1420
+ const block_iq4_nl * src = (const block_iq4_nl *)data;
1421
+ block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
1422
+
1423
+ block_iq4_nl dst_tmp[8];
1424
+
1425
+ int nrow = ggml_nrows(t);
1426
+ int nrows_interleaved = 8;
1427
+ int nblocks = t->ne[0] / QK4_NL;
1428
+
1429
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
1430
+
1431
+ if (t->ne[1] % nrows_interleaved != 0) {
1432
+ return -1;
1433
+ }
1434
+
1435
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1436
+ for (int64_t x = 0; x < nblocks; x++) {
1437
+ for (int i = 0; i < nrows_interleaved; i++) {
1438
+ dst_tmp[i] = src[x + i * nblocks];
1439
+ }
1440
+ *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
1441
+ }
1442
+ src += nrows_interleaved * nblocks;
1443
+ }
1444
+ return 0;
1445
+
1446
+ GGML_UNUSED(data_size);
1447
+ }
1448
+
1318
1449
  namespace ggml::cpu::repack {
1319
1450
  // repack
1320
1451
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
@@ -1350,6 +1481,10 @@ template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void *
1350
1481
  // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
1351
1482
  //}
1352
1483
 
1484
+ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
1485
+ return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1486
+ }
1487
+
1353
1488
  // gemv
1354
1489
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1355
1490
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1378,6 +1513,10 @@ template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1378
1513
  ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1379
1514
  }
1380
1515
 
1516
+ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1517
+ ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1518
+ }
1519
+
1381
1520
  // gemm
1382
1521
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1383
1522
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1406,6 +1545,10 @@ template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size
1406
1545
  ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1407
1546
  }
1408
1547
 
1548
+ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1549
+ ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1550
+ }
1551
+
1409
1552
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1410
1553
  public:
1411
1554
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -1680,6 +1823,7 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1680
1823
 
1681
1824
  // instance for IQ4
1682
1825
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
1826
+ static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
1683
1827
 
1684
1828
  if (cur->type == GGML_TYPE_Q4_0) {
1685
1829
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
@@ -1710,6 +1854,11 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
1710
1854
  }
1711
1855
  }
1712
1856
  } else if (cur->type == GGML_TYPE_IQ4_NL) {
1857
+ if (ggml_cpu_has_avx2()) {
1858
+ if (cur->ne[1] % 8 == 0) {
1859
+ return &iq4_nl_8x8_q8_0;
1860
+ }
1861
+ }
1713
1862
  if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
1714
1863
  if (cur->ne[1] % 4 == 0) {
1715
1864
  return &iq4_nl_4x4_q8_0;
@@ -67,6 +67,13 @@ struct block_iq4_nlx4 {
67
67
 
68
68
  static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
69
69
 
70
+ struct block_iq4_nlx8 {
71
+ ggml_half d[8]; // deltas for 8 iq4_nl blocks
72
+ uint8_t qs[QK4_NL * 4]; // nibbles / quants for 8 iq4_nl blocks
73
+ };
74
+
75
+ static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
76
+
70
77
  #if defined(__cplusplus)
71
78
  extern "C" {
72
79
  #endif
@@ -80,12 +87,14 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
80
87
  void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
81
88
  void ggml_gemv_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
82
89
  void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
+ void ggml_gemv_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
83
91
  void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
84
92
  void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
85
93
  void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
94
  void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
95
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
96
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
97
+ void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
98
 
90
99
  // Native implementations
91
100
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -97,12 +106,14 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs,
97
106
  void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
98
107
  void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
108
  void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
109
+ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
100
110
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
111
  void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
112
  void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
113
  void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
114
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
105
115
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
116
+ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
106
117
 
107
118
  #if defined(__cplusplus)
108
119
  } // extern "C"
@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
10
10
  } // namespace ggml::cpu
11
11
 
12
12
  bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
13
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
13
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
14
14
  if (extra && extra->context) {
15
15
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
16
16
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
23
23
  }
24
24
 
25
25
  bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
26
- for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
26
+ for (auto extra : ggml_backend_cpu_get_extra_buffer_types()) {
27
27
  if (extra && extra->context) {
28
28
  auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
29
29
  auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -33,6 +33,6 @@ class extra_buffer_type {
33
33
  } // namespace ggml::cpu
34
34
 
35
35
  // implemented in ggml-cpu.cpp.
36
- std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
36
+ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_types();
37
37
 
38
38
  #endif
@@ -870,6 +870,29 @@ extern "C" {
870
870
  size_t n_token_capacity,
871
871
  size_t * n_token_count_out);
872
872
 
873
+ #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
874
+
875
+ typedef uint32_t llama_state_seq_flags;
876
+
877
+ LLAMA_API size_t llama_state_seq_get_size_ext(
878
+ struct llama_context * ctx,
879
+ llama_seq_id seq_id,
880
+ llama_state_seq_flags flags);
881
+
882
+ LLAMA_API size_t llama_state_seq_get_data_ext(
883
+ struct llama_context * ctx,
884
+ uint8_t * dst,
885
+ size_t size,
886
+ llama_seq_id seq_id,
887
+ llama_state_seq_flags flags);
888
+
889
+ LLAMA_API size_t llama_state_seq_set_data_ext(
890
+ struct llama_context * ctx,
891
+ const uint8_t * src,
892
+ size_t size,
893
+ llama_seq_id dest_seq_id,
894
+ llama_state_seq_flags flags);
895
+
873
896
  //
874
897
  // Decoding
875
898
  //
@@ -1437,6 +1460,8 @@ extern "C" {
1437
1460
 
1438
1461
  ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
1439
1462
  void * get_opt_pars_ud; // userdata for calculating optimizer parameters
1463
+
1464
+ enum ggml_opt_optimizer_type optimizer_type;
1440
1465
  };
1441
1466
 
1442
1467
  LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
@@ -477,7 +477,7 @@ llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
477
477
 
478
478
  llama_ubatch llama_batch_allocr::split_equal(uint32_t n_ubatch, bool sequential) {
479
479
  if (sequential && has_cpl) {
480
- LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch\n", __func__);
480
+ LLAMA_LOG_ERROR("%s: sequential split is not supported when there are coupled sequences in the input batch (you may need to use the -kvu flag)\n", __func__);
481
481
 
482
482
  return {};
483
483
  }
@@ -193,11 +193,11 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
193
193
  return LLM_CHAT_TEMPLATE_LLAMA4;
194
194
  } else if (tmpl_contains("<|endofuserprompt|>")) {
195
195
  return LLM_CHAT_TEMPLATE_DOTS1;
196
- } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
196
+ } else if (tmpl_contains("<|extra_0|>") && tmpl_contains("<|extra_4|>")) {
197
197
  return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
198
198
  } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
199
199
  return LLM_CHAT_TEMPLATE_OPENAI_MOE;
200
- } else if (tmpl_contains("<|hy_place▁holder▁no▁2|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
200
+ } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
201
201
  return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
202
202
  } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
203
203
  return LLM_CHAT_TEMPLATE_KIMI_K2;
@@ -625,8 +625,6 @@ int32_t llm_chat_apply_template(
625
625
  } else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
626
626
  // Yandex template ("\n\n" is defined as EOT token)
627
627
 
628
- ss << "<s>";
629
-
630
628
  for (size_t i = 0; i < chat.size(); i++) {
631
629
  std::string role(chat[i]->role);
632
630
  if (role == "user") {
@@ -1657,30 +1657,30 @@ size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
1657
1657
  }
1658
1658
  }
1659
1659
 
1660
- size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
1660
+ size_t llama_context::state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags) {
1661
1661
  llama_io_write_dummy io;
1662
1662
  try {
1663
- return state_seq_write_data(io, seq_id);
1663
+ return state_seq_write_data(io, seq_id, flags);
1664
1664
  } catch (const std::exception & err) {
1665
1665
  LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
1666
1666
  return 0;
1667
1667
  }
1668
1668
  }
1669
1669
 
1670
- size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
1670
+ size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags) {
1671
1671
  llama_io_write_buffer io(dst, size);
1672
1672
  try {
1673
- return state_seq_write_data(io, seq_id);
1673
+ return state_seq_write_data(io, seq_id, flags);
1674
1674
  } catch (const std::exception & err) {
1675
1675
  LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
1676
1676
  return 0;
1677
1677
  }
1678
1678
  }
1679
1679
 
1680
- size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
1680
+ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags) {
1681
1681
  llama_io_read_buffer io(src, size);
1682
1682
  try {
1683
- return state_seq_read_data(io, seq_id);
1683
+ return state_seq_read_data(io, seq_id, flags);
1684
1684
  } catch (const std::exception & err) {
1685
1685
  LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
1686
1686
  return 0;
@@ -1778,7 +1778,7 @@ size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * file
1778
1778
  {
1779
1779
  const size_t state_size = file.size() - file.tell();
1780
1780
  llama_io_read_file io(&file);
1781
- const size_t nread = state_seq_read_data(io, seq_id);
1781
+ const size_t nread = state_seq_read_data(io, seq_id, 0);
1782
1782
  if (!nread) {
1783
1783
  LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
1784
1784
  return 0;
@@ -1802,7 +1802,7 @@ size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * file
1802
1802
 
1803
1803
  // save the context state using stream saving
1804
1804
  llama_io_write_file io(&file);
1805
- state_seq_write_data(io, seq_id);
1805
+ state_seq_write_data(io, seq_id, 0);
1806
1806
 
1807
1807
  const size_t res = file.tell();
1808
1808
  GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
@@ -1971,21 +1971,21 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
1971
1971
  return io.n_bytes();
1972
1972
  }
1973
1973
 
1974
- size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
1974
+ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1975
1975
  GGML_UNUSED(seq_id);
1976
1976
 
1977
1977
  if (memory) {
1978
- memory->state_write(io, seq_id);
1978
+ memory->state_write(io, seq_id, flags);
1979
1979
  }
1980
1980
 
1981
1981
  return io.n_bytes();
1982
1982
  }
1983
1983
 
1984
- size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
1984
+ size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
1985
1985
  GGML_UNUSED(seq_id);
1986
1986
 
1987
1987
  if (memory) {
1988
- memory->state_read(io, seq_id);
1988
+ memory->state_read(io, seq_id, flags);
1989
1989
  }
1990
1990
 
1991
1991
  return io.n_bytes();
@@ -2048,7 +2048,7 @@ void llama_context::opt_init(struct llama_model * model, struct llama_opt_params
2048
2048
  opt_params.opt_period = n_batch / n_ubatch;
2049
2049
  opt_params.get_opt_pars = lopt_params.get_opt_pars;
2050
2050
  opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
2051
-
2051
+ opt_params.optimizer = lopt_params.optimizer_type;
2052
2052
  opt_ctx = ggml_opt_init(opt_params);
2053
2053
 
2054
2054
  llama_opt_param_filter param_filter = lopt_params.param_filter;
@@ -2801,19 +2801,31 @@ bool llama_state_save_file(llama_context * ctx, const char * path_session, const
2801
2801
  }
2802
2802
 
2803
2803
  size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
2804
- return ctx->state_seq_get_size(seq_id);
2804
+ return llama_state_seq_get_size_ext(ctx, seq_id, 0);
2805
2805
  }
2806
2806
 
2807
2807
  size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
2808
+ return llama_state_seq_get_data_ext(ctx, dst, size, seq_id, 0);
2809
+ }
2810
+
2811
+ size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2812
+ return llama_state_seq_set_data_ext(ctx, src, size, seq_id, 0);
2813
+ }
2814
+
2815
+ size_t llama_state_seq_get_size_ext(llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
2816
+ return ctx->state_seq_get_size(seq_id, flags);
2817
+ }
2818
+
2819
+ size_t llama_state_seq_get_data_ext(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2808
2820
  ctx->synchronize();
2809
2821
 
2810
- return ctx->state_seq_get_data(seq_id, dst, size);
2822
+ return ctx->state_seq_get_data(seq_id, dst, size, flags);
2811
2823
  }
2812
2824
 
2813
- size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
2825
+ size_t llama_state_seq_set_data_ext(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
2814
2826
  ctx->synchronize();
2815
2827
 
2816
- return ctx->state_seq_set_data(seq_id, src, size);
2828
+ return ctx->state_seq_set_data(seq_id, src, size, flags);
2817
2829
  }
2818
2830
 
2819
2831
  size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
@@ -111,9 +111,9 @@ struct llama_context {
111
111
  size_t state_get_data( uint8_t * dst, size_t size);
112
112
  size_t state_set_data(const uint8_t * src, size_t size);
113
113
 
114
- size_t state_seq_get_size(llama_seq_id seq_id);
115
- size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size);
116
- size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
114
+ size_t state_seq_get_size(llama_seq_id seq_id, llama_state_seq_flags flags);
115
+ size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size, llama_state_seq_flags flags);
116
+ size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size, llama_state_seq_flags flags);
117
117
 
118
118
  bool state_load_file(
119
119
  const char * filepath,
@@ -152,6 +152,7 @@ struct llama_context {
152
152
 
153
153
  void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
154
154
 
155
+ // TODO: more flexible combinations of logical/physical batch size and context size
155
156
  void opt_epoch(
156
157
  ggml_opt_dataset_t dataset,
157
158
  ggml_opt_result_t result_train,
@@ -212,8 +213,8 @@ private:
212
213
  size_t state_write_data(llama_io_write_i & io);
213
214
  size_t state_read_data (llama_io_read_i & io);
214
215
 
215
- size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
216
- size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id);
216
+ size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
217
+ size_t state_seq_read_data (llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags);
217
218
 
218
219
  //
219
220
  // members