@fugood/llama.node 1.4.8 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/lib/binding.ts +43 -0
  2. package/lib/parallel.js +26 -0
  3. package/lib/parallel.ts +33 -0
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +12 -14
  6. package/src/LlamaCompletionWorker.cpp +3 -1
  7. package/src/LlamaCompletionWorker.h +2 -0
  8. package/src/LlamaContext.cpp +16 -1
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +4 -4
  11. package/src/llama.cpp/common/arg.cpp +159 -42
  12. package/src/llama.cpp/common/arg.h +10 -1
  13. package/src/llama.cpp/common/common.cpp +1 -1
  14. package/src/llama.cpp/common/common.h +6 -2
  15. package/src/llama.cpp/common/preset.cpp +197 -5
  16. package/src/llama.cpp/common/preset.h +45 -3
  17. package/src/llama.cpp/common/sampling.cpp +51 -37
  18. package/src/llama.cpp/common/sampling.h +6 -3
  19. package/src/llama.cpp/common/speculative.cpp +1 -1
  20. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  31. package/src/llama.cpp/src/llama-mmap.h +5 -1
  32. package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
  33. package/src/llama.cpp/src/llama-model.cpp +7 -5
  34. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  35. package/src/llama.cpp/src/llama.cpp +22 -32
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
692
692
  }
693
693
  }
694
694
 
695
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
696
+ float * GGML_RESTRICT s,
697
+ size_t bs,
698
+ const void * GGML_RESTRICT vx,
699
+ const void * GGML_RESTRICT vy,
700
+ int nr,
701
+ int nc) {
702
+ const int qk = QK8_0;
703
+ const int nb = n / qk;
704
+ const int ncols_interleaved = 4;
705
+ const int blocklen = 4;
706
+
707
+ assert(nr == 1);
708
+ assert(n % qk == 0);
709
+ assert(nc % ncols_interleaved == 0);
710
+
711
+ UNUSED(bs);
712
+ UNUSED(nr);
713
+
714
+ float sumf[4];
715
+ int sumi;
716
+
717
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
718
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
719
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
720
+
721
+ for (int j = 0; j < ncols_interleaved; j++) {
722
+ sumf[j] = 0.0;
723
+ }
724
+ for (int l = 0; l < nb; l++) {
725
+ for (int k = 0; k < (qk / blocklen); k++) {
726
+ for (int j = 0; j < ncols_interleaved; j++) {
727
+ sumi = 0;
728
+ for (int i = 0; i < blocklen; ++i) {
729
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
730
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
731
+ }
732
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
733
+ }
734
+ }
735
+ }
736
+ for (int j = 0; j < ncols_interleaved; j++) {
737
+ s[x * ncols_interleaved + j] = sumf[j];
738
+ }
739
+ }
740
+ }
741
+
742
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
743
+ float * GGML_RESTRICT s,
744
+ size_t bs,
745
+ const void * GGML_RESTRICT vx,
746
+ const void * GGML_RESTRICT vy,
747
+ int nr,
748
+ int nc) {
749
+ const int qk = QK8_0;
750
+ const int nb = n / qk;
751
+ const int ncols_interleaved = 4;
752
+ const int blocklen = 8;
753
+
754
+ assert(nr == 1);
755
+ assert(n % qk == 0);
756
+ assert(nc % ncols_interleaved == 0);
757
+
758
+ UNUSED(bs);
759
+ UNUSED(nr);
760
+
761
+ float sumf[4];
762
+ int sumi;
763
+
764
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
765
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
766
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
767
+
768
+ for (int j = 0; j < ncols_interleaved; j++) {
769
+ sumf[j] = 0.0;
770
+ }
771
+ for (int l = 0; l < nb; l++) {
772
+ for (int k = 0; k < (qk / blocklen); k++) {
773
+ for (int j = 0; j < ncols_interleaved; j++) {
774
+ sumi = 0;
775
+ for (int i = 0; i < blocklen; ++i) {
776
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
777
+ sumi += v0 * a_ptr[l].qs[k * blocklen + i];
778
+ }
779
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
780
+ }
781
+ }
782
+ }
783
+ for (int j = 0; j < ncols_interleaved; j++) {
784
+ s[x * ncols_interleaved + j] = sumf[j];
785
+ }
786
+ }
787
+ }
788
+
695
789
  void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
696
790
  const int qk = QK8_0;
697
791
  const int nb = n / qk;
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
1219
1313
  }
1220
1314
  }
1221
1315
 
1316
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
1317
+ float * GGML_RESTRICT s,
1318
+ size_t bs,
1319
+ const void * GGML_RESTRICT vx,
1320
+ const void * GGML_RESTRICT vy,
1321
+ int nr,
1322
+ int nc) {
1323
+ const int qk = QK8_0;
1324
+ const int nb = n / qk;
1325
+ const int ncols_interleaved = 4;
1326
+ const int blocklen = 4;
1327
+
1328
+ assert(n % qk == 0);
1329
+ assert(nr % 4 == 0);
1330
+ assert(nc % ncols_interleaved == 0);
1331
+
1332
+ float sumf[4][4];
1333
+ int sumi;
1334
+
1335
+ for (int y = 0; y < nr / 4; y++) {
1336
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1337
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1338
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1339
+ for (int m = 0; m < 4; m++) {
1340
+ for (int j = 0; j < ncols_interleaved; j++) {
1341
+ sumf[m][j] = 0.0;
1342
+ }
1343
+ }
1344
+ for (int l = 0; l < nb; l++) {
1345
+ for (int k = 0; k < (qk / blocklen); k++) {
1346
+ for (int m = 0; m < 4; m++) {
1347
+ for (int j = 0; j < ncols_interleaved; j++) {
1348
+ sumi = 0;
1349
+ for (int i = 0; i < blocklen; ++i) {
1350
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1351
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1352
+ }
1353
+ sumf[m][j] +=
1354
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1355
+ }
1356
+ }
1357
+ }
1358
+ }
1359
+ for (int m = 0; m < 4; m++) {
1360
+ for (int j = 0; j < ncols_interleaved; j++) {
1361
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1362
+ }
1363
+ }
1364
+ }
1365
+ }
1366
+ }
1367
+
1368
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
1369
+ float * GGML_RESTRICT s,
1370
+ size_t bs,
1371
+ const void * GGML_RESTRICT vx,
1372
+ const void * GGML_RESTRICT vy,
1373
+ int nr,
1374
+ int nc) {
1375
+ const int qk = QK8_0;
1376
+ const int nb = n / qk;
1377
+ const int ncols_interleaved = 4;
1378
+ const int blocklen = 8;
1379
+
1380
+ assert(n % qk == 0);
1381
+ assert(nr % 4 == 0);
1382
+ assert(nc % ncols_interleaved == 0);
1383
+
1384
+ float sumf[4][4];
1385
+ int sumi;
1386
+
1387
+ for (int y = 0; y < nr / 4; y++) {
1388
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
1389
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1390
+ const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
1391
+ for (int m = 0; m < 4; m++) {
1392
+ for (int j = 0; j < ncols_interleaved; j++) {
1393
+ sumf[m][j] = 0.0;
1394
+ }
1395
+ }
1396
+ for (int l = 0; l < nb; l++) {
1397
+ for (int k = 0; k < (qk / blocklen); k++) {
1398
+ for (int m = 0; m < 4; m++) {
1399
+ for (int j = 0; j < ncols_interleaved; j++) {
1400
+ sumi = 0;
1401
+ for (int i = 0; i < blocklen; ++i) {
1402
+ const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
1403
+ sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
1404
+ }
1405
+ sumf[m][j] +=
1406
+ sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1407
+ }
1408
+ }
1409
+ }
1410
+ }
1411
+ for (int m = 0; m < 4; m++) {
1412
+ for (int j = 0; j < ncols_interleaved; j++) {
1413
+ s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
1414
+ }
1415
+ }
1416
+ }
1417
+ }
1418
+ }
1419
+
1222
1420
  } // extern "C"
1223
1421
 
1422
+ static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
1423
+ block_q8_0x4 out;
1424
+
1425
+ for (int i = 0; i < 4; i++) {
1426
+ out.d[i] = in[i].d;
1427
+ }
1428
+
1429
+ const int end = QK8_0 * 4 / blck_size_interleave;
1430
+ for (int i = 0; i < end; ++i) {
1431
+ int src_id = i % 4;
1432
+ int src_offset = (i / 4) * blck_size_interleave;
1433
+ int dst_offset = i * blck_size_interleave;
1434
+ memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
1435
+ }
1436
+ return out;
1437
+ }
1438
+
1224
1439
  static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1225
1440
  block_q4_0x4 out;
1226
1441
 
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
1534
1749
  GGML_UNUSED(data_size);
1535
1750
  }
1536
1751
 
1752
+ static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
1753
+ int interleave_block,
1754
+ const void * GGML_RESTRICT data,
1755
+ size_t data_size) {
1756
+ GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
1757
+ GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
1758
+ constexpr int nrows_interleaved = 4;
1759
+
1760
+ block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
1761
+ const block_q8_0 * src = (const block_q8_0 *) data;
1762
+ block_q8_0 dst_tmp[4];
1763
+ int nrow = ggml_nrows(t);
1764
+ int nblocks = t->ne[0] / QK8_0;
1765
+
1766
+ GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
1767
+
1768
+ if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
1769
+ return -1;
1770
+ }
1771
+
1772
+ for (int b = 0; b < nrow; b += nrows_interleaved) {
1773
+ for (int64_t x = 0; x < nblocks; x++) {
1774
+ for (int i = 0; i < nrows_interleaved; i++) {
1775
+ dst_tmp[i] = src[x + i * nblocks];
1776
+ }
1777
+ *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
1778
+ }
1779
+ src += nrows_interleaved * nblocks;
1780
+ }
1781
+ return 0;
1782
+ }
1783
+
1537
1784
  static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
1538
1785
  block_iq4_nlx4 out;
1539
1786
 
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
1702
1949
  return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
1703
1950
  }
1704
1951
 
1952
+ template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1953
+ return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
1954
+ }
1955
+
1956
+ template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
1957
+ return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
1958
+ }
1959
+
1705
1960
  // gemv
1706
1961
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1707
1962
  void gemv(int, float *, size_t, const void *, const void *, int, int);
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1738
1993
  ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1739
1994
  }
1740
1995
 
1996
+ template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1997
+ ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
1998
+ }
1999
+
2000
+ template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2001
+ ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2002
+ }
2003
+
1741
2004
  // gemm
1742
2005
  template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
1743
2006
  void gemm(int, float *, size_t, const void *, const void *, int, int);
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
1774
2037
  ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
1775
2038
  }
1776
2039
 
2040
+ template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2041
+ ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
2042
+ }
2043
+
2044
+ template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2045
+ ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
2046
+ }
2047
+
1777
2048
  class tensor_traits_base : public ggml::cpu::tensor_traits {
1778
2049
  public:
1779
2050
  virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2168
2439
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
2169
2440
  static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
2170
2441
 
2442
+ // instance for Q8_0
2443
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
2444
+ static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
2445
+
2171
2446
  if (cur->type == GGML_TYPE_Q4_0) {
2172
2447
  if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
2173
2448
  || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
2218
2493
  return &iq4_nl_4x4_q8_0;
2219
2494
  }
2220
2495
  }
2496
+ } else if (cur->type == GGML_TYPE_Q8_0) {
2497
+ if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
2498
+ if (cur->ne[1] % 4 == 0) {
2499
+ return &q8_0_4x8_q8_0;
2500
+ }
2501
+ }
2502
+ if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
2503
+ if (cur->ne[1] % 4 == 0) {
2504
+ return &q8_0_4x4_q8_0;
2505
+ }
2506
+ }
2221
2507
  }
2222
2508
 
2223
2509
  return nullptr;
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
98
98
  void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
99
99
  void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
100
100
  void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
+ void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
102
+ void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
103
+ void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
104
+ void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
101
105
 
102
106
  // Native implementations
103
107
  void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
120
124
  void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
121
125
  void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
122
126
  void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
127
+ void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
128
+ void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
129
+ void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
130
+ void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
123
131
 
124
132
  #if defined(__cplusplus)
125
133
  } // extern "C"
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
195
195
  sumf += (ggml_float)_mm_cvtss_f32(g);
196
196
 
197
197
  #undef LOAD
198
- #endif
198
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
199
+ size_t vl = __riscv_vsetvlmax_e32m4();
200
+
201
+ // initialize accumulators to all zeroes
202
+ vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
203
+ vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
204
+
205
+ // calculate step size
206
+ const size_t epr = __riscv_vsetvlmax_e16m2();
207
+ const size_t step = epr * 2;
208
+ const int np = (n & ~(step - 1));
209
+
210
+ // unroll by 2
211
+ for (; i < np; i += step) {
212
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
213
+ vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
214
+ vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
215
+ __asm__ __volatile__ ("" ::: "memory");
216
+
217
+ vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
218
+ vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
219
+ vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
220
+ __asm__ __volatile__ ("" ::: "memory");
221
+ }
199
222
 
223
+ // accumulate in 1 register
224
+ vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
225
+
226
+ // leftovers
227
+ for (i = np; i < n; i += vl) {
228
+ vl = __riscv_vsetvl_e16m2(n - i);
229
+ vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
230
+ vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
231
+ vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
232
+ }
233
+
234
+ // reduce
235
+ vl = __riscv_vsetvlmax_e32m4();
236
+ vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
237
+ sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
238
+
239
+ #endif
200
240
  for (; i < n; ++i) {
201
241
  sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
202
242
  GGML_BF16_TO_FP32(y[i]));
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
224
224
  }
225
225
  GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
226
  GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
- #elif defined(__riscv_v_intrinsic)
228
- // todo: RVV impl
229
- for (int i = 0; i < n; ++i) {
230
- for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
231
- sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
232
- }
233
- }
227
+
228
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229
+ size_t vl = __riscv_vsetvlmax_e32m4();
230
+
231
+ // initialize accumulators to all zeroes
232
+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233
+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234
+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235
+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
236
+
237
+ // calculate step size
238
+ const size_t epr = __riscv_vsetvlmax_e16m2();
239
+ const size_t step = epr * 2;
240
+ const int np = (n & ~(step - 1));
241
+
242
+ // unroll by 2 along the row dimension
243
+ for (int i = 0; i < np; i += step) {
244
+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245
+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246
+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247
+ vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248
+ vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249
+
250
+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251
+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252
+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253
+ vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254
+ vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
255
+ }
256
+
257
+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258
+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
259
+
260
+ // leftovers
261
+ for (int i = np; i < n; i += vl) {
262
+ vl = __riscv_vsetvl_e16m2(n - i);
263
+ vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
266
+
267
+ vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268
+ vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
269
+ }
270
+
271
+ // reduce
272
+ vl = __riscv_vsetvlmax_e32m2();
273
+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274
+ __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
275
+ vl = __riscv_vsetvlmax_e32m1();
276
+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277
+ __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
278
+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
279
+ acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
280
+
281
+ vl = __riscv_vsetvlmax_e32m2();
282
+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283
+ __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284
+ vl = __riscv_vsetvlmax_e32m1();
285
+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286
+ __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
287
+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
288
+ acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
289
+ sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
290
+ sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291
+
234
292
  #else
235
293
  const int np = (n & ~(GGML_F16_STEP - 1));
236
294
 
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
475
533
  }
476
534
  np = n;
477
535
  #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
478
- const int np = n;
479
- _Float16 hv = (_Float16)v;
480
- for (int i = 0, avl; i < n; i += avl) {
481
- avl = __riscv_vsetvl_e16m8(n - i);
482
- vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
483
- vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
484
- vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
485
- __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
536
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
537
+ const _Float16 scale = *(const _Float16*)(&s);
538
+
539
+ // calculate step size
540
+ const int epr = __riscv_vsetvlmax_e16m4();
541
+ const int step = epr * 2;
542
+ int np = (n & ~(step - 1));
543
+
544
+ // unroll by 2
545
+ for (int i = 0; i < np; i += step) {
546
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
547
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
548
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
549
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
550
+ __asm__ __volatile__ ("" ::: "memory");
551
+
552
+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
553
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
554
+ ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
555
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
556
+ __asm__ __volatile__ ("" ::: "memory");
557
+ }
558
+
559
+ // leftovers
560
+ int vl;
561
+ for (int i = np; i < n; i += vl) {
562
+ vl = __riscv_vsetvl_e16m4(n - i);
563
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
564
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
565
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
566
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
486
567
  }
568
+ np = n;
487
569
  #elif defined(GGML_SIMD)
488
570
  const int np = (n & ~(GGML_F16_STEP - 1));
489
571
 
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
724
806
  svst1_f16(pg, (__fp16 *)(y + np), out);
725
807
  }
726
808
  #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
727
- for (int i = 0, vl; i < n; i += vl) {
728
- vl = __riscv_vsetvl_e16m2(n - i);
729
- vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
730
- vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
731
- vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
732
- vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
733
- __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
809
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
810
+ const _Float16 scale = *(const _Float16*)(&s);
811
+
812
+ // calculate step size
813
+ const int epr = __riscv_vsetvlmax_e16m4();
814
+ const int step = epr * 2;
815
+ const int np = (n & ~(step - 1));
816
+
817
+ // unroll by 2
818
+ for (int i = 0; i < np; i += step) {
819
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
820
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
821
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
822
+ __asm__ __volatile__ ("" ::: "memory");
823
+
824
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
825
+ ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
826
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
827
+ __asm__ __volatile__ ("" ::: "memory");
828
+ }
829
+
830
+ // leftovers
831
+ int vl;
832
+ for (int i = np; i < n; i += vl) {
833
+ vl = __riscv_vsetvl_e16m4(n - i);
834
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
835
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
836
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
734
837
  }
735
838
  #elif defined(GGML_SIMD)
736
839
  const int np = (n & ~(GGML_F16_STEP - 1));
@@ -2055,7 +2055,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
2055
2055
  LLM_TENSOR_SHORTCONV_INPROJ,
2056
2056
  LLM_TENSOR_SHORTCONV_OUTPROJ,
2057
2057
  LLM_TENSOR_TOKEN_EMBD,
2058
- LLM_TENSOR_OUTPUT_NORM,
2058
+ LLM_TENSOR_OUTPUT_NORM_LFM2,
2059
2059
  LLM_TENSOR_FFN_GATE_INP,
2060
2060
  LLM_TENSOR_FFN_GATE_EXPS,
2061
2061
  LLM_TENSOR_FFN_DOWN_EXPS,