@fugood/llama.node 1.4.8 → 1.4.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +7 -7
- package/src/LlamaContext.cpp +2 -0
- package/src/llama.cpp/common/arg.cpp +107 -31
- package/src/llama.cpp/common/common.cpp +1 -1
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/common/sampling.cpp +51 -37
- package/src/llama.cpp/common/sampling.h +6 -3
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
- package/src/llama.cpp/src/llama-arch.cpp +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +123 -28
- package/src/llama.cpp/src/llama-mmap.h +5 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
- package/src/llama.cpp/src/llama-model.cpp +7 -5
- package/src/llama.cpp/src/llama-sampling.cpp +16 -0
- package/src/llama.cpp/src/llama.cpp +22 -32
|
@@ -692,6 +692,100 @@ void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
692
692
|
}
|
|
693
693
|
}
|
|
694
694
|
|
|
695
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
|
696
|
+
float * GGML_RESTRICT s,
|
|
697
|
+
size_t bs,
|
|
698
|
+
const void * GGML_RESTRICT vx,
|
|
699
|
+
const void * GGML_RESTRICT vy,
|
|
700
|
+
int nr,
|
|
701
|
+
int nc) {
|
|
702
|
+
const int qk = QK8_0;
|
|
703
|
+
const int nb = n / qk;
|
|
704
|
+
const int ncols_interleaved = 4;
|
|
705
|
+
const int blocklen = 4;
|
|
706
|
+
|
|
707
|
+
assert(nr == 1);
|
|
708
|
+
assert(n % qk == 0);
|
|
709
|
+
assert(nc % ncols_interleaved == 0);
|
|
710
|
+
|
|
711
|
+
UNUSED(bs);
|
|
712
|
+
UNUSED(nr);
|
|
713
|
+
|
|
714
|
+
float sumf[4];
|
|
715
|
+
int sumi;
|
|
716
|
+
|
|
717
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
718
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
719
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
720
|
+
|
|
721
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
722
|
+
sumf[j] = 0.0;
|
|
723
|
+
}
|
|
724
|
+
for (int l = 0; l < nb; l++) {
|
|
725
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
726
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
727
|
+
sumi = 0;
|
|
728
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
729
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
730
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
731
|
+
}
|
|
732
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
737
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
738
|
+
}
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
|
743
|
+
float * GGML_RESTRICT s,
|
|
744
|
+
size_t bs,
|
|
745
|
+
const void * GGML_RESTRICT vx,
|
|
746
|
+
const void * GGML_RESTRICT vy,
|
|
747
|
+
int nr,
|
|
748
|
+
int nc) {
|
|
749
|
+
const int qk = QK8_0;
|
|
750
|
+
const int nb = n / qk;
|
|
751
|
+
const int ncols_interleaved = 4;
|
|
752
|
+
const int blocklen = 8;
|
|
753
|
+
|
|
754
|
+
assert(nr == 1);
|
|
755
|
+
assert(n % qk == 0);
|
|
756
|
+
assert(nc % ncols_interleaved == 0);
|
|
757
|
+
|
|
758
|
+
UNUSED(bs);
|
|
759
|
+
UNUSED(nr);
|
|
760
|
+
|
|
761
|
+
float sumf[4];
|
|
762
|
+
int sumi;
|
|
763
|
+
|
|
764
|
+
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
|
765
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
766
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
767
|
+
|
|
768
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
769
|
+
sumf[j] = 0.0;
|
|
770
|
+
}
|
|
771
|
+
for (int l = 0; l < nb; l++) {
|
|
772
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
773
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
774
|
+
sumi = 0;
|
|
775
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
776
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
777
|
+
sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
|
778
|
+
}
|
|
779
|
+
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
784
|
+
s[x * ncols_interleaved + j] = sumf[j];
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
|
|
695
789
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
|
696
790
|
const int qk = QK8_0;
|
|
697
791
|
const int nb = n / qk;
|
|
@@ -1219,8 +1313,129 @@ void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs
|
|
|
1219
1313
|
}
|
|
1220
1314
|
}
|
|
1221
1315
|
|
|
1316
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
|
1317
|
+
float * GGML_RESTRICT s,
|
|
1318
|
+
size_t bs,
|
|
1319
|
+
const void * GGML_RESTRICT vx,
|
|
1320
|
+
const void * GGML_RESTRICT vy,
|
|
1321
|
+
int nr,
|
|
1322
|
+
int nc) {
|
|
1323
|
+
const int qk = QK8_0;
|
|
1324
|
+
const int nb = n / qk;
|
|
1325
|
+
const int ncols_interleaved = 4;
|
|
1326
|
+
const int blocklen = 4;
|
|
1327
|
+
|
|
1328
|
+
assert(n % qk == 0);
|
|
1329
|
+
assert(nr % 4 == 0);
|
|
1330
|
+
assert(nc % ncols_interleaved == 0);
|
|
1331
|
+
|
|
1332
|
+
float sumf[4][4];
|
|
1333
|
+
int sumi;
|
|
1334
|
+
|
|
1335
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1336
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1337
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1338
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1339
|
+
for (int m = 0; m < 4; m++) {
|
|
1340
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1341
|
+
sumf[m][j] = 0.0;
|
|
1342
|
+
}
|
|
1343
|
+
}
|
|
1344
|
+
for (int l = 0; l < nb; l++) {
|
|
1345
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1346
|
+
for (int m = 0; m < 4; m++) {
|
|
1347
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1348
|
+
sumi = 0;
|
|
1349
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1350
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1351
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1352
|
+
}
|
|
1353
|
+
sumf[m][j] +=
|
|
1354
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
for (int m = 0; m < 4; m++) {
|
|
1360
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1361
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
|
1369
|
+
float * GGML_RESTRICT s,
|
|
1370
|
+
size_t bs,
|
|
1371
|
+
const void * GGML_RESTRICT vx,
|
|
1372
|
+
const void * GGML_RESTRICT vy,
|
|
1373
|
+
int nr,
|
|
1374
|
+
int nc) {
|
|
1375
|
+
const int qk = QK8_0;
|
|
1376
|
+
const int nb = n / qk;
|
|
1377
|
+
const int ncols_interleaved = 4;
|
|
1378
|
+
const int blocklen = 8;
|
|
1379
|
+
|
|
1380
|
+
assert(n % qk == 0);
|
|
1381
|
+
assert(nr % 4 == 0);
|
|
1382
|
+
assert(nc % ncols_interleaved == 0);
|
|
1383
|
+
|
|
1384
|
+
float sumf[4][4];
|
|
1385
|
+
int sumi;
|
|
1386
|
+
|
|
1387
|
+
for (int y = 0; y < nr / 4; y++) {
|
|
1388
|
+
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
|
1389
|
+
for (int x = 0; x < nc / ncols_interleaved; x++) {
|
|
1390
|
+
const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
|
1391
|
+
for (int m = 0; m < 4; m++) {
|
|
1392
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1393
|
+
sumf[m][j] = 0.0;
|
|
1394
|
+
}
|
|
1395
|
+
}
|
|
1396
|
+
for (int l = 0; l < nb; l++) {
|
|
1397
|
+
for (int k = 0; k < (qk / blocklen); k++) {
|
|
1398
|
+
for (int m = 0; m < 4; m++) {
|
|
1399
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1400
|
+
sumi = 0;
|
|
1401
|
+
for (int i = 0; i < blocklen; ++i) {
|
|
1402
|
+
const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
|
1403
|
+
sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
|
1404
|
+
}
|
|
1405
|
+
sumf[m][j] +=
|
|
1406
|
+
sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
|
1407
|
+
}
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
}
|
|
1411
|
+
for (int m = 0; m < 4; m++) {
|
|
1412
|
+
for (int j = 0; j < ncols_interleaved; j++) {
|
|
1413
|
+
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
|
1414
|
+
}
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1222
1420
|
} // extern "C"
|
|
1223
1421
|
|
|
1422
|
+
static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
|
1423
|
+
block_q8_0x4 out;
|
|
1424
|
+
|
|
1425
|
+
for (int i = 0; i < 4; i++) {
|
|
1426
|
+
out.d[i] = in[i].d;
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
const int end = QK8_0 * 4 / blck_size_interleave;
|
|
1430
|
+
for (int i = 0; i < end; ++i) {
|
|
1431
|
+
int src_id = i % 4;
|
|
1432
|
+
int src_offset = (i / 4) * blck_size_interleave;
|
|
1433
|
+
int dst_offset = i * blck_size_interleave;
|
|
1434
|
+
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
|
1435
|
+
}
|
|
1436
|
+
return out;
|
|
1437
|
+
}
|
|
1438
|
+
|
|
1224
1439
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
|
1225
1440
|
block_q4_0x4 out;
|
|
1226
1441
|
|
|
@@ -1534,6 +1749,38 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
|
|
|
1534
1749
|
GGML_UNUSED(data_size);
|
|
1535
1750
|
}
|
|
1536
1751
|
|
|
1752
|
+
static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
|
1753
|
+
int interleave_block,
|
|
1754
|
+
const void * GGML_RESTRICT data,
|
|
1755
|
+
size_t data_size) {
|
|
1756
|
+
GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
|
1757
|
+
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
|
1758
|
+
constexpr int nrows_interleaved = 4;
|
|
1759
|
+
|
|
1760
|
+
block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
|
1761
|
+
const block_q8_0 * src = (const block_q8_0 *) data;
|
|
1762
|
+
block_q8_0 dst_tmp[4];
|
|
1763
|
+
int nrow = ggml_nrows(t);
|
|
1764
|
+
int nblocks = t->ne[0] / QK8_0;
|
|
1765
|
+
|
|
1766
|
+
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
|
1767
|
+
|
|
1768
|
+
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
|
1769
|
+
return -1;
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
for (int b = 0; b < nrow; b += nrows_interleaved) {
|
|
1773
|
+
for (int64_t x = 0; x < nblocks; x++) {
|
|
1774
|
+
for (int i = 0; i < nrows_interleaved; i++) {
|
|
1775
|
+
dst_tmp[i] = src[x + i * nblocks];
|
|
1776
|
+
}
|
|
1777
|
+
*dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
|
1778
|
+
}
|
|
1779
|
+
src += nrows_interleaved * nblocks;
|
|
1780
|
+
}
|
|
1781
|
+
return 0;
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1537
1784
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
|
1538
1785
|
block_iq4_nlx4 out;
|
|
1539
1786
|
|
|
@@ -1702,6 +1949,14 @@ template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void *
|
|
|
1702
1949
|
return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
|
1703
1950
|
}
|
|
1704
1951
|
|
|
1952
|
+
template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1953
|
+
return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
|
1954
|
+
}
|
|
1955
|
+
|
|
1956
|
+
template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
|
1957
|
+
return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
|
1958
|
+
}
|
|
1959
|
+
|
|
1705
1960
|
// gemv
|
|
1706
1961
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1707
1962
|
void gemv(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1738,6 +1993,14 @@ template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1738
1993
|
ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1739
1994
|
}
|
|
1740
1995
|
|
|
1996
|
+
template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
1997
|
+
ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1998
|
+
}
|
|
1999
|
+
|
|
2000
|
+
template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2001
|
+
ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2002
|
+
}
|
|
2003
|
+
|
|
1741
2004
|
// gemm
|
|
1742
2005
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
|
1743
2006
|
void gemm(int, float *, size_t, const void *, const void *, int, int);
|
|
@@ -1774,6 +2037,14 @@ template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size
|
|
|
1774
2037
|
ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
1775
2038
|
}
|
|
1776
2039
|
|
|
2040
|
+
template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2041
|
+
ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2042
|
+
}
|
|
2043
|
+
|
|
2044
|
+
template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
|
2045
|
+
ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
|
2046
|
+
}
|
|
2047
|
+
|
|
1777
2048
|
class tensor_traits_base : public ggml::cpu::tensor_traits {
|
|
1778
2049
|
public:
|
|
1779
2050
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
|
@@ -2168,6 +2439,10 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2168
2439
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
|
2169
2440
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
|
2170
2441
|
|
|
2442
|
+
// instance for Q8_0
|
|
2443
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
|
2444
|
+
static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
|
2445
|
+
|
|
2171
2446
|
if (cur->type == GGML_TYPE_Q4_0) {
|
|
2172
2447
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
|
2173
2448
|
|| (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
|
@@ -2218,6 +2493,17 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
|
|
|
2218
2493
|
return &iq4_nl_4x4_q8_0;
|
|
2219
2494
|
}
|
|
2220
2495
|
}
|
|
2496
|
+
} else if (cur->type == GGML_TYPE_Q8_0) {
|
|
2497
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
|
2498
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2499
|
+
return &q8_0_4x8_q8_0;
|
|
2500
|
+
}
|
|
2501
|
+
}
|
|
2502
|
+
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
|
2503
|
+
if (cur->ne[1] % 4 == 0) {
|
|
2504
|
+
return &q8_0_4x4_q8_0;
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2221
2507
|
}
|
|
2222
2508
|
|
|
2223
2509
|
return nullptr;
|
|
@@ -98,6 +98,10 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
|
|
|
98
98
|
void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
99
99
|
void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
100
100
|
void ggml_gemm_iq4_nl_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
|
+
void ggml_gemv_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
102
|
+
void ggml_gemv_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
103
|
+
void ggml_gemm_q8_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
104
|
+
void ggml_gemm_q8_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
101
105
|
|
|
102
106
|
// Native implementations
|
|
103
107
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
|
|
@@ -120,6 +124,10 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs,
|
|
|
120
124
|
void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
121
125
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
122
126
|
void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
127
|
+
void ggml_gemv_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
128
|
+
void ggml_gemv_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
129
|
+
void ggml_gemm_q8_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
130
|
+
void ggml_gemm_q8_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
|
123
131
|
|
|
124
132
|
#if defined(__cplusplus)
|
|
125
133
|
} // extern "C"
|
|
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
|
|
|
195
195
|
sumf += (ggml_float)_mm_cvtss_f32(g);
|
|
196
196
|
|
|
197
197
|
#undef LOAD
|
|
198
|
-
#
|
|
198
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma)
|
|
199
|
+
size_t vl = __riscv_vsetvlmax_e32m4();
|
|
200
|
+
|
|
201
|
+
// initialize accumulators to all zeroes
|
|
202
|
+
vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
203
|
+
vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
204
|
+
|
|
205
|
+
// calculate step size
|
|
206
|
+
const size_t epr = __riscv_vsetvlmax_e16m2();
|
|
207
|
+
const size_t step = epr * 2;
|
|
208
|
+
const int np = (n & ~(step - 1));
|
|
209
|
+
|
|
210
|
+
// unroll by 2
|
|
211
|
+
for (; i < np; i += step) {
|
|
212
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr);
|
|
213
|
+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr);
|
|
214
|
+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr);
|
|
215
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
216
|
+
|
|
217
|
+
vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr);
|
|
218
|
+
vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr);
|
|
219
|
+
vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr);
|
|
220
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
221
|
+
}
|
|
199
222
|
|
|
223
|
+
// accumulate in 1 register
|
|
224
|
+
vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl);
|
|
225
|
+
|
|
226
|
+
// leftovers
|
|
227
|
+
for (i = np; i < n; i += vl) {
|
|
228
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
229
|
+
vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl);
|
|
230
|
+
vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl);
|
|
231
|
+
vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// reduce
|
|
235
|
+
vl = __riscv_vsetvlmax_e32m4();
|
|
236
|
+
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
237
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
|
|
238
|
+
|
|
239
|
+
#endif
|
|
200
240
|
for (; i < n; ++i) {
|
|
201
241
|
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
|
|
202
242
|
GGML_BF16_TO_FP32(y[i]));
|
|
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
|
|
224
224
|
}
|
|
225
225
|
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
|
226
226
|
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
227
|
+
|
|
228
|
+
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
|
|
229
|
+
size_t vl = __riscv_vsetvlmax_e32m4();
|
|
230
|
+
|
|
231
|
+
// initialize accumulators to all zeroes
|
|
232
|
+
vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
233
|
+
vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
234
|
+
vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
235
|
+
vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
|
|
236
|
+
|
|
237
|
+
// calculate step size
|
|
238
|
+
const size_t epr = __riscv_vsetvlmax_e16m2();
|
|
239
|
+
const size_t step = epr * 2;
|
|
240
|
+
const int np = (n & ~(step - 1));
|
|
241
|
+
|
|
242
|
+
// unroll by 2 along the row dimension
|
|
243
|
+
for (int i = 0; i < np; i += step) {
|
|
244
|
+
vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
|
|
245
|
+
vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
|
|
246
|
+
vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
|
|
247
|
+
vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
|
|
248
|
+
vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
|
|
249
|
+
|
|
250
|
+
vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
|
|
251
|
+
vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
|
|
252
|
+
vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
|
|
253
|
+
vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
|
|
254
|
+
vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
|
|
258
|
+
vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
|
|
259
|
+
|
|
260
|
+
// leftovers
|
|
261
|
+
for (int i = np; i < n; i += vl) {
|
|
262
|
+
vl = __riscv_vsetvl_e16m2(n - i);
|
|
263
|
+
vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
|
|
264
|
+
vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
|
|
265
|
+
vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
|
|
266
|
+
|
|
267
|
+
vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
|
|
268
|
+
vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// reduce
|
|
272
|
+
vl = __riscv_vsetvlmax_e32m2();
|
|
273
|
+
vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
|
|
274
|
+
__riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
|
|
275
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
276
|
+
vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
|
|
277
|
+
__riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
|
|
278
|
+
vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
|
|
279
|
+
acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
280
|
+
|
|
281
|
+
vl = __riscv_vsetvlmax_e32m2();
|
|
282
|
+
vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
|
|
283
|
+
__riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
|
|
284
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
285
|
+
vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
|
|
286
|
+
__riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
|
|
287
|
+
vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
|
|
288
|
+
acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
|
|
289
|
+
sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
|
|
290
|
+
sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
|
|
291
|
+
|
|
234
292
|
#else
|
|
235
293
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
236
294
|
|
|
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
|
|
|
475
533
|
}
|
|
476
534
|
np = n;
|
|
477
535
|
#elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
|
|
478
|
-
const
|
|
479
|
-
_Float16
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
536
|
+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
|
|
537
|
+
const _Float16 scale = *(const _Float16*)(&s);
|
|
538
|
+
|
|
539
|
+
// calculate step size
|
|
540
|
+
const int epr = __riscv_vsetvlmax_e16m4();
|
|
541
|
+
const int step = epr * 2;
|
|
542
|
+
int np = (n & ~(step - 1));
|
|
543
|
+
|
|
544
|
+
// unroll by 2
|
|
545
|
+
for (int i = 0; i < np; i += step) {
|
|
546
|
+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
|
|
547
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
|
|
548
|
+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
|
|
549
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
|
|
550
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
551
|
+
|
|
552
|
+
vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
|
|
553
|
+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
|
|
554
|
+
ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
|
|
555
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
|
|
556
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// leftovers
|
|
560
|
+
int vl;
|
|
561
|
+
for (int i = np; i < n; i += vl) {
|
|
562
|
+
vl = __riscv_vsetvl_e16m4(n - i);
|
|
563
|
+
vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
|
|
564
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
|
|
565
|
+
ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
|
|
566
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
|
486
567
|
}
|
|
568
|
+
np = n;
|
|
487
569
|
#elif defined(GGML_SIMD)
|
|
488
570
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
489
571
|
|
|
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
724
806
|
svst1_f16(pg, (__fp16 *)(y + np), out);
|
|
725
807
|
}
|
|
726
808
|
#elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
809
|
+
const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
|
|
810
|
+
const _Float16 scale = *(const _Float16*)(&s);
|
|
811
|
+
|
|
812
|
+
// calculate step size
|
|
813
|
+
const int epr = __riscv_vsetvlmax_e16m4();
|
|
814
|
+
const int step = epr * 2;
|
|
815
|
+
const int np = (n & ~(step - 1));
|
|
816
|
+
|
|
817
|
+
// unroll by 2
|
|
818
|
+
for (int i = 0; i < np; i += step) {
|
|
819
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
|
|
820
|
+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
|
|
821
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
|
|
822
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
823
|
+
|
|
824
|
+
vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
|
|
825
|
+
ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
|
|
826
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
|
|
827
|
+
__asm__ __volatile__ ("" ::: "memory");
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// leftovers
|
|
831
|
+
int vl;
|
|
832
|
+
for (int i = np; i < n; i += vl) {
|
|
833
|
+
vl = __riscv_vsetvl_e16m4(n - i);
|
|
834
|
+
vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
|
|
835
|
+
ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
|
|
836
|
+
__riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
|
|
734
837
|
}
|
|
735
838
|
#elif defined(GGML_SIMD)
|
|
736
839
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
@@ -2055,7 +2055,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
|
|
|
2055
2055
|
LLM_TENSOR_SHORTCONV_INPROJ,
|
|
2056
2056
|
LLM_TENSOR_SHORTCONV_OUTPROJ,
|
|
2057
2057
|
LLM_TENSOR_TOKEN_EMBD,
|
|
2058
|
-
|
|
2058
|
+
LLM_TENSOR_OUTPUT_NORM_LFM2,
|
|
2059
2059
|
LLM_TENSOR_FFN_GATE_INP,
|
|
2060
2060
|
LLM_TENSOR_FFN_GATE_EXPS,
|
|
2061
2061
|
LLM_TENSOR_FFN_DOWN_EXPS,
|