llama_cpp 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
@@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
54
54
|
#endif
|
55
55
|
#endif
|
56
56
|
|
57
|
+
#ifdef __riscv_v_intrinsic
|
58
|
+
#include <riscv_vector.h>
|
59
|
+
#endif
|
60
|
+
|
57
61
|
#undef MIN
|
58
62
|
#undef MAX
|
59
63
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
@@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
65
69
|
// 2-6 bit quantization in super-blocks
|
66
70
|
//
|
67
71
|
|
68
|
-
|
69
72
|
//
|
70
73
|
// ===================== Helper functions
|
71
74
|
//
|
@@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
344
347
|
const float q4scale = 15.f;
|
345
348
|
|
346
349
|
for (int i = 0; i < nb; i++) {
|
347
|
-
|
348
350
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
349
351
|
float max_min = 0;
|
350
352
|
for (int j = 0; j < QK_K/16; ++j) {
|
@@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1582
1584
|
|
1583
1585
|
*s = hsum_float_8(acc);
|
1584
1586
|
|
1587
|
+
#elif defined __riscv_v_intrinsic
|
1588
|
+
|
1589
|
+
float sumf = 0;
|
1590
|
+
uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
1591
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
1592
|
+
|
1593
|
+
for (int i = 0; i < nb; ++i) {
|
1594
|
+
|
1595
|
+
const uint8_t * q2 = x[i].qs;
|
1596
|
+
const int8_t * q8 = y[i].qs;
|
1597
|
+
const uint8_t * sc = x[i].scales;
|
1598
|
+
|
1599
|
+
const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
1600
|
+
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
1601
|
+
|
1602
|
+
size_t vl = 16;
|
1603
|
+
|
1604
|
+
vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
|
1605
|
+
vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
|
1606
|
+
|
1607
|
+
vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
|
1608
|
+
|
1609
|
+
vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
|
1610
|
+
vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
|
1611
|
+
vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
|
1612
|
+
vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
|
1613
|
+
vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
1614
|
+
|
1615
|
+
sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
|
1616
|
+
|
1617
|
+
vl = 32;
|
1618
|
+
|
1619
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
1620
|
+
vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
|
1621
|
+
|
1622
|
+
uint8_t is=0;
|
1623
|
+
int isum=0;
|
1624
|
+
|
1625
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
1626
|
+
// load Q2
|
1627
|
+
vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
|
1628
|
+
|
1629
|
+
vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
|
1630
|
+
vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
|
1631
|
+
vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
|
1632
|
+
vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
|
1633
|
+
|
1634
|
+
// duplicate scale elements for product
|
1635
|
+
vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
|
1636
|
+
vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
|
1637
|
+
vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
|
1638
|
+
vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
|
1639
|
+
|
1640
|
+
vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
|
1641
|
+
vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
|
1642
|
+
vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
|
1643
|
+
vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
|
1644
|
+
|
1645
|
+
// load Q8
|
1646
|
+
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
1647
|
+
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
1648
|
+
vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
|
1649
|
+
vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
|
1650
|
+
|
1651
|
+
vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
|
1652
|
+
vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
|
1653
|
+
vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
|
1654
|
+
vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
|
1655
|
+
|
1656
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
|
1657
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
|
1658
|
+
|
1659
|
+
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
1660
|
+
|
1661
|
+
q2+=32; q8+=128; is=8;
|
1662
|
+
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
sumf += dall * isum;
|
1666
|
+
|
1667
|
+
}
|
1668
|
+
|
1669
|
+
*s = sumf;
|
1670
|
+
|
1585
1671
|
#else
|
1586
1672
|
|
1587
1673
|
float sumf = 0;
|
@@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1807
1893
|
|
1808
1894
|
*s = hsum_float_8(acc) + summs;
|
1809
1895
|
|
1896
|
+
#elif defined __riscv_v_intrinsic
|
1897
|
+
|
1898
|
+
uint32_t aux32[2];
|
1899
|
+
const uint8_t * scales = (const uint8_t *)aux32;
|
1900
|
+
|
1901
|
+
float sumf = 0;
|
1902
|
+
|
1903
|
+
for (int i = 0; i < nb; ++i) {
|
1904
|
+
|
1905
|
+
const float d = y[i].d * (float)x[i].d;
|
1906
|
+
const float dmin = -y[i].d * (float)x[i].dmin;
|
1907
|
+
|
1908
|
+
const uint8_t * restrict q2 = x[i].qs;
|
1909
|
+
const int8_t * restrict q8 = y[i].qs;
|
1910
|
+
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
1911
|
+
|
1912
|
+
aux32[0] = sc[0] & 0x0f0f0f0f;
|
1913
|
+
aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
|
1914
|
+
|
1915
|
+
sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
|
1916
|
+
|
1917
|
+
int isum1 = 0;
|
1918
|
+
int isum2 = 0;
|
1919
|
+
|
1920
|
+
size_t vl = 16;
|
1921
|
+
|
1922
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
1923
|
+
|
1924
|
+
// load Q2
|
1925
|
+
vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
|
1926
|
+
|
1927
|
+
vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
|
1928
|
+
vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
|
1929
|
+
vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
|
1930
|
+
vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
|
1931
|
+
|
1932
|
+
// load Q8, and take product with Q2
|
1933
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
1934
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
1935
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
1936
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
1937
|
+
|
1938
|
+
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
|
1939
|
+
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
|
1940
|
+
vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
|
1941
|
+
vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
|
1942
|
+
|
1943
|
+
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
|
1944
|
+
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
|
1945
|
+
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
|
1946
|
+
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
|
1947
|
+
|
1948
|
+
sumf += d * (isum1 + isum2);
|
1949
|
+
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
*s = sumf;
|
1953
|
+
|
1810
1954
|
#else
|
1811
1955
|
|
1812
1956
|
float sumf = 0;
|
@@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2220
2364
|
|
2221
2365
|
*s = hsum_float_8(acc);
|
2222
2366
|
|
2367
|
+
#elif defined __riscv_v_intrinsic
|
2368
|
+
|
2369
|
+
uint32_t aux[3];
|
2370
|
+
uint32_t utmp[4];
|
2371
|
+
|
2372
|
+
float sumf = 0;
|
2373
|
+
for (int i = 0; i < nb; ++i) {
|
2374
|
+
|
2375
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2376
|
+
const uint8_t * restrict qh = x[i].hmask;
|
2377
|
+
const int8_t * restrict q8 = y[i].qs;
|
2378
|
+
|
2379
|
+
memcpy(aux, x[i].scales, 12);
|
2380
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
2381
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
2382
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
2383
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
2384
|
+
|
2385
|
+
int8_t * scale = (int8_t *)utmp;
|
2386
|
+
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
2387
|
+
|
2388
|
+
|
2389
|
+
size_t vl = 32;
|
2390
|
+
uint8_t m = 1;
|
2391
|
+
|
2392
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
2393
|
+
vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
|
2394
|
+
|
2395
|
+
int sum_t = 0;
|
2396
|
+
|
2397
|
+
for (int j = 0; j < QK_K; j += 128) {
|
2398
|
+
|
2399
|
+
vl = 32;
|
2400
|
+
|
2401
|
+
// load Q3
|
2402
|
+
vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
|
2403
|
+
|
2404
|
+
vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
|
2405
|
+
vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
|
2406
|
+
vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
|
2407
|
+
vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
|
2408
|
+
|
2409
|
+
// compute mask for subtraction
|
2410
|
+
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2411
|
+
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
2412
|
+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
2413
|
+
m <<= 1;
|
2414
|
+
|
2415
|
+
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2416
|
+
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
2417
|
+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
2418
|
+
m <<= 1;
|
2419
|
+
|
2420
|
+
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2421
|
+
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
2422
|
+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
2423
|
+
m <<= 1;
|
2424
|
+
|
2425
|
+
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2426
|
+
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
2427
|
+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
2428
|
+
m <<= 1;
|
2429
|
+
|
2430
|
+
// load Q8 and take product with Q3
|
2431
|
+
vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
2432
|
+
vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
2433
|
+
vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
2434
|
+
vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
2435
|
+
|
2436
|
+
vl = 16;
|
2437
|
+
|
2438
|
+
// retreive lane to multiply with scale
|
2439
|
+
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
2440
|
+
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
2441
|
+
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
2442
|
+
vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
|
2443
|
+
vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
|
2444
|
+
vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
|
2445
|
+
vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
|
2446
|
+
vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
|
2447
|
+
|
2448
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
|
2449
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
|
2450
|
+
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
|
2451
|
+
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
|
2452
|
+
|
2453
|
+
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
2454
|
+
|
2455
|
+
q3 += 32; q8 += 128; scale += 8;
|
2456
|
+
|
2457
|
+
}
|
2458
|
+
|
2459
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
2460
|
+
|
2461
|
+
sumf += d*sum_t;
|
2462
|
+
|
2463
|
+
}
|
2464
|
+
|
2465
|
+
*s = sumf;
|
2466
|
+
|
2223
2467
|
#else
|
2224
2468
|
// scalar version
|
2225
2469
|
// This function is written like this so the compiler can manage to vectorize most of it
|
@@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2523
2767
|
|
2524
2768
|
*s = hsum_float_8(acc);
|
2525
2769
|
|
2770
|
+
#elif defined __riscv_v_intrinsic
|
2771
|
+
|
2772
|
+
uint16_t aux16[2];
|
2773
|
+
int8_t * scales = (int8_t *)aux16;
|
2774
|
+
|
2775
|
+
float sumf = 0;
|
2776
|
+
|
2777
|
+
for (int i = 0; i < nb; ++i) {
|
2778
|
+
|
2779
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2780
|
+
const int8_t * restrict q8 = y[i].qs;
|
2781
|
+
|
2782
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
2783
|
+
aux16[0] = a & 0x0f0f;
|
2784
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
2785
|
+
|
2786
|
+
for (int j = 0; j < 4; ++j) scales[j] -= 8;
|
2787
|
+
|
2788
|
+
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
2789
|
+
|
2790
|
+
const float d = y[i].d * (float)x[i].d;
|
2791
|
+
|
2792
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
2793
|
+
|
2794
|
+
// load qh
|
2795
|
+
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
|
2796
|
+
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
2797
|
+
|
2798
|
+
size_t vl = 16;
|
2799
|
+
|
2800
|
+
// extend and combine both qh_x1 and qh_x2
|
2801
|
+
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
2802
|
+
|
2803
|
+
vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
2804
|
+
vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
|
2805
|
+
vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
2806
|
+
vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
|
2807
|
+
|
2808
|
+
// load Q3
|
2809
|
+
vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
|
2810
|
+
|
2811
|
+
vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
|
2812
|
+
vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
|
2813
|
+
vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
|
2814
|
+
vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
|
2815
|
+
|
2816
|
+
vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
|
2817
|
+
vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
|
2818
|
+
vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
|
2819
|
+
vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
|
2820
|
+
|
2821
|
+
// load Q8 and take product with Q3
|
2822
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
2823
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
2824
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
2825
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
2826
|
+
|
2827
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
2828
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
2829
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
2830
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
2831
|
+
|
2832
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
|
2833
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
|
2834
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
|
2835
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
|
2836
|
+
|
2837
|
+
sumf += d * isum;
|
2838
|
+
|
2839
|
+
}
|
2840
|
+
|
2841
|
+
*s = sumf;
|
2842
|
+
|
2526
2843
|
#else
|
2527
2844
|
|
2528
2845
|
int8_t aux8[QK_K];
|
@@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2823
3140
|
|
2824
3141
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
2825
3142
|
|
3143
|
+
#elif defined __riscv_v_intrinsic
|
3144
|
+
|
3145
|
+
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
3146
|
+
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
3147
|
+
|
3148
|
+
float sumf = 0;
|
3149
|
+
|
3150
|
+
for (int i = 0; i < nb; ++i) {
|
3151
|
+
|
3152
|
+
size_t vl = 8;
|
3153
|
+
|
3154
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
3155
|
+
const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
3156
|
+
|
3157
|
+
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
3158
|
+
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
3159
|
+
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
3160
|
+
|
3161
|
+
memcpy(utmp, x[i].scales, 12);
|
3162
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
3163
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
3164
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
3165
|
+
utmp[2] = uaux;
|
3166
|
+
utmp[0] &= kmask1;
|
3167
|
+
|
3168
|
+
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
3169
|
+
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
3170
|
+
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
3171
|
+
|
3172
|
+
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
3173
|
+
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
3174
|
+
|
3175
|
+
const uint8_t * restrict q4 = x[i].qs;
|
3176
|
+
const int8_t * restrict q8 = y[i].qs;
|
3177
|
+
|
3178
|
+
vl = 32;
|
3179
|
+
|
3180
|
+
int32_t sum_1 = 0;
|
3181
|
+
int32_t sum_2 = 0;
|
3182
|
+
|
3183
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
3184
|
+
|
3185
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
3186
|
+
// load Q4
|
3187
|
+
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
3188
|
+
|
3189
|
+
// load Q8 and multiply it with lower Q4 nibble
|
3190
|
+
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
3191
|
+
vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
3192
|
+
vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
|
3193
|
+
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
|
3194
|
+
|
3195
|
+
sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
|
3196
|
+
|
3197
|
+
// load Q8 and multiply it with upper Q4 nibble
|
3198
|
+
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
3199
|
+
vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
3200
|
+
vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
|
3201
|
+
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
|
3202
|
+
|
3203
|
+
sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
|
3204
|
+
|
3205
|
+
q4 += 32; q8 += 64;
|
3206
|
+
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
sumf += d*(sum_1 + sum_2);
|
3210
|
+
|
3211
|
+
}
|
3212
|
+
|
3213
|
+
*s = sumf;
|
3214
|
+
|
2826
3215
|
#else
|
2827
3216
|
|
2828
3217
|
|
@@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3064
3453
|
|
3065
3454
|
*s = hsum_float_8(acc) - summs;
|
3066
3455
|
|
3456
|
+
#elif defined __riscv_v_intrinsic
|
3457
|
+
|
3458
|
+
uint16_t s16[2];
|
3459
|
+
const uint8_t * restrict scales = (const uint8_t *)s16;
|
3460
|
+
|
3461
|
+
float sumf = 0;
|
3462
|
+
|
3463
|
+
for (int i = 0; i < nb; ++i) {
|
3464
|
+
|
3465
|
+
const uint8_t * restrict q4 = x[i].qs;
|
3466
|
+
const int8_t * restrict q8 = y[i].qs;
|
3467
|
+
|
3468
|
+
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
3469
|
+
s16[0] = b[0] & 0x0f0f;
|
3470
|
+
s16[1] = (b[0] >> 4) & 0x0f0f;
|
3471
|
+
|
3472
|
+
sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
3473
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
|
3474
|
+
|
3475
|
+
size_t vl = 32;
|
3476
|
+
|
3477
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
3478
|
+
|
3479
|
+
// load Q4
|
3480
|
+
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
3481
|
+
|
3482
|
+
// load Q8 and multiply it with lower Q4 nibble
|
3483
|
+
vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
3484
|
+
vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
|
3485
|
+
vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
|
3486
|
+
|
3487
|
+
sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
|
3488
|
+
|
3489
|
+
// load Q8 and multiply it with upper Q4 nibble
|
3490
|
+
vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
3491
|
+
vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
3492
|
+
vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
|
3493
|
+
|
3494
|
+
sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
|
3495
|
+
|
3496
|
+
}
|
3497
|
+
|
3498
|
+
*s = sumf;
|
3499
|
+
|
3067
3500
|
#else
|
3068
3501
|
|
3069
3502
|
uint8_t aux8[QK_K];
|
@@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3394
3827
|
|
3395
3828
|
*s = hsum_float_8(acc) + summs;
|
3396
3829
|
|
3830
|
+
#elif defined __riscv_v_intrinsic
|
3831
|
+
|
3832
|
+
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
3833
|
+
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
3834
|
+
|
3835
|
+
float sumf = 0;
|
3836
|
+
float sums = 0.0;
|
3837
|
+
|
3838
|
+
size_t vl;
|
3839
|
+
|
3840
|
+
for (int i = 0; i < nb; ++i) {
|
3841
|
+
|
3842
|
+
vl = 8;
|
3843
|
+
|
3844
|
+
const uint8_t * restrict q5 = x[i].qs;
|
3845
|
+
const uint8_t * restrict hm = x[i].qh;
|
3846
|
+
const int8_t * restrict q8 = y[i].qs;
|
3847
|
+
|
3848
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
3849
|
+
const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
|
3850
|
+
|
3851
|
+
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
3852
|
+
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
3853
|
+
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
3854
|
+
|
3855
|
+
memcpy(utmp, x[i].scales, 12);
|
3856
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
3857
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
3858
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
3859
|
+
utmp[2] = uaux;
|
3860
|
+
utmp[0] &= kmask1;
|
3861
|
+
|
3862
|
+
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
3863
|
+
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
3864
|
+
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
3865
|
+
|
3866
|
+
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
3867
|
+
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
3868
|
+
|
3869
|
+
vl = 32;
|
3870
|
+
int32_t aux32 = 0;
|
3871
|
+
int is = 0;
|
3872
|
+
|
3873
|
+
uint8_t m = 1;
|
3874
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
3875
|
+
vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
|
3876
|
+
|
3877
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
3878
|
+
// load Q5 and Q8
|
3879
|
+
vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
|
3880
|
+
vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
|
3881
|
+
vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
|
3882
|
+
|
3883
|
+
// compute mask for addition
|
3884
|
+
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
3885
|
+
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
3886
|
+
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
3887
|
+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
3888
|
+
m <<= 1;
|
3889
|
+
|
3890
|
+
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
3891
|
+
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
3892
|
+
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
3893
|
+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
3894
|
+
m <<= 1;
|
3895
|
+
|
3896
|
+
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
3897
|
+
vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
|
3898
|
+
|
3899
|
+
vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
|
3900
|
+
vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
|
3901
|
+
|
3902
|
+
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
|
3903
|
+
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
|
3904
|
+
|
3905
|
+
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
|
3906
|
+
q5 += 32; q8 += 64;
|
3907
|
+
|
3908
|
+
}
|
3909
|
+
|
3910
|
+
vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
|
3911
|
+
sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
|
3912
|
+
|
3913
|
+
}
|
3914
|
+
|
3915
|
+
*s = sumf+sums;
|
3916
|
+
|
3397
3917
|
#else
|
3398
3918
|
|
3399
3919
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
@@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3639
4159
|
|
3640
4160
|
*s = hsum_float_8(acc);
|
3641
4161
|
|
4162
|
+
#elif defined __riscv_v_intrinsic
|
4163
|
+
|
4164
|
+
float sumf = 0;
|
4165
|
+
|
4166
|
+
for (int i = 0; i < nb; ++i) {
|
4167
|
+
|
4168
|
+
const float d = y[i].d * (float)x[i].d;
|
4169
|
+
const int8_t * sc = x[i].scales;
|
4170
|
+
|
4171
|
+
const uint8_t * restrict q5 = x[i].qs;
|
4172
|
+
const uint8_t * restrict qh = x[i].qh;
|
4173
|
+
const int8_t * restrict q8 = y[i].qs;
|
4174
|
+
|
4175
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4176
|
+
|
4177
|
+
// load qh
|
4178
|
+
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
|
4179
|
+
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
4180
|
+
|
4181
|
+
size_t vl = 16;
|
4182
|
+
|
4183
|
+
// combine both qh_1 and qh_2
|
4184
|
+
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
4185
|
+
|
4186
|
+
vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
4187
|
+
vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
|
4188
|
+
vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
|
4189
|
+
vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
4190
|
+
|
4191
|
+
vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
|
4192
|
+
vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
|
4193
|
+
vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
|
4194
|
+
vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
|
4195
|
+
|
4196
|
+
// load q5
|
4197
|
+
vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
|
4198
|
+
vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
|
4199
|
+
|
4200
|
+
vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
|
4201
|
+
vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
|
4202
|
+
vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
|
4203
|
+
vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
|
4204
|
+
|
4205
|
+
vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
|
4206
|
+
vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
|
4207
|
+
vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
|
4208
|
+
vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
|
4209
|
+
|
4210
|
+
// load Q8 and multiply it with Q5
|
4211
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
4212
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
4213
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
4214
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
4215
|
+
|
4216
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
4217
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
4218
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
4219
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
4220
|
+
|
4221
|
+
int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
|
4222
|
+
int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
|
4223
|
+
int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
|
4224
|
+
int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
|
4225
|
+
|
4226
|
+
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
4227
|
+
|
4228
|
+
}
|
4229
|
+
|
4230
|
+
*s = sumf;
|
4231
|
+
|
3642
4232
|
#else
|
3643
4233
|
|
3644
4234
|
int8_t aux8[QK_K];
|
@@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4023
4613
|
|
4024
4614
|
*s = hsum_float_8(acc);
|
4025
4615
|
|
4616
|
+
#elif defined __riscv_v_intrinsic
|
4617
|
+
|
4618
|
+
float sumf = 0;
|
4619
|
+
for (int i = 0; i < nb; ++i) {
|
4620
|
+
|
4621
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
4622
|
+
|
4623
|
+
const uint8_t * restrict q6 = x[i].ql;
|
4624
|
+
const uint8_t * restrict qh = x[i].qh;
|
4625
|
+
const int8_t * restrict q8 = y[i].qs;
|
4626
|
+
|
4627
|
+
const int8_t * restrict scale = x[i].scales;
|
4628
|
+
|
4629
|
+
size_t vl;
|
4630
|
+
|
4631
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4632
|
+
|
4633
|
+
int sum_t = 0;
|
4634
|
+
int is = 0;
|
4635
|
+
|
4636
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
4637
|
+
|
4638
|
+
vl = 32;
|
4639
|
+
|
4640
|
+
// load qh
|
4641
|
+
vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
|
4642
|
+
|
4643
|
+
// load Q6
|
4644
|
+
vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
|
4645
|
+
vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
|
4646
|
+
|
4647
|
+
vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
|
4648
|
+
vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
|
4649
|
+
vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
|
4650
|
+
vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
|
4651
|
+
|
4652
|
+
vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
|
4653
|
+
vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
|
4654
|
+
vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
|
4655
|
+
vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
|
4656
|
+
|
4657
|
+
vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
|
4658
|
+
vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
|
4659
|
+
vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
|
4660
|
+
vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
|
4661
|
+
|
4662
|
+
vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
|
4663
|
+
vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
|
4664
|
+
vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
|
4665
|
+
vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
|
4666
|
+
|
4667
|
+
// load Q8 and take product
|
4668
|
+
vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
4669
|
+
vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
4670
|
+
vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
4671
|
+
vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
4672
|
+
|
4673
|
+
vl = 16;
|
4674
|
+
|
4675
|
+
vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
|
4676
|
+
vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
|
4677
|
+
vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
|
4678
|
+
vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
|
4679
|
+
vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
|
4680
|
+
vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
|
4681
|
+
vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
|
4682
|
+
vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
|
4683
|
+
|
4684
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
|
4685
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
|
4686
|
+
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
|
4687
|
+
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
|
4688
|
+
|
4689
|
+
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
4690
|
+
|
4691
|
+
q6 += 64; qh += 32; q8 += 128; is=8;
|
4692
|
+
|
4693
|
+
}
|
4694
|
+
|
4695
|
+
sumf += d * sum_t;
|
4696
|
+
|
4697
|
+
}
|
4698
|
+
|
4699
|
+
*s = sumf;
|
4700
|
+
|
4026
4701
|
#else
|
4027
4702
|
|
4028
4703
|
int8_t aux8[QK_K];
|
@@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4276
4951
|
|
4277
4952
|
*s = hsum_float_8(acc);
|
4278
4953
|
|
4954
|
+
#elif defined __riscv_v_intrinsic
|
4955
|
+
|
4956
|
+
float sumf = 0;
|
4957
|
+
|
4958
|
+
for (int i = 0; i < nb; ++i) {
|
4959
|
+
|
4960
|
+
const float d_all = (float)x[i].d;
|
4961
|
+
|
4962
|
+
const uint8_t * restrict q6 = x[i].ql;
|
4963
|
+
const uint8_t * restrict qh = x[i].qh;
|
4964
|
+
const int8_t * restrict q8 = y[i].qs;
|
4965
|
+
|
4966
|
+
const int8_t * restrict scale = x[i].scales;
|
4967
|
+
|
4968
|
+
int32_t isum = 0;
|
4969
|
+
|
4970
|
+
size_t vl = 16;
|
4971
|
+
|
4972
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4973
|
+
|
4974
|
+
// load Q6
|
4975
|
+
vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
|
4976
|
+
vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
|
4977
|
+
|
4978
|
+
// load qh
|
4979
|
+
vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
|
4980
|
+
|
4981
|
+
vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4982
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4983
|
+
vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4984
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4985
|
+
vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4986
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4987
|
+
vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4988
|
+
|
4989
|
+
vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
|
4990
|
+
vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
|
4991
|
+
vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
|
4992
|
+
vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
|
4993
|
+
|
4994
|
+
vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
|
4995
|
+
vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
|
4996
|
+
vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
|
4997
|
+
vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
|
4998
|
+
|
4999
|
+
// load Q8 and take product
|
5000
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
5001
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
5002
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
5003
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
5004
|
+
|
5005
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
5006
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
5007
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
5008
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
5009
|
+
|
5010
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
|
5011
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
|
5012
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
|
5013
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
|
5014
|
+
|
5015
|
+
sumf += isum * d_all * y[i].d;
|
5016
|
+
|
5017
|
+
}
|
5018
|
+
|
5019
|
+
*s = sumf;
|
5020
|
+
|
4279
5021
|
#else
|
4280
5022
|
|
4281
5023
|
int8_t aux8[QK_K];
|