llama_cpp 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
@@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
54
54
|
#endif
|
55
55
|
#endif
|
56
56
|
|
57
|
+
#ifdef __riscv_v_intrinsic
|
58
|
+
#include <riscv_vector.h>
|
59
|
+
#endif
|
60
|
+
|
57
61
|
#undef MIN
|
58
62
|
#undef MAX
|
59
63
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
@@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
|
|
65
69
|
// 2-6 bit quantization in super-blocks
|
66
70
|
//
|
67
71
|
|
68
|
-
|
69
72
|
//
|
70
73
|
// ===================== Helper functions
|
71
74
|
//
|
@@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
|
344
347
|
const float q4scale = 15.f;
|
345
348
|
|
346
349
|
for (int i = 0; i < nb; i++) {
|
347
|
-
|
348
350
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
349
351
|
float max_min = 0;
|
350
352
|
for (int j = 0; j < QK_K/16; ++j) {
|
@@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1582
1584
|
|
1583
1585
|
*s = hsum_float_8(acc);
|
1584
1586
|
|
1587
|
+
#elif defined __riscv_v_intrinsic
|
1588
|
+
|
1589
|
+
float sumf = 0;
|
1590
|
+
uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
1591
|
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
|
1592
|
+
|
1593
|
+
for (int i = 0; i < nb; ++i) {
|
1594
|
+
|
1595
|
+
const uint8_t * q2 = x[i].qs;
|
1596
|
+
const int8_t * q8 = y[i].qs;
|
1597
|
+
const uint8_t * sc = x[i].scales;
|
1598
|
+
|
1599
|
+
const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
1600
|
+
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
1601
|
+
|
1602
|
+
size_t vl = 16;
|
1603
|
+
|
1604
|
+
vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
|
1605
|
+
vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
|
1606
|
+
|
1607
|
+
vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
|
1608
|
+
|
1609
|
+
vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
|
1610
|
+
vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
|
1611
|
+
vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
|
1612
|
+
vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
|
1613
|
+
vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
1614
|
+
|
1615
|
+
sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
|
1616
|
+
|
1617
|
+
vl = 32;
|
1618
|
+
|
1619
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
1620
|
+
vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
|
1621
|
+
|
1622
|
+
uint8_t is=0;
|
1623
|
+
int isum=0;
|
1624
|
+
|
1625
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
1626
|
+
// load Q2
|
1627
|
+
vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
|
1628
|
+
|
1629
|
+
vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
|
1630
|
+
vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
|
1631
|
+
vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
|
1632
|
+
vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
|
1633
|
+
|
1634
|
+
// duplicate scale elements for product
|
1635
|
+
vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
|
1636
|
+
vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
|
1637
|
+
vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
|
1638
|
+
vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
|
1639
|
+
|
1640
|
+
vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
|
1641
|
+
vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
|
1642
|
+
vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
|
1643
|
+
vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
|
1644
|
+
|
1645
|
+
// load Q8
|
1646
|
+
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
1647
|
+
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
1648
|
+
vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
|
1649
|
+
vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
|
1650
|
+
|
1651
|
+
vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
|
1652
|
+
vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
|
1653
|
+
vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
|
1654
|
+
vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
|
1655
|
+
|
1656
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
|
1657
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
|
1658
|
+
|
1659
|
+
isum += __riscv_vmv_x_s_i32m1_i32(isum1);
|
1660
|
+
|
1661
|
+
q2+=32; q8+=128; is=8;
|
1662
|
+
|
1663
|
+
}
|
1664
|
+
|
1665
|
+
sumf += dall * isum;
|
1666
|
+
|
1667
|
+
}
|
1668
|
+
|
1669
|
+
*s = sumf;
|
1670
|
+
|
1585
1671
|
#else
|
1586
1672
|
|
1587
1673
|
float sumf = 0;
|
@@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1807
1893
|
|
1808
1894
|
*s = hsum_float_8(acc) + summs;
|
1809
1895
|
|
1896
|
+
#elif defined __riscv_v_intrinsic
|
1897
|
+
|
1898
|
+
uint32_t aux32[2];
|
1899
|
+
const uint8_t * scales = (const uint8_t *)aux32;
|
1900
|
+
|
1901
|
+
float sumf = 0;
|
1902
|
+
|
1903
|
+
for (int i = 0; i < nb; ++i) {
|
1904
|
+
|
1905
|
+
const float d = y[i].d * (float)x[i].d;
|
1906
|
+
const float dmin = -y[i].d * (float)x[i].dmin;
|
1907
|
+
|
1908
|
+
const uint8_t * restrict q2 = x[i].qs;
|
1909
|
+
const int8_t * restrict q8 = y[i].qs;
|
1910
|
+
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
1911
|
+
|
1912
|
+
aux32[0] = sc[0] & 0x0f0f0f0f;
|
1913
|
+
aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
|
1914
|
+
|
1915
|
+
sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
|
1916
|
+
|
1917
|
+
int isum1 = 0;
|
1918
|
+
int isum2 = 0;
|
1919
|
+
|
1920
|
+
size_t vl = 16;
|
1921
|
+
|
1922
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
1923
|
+
|
1924
|
+
// load Q2
|
1925
|
+
vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
|
1926
|
+
|
1927
|
+
vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
|
1928
|
+
vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
|
1929
|
+
vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
|
1930
|
+
vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
|
1931
|
+
|
1932
|
+
// load Q8, and take product with Q2
|
1933
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
1934
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
1935
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
1936
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
1937
|
+
|
1938
|
+
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
|
1939
|
+
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
|
1940
|
+
vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
|
1941
|
+
vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
|
1942
|
+
|
1943
|
+
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
|
1944
|
+
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
|
1945
|
+
isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
|
1946
|
+
isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
|
1947
|
+
|
1948
|
+
sumf += d * (isum1 + isum2);
|
1949
|
+
|
1950
|
+
}
|
1951
|
+
|
1952
|
+
*s = sumf;
|
1953
|
+
|
1810
1954
|
#else
|
1811
1955
|
|
1812
1956
|
float sumf = 0;
|
@@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2220
2364
|
|
2221
2365
|
*s = hsum_float_8(acc);
|
2222
2366
|
|
2367
|
+
#elif defined __riscv_v_intrinsic
|
2368
|
+
|
2369
|
+
uint32_t aux[3];
|
2370
|
+
uint32_t utmp[4];
|
2371
|
+
|
2372
|
+
float sumf = 0;
|
2373
|
+
for (int i = 0; i < nb; ++i) {
|
2374
|
+
|
2375
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2376
|
+
const uint8_t * restrict qh = x[i].hmask;
|
2377
|
+
const int8_t * restrict q8 = y[i].qs;
|
2378
|
+
|
2379
|
+
memcpy(aux, x[i].scales, 12);
|
2380
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
2381
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
2382
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
2383
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
2384
|
+
|
2385
|
+
int8_t * scale = (int8_t *)utmp;
|
2386
|
+
for (int j = 0; j < 16; ++j) scale[j] -= 32;
|
2387
|
+
|
2388
|
+
|
2389
|
+
size_t vl = 32;
|
2390
|
+
uint8_t m = 1;
|
2391
|
+
|
2392
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
2393
|
+
vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
|
2394
|
+
|
2395
|
+
int sum_t = 0;
|
2396
|
+
|
2397
|
+
for (int j = 0; j < QK_K; j += 128) {
|
2398
|
+
|
2399
|
+
vl = 32;
|
2400
|
+
|
2401
|
+
// load Q3
|
2402
|
+
vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
|
2403
|
+
|
2404
|
+
vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
|
2405
|
+
vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
|
2406
|
+
vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
|
2407
|
+
vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
|
2408
|
+
|
2409
|
+
// compute mask for subtraction
|
2410
|
+
vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2411
|
+
vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
|
2412
|
+
vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
|
2413
|
+
m <<= 1;
|
2414
|
+
|
2415
|
+
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2416
|
+
vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
|
2417
|
+
vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
|
2418
|
+
m <<= 1;
|
2419
|
+
|
2420
|
+
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2421
|
+
vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
|
2422
|
+
vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
|
2423
|
+
m <<= 1;
|
2424
|
+
|
2425
|
+
vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
2426
|
+
vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
|
2427
|
+
vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
|
2428
|
+
m <<= 1;
|
2429
|
+
|
2430
|
+
// load Q8 and take product with Q3
|
2431
|
+
vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
2432
|
+
vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
2433
|
+
vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
2434
|
+
vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
2435
|
+
|
2436
|
+
vl = 16;
|
2437
|
+
|
2438
|
+
// retreive lane to multiply with scale
|
2439
|
+
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
2440
|
+
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
2441
|
+
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
2442
|
+
vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
|
2443
|
+
vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
|
2444
|
+
vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
|
2445
|
+
vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
|
2446
|
+
vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
|
2447
|
+
|
2448
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
|
2449
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
|
2450
|
+
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
|
2451
|
+
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
|
2452
|
+
|
2453
|
+
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
2454
|
+
|
2455
|
+
q3 += 32; q8 += 128; scale += 8;
|
2456
|
+
|
2457
|
+
}
|
2458
|
+
|
2459
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
2460
|
+
|
2461
|
+
sumf += d*sum_t;
|
2462
|
+
|
2463
|
+
}
|
2464
|
+
|
2465
|
+
*s = sumf;
|
2466
|
+
|
2223
2467
|
#else
|
2224
2468
|
// scalar version
|
2225
2469
|
// This function is written like this so the compiler can manage to vectorize most of it
|
@@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2523
2767
|
|
2524
2768
|
*s = hsum_float_8(acc);
|
2525
2769
|
|
2770
|
+
#elif defined __riscv_v_intrinsic
|
2771
|
+
|
2772
|
+
uint16_t aux16[2];
|
2773
|
+
int8_t * scales = (int8_t *)aux16;
|
2774
|
+
|
2775
|
+
float sumf = 0;
|
2776
|
+
|
2777
|
+
for (int i = 0; i < nb; ++i) {
|
2778
|
+
|
2779
|
+
const uint8_t * restrict q3 = x[i].qs;
|
2780
|
+
const int8_t * restrict q8 = y[i].qs;
|
2781
|
+
|
2782
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
2783
|
+
aux16[0] = a & 0x0f0f;
|
2784
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
2785
|
+
|
2786
|
+
for (int j = 0; j < 4; ++j) scales[j] -= 8;
|
2787
|
+
|
2788
|
+
int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
|
2789
|
+
|
2790
|
+
const float d = y[i].d * (float)x[i].d;
|
2791
|
+
|
2792
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
2793
|
+
|
2794
|
+
// load qh
|
2795
|
+
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
|
2796
|
+
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
2797
|
+
|
2798
|
+
size_t vl = 16;
|
2799
|
+
|
2800
|
+
// extend and combine both qh_x1 and qh_x2
|
2801
|
+
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
2802
|
+
|
2803
|
+
vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
2804
|
+
vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
|
2805
|
+
vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
|
2806
|
+
vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
|
2807
|
+
|
2808
|
+
// load Q3
|
2809
|
+
vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
|
2810
|
+
|
2811
|
+
vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
|
2812
|
+
vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
|
2813
|
+
vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
|
2814
|
+
vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
|
2815
|
+
|
2816
|
+
vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
|
2817
|
+
vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
|
2818
|
+
vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
|
2819
|
+
vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
|
2820
|
+
|
2821
|
+
// load Q8 and take product with Q3
|
2822
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
2823
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
2824
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
2825
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
2826
|
+
|
2827
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
2828
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
2829
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
2830
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
2831
|
+
|
2832
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
|
2833
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
|
2834
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
|
2835
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
|
2836
|
+
|
2837
|
+
sumf += d * isum;
|
2838
|
+
|
2839
|
+
}
|
2840
|
+
|
2841
|
+
*s = sumf;
|
2842
|
+
|
2526
2843
|
#else
|
2527
2844
|
|
2528
2845
|
int8_t aux8[QK_K];
|
@@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2823
3140
|
|
2824
3141
|
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
|
2825
3142
|
|
3143
|
+
#elif defined __riscv_v_intrinsic
|
3144
|
+
|
3145
|
+
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
3146
|
+
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
3147
|
+
|
3148
|
+
float sumf = 0;
|
3149
|
+
|
3150
|
+
for (int i = 0; i < nb; ++i) {
|
3151
|
+
|
3152
|
+
size_t vl = 8;
|
3153
|
+
|
3154
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
3155
|
+
const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
3156
|
+
|
3157
|
+
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
3158
|
+
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
3159
|
+
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
3160
|
+
|
3161
|
+
memcpy(utmp, x[i].scales, 12);
|
3162
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
3163
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
3164
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
3165
|
+
utmp[2] = uaux;
|
3166
|
+
utmp[0] &= kmask1;
|
3167
|
+
|
3168
|
+
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
3169
|
+
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
3170
|
+
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
3171
|
+
|
3172
|
+
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
3173
|
+
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
3174
|
+
|
3175
|
+
const uint8_t * restrict q4 = x[i].qs;
|
3176
|
+
const int8_t * restrict q8 = y[i].qs;
|
3177
|
+
|
3178
|
+
vl = 32;
|
3179
|
+
|
3180
|
+
int32_t sum_1 = 0;
|
3181
|
+
int32_t sum_2 = 0;
|
3182
|
+
|
3183
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
3184
|
+
|
3185
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
3186
|
+
// load Q4
|
3187
|
+
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
3188
|
+
|
3189
|
+
// load Q8 and multiply it with lower Q4 nibble
|
3190
|
+
vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
|
3191
|
+
vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
3192
|
+
vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
|
3193
|
+
vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
|
3194
|
+
|
3195
|
+
sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
|
3196
|
+
|
3197
|
+
// load Q8 and multiply it with upper Q4 nibble
|
3198
|
+
vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
|
3199
|
+
vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
3200
|
+
vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
|
3201
|
+
vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
|
3202
|
+
|
3203
|
+
sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
|
3204
|
+
|
3205
|
+
q4 += 32; q8 += 64;
|
3206
|
+
|
3207
|
+
}
|
3208
|
+
|
3209
|
+
sumf += d*(sum_1 + sum_2);
|
3210
|
+
|
3211
|
+
}
|
3212
|
+
|
3213
|
+
*s = sumf;
|
3214
|
+
|
2826
3215
|
#else
|
2827
3216
|
|
2828
3217
|
|
@@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3064
3453
|
|
3065
3454
|
*s = hsum_float_8(acc) - summs;
|
3066
3455
|
|
3456
|
+
#elif defined __riscv_v_intrinsic
|
3457
|
+
|
3458
|
+
uint16_t s16[2];
|
3459
|
+
const uint8_t * restrict scales = (const uint8_t *)s16;
|
3460
|
+
|
3461
|
+
float sumf = 0;
|
3462
|
+
|
3463
|
+
for (int i = 0; i < nb; ++i) {
|
3464
|
+
|
3465
|
+
const uint8_t * restrict q4 = x[i].qs;
|
3466
|
+
const int8_t * restrict q8 = y[i].qs;
|
3467
|
+
|
3468
|
+
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
3469
|
+
s16[0] = b[0] & 0x0f0f;
|
3470
|
+
s16[1] = (b[0] >> 4) & 0x0f0f;
|
3471
|
+
|
3472
|
+
sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
3473
|
+
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
|
3474
|
+
|
3475
|
+
size_t vl = 32;
|
3476
|
+
|
3477
|
+
vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
|
3478
|
+
|
3479
|
+
// load Q4
|
3480
|
+
vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
|
3481
|
+
|
3482
|
+
// load Q8 and multiply it with lower Q4 nibble
|
3483
|
+
vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
|
3484
|
+
vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
|
3485
|
+
vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
|
3486
|
+
|
3487
|
+
sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
|
3488
|
+
|
3489
|
+
// load Q8 and multiply it with upper Q4 nibble
|
3490
|
+
vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
|
3491
|
+
vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
3492
|
+
vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
|
3493
|
+
|
3494
|
+
sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
|
3495
|
+
|
3496
|
+
}
|
3497
|
+
|
3498
|
+
*s = sumf;
|
3499
|
+
|
3067
3500
|
#else
|
3068
3501
|
|
3069
3502
|
uint8_t aux8[QK_K];
|
@@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3394
3827
|
|
3395
3828
|
*s = hsum_float_8(acc) + summs;
|
3396
3829
|
|
3830
|
+
#elif defined __riscv_v_intrinsic
|
3831
|
+
|
3832
|
+
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
3833
|
+
const uint8_t * mins = (const uint8_t*)&utmp[2];
|
3834
|
+
|
3835
|
+
float sumf = 0;
|
3836
|
+
float sums = 0.0;
|
3837
|
+
|
3838
|
+
size_t vl;
|
3839
|
+
|
3840
|
+
for (int i = 0; i < nb; ++i) {
|
3841
|
+
|
3842
|
+
vl = 8;
|
3843
|
+
|
3844
|
+
const uint8_t * restrict q5 = x[i].qs;
|
3845
|
+
const uint8_t * restrict hm = x[i].qh;
|
3846
|
+
const int8_t * restrict q8 = y[i].qs;
|
3847
|
+
|
3848
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
3849
|
+
const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
|
3850
|
+
|
3851
|
+
vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
|
3852
|
+
vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
|
3853
|
+
vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
|
3854
|
+
|
3855
|
+
memcpy(utmp, x[i].scales, 12);
|
3856
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
3857
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
3858
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
3859
|
+
utmp[2] = uaux;
|
3860
|
+
utmp[0] &= kmask1;
|
3861
|
+
|
3862
|
+
vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
|
3863
|
+
vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
|
3864
|
+
vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
|
3865
|
+
|
3866
|
+
vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
|
3867
|
+
sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
|
3868
|
+
|
3869
|
+
vl = 32;
|
3870
|
+
int32_t aux32 = 0;
|
3871
|
+
int is = 0;
|
3872
|
+
|
3873
|
+
uint8_t m = 1;
|
3874
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
3875
|
+
vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
|
3876
|
+
|
3877
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
3878
|
+
// load Q5 and Q8
|
3879
|
+
vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
|
3880
|
+
vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
|
3881
|
+
vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
|
3882
|
+
|
3883
|
+
// compute mask for addition
|
3884
|
+
vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
|
3885
|
+
vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
3886
|
+
vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
|
3887
|
+
vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
|
3888
|
+
m <<= 1;
|
3889
|
+
|
3890
|
+
vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
|
3891
|
+
vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
|
3892
|
+
vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
|
3893
|
+
vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
|
3894
|
+
m <<= 1;
|
3895
|
+
|
3896
|
+
vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
|
3897
|
+
vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
|
3898
|
+
|
3899
|
+
vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
|
3900
|
+
vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
|
3901
|
+
|
3902
|
+
vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
|
3903
|
+
vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
|
3904
|
+
|
3905
|
+
aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
|
3906
|
+
q5 += 32; q8 += 64;
|
3907
|
+
|
3908
|
+
}
|
3909
|
+
|
3910
|
+
vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
|
3911
|
+
sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
|
3912
|
+
|
3913
|
+
}
|
3914
|
+
|
3915
|
+
*s = sumf+sums;
|
3916
|
+
|
3397
3917
|
#else
|
3398
3918
|
|
3399
3919
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
@@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3639
4159
|
|
3640
4160
|
*s = hsum_float_8(acc);
|
3641
4161
|
|
4162
|
+
#elif defined __riscv_v_intrinsic
|
4163
|
+
|
4164
|
+
float sumf = 0;
|
4165
|
+
|
4166
|
+
for (int i = 0; i < nb; ++i) {
|
4167
|
+
|
4168
|
+
const float d = y[i].d * (float)x[i].d;
|
4169
|
+
const int8_t * sc = x[i].scales;
|
4170
|
+
|
4171
|
+
const uint8_t * restrict q5 = x[i].qs;
|
4172
|
+
const uint8_t * restrict qh = x[i].qh;
|
4173
|
+
const int8_t * restrict q8 = y[i].qs;
|
4174
|
+
|
4175
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4176
|
+
|
4177
|
+
// load qh
|
4178
|
+
vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
|
4179
|
+
vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
|
4180
|
+
|
4181
|
+
size_t vl = 16;
|
4182
|
+
|
4183
|
+
// combine both qh_1 and qh_2
|
4184
|
+
vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
|
4185
|
+
|
4186
|
+
vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
4187
|
+
vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
|
4188
|
+
vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
|
4189
|
+
vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
|
4190
|
+
|
4191
|
+
vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
|
4192
|
+
vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
|
4193
|
+
vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
|
4194
|
+
vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
|
4195
|
+
|
4196
|
+
// load q5
|
4197
|
+
vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
|
4198
|
+
vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
|
4199
|
+
|
4200
|
+
vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
|
4201
|
+
vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
|
4202
|
+
vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
|
4203
|
+
vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
|
4204
|
+
|
4205
|
+
vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
|
4206
|
+
vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
|
4207
|
+
vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
|
4208
|
+
vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
|
4209
|
+
|
4210
|
+
// load Q8 and multiply it with Q5
|
4211
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
4212
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
4213
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
4214
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
4215
|
+
|
4216
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
4217
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
4218
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
4219
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
4220
|
+
|
4221
|
+
int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
|
4222
|
+
int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
|
4223
|
+
int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
|
4224
|
+
int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
|
4225
|
+
|
4226
|
+
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
4227
|
+
|
4228
|
+
}
|
4229
|
+
|
4230
|
+
*s = sumf;
|
4231
|
+
|
3642
4232
|
#else
|
3643
4233
|
|
3644
4234
|
int8_t aux8[QK_K];
|
@@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4023
4613
|
|
4024
4614
|
*s = hsum_float_8(acc);
|
4025
4615
|
|
4616
|
+
#elif defined __riscv_v_intrinsic
|
4617
|
+
|
4618
|
+
float sumf = 0;
|
4619
|
+
for (int i = 0; i < nb; ++i) {
|
4620
|
+
|
4621
|
+
const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
|
4622
|
+
|
4623
|
+
const uint8_t * restrict q6 = x[i].ql;
|
4624
|
+
const uint8_t * restrict qh = x[i].qh;
|
4625
|
+
const int8_t * restrict q8 = y[i].qs;
|
4626
|
+
|
4627
|
+
const int8_t * restrict scale = x[i].scales;
|
4628
|
+
|
4629
|
+
size_t vl;
|
4630
|
+
|
4631
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4632
|
+
|
4633
|
+
int sum_t = 0;
|
4634
|
+
int is = 0;
|
4635
|
+
|
4636
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
4637
|
+
|
4638
|
+
vl = 32;
|
4639
|
+
|
4640
|
+
// load qh
|
4641
|
+
vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
|
4642
|
+
|
4643
|
+
// load Q6
|
4644
|
+
vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
|
4645
|
+
vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
|
4646
|
+
|
4647
|
+
vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
|
4648
|
+
vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
|
4649
|
+
vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
|
4650
|
+
vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
|
4651
|
+
|
4652
|
+
vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
|
4653
|
+
vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
|
4654
|
+
vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
|
4655
|
+
vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
|
4656
|
+
|
4657
|
+
vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
|
4658
|
+
vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
|
4659
|
+
vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
|
4660
|
+
vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
|
4661
|
+
|
4662
|
+
vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
|
4663
|
+
vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
|
4664
|
+
vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
|
4665
|
+
vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
|
4666
|
+
|
4667
|
+
// load Q8 and take product
|
4668
|
+
vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
|
4669
|
+
vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
|
4670
|
+
vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
|
4671
|
+
vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
|
4672
|
+
|
4673
|
+
vl = 16;
|
4674
|
+
|
4675
|
+
vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
|
4676
|
+
vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
|
4677
|
+
vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
|
4678
|
+
vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
|
4679
|
+
vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
|
4680
|
+
vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
|
4681
|
+
vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
|
4682
|
+
vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
|
4683
|
+
|
4684
|
+
vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
|
4685
|
+
vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
|
4686
|
+
vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
|
4687
|
+
vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
|
4688
|
+
|
4689
|
+
sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
|
4690
|
+
|
4691
|
+
q6 += 64; qh += 32; q8 += 128; is=8;
|
4692
|
+
|
4693
|
+
}
|
4694
|
+
|
4695
|
+
sumf += d * sum_t;
|
4696
|
+
|
4697
|
+
}
|
4698
|
+
|
4699
|
+
*s = sumf;
|
4700
|
+
|
4026
4701
|
#else
|
4027
4702
|
|
4028
4703
|
int8_t aux8[QK_K];
|
@@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4276
4951
|
|
4277
4952
|
*s = hsum_float_8(acc);
|
4278
4953
|
|
4954
|
+
#elif defined __riscv_v_intrinsic
|
4955
|
+
|
4956
|
+
float sumf = 0;
|
4957
|
+
|
4958
|
+
for (int i = 0; i < nb; ++i) {
|
4959
|
+
|
4960
|
+
const float d_all = (float)x[i].d;
|
4961
|
+
|
4962
|
+
const uint8_t * restrict q6 = x[i].ql;
|
4963
|
+
const uint8_t * restrict qh = x[i].qh;
|
4964
|
+
const int8_t * restrict q8 = y[i].qs;
|
4965
|
+
|
4966
|
+
const int8_t * restrict scale = x[i].scales;
|
4967
|
+
|
4968
|
+
int32_t isum = 0;
|
4969
|
+
|
4970
|
+
size_t vl = 16;
|
4971
|
+
|
4972
|
+
vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
|
4973
|
+
|
4974
|
+
// load Q6
|
4975
|
+
vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
|
4976
|
+
vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
|
4977
|
+
|
4978
|
+
// load qh
|
4979
|
+
vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
|
4980
|
+
|
4981
|
+
vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4982
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4983
|
+
vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4984
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4985
|
+
vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4986
|
+
qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
|
4987
|
+
vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
|
4988
|
+
|
4989
|
+
vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
|
4990
|
+
vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
|
4991
|
+
vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
|
4992
|
+
vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
|
4993
|
+
|
4994
|
+
vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
|
4995
|
+
vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
|
4996
|
+
vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
|
4997
|
+
vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
|
4998
|
+
|
4999
|
+
// load Q8 and take product
|
5000
|
+
vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
|
5001
|
+
vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
|
5002
|
+
vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
|
5003
|
+
vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
|
5004
|
+
|
5005
|
+
vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
|
5006
|
+
vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
|
5007
|
+
vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
|
5008
|
+
vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
|
5009
|
+
|
5010
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
|
5011
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
|
5012
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
|
5013
|
+
isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
|
5014
|
+
|
5015
|
+
sumf += isum * d_all * y[i].d;
|
5016
|
+
|
5017
|
+
}
|
5018
|
+
|
5019
|
+
*s = sumf;
|
5020
|
+
|
4279
5021
|
#else
|
4280
5022
|
|
4281
5023
|
int8_t aux8[QK_K];
|