llama_cpp 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -54,6 +54,10 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
54
54
  #endif
55
55
  #endif
56
56
 
57
+ #ifdef __riscv_v_intrinsic
58
+ #include <riscv_vector.h>
59
+ #endif
60
+
57
61
  #undef MIN
58
62
  #undef MAX
59
63
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -65,7 +69,6 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
65
69
  // 2-6 bit quantization in super-blocks
66
70
  //
67
71
 
68
-
69
72
  //
70
73
  // ===================== Helper functions
71
74
  //
@@ -344,7 +347,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
344
347
  const float q4scale = 15.f;
345
348
 
346
349
  for (int i = 0; i < nb; i++) {
347
-
348
350
  float max_scale = 0; // as we are deducting the min, scales are always positive
349
351
  float max_min = 0;
350
352
  for (int j = 0; j < QK_K/16; ++j) {
@@ -1582,6 +1584,90 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1582
1584
 
1583
1585
  *s = hsum_float_8(acc);
1584
1586
 
1587
+ #elif defined __riscv_v_intrinsic
1588
+
1589
+ float sumf = 0;
1590
+ uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1591
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
1592
+
1593
+ for (int i = 0; i < nb; ++i) {
1594
+
1595
+ const uint8_t * q2 = x[i].qs;
1596
+ const int8_t * q8 = y[i].qs;
1597
+ const uint8_t * sc = x[i].scales;
1598
+
1599
+ const float dall = y[i].d * ggml_fp16_to_fp32(x[i].d);
1600
+ const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
1601
+
1602
+ size_t vl = 16;
1603
+
1604
+ vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
1605
+ vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
1606
+
1607
+ vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
1608
+
1609
+ vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
1610
+ vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
1611
+ vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
1612
+ vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
1613
+ vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
1614
+
1615
+ sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
1616
+
1617
+ vl = 32;
1618
+
1619
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
1620
+ vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
1621
+
1622
+ uint8_t is=0;
1623
+ int isum=0;
1624
+
1625
+ for (int j = 0; j < QK_K/128; ++j) {
1626
+ // load Q2
1627
+ vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
1628
+
1629
+ vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
1630
+ vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
1631
+ vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
1632
+ vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
1633
+
1634
+ // duplicate scale elements for product
1635
+ vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
1636
+ vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
1637
+ vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
1638
+ vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
1639
+
1640
+ vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
1641
+ vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
1642
+ vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
1643
+ vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
1644
+
1645
+ // load Q8
1646
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
1647
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
1648
+ vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
1649
+ vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
1650
+
1651
+ vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
1652
+ vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
1653
+ vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
1654
+ vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
1655
+
1656
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
1657
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
1658
+
1659
+ isum += __riscv_vmv_x_s_i32m1_i32(isum1);
1660
+
1661
+ q2+=32; q8+=128; is=8;
1662
+
1663
+ }
1664
+
1665
+ sumf += dall * isum;
1666
+
1667
+ }
1668
+
1669
+ *s = sumf;
1670
+
1585
1671
  #else
1586
1672
 
1587
1673
  float sumf = 0;
@@ -1807,6 +1893,64 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1807
1893
 
1808
1894
  *s = hsum_float_8(acc) + summs;
1809
1895
 
1896
+ #elif defined __riscv_v_intrinsic
1897
+
1898
+ uint32_t aux32[2];
1899
+ const uint8_t * scales = (const uint8_t *)aux32;
1900
+
1901
+ float sumf = 0;
1902
+
1903
+ for (int i = 0; i < nb; ++i) {
1904
+
1905
+ const float d = y[i].d * (float)x[i].d;
1906
+ const float dmin = -y[i].d * (float)x[i].dmin;
1907
+
1908
+ const uint8_t * restrict q2 = x[i].qs;
1909
+ const int8_t * restrict q8 = y[i].qs;
1910
+ const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
1911
+
1912
+ aux32[0] = sc[0] & 0x0f0f0f0f;
1913
+ aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f;
1914
+
1915
+ sumf += dmin * (scales[4] * y[i].bsums[0] + scales[5] * y[i].bsums[1] + scales[6] * y[i].bsums[2] + scales[7] * y[i].bsums[3]);
1916
+
1917
+ int isum1 = 0;
1918
+ int isum2 = 0;
1919
+
1920
+ size_t vl = 16;
1921
+
1922
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
1923
+
1924
+ // load Q2
1925
+ vuint8mf2_t q2_x = __riscv_vle8_v_u8mf2(q2, vl);
1926
+
1927
+ vint8mf2_t q2_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q2_x, 0x03, vl));
1928
+ vint8mf2_t q2_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x2, vl), 0x03 , vl));
1929
+ vint8mf2_t q2_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x4, vl), 0x03 , vl));
1930
+ vint8mf2_t q2_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q2_x, 0x6, vl), 0x03 , vl));
1931
+
1932
+ // load Q8, and take product with Q2
1933
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q2_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
1934
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q2_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
1935
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q2_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
1936
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q2_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
1937
+
1938
+ vint16m1_t vs_0 = __riscv_vredsum_vs_i16m1_i16m1(p0, vzero, vl);
1939
+ vint16m1_t vs_1 = __riscv_vredsum_vs_i16m1_i16m1(p1, vzero, vl);
1940
+ vint16m1_t vs_2 = __riscv_vredsum_vs_i16m1_i16m1(p2, vzero, vl);
1941
+ vint16m1_t vs_3 = __riscv_vredsum_vs_i16m1_i16m1(p3, vzero, vl);
1942
+
1943
+ isum1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[0];
1944
+ isum2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[1];
1945
+ isum1 += __riscv_vmv_x_s_i16m1_i16(vs_2) * scales[2];
1946
+ isum2 += __riscv_vmv_x_s_i16m1_i16(vs_3) * scales[3];
1947
+
1948
+ sumf += d * (isum1 + isum2);
1949
+
1950
+ }
1951
+
1952
+ *s = sumf;
1953
+
1810
1954
  #else
1811
1955
 
1812
1956
  float sumf = 0;
@@ -2220,6 +2364,106 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2220
2364
 
2221
2365
  *s = hsum_float_8(acc);
2222
2366
 
2367
+ #elif defined __riscv_v_intrinsic
2368
+
2369
+ uint32_t aux[3];
2370
+ uint32_t utmp[4];
2371
+
2372
+ float sumf = 0;
2373
+ for (int i = 0; i < nb; ++i) {
2374
+
2375
+ const uint8_t * restrict q3 = x[i].qs;
2376
+ const uint8_t * restrict qh = x[i].hmask;
2377
+ const int8_t * restrict q8 = y[i].qs;
2378
+
2379
+ memcpy(aux, x[i].scales, 12);
2380
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
2381
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
2382
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
2383
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
2384
+
2385
+ int8_t * scale = (int8_t *)utmp;
2386
+ for (int j = 0; j < 16; ++j) scale[j] -= 32;
2387
+
2388
+
2389
+ size_t vl = 32;
2390
+ uint8_t m = 1;
2391
+
2392
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
2393
+ vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
2394
+
2395
+ int sum_t = 0;
2396
+
2397
+ for (int j = 0; j < QK_K; j += 128) {
2398
+
2399
+ vl = 32;
2400
+
2401
+ // load Q3
2402
+ vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
2403
+
2404
+ vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
2405
+ vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
2406
+ vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
2407
+ vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
2408
+
2409
+ // compute mask for subtraction
2410
+ vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
2411
+ vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
2412
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
2413
+ m <<= 1;
2414
+
2415
+ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
2416
+ vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
2417
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
2418
+ m <<= 1;
2419
+
2420
+ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
2421
+ vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
2422
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
2423
+ m <<= 1;
2424
+
2425
+ vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
2426
+ vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
2427
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
2428
+ m <<= 1;
2429
+
2430
+ // load Q8 and take product with Q3
2431
+ vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
2432
+ vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
2433
+ vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
2434
+ vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
2435
+
2436
+ vl = 16;
2437
+
2438
+ // retreive lane to multiply with scale
2439
+ vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
2440
+ vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
2441
+ vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
2442
+ vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
2443
+ vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
2444
+ vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
2445
+ vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
2446
+ vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
2447
+
2448
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
2449
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
2450
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
2451
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
2452
+
2453
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
2454
+
2455
+ q3 += 32; q8 += 128; scale += 8;
2456
+
2457
+ }
2458
+
2459
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
2460
+
2461
+ sumf += d*sum_t;
2462
+
2463
+ }
2464
+
2465
+ *s = sumf;
2466
+
2223
2467
  #else
2224
2468
  // scalar version
2225
2469
  // This function is written like this so the compiler can manage to vectorize most of it
@@ -2523,6 +2767,79 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2523
2767
 
2524
2768
  *s = hsum_float_8(acc);
2525
2769
 
2770
+ #elif defined __riscv_v_intrinsic
2771
+
2772
+ uint16_t aux16[2];
2773
+ int8_t * scales = (int8_t *)aux16;
2774
+
2775
+ float sumf = 0;
2776
+
2777
+ for (int i = 0; i < nb; ++i) {
2778
+
2779
+ const uint8_t * restrict q3 = x[i].qs;
2780
+ const int8_t * restrict q8 = y[i].qs;
2781
+
2782
+ const uint16_t a = *(const uint16_t *)x[i].scales;
2783
+ aux16[0] = a & 0x0f0f;
2784
+ aux16[1] = (a >> 4) & 0x0f0f;
2785
+
2786
+ for (int j = 0; j < 4; ++j) scales[j] -= 8;
2787
+
2788
+ int32_t isum = -4*(scales[0] * y[i].bsums[0] + scales[2] * y[i].bsums[1] + scales[1] * y[i].bsums[2] + scales[3] * y[i].bsums[3]);
2789
+
2790
+ const float d = y[i].d * (float)x[i].d;
2791
+
2792
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
2793
+
2794
+ // load qh
2795
+ vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(x[i].hmask, 8);
2796
+ vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
2797
+
2798
+ size_t vl = 16;
2799
+
2800
+ // extend and combine both qh_x1 and qh_x2
2801
+ vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
2802
+
2803
+ vuint8mf2_t qh_0 = __riscv_vand_vx_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
2804
+ vuint8mf2_t qh_1 = __riscv_vand_vx_u8mf2(qh_x, 0x4, vl);
2805
+ vuint8mf2_t qh_2 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl), 0x4, vl);
2806
+ vuint8mf2_t qh_3 = __riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), 0x4, vl);
2807
+
2808
+ // load Q3
2809
+ vuint8mf2_t q3_x = __riscv_vle8_v_u8mf2(q3, vl);
2810
+
2811
+ vuint8mf2_t q3h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q3_x, 0x3, vl), qh_0, vl);
2812
+ vuint8mf2_t q3h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 2, vl), 0x3, vl), qh_1, vl);
2813
+ vuint8mf2_t q3h_2 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 4, vl), 0x3, vl), qh_2, vl);
2814
+ vuint8mf2_t q3h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q3_x, 0x6, vl), qh_3, vl);
2815
+
2816
+ vint8mf2_t q3_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_0);
2817
+ vint8mf2_t q3_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_1);
2818
+ vint8mf2_t q3_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_2);
2819
+ vint8mf2_t q3_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(q3h_3);
2820
+
2821
+ // load Q8 and take product with Q3
2822
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q3_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
2823
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q3_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
2824
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q3_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
2825
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q3_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
2826
+
2827
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
2828
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
2829
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
2830
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
2831
+
2832
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scales[0];
2833
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scales[2];
2834
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scales[1];
2835
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scales[3];
2836
+
2837
+ sumf += d * isum;
2838
+
2839
+ }
2840
+
2841
+ *s = sumf;
2842
+
2526
2843
  #else
2527
2844
 
2528
2845
  int8_t aux8[QK_K];
@@ -2823,6 +3140,78 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2823
3140
 
2824
3141
  *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
2825
3142
 
3143
+ #elif defined __riscv_v_intrinsic
3144
+
3145
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
3146
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
3147
+
3148
+ float sumf = 0;
3149
+
3150
+ for (int i = 0; i < nb; ++i) {
3151
+
3152
+ size_t vl = 8;
3153
+
3154
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
3155
+ const float dmin = y[i].d * ggml_fp16_to_fp32(x[i].dmin);
3156
+
3157
+ vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
3158
+ vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
3159
+ vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
3160
+
3161
+ memcpy(utmp, x[i].scales, 12);
3162
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
3163
+ const uint32_t uaux = utmp[1] & kmask1;
3164
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
3165
+ utmp[2] = uaux;
3166
+ utmp[0] &= kmask1;
3167
+
3168
+ vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
3169
+ vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
3170
+ vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
3171
+
3172
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
3173
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
3174
+
3175
+ const uint8_t * restrict q4 = x[i].qs;
3176
+ const int8_t * restrict q8 = y[i].qs;
3177
+
3178
+ vl = 32;
3179
+
3180
+ int32_t sum_1 = 0;
3181
+ int32_t sum_2 = 0;
3182
+
3183
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
3184
+
3185
+ for (int j = 0; j < QK_K/64; ++j) {
3186
+ // load Q4
3187
+ vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
3188
+
3189
+ // load Q8 and multiply it with lower Q4 nibble
3190
+ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
3191
+ vint8m1_t q4_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
3192
+ vint16m2_t qv_0 = __riscv_vwmul_vv_i16m2(q4_0, q8_0, vl);
3193
+ vint16m1_t vs_0 = __riscv_vredsum_vs_i16m2_i16m1(qv_0, vzero, vl);
3194
+
3195
+ sum_1 += __riscv_vmv_x_s_i16m1_i16(vs_0) * scales[2*j+0];
3196
+
3197
+ // load Q8 and multiply it with upper Q4 nibble
3198
+ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
3199
+ vint8m1_t q4_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
3200
+ vint16m2_t qv_1 = __riscv_vwmul_vv_i16m2(q4_1, q8_1, vl);
3201
+ vint16m1_t vs_1 = __riscv_vredsum_vs_i16m2_i16m1(qv_1, vzero, vl);
3202
+
3203
+ sum_2 += __riscv_vmv_x_s_i16m1_i16(vs_1) * scales[2*j+1];
3204
+
3205
+ q4 += 32; q8 += 64;
3206
+
3207
+ }
3208
+
3209
+ sumf += d*(sum_1 + sum_2);
3210
+
3211
+ }
3212
+
3213
+ *s = sumf;
3214
+
2826
3215
  #else
2827
3216
 
2828
3217
 
@@ -3064,6 +3453,50 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
3064
3453
 
3065
3454
  *s = hsum_float_8(acc) - summs;
3066
3455
 
3456
+ #elif defined __riscv_v_intrinsic
3457
+
3458
+ uint16_t s16[2];
3459
+ const uint8_t * restrict scales = (const uint8_t *)s16;
3460
+
3461
+ float sumf = 0;
3462
+
3463
+ for (int i = 0; i < nb; ++i) {
3464
+
3465
+ const uint8_t * restrict q4 = x[i].qs;
3466
+ const int8_t * restrict q8 = y[i].qs;
3467
+
3468
+ const uint16_t * restrict b = (const uint16_t *)x[i].scales;
3469
+ s16[0] = b[0] & 0x0f0f;
3470
+ s16[1] = (b[0] >> 4) & 0x0f0f;
3471
+
3472
+ sumf -= y[i].d * ggml_fp16_to_fp32(x[i].d[1]) * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
3473
+ const float d = y[i].d * ggml_fp16_to_fp32(x[i].d[0]);
3474
+
3475
+ size_t vl = 32;
3476
+
3477
+ vint16m1_t vzero = __riscv_vmv_v_x_i16m1(0, 1);
3478
+
3479
+ // load Q4
3480
+ vuint8m1_t q4_x = __riscv_vle8_v_u8m1(q4, vl);
3481
+
3482
+ // load Q8 and multiply it with lower Q4 nibble
3483
+ vint8m1_t q4_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q4_x, 0x0F, vl));
3484
+ vint16m2_t va_0 = __riscv_vwmul_vv_i16m2(q4_a, __riscv_vle8_v_i8m1(q8, vl), vl);
3485
+ vint16m1_t aux1 = __riscv_vredsum_vs_i16m2_i16m1(va_0, vzero, vl);
3486
+
3487
+ sumf += d*scales[0]*__riscv_vmv_x_s_i16m1_i16(aux1);
3488
+
3489
+ // load Q8 and multiply it with upper Q4 nibble
3490
+ vint8m1_t q4_s = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q4_x, 0x04, vl));
3491
+ vint16m2_t va_1 = __riscv_vwmul_vv_i16m2(q4_s, __riscv_vle8_v_i8m1(q8+32, vl), vl);
3492
+ vint16m1_t aux2 = __riscv_vredsum_vs_i16m2_i16m1(va_1, vzero, vl);
3493
+
3494
+ sumf += d*scales[1]*__riscv_vmv_x_s_i16m1_i16(aux2);
3495
+
3496
+ }
3497
+
3498
+ *s = sumf;
3499
+
3067
3500
  #else
3068
3501
 
3069
3502
  uint8_t aux8[QK_K];
@@ -3394,6 +3827,93 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3394
3827
 
3395
3828
  *s = hsum_float_8(acc) + summs;
3396
3829
 
3830
+ #elif defined __riscv_v_intrinsic
3831
+
3832
+ const uint8_t * scales = (const uint8_t*)&utmp[0];
3833
+ const uint8_t * mins = (const uint8_t*)&utmp[2];
3834
+
3835
+ float sumf = 0;
3836
+ float sums = 0.0;
3837
+
3838
+ size_t vl;
3839
+
3840
+ for (int i = 0; i < nb; ++i) {
3841
+
3842
+ vl = 8;
3843
+
3844
+ const uint8_t * restrict q5 = x[i].qs;
3845
+ const uint8_t * restrict hm = x[i].qh;
3846
+ const int8_t * restrict q8 = y[i].qs;
3847
+
3848
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
3849
+ const float dmin = ggml_fp16_to_fp32(x[i].dmin) * y[i].d;
3850
+
3851
+ vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
3852
+ vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
3853
+ vint16mf2_t q8sums = __riscv_vadd_vv_i16mf2(q8sums_0, q8sums_1, vl);
3854
+
3855
+ memcpy(utmp, x[i].scales, 12);
3856
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
3857
+ const uint32_t uaux = utmp[1] & kmask1;
3858
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
3859
+ utmp[2] = uaux;
3860
+ utmp[0] &= kmask1;
3861
+
3862
+ vuint8mf4_t mins8 = __riscv_vle8_v_u8mf4(mins, vl);
3863
+ vint16mf2_t v_mins = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vzext_vf2_u16mf2(mins8, vl));
3864
+ vint32m1_t prod = __riscv_vwmul_vv_i32m1(q8sums, v_mins, vl);
3865
+
3866
+ vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
3867
+ sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi);
3868
+
3869
+ vl = 32;
3870
+ int32_t aux32 = 0;
3871
+ int is = 0;
3872
+
3873
+ uint8_t m = 1;
3874
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
3875
+ vuint8m1_t vqh = __riscv_vle8_v_u8m1(hm, vl);
3876
+
3877
+ for (int j = 0; j < QK_K/64; ++j) {
3878
+ // load Q5 and Q8
3879
+ vuint8m1_t q5_x = __riscv_vle8_v_u8m1(q5, vl);
3880
+ vint8m1_t q8_y1 = __riscv_vle8_v_i8m1(q8, vl);
3881
+ vint8m1_t q8_y2 = __riscv_vle8_v_i8m1(q8+32, vl);
3882
+
3883
+ // compute mask for addition
3884
+ vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
3885
+ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
3886
+ vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
3887
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
3888
+ m <<= 1;
3889
+
3890
+ vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
3891
+ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
3892
+ vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
3893
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
3894
+ m <<= 1;
3895
+
3896
+ vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
3897
+ vint16m2_t v1 = __riscv_vwmul_vv_i16m2(q5_m2, q8_y2, vl);
3898
+
3899
+ vint32m4_t vs1 = __riscv_vwmul_vx_i32m4(v0, scales[is++], vl);
3900
+ vint32m4_t vs2 = __riscv_vwmul_vx_i32m4(v1, scales[is++], vl);
3901
+
3902
+ vint32m1_t vacc1 = __riscv_vredsum_vs_i32m4_i32m1(vs1, vzero, vl);
3903
+ vint32m1_t vacc2 = __riscv_vredsum_vs_i32m4_i32m1(vs2, vzero, vl);
3904
+
3905
+ aux32 += __riscv_vmv_x_s_i32m1_i32(vacc1) + __riscv_vmv_x_s_i32m1_i32(vacc2);
3906
+ q5 += 32; q8 += 64;
3907
+
3908
+ }
3909
+
3910
+ vfloat32m1_t vaux = __riscv_vfmul_vf_f32m1(__riscv_vfmv_v_f_f32m1(aux32, 1), d, 1);
3911
+ sums += __riscv_vfmv_f_s_f32m1_f32(vaux);
3912
+
3913
+ }
3914
+
3915
+ *s = sumf+sums;
3916
+
3397
3917
  #else
3398
3918
 
3399
3919
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -3639,6 +4159,76 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3639
4159
 
3640
4160
  *s = hsum_float_8(acc);
3641
4161
 
4162
+ #elif defined __riscv_v_intrinsic
4163
+
4164
+ float sumf = 0;
4165
+
4166
+ for (int i = 0; i < nb; ++i) {
4167
+
4168
+ const float d = y[i].d * (float)x[i].d;
4169
+ const int8_t * sc = x[i].scales;
4170
+
4171
+ const uint8_t * restrict q5 = x[i].qs;
4172
+ const uint8_t * restrict qh = x[i].qh;
4173
+ const int8_t * restrict q8 = y[i].qs;
4174
+
4175
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
4176
+
4177
+ // load qh
4178
+ vuint8mf4_t qh_x1 = __riscv_vle8_v_u8mf4(qh, 8);
4179
+ vuint8mf2_t qh_x2 = __riscv_vlmul_ext_v_u8mf4_u8mf2(__riscv_vsrl_vx_u8mf4(qh_x1, 1, 8));
4180
+
4181
+ size_t vl = 16;
4182
+
4183
+ // combine both qh_1 and qh_2
4184
+ vuint8mf2_t qh_x = __riscv_vslideup_vx_u8mf2(__riscv_vlmul_ext_v_u8mf4_u8mf2(qh_x1), qh_x2, vl/2, vl);
4185
+
4186
+ vuint8mf2_t qh_h0 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
4187
+ vuint8mf2_t qh_h1 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsll_vx_u8mf2(qh_x, 0x2, vl), vl), 16, vl);
4188
+ vuint8mf2_t qh_h2 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(qh_x, vl), 16, vl);
4189
+ vuint8mf2_t qh_h3 = __riscv_vand_vx_u8mf2(__riscv_vnot_v_u8mf2(__riscv_vsrl_vx_u8mf2(qh_x, 0x4, vl), vl), 16, vl);
4190
+
4191
+ vint8mf2_t qh_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h0);
4192
+ vint8mf2_t qh_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h1);
4193
+ vint8mf2_t qh_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h2);
4194
+ vint8mf2_t qh_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(qh_h3);
4195
+
4196
+ // load q5
4197
+ vuint8mf2_t q5_x1 = __riscv_vle8_v_u8mf2(q5, vl);
4198
+ vuint8mf2_t q5_x2 = __riscv_vle8_v_u8mf2(q5+16, vl);
4199
+
4200
+ vint8mf2_t q5s_0 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x1, 0xF, vl));
4201
+ vint8mf2_t q5s_1 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vand_vx_u8mf2(q5_x2, 0xF, vl));
4202
+ vint8mf2_t q5s_2 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x1, 0x4, vl));
4203
+ vint8mf2_t q5s_3 = __riscv_vreinterpret_v_u8mf2_i8mf2(__riscv_vsrl_vx_u8mf2(q5_x2, 0x4, vl));
4204
+
4205
+ vint8mf2_t q5_0 = __riscv_vsub_vv_i8mf2(q5s_0, qh_0, vl);
4206
+ vint8mf2_t q5_1 = __riscv_vsub_vv_i8mf2(q5s_1, qh_1, vl);
4207
+ vint8mf2_t q5_2 = __riscv_vsub_vv_i8mf2(q5s_2, qh_2, vl);
4208
+ vint8mf2_t q5_3 = __riscv_vsub_vv_i8mf2(q5s_3, qh_3, vl);
4209
+
4210
+ // load Q8 and multiply it with Q5
4211
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q5_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
4212
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q5_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
4213
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q5_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
4214
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q5_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
4215
+
4216
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
4217
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
4218
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
4219
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
4220
+
4221
+ int32_t sumi1 = sc[0] * __riscv_vmv_x_s_i32m1_i32(vs_0);
4222
+ int32_t sumi2 = sc[1] * __riscv_vmv_x_s_i32m1_i32(vs_1);
4223
+ int32_t sumi3 = sc[2] * __riscv_vmv_x_s_i32m1_i32(vs_2);
4224
+ int32_t sumi4 = sc[3] * __riscv_vmv_x_s_i32m1_i32(vs_3);
4225
+
4226
+ sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
4227
+
4228
+ }
4229
+
4230
+ *s = sumf;
4231
+
3642
4232
  #else
3643
4233
 
3644
4234
  int8_t aux8[QK_K];
@@ -4023,6 +4613,91 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4023
4613
 
4024
4614
  *s = hsum_float_8(acc);
4025
4615
 
4616
+ #elif defined __riscv_v_intrinsic
4617
+
4618
+ float sumf = 0;
4619
+ for (int i = 0; i < nb; ++i) {
4620
+
4621
+ const float d = ggml_fp16_to_fp32(x[i].d) * y[i].d;
4622
+
4623
+ const uint8_t * restrict q6 = x[i].ql;
4624
+ const uint8_t * restrict qh = x[i].qh;
4625
+ const int8_t * restrict q8 = y[i].qs;
4626
+
4627
+ const int8_t * restrict scale = x[i].scales;
4628
+
4629
+ size_t vl;
4630
+
4631
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
4632
+
4633
+ int sum_t = 0;
4634
+ int is = 0;
4635
+
4636
+ for (int j = 0; j < QK_K/128; ++j) {
4637
+
4638
+ vl = 32;
4639
+
4640
+ // load qh
4641
+ vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
4642
+
4643
+ // load Q6
4644
+ vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
4645
+ vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
4646
+
4647
+ vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
4648
+ vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
4649
+ vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
4650
+ vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
4651
+
4652
+ vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
4653
+ vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
4654
+ vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
4655
+ vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
4656
+
4657
+ vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
4658
+ vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
4659
+ vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
4660
+ vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
4661
+
4662
+ vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
4663
+ vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
4664
+ vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
4665
+ vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
4666
+
4667
+ // load Q8 and take product
4668
+ vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
4669
+ vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
4670
+ vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
4671
+ vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
4672
+
4673
+ vl = 16;
4674
+
4675
+ vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
4676
+ vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
4677
+ vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
4678
+ vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
4679
+ vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
4680
+ vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
4681
+ vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
4682
+ vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
4683
+
4684
+ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
4685
+ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
4686
+ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
4687
+ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
4688
+
4689
+ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
4690
+
4691
+ q6 += 64; qh += 32; q8 += 128; is=8;
4692
+
4693
+ }
4694
+
4695
+ sumf += d * sum_t;
4696
+
4697
+ }
4698
+
4699
+ *s = sumf;
4700
+
4026
4701
  #else
4027
4702
 
4028
4703
  int8_t aux8[QK_K];
@@ -4276,6 +4951,73 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4276
4951
 
4277
4952
  *s = hsum_float_8(acc);
4278
4953
 
4954
+ #elif defined __riscv_v_intrinsic
4955
+
4956
+ float sumf = 0;
4957
+
4958
+ for (int i = 0; i < nb; ++i) {
4959
+
4960
+ const float d_all = (float)x[i].d;
4961
+
4962
+ const uint8_t * restrict q6 = x[i].ql;
4963
+ const uint8_t * restrict qh = x[i].qh;
4964
+ const int8_t * restrict q8 = y[i].qs;
4965
+
4966
+ const int8_t * restrict scale = x[i].scales;
4967
+
4968
+ int32_t isum = 0;
4969
+
4970
+ size_t vl = 16;
4971
+
4972
+ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
4973
+
4974
+ // load Q6
4975
+ vuint8mf2_t q6_0 = __riscv_vle8_v_u8mf2(q6, vl);
4976
+ vuint8mf2_t q6_1 = __riscv_vle8_v_u8mf2(q6+16, vl);
4977
+
4978
+ // load qh
4979
+ vuint8mf2_t qh_x = __riscv_vle8_v_u8mf2(qh, vl);
4980
+
4981
+ vuint8mf2_t qh0 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
4982
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
4983
+ vuint8mf2_t qh1 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
4984
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
4985
+ vuint8mf2_t qh2 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
4986
+ qh_x = __riscv_vsrl_vx_u8mf2(qh_x, 0x2, vl);
4987
+ vuint8mf2_t qh3 = __riscv_vsll_vx_u8mf2(__riscv_vand_vx_u8mf2(qh_x, 0x3, vl), 0x4, vl);
4988
+
4989
+ vuint8mf2_t q6h_0 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_0, 0xF, vl), qh0, vl);
4990
+ vuint8mf2_t q6h_1 = __riscv_vor_vv_u8mf2(__riscv_vand_vx_u8mf2(q6_1, 0xF, vl), qh1, vl);
4991
+ vuint8mf2_t q6h_2 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_0, 0x4, vl), qh2, vl);
4992
+ vuint8mf2_t q6h_3 = __riscv_vor_vv_u8mf2(__riscv_vsrl_vx_u8mf2(q6_1, 0x4, vl), qh3, vl);
4993
+
4994
+ vint8mf2_t q6v_0 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_0), 32, vl);
4995
+ vint8mf2_t q6v_1 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_1), 32, vl);
4996
+ vint8mf2_t q6v_2 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_2), 32, vl);
4997
+ vint8mf2_t q6v_3 = __riscv_vsub_vx_i8mf2(__riscv_vreinterpret_v_u8mf2_i8mf2(q6h_3), 32, vl);
4998
+
4999
+ // load Q8 and take product
5000
+ vint16m1_t p0 = __riscv_vwmul_vv_i16m1(q6v_0, __riscv_vle8_v_i8mf2(q8, vl), vl);
5001
+ vint16m1_t p1 = __riscv_vwmul_vv_i16m1(q6v_1, __riscv_vle8_v_i8mf2(q8+16, vl), vl);
5002
+ vint16m1_t p2 = __riscv_vwmul_vv_i16m1(q6v_2, __riscv_vle8_v_i8mf2(q8+32, vl), vl);
5003
+ vint16m1_t p3 = __riscv_vwmul_vv_i16m1(q6v_3, __riscv_vle8_v_i8mf2(q8+48, vl), vl);
5004
+
5005
+ vint32m1_t vs_0 = __riscv_vwredsum_vs_i16m1_i32m1(p0, vzero, vl);
5006
+ vint32m1_t vs_1 = __riscv_vwredsum_vs_i16m1_i32m1(p1, vzero, vl);
5007
+ vint32m1_t vs_2 = __riscv_vwredsum_vs_i16m1_i32m1(p2, vzero, vl);
5008
+ vint32m1_t vs_3 = __riscv_vwredsum_vs_i16m1_i32m1(p3, vzero, vl);
5009
+
5010
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_0) * scale[0];
5011
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_1) * scale[1];
5012
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_2) * scale[2];
5013
+ isum += __riscv_vmv_x_s_i32m1_i32(vs_3) * scale[3];
5014
+
5015
+ sumf += isum * d_all * y[i].d;
5016
+
5017
+ }
5018
+
5019
+ *s = sumf;
5020
+
4279
5021
  #else
4280
5022
 
4281
5023
  int8_t aux8[QK_K];