cui-llama.rn 1.2.3 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@
7
7
 
8
8
  #include "ggml-quants.h"
9
9
  #include "ggml-impl.h"
10
+ #include "ggml-cpu.h"
10
11
  #include "ggml-cpu-impl.h"
11
12
 
12
13
  #include <math.h>
@@ -991,6 +992,73 @@ void lm_ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
991
992
  }
992
993
  }
993
994
  return;
995
+ #elif defined(__riscv_v_intrinsic)
996
+ if (__riscv_vlenb() >= QK4_0) {
997
+ const size_t vl = QK4_0;
998
+
999
+ const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
1000
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
1001
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
1002
+
1003
+ vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
1004
+ for (int l = 0; l < nb; l++) {
1005
+ const int64_t a0 = *(const int64_t *)&a_ptr[l].qs[0];
1006
+ const int64_t a1 = *(const int64_t *)&a_ptr[l].qs[8];
1007
+ const int64_t a2 = *(const int64_t *)&a_ptr[l].qs[16];
1008
+ const int64_t a3 = *(const int64_t *)&a_ptr[l].qs[24];
1009
+ __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
1010
+ const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a0, vl / 4));
1011
+ const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a1, vl / 4));
1012
+ const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a2, vl / 4));
1013
+ const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(a3, vl / 4));
1014
+
1015
+ const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
1016
+ const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
1017
+ const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
1018
+ const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
1019
+ const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
1020
+ const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
1021
+ const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
1022
+
1023
+ const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
1024
+ const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
1025
+ const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
1026
+ const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
1027
+
1028
+ const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_hi_m));
1029
+ const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
1030
+ const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
1031
+ const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
1032
+ const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
1033
+ const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
1034
+ const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
1035
+ const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
1036
+ const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
1037
+ const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
1038
+ const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
1039
+ const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
1040
+ const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
1041
+
1042
+ // vector version needs Zvfhmin extension
1043
+ const float a_scale = LM_GGML_FP16_TO_FP32(a_ptr[l].d);
1044
+ const float b_scales[8] = {
1045
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
1046
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
1047
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
1048
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
1049
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
1050
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
1051
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
1052
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
1053
+ };
1054
+ const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
1055
+ const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
1056
+ sumf = __riscv_vfmacc_vv_f32m1(sumf, tmp1, b_scales_vec, vl / 4);
1057
+ }
1058
+ __riscv_vse32_v_f32m1(s + x * ncols_interleaved, sumf, vl / 4);
1059
+ }
1060
+ return;
1061
+ }
994
1062
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
995
1063
  {
996
1064
  float sumf[8];
@@ -3171,6 +3239,207 @@ void lm_ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void
3171
3239
  }
3172
3240
  }
3173
3241
  }
3242
+ return;
3243
+ }
3244
+ #elif defined(__riscv_v_intrinsic)
3245
+ if (__riscv_vlenb() >= QK4_0) {
3246
+ const size_t vl = QK4_0;
3247
+
3248
+ for (int y = 0; y < nr / 4; y++) {
3249
+ const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
3250
+ for (int x = 0; x < nc / ncols_interleaved; x++) {
3251
+ const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
3252
+ vfloat32m1_t sumf0 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3253
+ vfloat32m1_t sumf1 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3254
+ vfloat32m1_t sumf2 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3255
+ vfloat32m1_t sumf3 = __riscv_vfmv_v_f_f32m1(0.0, vl / 4);
3256
+ for (int l = 0; l < nb; l++) {
3257
+ const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4((const int8_t *)b_ptr[l].qs, vl * 4);
3258
+ const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4(__riscv_vsll_vx_i8m4(rhs_raw_vec, 4, vl * 4), 4, vl * 4);
3259
+ const vint8m4_t rhs_vec_hi = __riscv_vsra_vx_i8m4(rhs_raw_vec, 4, vl * 4);
3260
+ const vint8m2_t rhs_vec_lo_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 0);
3261
+ const vint8m2_t rhs_vec_lo_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_lo, 1);
3262
+ const vint8m2_t rhs_vec_hi_0 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 0);
3263
+ const vint8m2_t rhs_vec_hi_1 = __riscv_vget_v_i8m4_i8m2(rhs_vec_hi, 1);
3264
+
3265
+ // vector version needs Zvfhmin extension
3266
+ const float a_scales[4] = {
3267
+ LM_GGML_FP16_TO_FP32(a_ptr[l].d[0]),
3268
+ LM_GGML_FP16_TO_FP32(a_ptr[l].d[1]),
3269
+ LM_GGML_FP16_TO_FP32(a_ptr[l].d[2]),
3270
+ LM_GGML_FP16_TO_FP32(a_ptr[l].d[3])
3271
+ };
3272
+ const float b_scales[8] = {
3273
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[0]),
3274
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[1]),
3275
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[2]),
3276
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[3]),
3277
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[4]),
3278
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[5]),
3279
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[6]),
3280
+ LM_GGML_FP16_TO_FP32(b_ptr[l].d[7])
3281
+ };
3282
+ const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
3283
+
3284
+ const int64_t A0 = *(const int64_t *)&a_ptr[l].qs[0];
3285
+ const int64_t A4 = *(const int64_t *)&a_ptr[l].qs[32];
3286
+ const int64_t A8 = *(const int64_t *)&a_ptr[l].qs[64];
3287
+ const int64_t Ac = *(const int64_t *)&a_ptr[l].qs[96];
3288
+ __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3289
+ vint16m4_t sumi_l0;
3290
+ {
3291
+ const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A0, vl / 4));
3292
+ const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A4, vl / 4));
3293
+ const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A8, vl / 4));
3294
+ const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ac, vl / 4));
3295
+ const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3296
+ const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3297
+ const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3298
+ const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3299
+
3300
+ sumi_l0 = sumi_hi_m;
3301
+ }
3302
+
3303
+ {
3304
+ const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l0));
3305
+ const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3306
+ const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3307
+ const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3308
+ const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3309
+ const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3310
+ const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3311
+ const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3312
+ const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3313
+ const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3314
+ const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3315
+ const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3316
+ const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3317
+
3318
+ const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[0], vl / 4);
3319
+ sumf0 = __riscv_vfmacc_vv_f32m1(sumf0, tmp1, b_scales_vec, vl / 4);
3320
+ }
3321
+
3322
+ const int64_t A1 = *(const int64_t *)&a_ptr[l].qs[8];
3323
+ const int64_t A5 = *(const int64_t *)&a_ptr[l].qs[40];
3324
+ const int64_t A9 = *(const int64_t *)&a_ptr[l].qs[72];
3325
+ const int64_t Ad = *(const int64_t *)&a_ptr[l].qs[104];
3326
+ __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3327
+ vint16m4_t sumi_l1;
3328
+ {
3329
+ const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A1, vl / 4));
3330
+ const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A5, vl / 4));
3331
+ const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A9, vl / 4));
3332
+ const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ad, vl / 4));
3333
+ const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3334
+ const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3335
+ const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3336
+ const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3337
+
3338
+ sumi_l1 = sumi_hi_m;
3339
+ }
3340
+
3341
+ {
3342
+ const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l1));
3343
+ const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3344
+ const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3345
+ const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3346
+ const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3347
+ const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3348
+ const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3349
+ const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3350
+ const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3351
+ const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3352
+ const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3353
+ const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3354
+ const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3355
+
3356
+ const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[1], vl / 4);
3357
+ sumf1 = __riscv_vfmacc_vv_f32m1(sumf1, tmp1, b_scales_vec, vl / 4);
3358
+ }
3359
+
3360
+ const int64_t A2 = *(const int64_t *)&a_ptr[l].qs[16];
3361
+ const int64_t A6 = *(const int64_t *)&a_ptr[l].qs[48];
3362
+ const int64_t Aa = *(const int64_t *)&a_ptr[l].qs[80];
3363
+ const int64_t Ae = *(const int64_t *)&a_ptr[l].qs[112];
3364
+ __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3365
+ vint16m4_t sumi_l2;
3366
+ {
3367
+ const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A2, vl / 4));
3368
+ const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A6, vl / 4));
3369
+ const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Aa, vl / 4));
3370
+ const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ae, vl / 4));
3371
+ const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3372
+ const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3373
+ const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3374
+ const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3375
+
3376
+ sumi_l2 = sumi_hi_m;
3377
+ }
3378
+
3379
+ {
3380
+ const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l2));
3381
+ const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3382
+ const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3383
+ const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3384
+ const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3385
+ const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3386
+ const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3387
+ const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3388
+ const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3389
+ const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3390
+ const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3391
+ const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3392
+ const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3393
+
3394
+ const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[2], vl / 4);
3395
+ sumf2 = __riscv_vfmacc_vv_f32m1(sumf2, tmp1, b_scales_vec, vl / 4);
3396
+ }
3397
+
3398
+ const int64_t A3 = *(const int64_t *)&a_ptr[l].qs[24];
3399
+ const int64_t A7 = *(const int64_t *)&a_ptr[l].qs[56];
3400
+ const int64_t Ab = *(const int64_t *)&a_ptr[l].qs[88];
3401
+ const int64_t Af = *(const int64_t *)&a_ptr[l].qs[120];
3402
+ __asm__ __volatile__("" ::: "memory"); // prevent gcc from emitting fused vlse64, violating alignment
3403
+ vint16m4_t sumi_l3;
3404
+ {
3405
+ const vint8m2_t lhs_0_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A3, vl / 4));
3406
+ const vint8m2_t lhs_1_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(A7, vl / 4));
3407
+ const vint8m2_t lhs_2_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Ab, vl / 4));
3408
+ const vint8m2_t lhs_3_8 =__riscv_vreinterpret_v_i64m2_i8m2(__riscv_vmv_v_x_i64m2(Af, vl / 4));
3409
+ const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4(rhs_vec_lo_0, lhs_0_8, vl * 2);
3410
+ const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4(sumi_lo_0, rhs_vec_lo_1, lhs_1_8, vl * 2);
3411
+ const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4(sumi_lo_1, rhs_vec_hi_0, lhs_2_8, vl * 2);
3412
+ const vint16m4_t sumi_hi_m = __riscv_vwmacc_vv_i16m4(sumi_hi_0, rhs_vec_hi_1, lhs_3_8, vl * 2);
3413
+
3414
+ sumi_l3 = sumi_hi_m;
3415
+ }
3416
+
3417
+ {
3418
+ const vuint32m4_t sumi_i32 = __riscv_vreinterpret_v_i32m4_u32m4(__riscv_vreinterpret_v_i16m4_i32m4(sumi_l3));
3419
+ const vuint16m2_t sumi_h2_0 = __riscv_vnsrl_wx_u16m2(sumi_i32, 0, vl);
3420
+ const vuint16m2_t sumi_h2_1 = __riscv_vnsrl_wx_u16m2(sumi_i32, 16, vl);
3421
+ const vuint16m2_t sumi_h2 = __riscv_vadd_vv_u16m2(sumi_h2_0, sumi_h2_1, vl);
3422
+ const vuint32m2_t sumi_h2_i32 = __riscv_vreinterpret_v_u16m2_u32m2(sumi_h2);
3423
+ const vuint16m1_t sumi_h4_0 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 0, vl / 2);
3424
+ const vuint16m1_t sumi_h4_1 = __riscv_vnsrl_wx_u16m1(sumi_h2_i32, 16, vl / 2);
3425
+ const vuint16m1_t sumi_h4 = __riscv_vadd_vv_u16m1(sumi_h4_0, sumi_h4_1, vl / 2);
3426
+ const vuint32m1_t sumi_h4_i32 = __riscv_vreinterpret_v_u16m1_u32m1(sumi_h4);
3427
+ const vint16mf2_t sumi_h8_0 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 0, vl / 4));
3428
+ const vint16mf2_t sumi_h8_1 = __riscv_vreinterpret_v_u16mf2_i16mf2(__riscv_vnsrl_wx_u16mf2(sumi_h4_i32, 16, vl / 4));
3429
+ const vint32m1_t sumi_h8 = __riscv_vwadd_vv_i32m1(sumi_h8_0, sumi_h8_1, vl / 4);
3430
+ const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
3431
+
3432
+ const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scales[3], vl / 4);
3433
+ sumf3 = __riscv_vfmacc_vv_f32m1(sumf3, tmp1, b_scales_vec, vl / 4);
3434
+ }
3435
+ }
3436
+ __riscv_vse32_v_f32m1(&s[(y * 4 + 0) * bs + x * ncols_interleaved], sumf0, vl / 4);
3437
+ __riscv_vse32_v_f32m1(&s[(y * 4 + 1) * bs + x * ncols_interleaved], sumf1, vl / 4);
3438
+ __riscv_vse32_v_f32m1(&s[(y * 4 + 2) * bs + x * ncols_interleaved], sumf2, vl / 4);
3439
+ __riscv_vse32_v_f32m1(&s[(y * 4 + 3) * bs + x * ncols_interleaved], sumf3, vl / 4);
3440
+ }
3441
+ }
3442
+
3174
3443
  return;
3175
3444
  }
3176
3445
  #endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__)
package/cpp/ggml-alloc.c CHANGED
@@ -14,7 +14,7 @@
14
14
 
15
15
  //#define LM_GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
17
+ //#define AT_PRINTF(...) LM_GGML_LOG_DEBUG(__VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
20
 
@@ -89,7 +89,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
89
89
  size = LM_GGML_PAD(size, talloc->alignment);
90
90
 
91
91
  if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
92
+ LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
94
  LM_GGML_ABORT("not enough space in the buffer");
95
95
  }
@@ -172,7 +172,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
172
172
  best_fit_block = alloc->n_free_blocks - 1;
173
173
  } else {
174
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
175
+ LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
176
  __func__, size, max_avail);
177
177
  LM_GGML_ABORT("not enough space in the buffer");
178
178
  }
@@ -209,16 +209,16 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
209
209
  }
210
210
  }
211
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
212
+ LM_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
213
  for (int i = 0; i < 1024; i++) {
214
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
215
+ LM_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
216
  alloc->allocated_tensors[i].offset,
217
217
  alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
218
  lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
219
  }
220
220
  }
221
- fprintf(stderr, "\n");
221
+ LM_GGML_LOG_DEBUG("\n");
222
222
  }
223
223
  #endif
224
224
 
@@ -348,7 +348,6 @@ struct tensor_alloc {
348
348
  };
349
349
 
350
350
  struct leaf_alloc {
351
- int buffer_id;
352
351
  struct tensor_alloc leaf;
353
352
  };
354
353
 
@@ -740,7 +739,6 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
740
739
  for (int i = 0; i < graph->n_leafs; i++) {
741
740
  struct lm_ggml_tensor * leaf = graph->leafs[i];
742
741
  struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf);
743
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
744
742
  if (leaf->view_src || leaf->data) {
745
743
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
746
744
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -768,13 +766,13 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
768
766
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
769
767
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
770
768
  #ifndef NDEBUG
771
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
769
+ LM_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
772
770
  #endif
773
771
 
774
772
  lm_ggml_backend_buffer_free(galloc->buffers[i]);
775
773
  galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
776
774
  if (galloc->buffers[i] == NULL) {
777
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
775
+ LM_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
778
776
  return false;
779
777
  }
780
778
  lm_ggml_backend_buffer_set_usage(galloc->buffers[i], LM_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -825,14 +823,14 @@ static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct
825
823
  static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) {
826
824
  if (galloc->n_nodes != graph->n_nodes) {
827
825
  #ifndef NDEBUG
828
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
826
+ LM_GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
829
827
  #endif
830
828
  return true;
831
829
  }
832
830
 
833
831
  if (galloc->n_leafs != graph->n_leafs) {
834
832
  #ifndef NDEBUG
835
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
833
+ LM_GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
836
834
  #endif
837
835
  return true;
838
836
  }
@@ -843,7 +841,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
843
841
 
844
842
  if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
845
843
  #ifndef NDEBUG
846
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
844
+ LM_GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
847
845
  #endif
848
846
  return true;
849
847
  }
@@ -855,7 +853,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
855
853
  }
856
854
  if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
857
855
  #ifndef NDEBUG
858
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
856
+ LM_GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
859
857
  #endif
860
858
  return true;
861
859
  }
@@ -869,14 +867,14 @@ bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph
869
867
  if (lm_ggml_gallocr_needs_realloc(galloc, graph)) {
870
868
  if (galloc->n_buffers == 1) {
871
869
  #ifndef NDEBUG
872
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
870
+ LM_GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
873
871
  #endif
874
872
  if (!lm_ggml_gallocr_reserve(galloc, graph)) {
875
873
  return false;
876
874
  }
877
875
  } else {
878
876
  #ifndef NDEBUG
879
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
877
+ LM_GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
880
878
  #endif
881
879
  return false;
882
880
  }
@@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct lm_ggml_context * ctx,
940
938
  lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size);
941
939
  if (buffer == NULL) {
942
940
  #ifndef NDEBUG
943
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
941
+ LM_GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
944
942
  #endif
945
943
  for (size_t i = 0; i < *n_buffers; i++) {
946
944
  lm_ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +988,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
990
988
  }
991
989
 
992
990
  if (this_size > max_size) {
993
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
991
+ LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
992
  __func__, t->name,
995
993
  lm_ggml_backend_buft_name(buft),
996
994
  this_size, max_size);
@@ -1022,7 +1020,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
1022
1020
 
1023
1021
  if (n_buffers == 0) {
1024
1022
  #ifndef NDEBUG
1025
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1023
+ LM_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1026
1024
  #endif
1027
1025
  return NULL;
1028
1026
  }
@@ -22,7 +22,7 @@ extern "C" {
22
22
  size_t (*get_max_size) (lm_ggml_backend_buffer_type_t buft);
23
23
  // (optional) data size needed to allocate the tensor, including padding (defaults to lm_ggml_nbytes)
24
24
  size_t (*get_alloc_size)(lm_ggml_backend_buffer_type_t buft, const struct lm_ggml_tensor * tensor);
25
- // (optional) check if tensor data is in host memory (defaults to false)
25
+ // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
26
26
  bool (*is_host) (lm_ggml_backend_buffer_type_t buft);
27
27
  };
28
28
 
@@ -37,7 +37,6 @@ extern "C" {
37
37
  //
38
38
 
39
39
  struct lm_ggml_backend_buffer_i {
40
- const char * (*get_name) (lm_ggml_backend_buffer_t buffer);
41
40
  // (optional) free the buffer
42
41
  void (*free_buffer) (lm_ggml_backend_buffer_t buffer);
43
42
  // base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
88
87
 
89
88
  void (*free)(lm_ggml_backend_t backend);
90
89
 
91
- // Will be moved to the device interface
92
- // buffer allocation
93
- lm_ggml_backend_buffer_type_t (*get_default_buffer_type)(lm_ggml_backend_t backend);
94
-
95
90
  // (optional) asynchronous tensor data access
96
91
  void (*set_tensor_async)(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
97
92
  void (*get_tensor_async)(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
98
93
  bool (*cpy_tensor_async)(lm_ggml_backend_t backend_src, lm_ggml_backend_t backend_dst, const struct lm_ggml_tensor * src, struct lm_ggml_tensor * dst);
99
94
 
100
- // (optional) complete all pending operations
95
+ // (optional) complete all pending operations (required if the backend supports async operations)
101
96
  void (*synchronize)(lm_ggml_backend_t backend);
102
97
 
103
- // (optional) compute graph with a plan (not used currently)
98
+ // (optional) graph plans (not used currently)
99
+ // compute graph with a plan
104
100
  lm_ggml_backend_graph_plan_t (*graph_plan_create) (lm_ggml_backend_t backend, const struct lm_ggml_cgraph * cgraph);
105
101
  void (*graph_plan_free) (lm_ggml_backend_t backend, lm_ggml_backend_graph_plan_t plan);
106
102
  // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
111
107
  // compute graph (always async if supported by the backend)
112
108
  enum lm_ggml_status (*graph_compute) (lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph);
113
109
 
114
- // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115
- // new backends should implement the device interface instead
116
- // These functions are being moved to the device interface
117
- bool (*supports_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
118
- bool (*supports_buft)(lm_ggml_backend_t backend, lm_ggml_backend_buffer_type_t buft);
119
- bool (*offload_op) (lm_ggml_backend_t backend, const struct lm_ggml_tensor * op);
120
-
121
110
  // (optional) event synchronization
122
111
  // record an event on this stream
123
112
  void (*event_record)(lm_ggml_backend_t backend, lm_ggml_backend_event_t event);