llama_cpp 0.15.1 → 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,12 @@
14
14
  #include <stdlib.h> // for qsort
15
15
  #include <stdio.h> // for GGML_ASSERT
16
16
 
17
+ #if defined(_MSC_VER)
18
+ // disable "possible loss of data" to avoid warnings for hundreds of casts
19
+ // we should just be careful :)
20
+ #pragma warning(disable: 4244 4267)
21
+ #endif
22
+
17
23
  #define UNUSED GGML_UNUSED
18
24
 
19
25
  // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -235,7 +241,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
235
241
  #endif // __AVX__ || __AVX2__ || __AVX512F__
236
242
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
237
243
 
238
- #if defined(__ARM_NEON) || defined(__wasm_simd128__)
244
+ #if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
239
245
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
240
246
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
241
247
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -637,6 +643,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
637
643
  // store result
638
644
  __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
639
645
  }
646
+ #elif defined(__POWER9_VECTOR__)
647
+ for (int i = 0; i < nb; i++) {
648
+ vector float srcv [8];
649
+ vector float asrcv[8];
650
+ vector float amaxv[8];
651
+ vector signed int vi[8];
652
+
653
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
654
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
655
+
656
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
657
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
658
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
659
+
660
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
661
+ vec_extract(amaxv[0], 1)),
662
+ MAX(vec_extract(amaxv[0], 2),
663
+ vec_extract(amaxv[0], 3)));
664
+
665
+ const float d = amax / ((1 << 7) - 1);
666
+ const float id = d ? 1.0f/d : 0.0f;
667
+ const vector float vid = vec_splats(id);
668
+
669
+ y[i].d = GGML_FP32_TO_FP16(d);
670
+
671
+ for (int j = 0; j < 8; j++) {
672
+ const vector float v = vec_round(vec_mul(srcv[j], vid));
673
+ vi[j] = vec_cts(v, 0);
674
+ }
675
+ vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
676
+ vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
677
+ }
640
678
  #else
641
679
  GGML_UNUSED(nb);
642
680
  // scalar
@@ -892,6 +930,46 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
892
930
  int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
893
931
  y[i].s = GGML_FP32_TO_FP16(sum*d);
894
932
  }
933
+ #elif defined(__POWER9_VECTOR__)
934
+ for (int i = 0; i < nb; i++) {
935
+ vector float srcv [8];
936
+ vector float asrcv[8];
937
+ vector float amaxv[8];
938
+ vector signed int vi[8];
939
+
940
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
941
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
942
+
943
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
944
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
945
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
946
+
947
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
948
+ vec_extract(amaxv[0], 1)),
949
+ MAX(vec_extract(amaxv[0], 2),
950
+ vec_extract(amaxv[0], 3)));
951
+
952
+ const float d = amax / ((1 << 7) - 1);
953
+ const float id = d ? 1.0f/d : 0.0f;
954
+ const vector float vid = vec_splats(id);
955
+
956
+ y[i].d = GGML_FP32_TO_FP16(d);
957
+
958
+ vector int accv = vec_splats(0);
959
+
960
+ for (int j = 0; j < 8; j++) {
961
+ const vector float v = vec_round(vec_mul(srcv[j], vid));
962
+ vi[j] = vec_cts(v, 0);
963
+
964
+ accv = vec_add(accv, vi[j]);
965
+ }
966
+ vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
967
+ vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
968
+
969
+ accv = vec_add(accv, vec_sld(accv, accv, 4));
970
+ accv = vec_add(accv, vec_sld(accv, accv, 8));
971
+ y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
972
+ }
895
973
  #else
896
974
  GGML_UNUSED(nb);
897
975
  // scalar
@@ -1908,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
1908
1986
 
1909
1987
  for (int j = 0; j < QK_K/16; ++j) {
1910
1988
  if (quant_weights) {
1911
- const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
1989
+ const float * qw = quant_weights + QK_K * i + 16*j;
1912
1990
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
1913
1991
  } else {
1914
1992
  for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
@@ -3409,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3409
3487
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3410
3488
  if (nrc == 2) {
3411
3489
  const block_q4_0 * restrict vx0 = vx;
3412
- const block_q4_0 * restrict vx1 = vx + bx;
3413
-
3490
+ const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
3414
3491
  const block_q8_0 * restrict vy0 = vy;
3415
- const block_q8_0 * restrict vy1 = vy + by;
3492
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3416
3493
 
3417
3494
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3418
3495
 
@@ -3446,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3446
3523
  const int8x16_t y1_l = vld1q_s8(b_y1->qs);
3447
3524
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3448
3525
 
3449
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3450
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3451
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3452
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3526
+ float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3527
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3528
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3529
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3530
+
3531
+ float32x4_t scale = vld1q_f32(_scale);
3453
3532
 
3454
3533
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3455
3534
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3734,6 +3813,46 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3734
3813
  }
3735
3814
 
3736
3815
  *s = sumf;
3816
+ #elif defined(__POWER9_VECTOR__)
3817
+ const vector signed char lowMask = vec_splats((signed char)0xF);
3818
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
3819
+ const vector signed char v8 = vec_splats((signed char)0x8);
3820
+
3821
+ vector float vsumf0 = vec_splats(0.0f);
3822
+
3823
+ #pragma GCC unroll 4
3824
+ for (int i = 0; i < nb; i++) {
3825
+ __builtin_prefetch(x[i].qs, 0, 1);
3826
+ __builtin_prefetch(y[i].qs, 0, 1);
3827
+
3828
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
3829
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
3830
+ vector float vd = vec_mul(vxd, vyd);
3831
+
3832
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
3833
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
3834
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
3835
+
3836
+ vector signed char q4x0 = vec_and(qxs, lowMask);
3837
+ vector signed char q4x1 = vec_sr(qxs, v4);
3838
+
3839
+ q4x0 = vec_sub(q4x0, v8);
3840
+ q4x1 = vec_sub(q4x1, v8);
3841
+
3842
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
3843
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
3844
+
3845
+ qv0 = vec_add(qv0, qv1);
3846
+
3847
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
3848
+
3849
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
3850
+ }
3851
+
3852
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
3853
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
3854
+
3855
+ *s = vec_extract(vsumf0, 0);
3737
3856
  #else
3738
3857
  // scalar
3739
3858
  float sumf = 0.0;
@@ -3776,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3776
3895
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3777
3896
  if (nrc == 2) {
3778
3897
  const block_q4_1 * restrict vx0 = vx;
3779
- const block_q4_1 * restrict vx1 = vx + bx;
3898
+ const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
3780
3899
  const block_q8_1 * restrict vy0 = vy;
3781
- const block_q8_1 * restrict vy1 = vy + by;
3900
+ const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
3782
3901
 
3783
3902
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3784
3903
  float32x4_t summs0 = vdupq_n_f32(0.0f);
@@ -3789,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3789
3908
  const block_q8_1 * restrict b_y0 = &vy0[i];
3790
3909
  const block_q8_1 * restrict b_y1 = &vy1[i];
3791
3910
 
3792
- float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
3793
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
3794
- GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
3795
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
3796
- summs0 += summs_t;
3911
+ float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
3912
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
3913
+ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
3914
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
3915
+ summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
3797
3916
 
3798
3917
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3799
3918
 
@@ -3813,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3813
3932
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3814
3933
 
3815
3934
  // mmla into int32x4_t
3816
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
3817
- GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
3818
- GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
3819
- GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
3935
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
3936
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
3937
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
3938
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
3939
+ float32x4_t scale = vld1q_f32(_scale);
3820
3940
 
3821
3941
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3822
3942
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3835,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3835
3955
 
3836
3956
  float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
3837
3957
  float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
3838
- sumv2 = sumv2 + summs0;
3958
+ sumv2 = vaddq_f32(sumv2, summs0);
3839
3959
 
3840
3960
  vst1_f32(s, vget_low_f32(sumv2));
3841
3961
  vst1_f32(s + bs, vget_high_f32(sumv2));
@@ -3952,6 +4072,46 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3952
4072
  }
3953
4073
 
3954
4074
  *s = sumf;
4075
+ #elif defined(__POWER9_VECTOR__)
4076
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4077
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4078
+
4079
+ vector float vsumf0 = vec_splats(0.0f);
4080
+
4081
+ #pragma GCC unroll 4
4082
+ for (int i = 0; i < nb; i++) {
4083
+ __builtin_prefetch(x[i].qs, 0, 1);
4084
+ __builtin_prefetch(y[i].qs, 0, 1);
4085
+
4086
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4087
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4088
+ vector float vd = vec_mul(vxd, vyd);
4089
+
4090
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
4091
+ vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.0f, 0.0f, 0.0f};
4092
+ vsumf0 = vec_madd(vxmin, vys, vsumf0);
4093
+
4094
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4095
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4096
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
4097
+
4098
+ vector signed char q4x0 = vec_and(qxs, lowMask);
4099
+ vector signed char q4x1 = vec_sr(qxs, v4);
4100
+
4101
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4102
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4103
+
4104
+ qv0 = vec_add(qv0, qv1);
4105
+
4106
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4107
+
4108
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4109
+ }
4110
+
4111
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4112
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4113
+
4114
+ *s = vec_extract(vsumf0, 0);
3955
4115
  #else
3956
4116
  // scalar
3957
4117
  float sumf = 0.0;
@@ -4237,6 +4397,49 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4237
4397
  }
4238
4398
 
4239
4399
  *s = sumf;
4400
+ #elif defined(__POWER9_VECTOR__)
4401
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4402
+ const vector unsigned char v4 = vec_splats((unsigned char)4);
4403
+
4404
+ vector float vsumf0 = vec_splats(0.0f);
4405
+
4406
+ #pragma GCC unroll 4
4407
+ for (int i = 0; i < nb; ++i) {
4408
+ __builtin_prefetch(x[i].qs, 0, 1);
4409
+ __builtin_prefetch(y[i].qs, 0, 1);
4410
+
4411
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4412
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4413
+ vector float vd = vec_mul(vxd, vyd);
4414
+
4415
+ vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[i].qh[0]]), (uint64_t)(table_b2b_1[x[i].qh[1]])};
4416
+ vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[i].qh[2]]), (uint64_t)(table_b2b_1[x[i].qh[3]])};
4417
+
4418
+ vector signed char qh0 = (vector signed char)aux64x2_0;
4419
+ vector signed char qh1 = (vector signed char)aux64x2_1;
4420
+
4421
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4422
+
4423
+ vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
4424
+ vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
4425
+
4426
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4427
+ vector signed char q8y1 = vec_xl( 16, y[i].qs);
4428
+
4429
+ vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
4430
+ vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
4431
+
4432
+ qv0 = vec_add(qv0, qv1);
4433
+
4434
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4435
+
4436
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4437
+ }
4438
+
4439
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4440
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4441
+
4442
+ *s = vec_extract(vsumf0, 0);
4240
4443
  #else
4241
4444
  // scalar
4242
4445
  float sumf = 0.0;
@@ -4541,6 +4744,53 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4541
4744
  }
4542
4745
 
4543
4746
  *s = sumf;
4747
+ #elif defined(__POWER9_VECTOR__)
4748
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4749
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4750
+
4751
+ vector float vsumf0 = vec_splats(0.0f);
4752
+
4753
+ #pragma GCC unroll 4
4754
+ for (int i = 0; i < nb; ++i) {
4755
+ __builtin_prefetch(x[i].qs, 0, 1);
4756
+ __builtin_prefetch(y[i].qs, 0, 1);
4757
+
4758
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4759
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4760
+ vector float vd = vec_mul(vxd, vyd);
4761
+
4762
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
4763
+ vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.f, 0.f, 0.f};
4764
+ vsumf0 = vec_madd(vxmin, vys, vsumf0);
4765
+
4766
+ vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[i].qh[0]]), (uint64_t)(table_b2b_0[x[i].qh[1]])};
4767
+ vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[i].qh[2]]), (uint64_t)(table_b2b_0[x[i].qh[3]])};
4768
+
4769
+ vector signed char qh0 = (vector signed char)aux64x2_0;
4770
+ vector signed char qh1 = (vector signed char)aux64x2_1;
4771
+
4772
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4773
+
4774
+ vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
4775
+ vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
4776
+
4777
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4778
+ vector signed char q8y1 = vec_xl( 16, y[i].qs);
4779
+
4780
+ vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
4781
+ vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
4782
+
4783
+ qv0 = vec_add(qv0, qv1);
4784
+
4785
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4786
+
4787
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4788
+ }
4789
+
4790
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4791
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4792
+
4793
+ *s = vec_extract(vsumf0, 0);
4544
4794
  #else
4545
4795
  // scalar
4546
4796
  float sumf = 0.0;
@@ -4589,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4589
4839
  #if defined(__ARM_FEATURE_MATMUL_INT8)
4590
4840
  if (nrc == 2) {
4591
4841
  const block_q8_0 * restrict vx0 = vx;
4592
- const block_q8_0 * restrict vx1 = vx + bx;
4842
+ const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
4593
4843
  const block_q8_0 * restrict vy0 = vy;
4594
- const block_q8_0 * restrict vy1 = vy + by;
4844
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
4595
4845
 
4596
4846
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
4597
4847
 
@@ -4613,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4613
4863
  const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4614
4864
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4615
4865
 
4616
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4617
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4618
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4619
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4866
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4867
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4868
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4869
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4870
+ float32x4_t scale = vld1q_f32(_scale);
4620
4871
 
4621
4872
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4622
4873
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -4716,6 +4967,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4716
4967
  }
4717
4968
 
4718
4969
  *s = sumf;
4970
+ #elif defined(__POWER9_VECTOR__)
4971
+ vector float vsumf0 = vec_splats(0.0f);
4972
+
4973
+ #pragma GCC unroll 4
4974
+ for (int i = 0; i < nb; i++) {
4975
+ __builtin_prefetch(x[i].qs, 0, 1);
4976
+ __builtin_prefetch(y[i].qs, 0, 1);
4977
+
4978
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4979
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4980
+ vector float vd = vec_mul(vxd, vyd);
4981
+
4982
+ vector signed char q8x0 = vec_xl( 0, x[i].qs);
4983
+ vector signed char q8x1 = vec_xl(16, x[i].qs);
4984
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4985
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
4986
+
4987
+ vector signed short qv0 = vec_mule(q8x0, q8y0);
4988
+ vector signed short qv1 = vec_mulo(q8x0, q8y0);
4989
+ vector signed short qv2 = vec_mule(q8x1, q8y1);
4990
+ vector signed short qv3 = vec_mulo(q8x1, q8y1);
4991
+
4992
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
4993
+ vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
4994
+ vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
4995
+ vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
4996
+
4997
+ vsumi0 = vec_add(vsumi0, vsumi2);
4998
+ vsumi1 = vec_add(vsumi1, vsumi3);
4999
+
5000
+ vsumi0 = vec_add(vsumi0, vsumi1);
5001
+
5002
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5003
+ }
5004
+
5005
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5006
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5007
+
5008
+ *s = vec_extract(vsumf0, 0);
4719
5009
  #else
4720
5010
  // scalar
4721
5011
  float sumf = 0.0;
@@ -5071,6 +5361,147 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5071
5361
 
5072
5362
  *s = sumf;
5073
5363
 
5364
+ #elif defined(__POWER9_VECTOR__)
5365
+ const vector signed char lowMask = vec_splats((signed char)0x3);
5366
+ const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5367
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5368
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5369
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5370
+
5371
+ vector float vsumf0 = vec_splats(0.0f);
5372
+ vector float vsumf1 = vec_splats(0.0f);
5373
+ vector float vsumf2 = vec_splats(0.0f);
5374
+ vector float vsumf3 = vec_splats(0.0f);
5375
+
5376
+ for (int i = 0; i < nb; ++i) {
5377
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
5378
+ vector float vyd = vec_splats(y[i].d);
5379
+ vector float vd = vec_mul(vxd, vyd);
5380
+
5381
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
5382
+ vector float vdmin = vec_mul(vxmin, vyd);
5383
+
5384
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
5385
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
5386
+
5387
+ vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
5388
+ vector signed char vscales = vec_and(q2xmins, lowScaleMask);
5389
+
5390
+ q2xmins = vec_sr(q2xmins, v4);
5391
+ vector signed short q2xmins0 = vec_unpackh(q2xmins);
5392
+ vector signed short q2xmins1 = vec_unpackl(q2xmins);
5393
+
5394
+ vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
5395
+ vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
5396
+ vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
5397
+ vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
5398
+
5399
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
5400
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
5401
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5402
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5403
+
5404
+ vector signed int vsumi0 = vec_splats((int32_t)0);
5405
+ vector signed int vsumi1 = vec_splats((int32_t)0);
5406
+ vector signed int vsumi2 = vec_splats((int32_t)0);
5407
+ vector signed int vsumi3 = vec_splats((int32_t)0);
5408
+ vector signed int vsumi4 = vec_splats((int32_t)0);
5409
+ vector signed int vsumi5 = vec_splats((int32_t)0);
5410
+ vector signed int vsumi6 = vec_splats((int32_t)0);
5411
+ vector signed int vsumi7 = vec_splats((int32_t)0);
5412
+
5413
+ const uint8_t * restrict q2 = x[i].qs;
5414
+ const int8_t * restrict q8 = y[i].qs;
5415
+
5416
+ for (int j = 0; j < QK_K/128; ++j) {
5417
+ __builtin_prefetch(q2, 0, 1);
5418
+ __builtin_prefetch(q8, 0, 1);
5419
+
5420
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
5421
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5422
+ q2 += 32;
5423
+
5424
+ vector signed char q2x00 = vec_and(qxs0, lowMask);
5425
+ vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5426
+ vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5427
+ vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
5428
+ vector signed char q2x10 = vec_and(qxs1, lowMask);
5429
+ vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
5430
+ vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
5431
+ vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
5432
+
5433
+ vector signed char q8y00 = vec_xl( 0, q8);
5434
+ vector signed char q8y10 = vec_xl( 16, q8);
5435
+ vector signed char q8y01 = vec_xl( 32, q8);
5436
+ vector signed char q8y11 = vec_xl( 48, q8);
5437
+ vector signed char q8y02 = vec_xl( 64, q8);
5438
+ vector signed char q8y12 = vec_xl( 80, q8);
5439
+ vector signed char q8y03 = vec_xl( 96, q8);
5440
+ vector signed char q8y13 = vec_xl(112, q8);
5441
+ q8 += 128;
5442
+
5443
+ vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
5444
+ vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
5445
+ vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
5446
+ vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
5447
+ vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
5448
+ vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
5449
+ vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
5450
+ vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
5451
+
5452
+ vector signed short vscales_h = vec_unpackh(vscales);
5453
+ vector signed short vs0 = vec_splat(vscales_h, 0);
5454
+ vector signed short vs1 = vec_splat(vscales_h, 1);
5455
+ vector signed short vs2 = vec_splat(vscales_h, 2);
5456
+ vector signed short vs3 = vec_splat(vscales_h, 3);
5457
+ vector signed short vs4 = vec_splat(vscales_h, 4);
5458
+ vector signed short vs5 = vec_splat(vscales_h, 5);
5459
+ vector signed short vs6 = vec_splat(vscales_h, 6);
5460
+ vector signed short vs7 = vec_splat(vscales_h, 7);
5461
+ vscales = vec_sld(vscales, vscales, 8);
5462
+
5463
+ qv0 = vec_mul(qv0, vs0);
5464
+ qv1 = vec_mul(qv1, vs2);
5465
+ qv2 = vec_mul(qv2, vs4);
5466
+ qv3 = vec_mul(qv3, vs6);
5467
+
5468
+ qv0 = vec_madd(qv4, vs1, qv0);
5469
+ qv1 = vec_madd(qv5, vs3, qv1);
5470
+ qv2 = vec_madd(qv6, vs5, qv2);
5471
+ qv3 = vec_madd(qv7, vs7, qv3);
5472
+
5473
+ vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
5474
+ vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
5475
+ vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
5476
+ vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
5477
+
5478
+ vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
5479
+ vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
5480
+ vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
5481
+ vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
5482
+ }
5483
+
5484
+ vsumi0 = vec_add(vsumi0, vsumi4);
5485
+ vsumi1 = vec_add(vsumi1, vsumi5);
5486
+ vsumi2 = vec_add(vsumi2, vsumi6);
5487
+ vsumi3 = vec_add(vsumi3, vsumi7);
5488
+
5489
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5490
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
5491
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
5492
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
5493
+ }
5494
+
5495
+ vsumf0 = vec_add(vsumf0, vsumf2);
5496
+ vsumf1 = vec_add(vsumf1, vsumf3);
5497
+
5498
+ vsumf0 = vec_add(vsumf0, vsumf1);
5499
+
5500
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5501
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5502
+
5503
+ *s = vec_extract(vsumf0, 0);
5504
+
5074
5505
  #else
5075
5506
 
5076
5507
  float sumf = 0;
@@ -5341,6 +5772,87 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5341
5772
 
5342
5773
  *s = sumf;
5343
5774
 
5775
+ #elif defined(__POWER9_VECTOR__)
5776
+ const vector signed char lowMask = vec_splats((signed char)0x3);
5777
+ const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5778
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5779
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5780
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5781
+
5782
+ vector float vsumf0 = vec_splats(0.0f);
5783
+ vector float vsumf1 = vec_splats(0.0f);
5784
+ vector float vsumf2 = vec_splats(0.0f);
5785
+ vector float vsumf3 = vec_splats(0.0f);
5786
+
5787
+ #pragma GCC unroll 2
5788
+ for (int i = 0; i < nb; ++i) {
5789
+ __builtin_prefetch(x[i].qs, 0, 1);
5790
+ __builtin_prefetch(y[i].qs, 0, 1);
5791
+
5792
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
5793
+ vector float vyd = vec_splats(y[i].d);
5794
+ vector float vd = vec_mul(vxd, vyd);
5795
+
5796
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
5797
+ vector float vdmin = vec_mul(vxmin, vyd);
5798
+
5799
+ vector signed short q8ysums0 = vec_xl_len(y[i].bsums, 8);
5800
+
5801
+ vector signed char q2xmins = (vector signed char)vec_xl_len(x[i].scales, 4);
5802
+ vector signed char vscales = vec_and(q2xmins, lowScaleMask);
5803
+
5804
+ q2xmins = vec_sr(q2xmins, v4);
5805
+ vector signed short q2xmins0 = vec_unpackh((vector signed char)q2xmins);
5806
+
5807
+ vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
5808
+ vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
5809
+
5810
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
5811
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
5812
+
5813
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
5814
+ vector signed char q2x00 = vec_and(qxs0, lowMask);
5815
+ vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5816
+ vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5817
+ vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
5818
+
5819
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
5820
+ vector signed char q8y01 = vec_xl( 16, y[i].qs);
5821
+ vector signed char q8y02 = vec_xl( 32, y[i].qs);
5822
+ vector signed char q8y03 = vec_xl( 48, y[i].qs);
5823
+
5824
+ vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
5825
+ vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
5826
+ vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
5827
+ vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
5828
+
5829
+ vector signed short vscales_h = vec_unpackh(vscales);
5830
+ vector signed short vs0 = vec_splat(vscales_h, 0);
5831
+ vector signed short vs1 = vec_splat(vscales_h, 1);
5832
+ vector signed short vs2 = vec_splat(vscales_h, 2);
5833
+ vector signed short vs3 = vec_splat(vscales_h, 3);
5834
+
5835
+ vector signed int vsumi0 = vec_add(vec_mule(qv0, vs0), vec_mulo(qv0, vs0));
5836
+ vector signed int vsumi1 = vec_add(vec_mule(qv1, vs1), vec_mulo(qv1, vs1));
5837
+ vector signed int vsumi2 = vec_add(vec_mule(qv2, vs2), vec_mulo(qv2, vs2));
5838
+ vector signed int vsumi3 = vec_add(vec_mule(qv3, vs3), vec_mulo(qv3, vs3));
5839
+
5840
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5841
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
5842
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
5843
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
5844
+ }
5845
+
5846
+ vsumf0 = vec_add(vsumf0, vsumf2);
5847
+ vsumf1 = vec_add(vsumf1, vsumf3);
5848
+
5849
+ vsumf0 = vec_add(vsumf0, vsumf1);
5850
+
5851
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5852
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5853
+
5854
+ *s = vec_extract(vsumf0, 0);
5855
+
5344
5856
  #else
5345
5857
 
5346
5858
  float sumf = 0;
@@ -5835,6 +6347,160 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5835
6347
 
5836
6348
  *s = sumf;
5837
6349
 
6350
+ #elif defined(__POWER9_VECTOR__)
6351
+ const vector signed char lowMask = vec_splats((signed char)0x3);
6352
+ const vector signed char v1 = vec_splats((signed char)0x1);
6353
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6354
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
6355
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
6356
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
6357
+ const vector signed char off = vec_splats((signed char)0x20);
6358
+
6359
+ vector float vsumf0 = vec_splats(0.0f);
6360
+ vector float vsumf1 = vec_splats(0.0f);
6361
+ vector float vsumf2 = vec_splats(0.0f);
6362
+ vector float vsumf3 = vec_splats(0.0f);
6363
+
6364
+ for (int i = 0; i < nb; ++i) {
6365
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
6366
+ vector float vyd = vec_splats(y[i].d);
6367
+ vector float vd = vec_mul(vxd, vyd);
6368
+
6369
+ uint32_t aux[3];
6370
+ uint32_t utmp[4];
6371
+
6372
+ memcpy(aux, x[i].scales, 12);
6373
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6374
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6375
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6376
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6377
+
6378
+ vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
6379
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6380
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6381
+
6382
+ vscales = vec_sub(vscales, off);
6383
+
6384
+ vector signed int vsumi0 = vec_splats((int32_t)0);
6385
+ vector signed int vsumi1 = vec_splats((int32_t)0);
6386
+ vector signed int vsumi2 = vec_splats((int32_t)0);
6387
+ vector signed int vsumi3 = vec_splats((int32_t)0);
6388
+ vector signed int vsumi4 = vec_splats((int32_t)0);
6389
+ vector signed int vsumi5 = vec_splats((int32_t)0);
6390
+ vector signed int vsumi6 = vec_splats((int32_t)0);
6391
+ vector signed int vsumi7 = vec_splats((int32_t)0);
6392
+
6393
+ const uint8_t * restrict q3 = x[i].qs;
6394
+ const int8_t * restrict q8 = y[i].qs;
6395
+
6396
+ for (int j = 0; j < QK_K/128; ++j) {
6397
+ __builtin_prefetch(q3, 0, 1);
6398
+ __builtin_prefetch(q8, 0, 1);
6399
+
6400
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
6401
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
6402
+ q3 += 32;
6403
+
6404
+ //the low 2 bits
6405
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
6406
+ vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
6407
+ vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
6408
+ vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
6409
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
6410
+ vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
6411
+ vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
6412
+ vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
6413
+
6414
+ //the 3rd bit
6415
+ vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
6416
+ vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
6417
+ vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
6418
+ vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
6419
+ vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
6420
+ vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
6421
+ vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
6422
+ vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
6423
+ qxhs0 = vec_sr(qxhs0, v4);
6424
+ qxhs1 = vec_sr(qxhs1, v4);
6425
+
6426
+ vector signed char q3x00 = vec_sub(qxs00, qxh00);
6427
+ vector signed char q3x01 = vec_sub(qxs01, qxh01);
6428
+ vector signed char q3x02 = vec_sub(qxs02, qxh02);
6429
+ vector signed char q3x03 = vec_sub(qxs03, qxh03);
6430
+ vector signed char q3x10 = vec_sub(qxs10, qxh10);
6431
+ vector signed char q3x11 = vec_sub(qxs11, qxh11);
6432
+ vector signed char q3x12 = vec_sub(qxs12, qxh12);
6433
+ vector signed char q3x13 = vec_sub(qxs13, qxh13);
6434
+
6435
+ vector signed char q8y00 = vec_xl( 0, q8);
6436
+ vector signed char q8y10 = vec_xl( 16, q8);
6437
+ vector signed char q8y01 = vec_xl( 32, q8);
6438
+ vector signed char q8y11 = vec_xl( 48, q8);
6439
+ vector signed char q8y02 = vec_xl( 64, q8);
6440
+ vector signed char q8y12 = vec_xl( 80, q8);
6441
+ vector signed char q8y03 = vec_xl( 96, q8);
6442
+ vector signed char q8y13 = vec_xl(112, q8);
6443
+ q8 += 128;
6444
+
6445
+ vector signed short vscales_h = vec_unpackh(vscales);
6446
+ vector signed short vs0 = vec_splat(vscales_h, 0);
6447
+ vector signed short vs1 = vec_splat(vscales_h, 1);
6448
+ vector signed short vs2 = vec_splat(vscales_h, 2);
6449
+ vector signed short vs3 = vec_splat(vscales_h, 3);
6450
+ vector signed short vs4 = vec_splat(vscales_h, 4);
6451
+ vector signed short vs5 = vec_splat(vscales_h, 5);
6452
+ vector signed short vs6 = vec_splat(vscales_h, 6);
6453
+ vector signed short vs7 = vec_splat(vscales_h, 7);
6454
+ vscales = vec_sld(vscales, vscales, 8);
6455
+
6456
+ vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
6457
+ vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
6458
+ vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
6459
+ vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
6460
+ vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
6461
+ vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
6462
+ vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6463
+ vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6464
+
6465
+ vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6466
+ vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6467
+ vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
6468
+ vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
6469
+ vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6470
+ vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6471
+ vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
6472
+ vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
6473
+
6474
+ vsumi0 = vec_add(vsum0, vsumi0);
6475
+ vsumi1 = vec_add(vsum1, vsumi1);
6476
+ vsumi2 = vec_add(vsum2, vsumi2);
6477
+ vsumi3 = vec_add(vsum3, vsumi3);
6478
+ vsumi4 = vec_add(vsum4, vsumi4);
6479
+ vsumi5 = vec_add(vsum5, vsumi5);
6480
+ vsumi6 = vec_add(vsum6, vsumi6);
6481
+ vsumi7 = vec_add(vsum7, vsumi7);
6482
+ }
6483
+
6484
+ vsumi0 = vec_add(vsumi0, vsumi4);
6485
+ vsumi1 = vec_add(vsumi1, vsumi5);
6486
+ vsumi2 = vec_add(vsumi2, vsumi6);
6487
+ vsumi3 = vec_add(vsumi3, vsumi7);
6488
+
6489
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
6490
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
6491
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
6492
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
6493
+ }
6494
+
6495
+ vsumf0 = vec_add(vsumf0, vsumf2);
6496
+ vsumf1 = vec_add(vsumf1, vsumf3);
6497
+
6498
+ vsumf0 = vec_add(vsumf0, vsumf1);
6499
+
6500
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
6501
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
6502
+
6503
+ *s = vec_extract(vsumf0, 0);
5838
6504
  #else
5839
6505
  // scalar version
5840
6506
  // This function is written like this so the compiler can manage to vectorize most of it
@@ -6201,6 +6867,95 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6201
6867
 
6202
6868
  *s = sumf;
6203
6869
 
6870
+ #elif defined(__POWER9_VECTOR__)
6871
+ const vector signed char lowMask = vec_splats((signed char)0x3);
6872
+ const vector signed char v1 = vec_splats((signed char)0x1);
6873
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6874
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
6875
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
6876
+ const vector signed char off = vec_splats((signed char)0x8);
6877
+
6878
+ vector float vsumf0 = vec_splats(0.0f);
6879
+ vector float vsumf1 = vec_splats(0.0f);
6880
+ vector float vsumf2 = vec_splats(0.0f);
6881
+ vector float vsumf3 = vec_splats(0.0f);
6882
+
6883
+ #pragma GCC unroll 2
6884
+ for (int i = 0; i < nb; ++i) {
6885
+ __builtin_prefetch(x[i].qs, 0, 1);
6886
+ __builtin_prefetch(y[i].qs, 0, 1);
6887
+
6888
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
6889
+ vector float vyd = vec_splats(y[i].d);
6890
+ vector float vd = vec_mul(vxd, vyd);
6891
+
6892
+ uint16_t aux16[2];
6893
+ int8_t * scales = (int8_t *)aux16;
6894
+
6895
+ const uint16_t a = *(const uint16_t *)x[i].scales;
6896
+ aux16[0] = a & 0x0f0f;
6897
+ aux16[1] = (a >> 4) & 0x0f0f;
6898
+
6899
+ vector signed char vscales = (vector signed char)vec_xl_len(scales, 8);
6900
+ vector signed char qxhs0 = (vector signed char)vec_xl_len(x[i].hmask, 8);
6901
+ qxhs0 = vec_or(qxhs0, vec_sr(vec_sld(qxhs0, qxhs0, 8), (vector unsigned char)v1));
6902
+
6903
+ vscales = vec_sub(vscales, off);
6904
+
6905
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
6906
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
6907
+ vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
6908
+ vector signed char qxs10 = vec_and(vec_sr(qxs0, v4), lowMask);
6909
+ vector signed char qxs11 = vec_and(vec_sr(qxs0, v6), lowMask);
6910
+
6911
+ //the 3rd bit
6912
+ vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
6913
+ vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
6914
+ vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v4)), v2);
6915
+ vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v6)), v2);
6916
+ qxhs0 = vec_sr(qxhs0, v4);
6917
+
6918
+ vector signed char q3x00 = vec_sub(qxs00, qxh00);
6919
+ vector signed char q3x01 = vec_sub(qxs01, qxh01);
6920
+ vector signed char q3x10 = vec_sub(qxs10, qxh02);
6921
+ vector signed char q3x11 = vec_sub(qxs11, qxh03);
6922
+
6923
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
6924
+ vector signed char q8y01 = vec_xl( 16, y[i].qs);
6925
+ vector signed char q8y10 = vec_xl( 32, y[i].qs);
6926
+ vector signed char q8y11 = vec_xl( 48, y[i].qs);
6927
+
6928
+ vector signed short vscales_h = vec_unpackh(vscales);
6929
+ vector signed short vs0 = vec_splat(vscales_h, 0);
6930
+ vector signed short vs1 = vec_splat(vscales_h, 1);
6931
+ vector signed short vs2 = vec_splat(vscales_h, 2);
6932
+ vector signed short vs3 = vec_splat(vscales_h, 3);
6933
+
6934
+ vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
6935
+ vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
6936
+ vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
6937
+ vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
6938
+
6939
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6940
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6941
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6942
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6943
+
6944
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
6945
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
6946
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
6947
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
6948
+ }
6949
+
6950
+ vsumf0 = vec_add(vsumf0, vsumf2);
6951
+ vsumf1 = vec_add(vsumf1, vsumf3);
6952
+
6953
+ vsumf0 = vec_add(vsumf0, vsumf1);
6954
+
6955
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
6956
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
6957
+
6958
+ *s = vec_extract(vsumf0, 0);
6204
6959
  #else
6205
6960
 
6206
6961
  int8_t aux8[QK_K];
@@ -6553,6 +7308,142 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6553
7308
 
6554
7309
  *s = sumf;
6555
7310
 
7311
+ #elif defined(__POWER9_VECTOR__)
7312
+ const vector signed char lowMask = vec_splats((signed char)0xF);
7313
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7314
+
7315
+ vector float vsumf0 = vec_splats(0.0f);
7316
+ vector float vsumf1 = vec_splats(0.0f);
7317
+ vector float vsumf2 = vec_splats(0.0f);
7318
+ vector float vsumf3 = vec_splats(0.0f);
7319
+
7320
+ for (int i = 0; i < nb; ++i) {
7321
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
7322
+ vector float vyd = vec_splats(y[i].d);
7323
+ vector float vd = vec_mul(vxd, vyd);
7324
+
7325
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7326
+ vector float vdmin = vec_mul(vxmin, vyd);
7327
+
7328
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7329
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7330
+
7331
+ memcpy(utmp, x[i].scales, 12);
7332
+
7333
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7334
+ const uint32_t uaux = utmp[1] & kmask1;
7335
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7336
+ utmp[2] = uaux;
7337
+ utmp[0] &= kmask1;
7338
+
7339
+ vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7340
+ vector signed short vscales = vec_unpackh(utmps);
7341
+ vector signed short q4xmins = vec_unpackl(utmps);
7342
+ vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
7343
+ vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
7344
+
7345
+ vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
7346
+ vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
7347
+ vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
7348
+ vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
7349
+
7350
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
7351
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
7352
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7353
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7354
+
7355
+ vector signed int vsumi0 = vec_splats((int32_t)0);
7356
+ vector signed int vsumi1 = vec_splats((int32_t)0);
7357
+ vector signed int vsumi2 = vec_splats((int32_t)0);
7358
+ vector signed int vsumi3 = vec_splats((int32_t)0);
7359
+ vector signed int vsumi4 = vec_splats((int32_t)0);
7360
+ vector signed int vsumi5 = vec_splats((int32_t)0);
7361
+ vector signed int vsumi6 = vec_splats((int32_t)0);
7362
+ vector signed int vsumi7 = vec_splats((int32_t)0);
7363
+
7364
+ const uint8_t * restrict q4 = x[i].qs;
7365
+ const int8_t * restrict q8 = y[i].qs;
7366
+
7367
+ for (int j = 0; j < QK_K/64; j+=2) {
7368
+ __builtin_prefetch(q4, 0, 1);
7369
+ __builtin_prefetch(q8, 0, 1);
7370
+
7371
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
7372
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
7373
+ vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
7374
+ vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7375
+ q4 += 64;
7376
+
7377
+ vector signed char q4x00 = vec_and(qxs0, lowMask);
7378
+ vector signed char q4x01 = vec_sr(qxs0, v4);
7379
+ vector signed char q4x10 = vec_and(qxs1, lowMask);
7380
+ vector signed char q4x11 = vec_sr(qxs1, v4);
7381
+ vector signed char q4x20 = vec_and(qxs2, lowMask);
7382
+ vector signed char q4x21 = vec_sr(qxs2, v4);
7383
+ vector signed char q4x30 = vec_and(qxs3, lowMask);
7384
+ vector signed char q4x31 = vec_sr(qxs3, v4);
7385
+
7386
+ vector signed char q8y00 = vec_xl( 0, q8);
7387
+ vector signed char q8y10 = vec_xl( 16, q8);
7388
+ vector signed char q8y01 = vec_xl( 32, q8);
7389
+ vector signed char q8y11 = vec_xl( 48, q8);
7390
+ vector signed char q8y20 = vec_xl( 64, q8);
7391
+ vector signed char q8y30 = vec_xl( 80, q8);
7392
+ vector signed char q8y21 = vec_xl( 96, q8);
7393
+ vector signed char q8y31 = vec_xl(112, q8);
7394
+ q8 += 128;
7395
+
7396
+ vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7397
+ vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7398
+ vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7399
+ vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7400
+ vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
7401
+ vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
7402
+ vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
7403
+ vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
7404
+
7405
+ vector signed short vs0 = vec_splat(vscales, 0);
7406
+ vector signed short vs1 = vec_splat(vscales, 1);
7407
+ vector signed short vs2 = vec_splat(vscales, 2);
7408
+ vector signed short vs3 = vec_splat(vscales, 3);
7409
+ vscales = vec_sld(vscales, vscales, 8);
7410
+
7411
+ qv00 = vec_add(qv00, qv10);
7412
+ qv10 = vec_add(qv01, qv11);
7413
+ qv20 = vec_add(qv20, qv30);
7414
+ qv30 = vec_add(qv21, qv31);
7415
+
7416
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7417
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7418
+ vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
7419
+ vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
7420
+ vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
7421
+ vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
7422
+ vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
7423
+ vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
7424
+ }
7425
+
7426
+ vsumi0 = vec_add(vsumi0, vsumi4);
7427
+ vsumi1 = vec_add(vsumi1, vsumi5);
7428
+ vsumi2 = vec_add(vsumi2, vsumi6);
7429
+ vsumi3 = vec_add(vsumi3, vsumi7);
7430
+
7431
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7432
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7433
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
7434
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
7435
+ }
7436
+
7437
+ vsumf0 = vec_add(vsumf0, vsumf2);
7438
+ vsumf1 = vec_add(vsumf1, vsumf3);
7439
+
7440
+ vsumf0 = vec_add(vsumf0, vsumf1);
7441
+
7442
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
7443
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
7444
+
7445
+ *s = vec_extract(vsumf0, 0);
7446
+
6556
7447
  #else
6557
7448
 
6558
7449
 
@@ -6819,6 +7710,87 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6819
7710
 
6820
7711
  *s = sumf;
6821
7712
 
7713
+ #elif defined(__POWER9_VECTOR__)
7714
+ const vector signed char lowMask = vec_splats((signed char)0xF);
7715
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7716
+
7717
+ vector float vsumf0 = vec_splats(0.0f);
7718
+ vector float vsumf1 = vec_splats(0.0f);
7719
+ vector float vsumf2 = vec_splats(0.0f);
7720
+ vector float vsumf3 = vec_splats(0.0f);
7721
+
7722
+ #pragma GCC unroll 2
7723
+ for (int i = 0; i < nb; ++i) {
7724
+ __builtin_prefetch(x[i].qs, 0, 1);
7725
+ __builtin_prefetch(y[i].qs, 0, 1);
7726
+
7727
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d[1]));
7728
+ vector float vyd = vec_splats(y[i].d);
7729
+ vector float vd= vec_mul(vxd, vyd);
7730
+
7731
+ uint16_t s16[2];
7732
+ const uint8_t * scales = (const uint8_t *)s16;
7733
+
7734
+ const uint16_t * restrict b = (const uint16_t *)x[i].scales;
7735
+ s16[0] = b[0] & 0x0f0f;
7736
+ s16[1] = (b[0] >> 4) & 0x0f0f;
7737
+
7738
+ vector signed char utmps = (vector signed char)vec_xl_len(scales, 4);
7739
+ vector signed short vscales = (vector signed short)vec_unpackh(utmps);
7740
+ vector signed short q4xmins0 = vec_mergeh(vscales, vscales);
7741
+ q4xmins0 = vec_sld(q4xmins0, q4xmins0, 8);
7742
+
7743
+ vector signed short q8ysums0 = vec_xl_len((const int16_t *)(y[i].bsums), 8);
7744
+
7745
+ vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
7746
+ vector signed int prod1 = vec_mulo(q4xmins0, q8ysums0);
7747
+
7748
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vd, vsumf0);
7749
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vd, vsumf1);
7750
+
7751
+ vd = vec_mul(vyd, vec_splats(GGML_FP16_TO_FP32(x[i].d[0])));
7752
+
7753
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
7754
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
7755
+ vector signed char q4x00 = vec_and(qxs0, lowMask);
7756
+ vector signed char q4x01 = vec_sr(qxs0, v4);
7757
+ vector signed char q4x10 = vec_and(qxs1, lowMask);
7758
+ vector signed char q4x11 = vec_sr(qxs1, v4);
7759
+
7760
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
7761
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
7762
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
7763
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
7764
+
7765
+ vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7766
+ vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7767
+ vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7768
+ vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7769
+
7770
+ vector signed short vs0 = vec_splat(vscales, 0);
7771
+ vector signed short vs1 = vec_splat(vscales, 1);
7772
+
7773
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
7774
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs0), vec_mulo(qv10, vs0));
7775
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs1), vec_mulo(qv01, vs1));
7776
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs1), vec_mulo(qv11, vs1));
7777
+
7778
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7779
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7780
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
7781
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
7782
+ }
7783
+
7784
+ vsumf0 = vec_add(vsumf0, vsumf2);
7785
+ vsumf1 = vec_add(vsumf1, vsumf3);
7786
+
7787
+ vsumf0 = vec_add(vsumf0, vsumf1);
7788
+
7789
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
7790
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
7791
+
7792
+ *s = vec_extract(vsumf0, 0);
7793
+
6822
7794
  #else
6823
7795
 
6824
7796
  uint8_t aux8[QK_K];
@@ -7220,6 +8192,130 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7220
8192
 
7221
8193
  *s = sumf+sums;
7222
8194
 
8195
+ #elif defined(__POWER9_VECTOR__)
8196
+ const vector signed char lowMask = vec_splats((signed char)0xF);
8197
+ const vector unsigned char v1 = vec_splats((unsigned char)0x1);
8198
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8199
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8200
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
8201
+
8202
+ vector float vsumf0 = vec_splats(0.0f);
8203
+ vector float vsumf1 = vec_splats(0.0f);
8204
+ vector float vsumf2 = vec_splats(0.0f);
8205
+ vector float vsumf3 = vec_splats(0.0f);
8206
+
8207
+ for (int i = 0; i < nb; ++i) {
8208
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
8209
+ vector float vyd = vec_splats(y[i].d);
8210
+ vector float vd = vec_mul(vxd, vyd);
8211
+
8212
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
8213
+ vector float vdmin = vec_mul(vxmin, vyd);
8214
+
8215
+ memcpy(utmp, x[i].scales, 12);
8216
+
8217
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8218
+ const uint32_t uaux = utmp[1] & kmask1;
8219
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8220
+ utmp[2] = uaux;
8221
+ utmp[0] &= kmask1;
8222
+
8223
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
8224
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
8225
+
8226
+ vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
8227
+ vector signed short vscales = vec_unpackh(utmps);
8228
+
8229
+ vector signed short q5xmins = vec_unpackl(utmps);
8230
+ vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
8231
+ vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
8232
+
8233
+ vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
8234
+ vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
8235
+ vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
8236
+ vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
8237
+
8238
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
8239
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
8240
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
8241
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
8242
+
8243
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
8244
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
8245
+
8246
+ vector signed int vsumi0 = vec_splats((int32_t)0);
8247
+ vector signed int vsumi1 = vec_splats((int32_t)0);
8248
+ vector signed int vsumi2 = vec_splats((int32_t)0);
8249
+ vector signed int vsumi3 = vec_splats((int32_t)0);
8250
+
8251
+ const uint8_t * restrict q5 = x[i].qs;
8252
+ const int8_t * restrict q8 = y[i].qs;
8253
+
8254
+ for (int j = 0; j < QK_K/64; ++j) {
8255
+ __builtin_prefetch(q5, 0, 1);
8256
+ __builtin_prefetch(q8, 0, 1);
8257
+
8258
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
8259
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
8260
+ q5 += 32;
8261
+
8262
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
8263
+ vector signed char qxs01 = vec_sr(qxs0, v4);
8264
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
8265
+ vector signed char qxs11 = vec_sr(qxs1, v4);
8266
+
8267
+ vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
8268
+ vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
8269
+ vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
8270
+ vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
8271
+ qxhs0 = vec_sr(qxhs0, v2);
8272
+ qxhs1 = vec_sr(qxhs1, v2);
8273
+
8274
+ vector signed char q5x00 = vec_or(q5h00, qxs00);
8275
+ vector signed char q5x01 = vec_or(q5h01, qxs01);
8276
+ vector signed char q5x10 = vec_or(q5h10, qxs10);
8277
+ vector signed char q5x11 = vec_or(q5h11, qxs11);
8278
+
8279
+ vector signed char q8y00 = vec_xl( 0, q8);
8280
+ vector signed char q8y10 = vec_xl(16, q8);
8281
+ vector signed char q8y01 = vec_xl(32, q8);
8282
+ vector signed char q8y11 = vec_xl(48, q8);
8283
+ q8 += 64;
8284
+
8285
+ vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
8286
+ vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
8287
+ vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
8288
+ vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
8289
+
8290
+ vector signed short vs0 = vec_splat(vscales, 0);
8291
+ vector signed short vs1 = vec_splat(vscales, 1);
8292
+ vscales = vec_sld(vscales, vscales, 12);
8293
+
8294
+ qv00 = vec_add(qv00, qv10);
8295
+ qv01 = vec_add(qv01, qv11);
8296
+
8297
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
8298
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
8299
+ vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
8300
+ vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
8301
+ }
8302
+
8303
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
8304
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
8305
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
8306
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
8307
+ }
8308
+
8309
+ vsumf0 = vec_add(vsumf0, vsumf2);
8310
+ vsumf1 = vec_add(vsumf1, vsumf3);
8311
+
8312
+ vsumf0 = vec_add(vsumf0, vsumf1);
8313
+
8314
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
8315
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
8316
+
8317
+ *s = vec_extract(vsumf0, 0);
8318
+
7223
8319
  #else
7224
8320
 
7225
8321
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -7517,6 +8613,83 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7517
8613
 
7518
8614
  *s = sumf;
7519
8615
 
8616
+ #elif defined(__POWER9_VECTOR__)
8617
+ const vector signed char lowMask = vec_splats((signed char)0xF);
8618
+ const vector unsigned char v1 = vec_splats((unsigned char)0x1);
8619
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8620
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
8621
+
8622
+ vector float vsumf0 = vec_splats(0.0f);
8623
+ vector float vsumf1 = vec_splats(0.0f);
8624
+ vector float vsumf2 = vec_splats(0.0f);
8625
+ vector float vsumf3 = vec_splats(0.0f);
8626
+
8627
+ #pragma GCC unroll 2
8628
+ for (int i = 0; i < nb; ++i) {
8629
+ __builtin_prefetch(x[i].qs, 0, 1);
8630
+ __builtin_prefetch(y[i].qs, 0, 1);
8631
+
8632
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
8633
+ vector float vyd = vec_splats(y[i].d);
8634
+ vector float vd= vec_mul(vxd, vyd);
8635
+
8636
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
8637
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
8638
+ vector signed char qxs00 = (vector signed char)vec_and(qxs0, lowMask);
8639
+ vector signed char qxs01 = (vector signed char)vec_sr(qxs0, v4);
8640
+ vector signed char qxs10 = (vector signed char)vec_and(qxs1, lowMask);
8641
+ vector signed char qxs11 = (vector signed char)vec_sr(qxs1, v4);
8642
+
8643
+ vector signed char qxhs = (vector signed char)vec_xl_len(x[i].qh, 8);
8644
+ vector signed char qxhs0 = vec_or(qxhs, vec_sr(vec_sld(qxhs, qxhs, 8), v1));
8645
+ vector signed char qxhs1 = vec_sr(qxhs0, v2);
8646
+ vector signed char qxh00 = vec_sl(vec_andc((vector signed char)v1, qxhs0), v4);
8647
+ vector signed char qxh10 = vec_sl(vec_andc((vector signed char)v1, qxhs1), v4);
8648
+ vector signed char qxh01 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs0, v4)), v4);
8649
+ vector signed char qxh11 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs1, v4)), v4);
8650
+
8651
+ vector signed char q5x00 = vec_sub(qxs00, qxh00);
8652
+ vector signed char q5x10 = vec_sub(qxs10, qxh10);
8653
+ vector signed char q5x01 = vec_sub(qxs01, qxh01);
8654
+ vector signed char q5x11 = vec_sub(qxs11, qxh11);
8655
+
8656
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
8657
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
8658
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
8659
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
8660
+
8661
+ vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
8662
+ vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
8663
+ vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
8664
+ vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
8665
+
8666
+ vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
8667
+ vector signed short vs0 = vec_splat(vs, 0);
8668
+ vector signed short vs1 = vec_splat(vs, 1);
8669
+ vector signed short vs2 = vec_splat(vs, 2);
8670
+ vector signed short vs3 = vec_splat(vs, 3);
8671
+
8672
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
8673
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
8674
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
8675
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
8676
+
8677
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
8678
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
8679
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
8680
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
8681
+ }
8682
+
8683
+ vsumf0 = vec_add(vsumf0, vsumf2);
8684
+ vsumf1 = vec_add(vsumf1, vsumf3);
8685
+
8686
+ vsumf0 = vec_add(vsumf0, vsumf1);
8687
+
8688
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
8689
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
8690
+
8691
+ *s = vec_extract(vsumf0, 0);
8692
+
7520
8693
  #else
7521
8694
 
7522
8695
  int8_t aux8[QK_K];
@@ -7947,6 +9120,151 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7947
9120
 
7948
9121
  *s = sumf;
7949
9122
 
9123
+ #elif defined(__POWER9_VECTOR__)
9124
+ const vector signed char lowMask = vec_splats((signed char)0xF);
9125
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
9126
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
9127
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
9128
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
9129
+ const vector signed char off = vec_splats((signed char)0x20);
9130
+
9131
+ vector float vsumf0 = vec_splats(0.0f);
9132
+ vector float vsumf1 = vec_splats(0.0f);
9133
+ vector float vsumf2 = vec_splats(0.0f);
9134
+ vector float vsumf3 = vec_splats(0.0f);
9135
+
9136
+ for (int i = 0; i < nb; ++i) {
9137
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9138
+ vector float vyd = vec_splats(y[i].d);
9139
+ vector float vd = vec_mul(vxd, vyd);
9140
+
9141
+ vector signed int vsumi0 = vec_splats((int32_t)0);
9142
+ vector signed int vsumi1 = vec_splats((int32_t)0);
9143
+ vector signed int vsumi2 = vec_splats((int32_t)0);
9144
+ vector signed int vsumi3 = vec_splats((int32_t)0);
9145
+ vector signed int vsumi4 = vec_splats((int32_t)0);
9146
+ vector signed int vsumi5 = vec_splats((int32_t)0);
9147
+ vector signed int vsumi6 = vec_splats((int32_t)0);
9148
+ vector signed int vsumi7 = vec_splats((int32_t)0);
9149
+
9150
+ const uint8_t * restrict q6 = x[i].ql;
9151
+ const uint8_t * restrict qh = x[i].qh;
9152
+ const int8_t * restrict qs = x[i].scales;
9153
+ const int8_t * restrict q8 = y[i].qs;
9154
+
9155
+ for (int j = 0; j < QK_K/128; ++j) {
9156
+ __builtin_prefetch(q6, 0, 0);
9157
+ __builtin_prefetch(qh, 0, 0);
9158
+ __builtin_prefetch(q8, 0, 0);
9159
+
9160
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
9161
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
9162
+ vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
9163
+ vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
9164
+ q6 += 64;
9165
+
9166
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
9167
+ vector signed char qxs01 = vec_sr(qxs0, v4);
9168
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
9169
+ vector signed char qxs11 = vec_sr(qxs1, v4);
9170
+ vector signed char qxs20 = vec_and(qxs2, lowMask);
9171
+ vector signed char qxs21 = vec_sr(qxs2, v4);
9172
+ vector signed char qxs30 = vec_and(qxs3, lowMask);
9173
+ vector signed char qxs31 = vec_sr(qxs3, v4);
9174
+
9175
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
9176
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
9177
+ qh += 32;
9178
+
9179
+ vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
9180
+ vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
9181
+ vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
9182
+ vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
9183
+ vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
9184
+ vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
9185
+ vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
9186
+ vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
9187
+
9188
+ vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
9189
+ vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
9190
+ vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
9191
+ vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
9192
+ vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
9193
+ vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
9194
+ vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
9195
+ vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
9196
+
9197
+ vector signed char q8y00 = vec_xl( 0, q8);
9198
+ vector signed char q8y10 = vec_xl( 16, q8);
9199
+ vector signed char q8y20 = vec_xl( 32, q8);
9200
+ vector signed char q8y30 = vec_xl( 48, q8);
9201
+ vector signed char q8y01 = vec_xl( 64, q8);
9202
+ vector signed char q8y11 = vec_xl( 80, q8);
9203
+ vector signed char q8y21 = vec_xl( 96, q8);
9204
+ vector signed char q8y31 = vec_xl(112, q8);
9205
+ q8 += 128;
9206
+
9207
+ vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
9208
+ vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
9209
+ vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
9210
+ vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
9211
+ vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
9212
+ vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
9213
+ vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
9214
+ vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
9215
+
9216
+ vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
9217
+ qs += 8;
9218
+
9219
+ vector signed short vs0 = vec_splat(vscales, 0);
9220
+ vector signed short vs1 = vec_splat(vscales, 1);
9221
+ vector signed short vs2 = vec_splat(vscales, 2);
9222
+ vector signed short vs3 = vec_splat(vscales, 3);
9223
+ vector signed short vs4 = vec_splat(vscales, 4);
9224
+ vector signed short vs5 = vec_splat(vscales, 5);
9225
+ vector signed short vs6 = vec_splat(vscales, 6);
9226
+ vector signed short vs7 = vec_splat(vscales, 7);
9227
+
9228
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
9229
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
9230
+ vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
9231
+ vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
9232
+ vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
9233
+ vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
9234
+ vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
9235
+ vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
9236
+
9237
+ vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
9238
+ vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
9239
+ vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
9240
+ vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
9241
+ vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
9242
+ vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
9243
+ vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
9244
+ vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
9245
+ }
9246
+
9247
+ vsumi0 = vec_add(vsumi0, vsumi4);
9248
+ vsumi1 = vec_add(vsumi1, vsumi5);
9249
+ vsumi2 = vec_add(vsumi2, vsumi6);
9250
+ vsumi3 = vec_add(vsumi3, vsumi7);
9251
+
9252
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9253
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9254
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9255
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9256
+ }
9257
+
9258
+ vsumf0 = vec_add(vsumf0, vsumf2);
9259
+ vsumf1 = vec_add(vsumf1, vsumf3);
9260
+
9261
+ vsumf0 = vec_add(vsumf0, vsumf1);
9262
+
9263
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9264
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9265
+
9266
+ *s = vec_extract(vsumf0, 0);
9267
+
7950
9268
  #else
7951
9269
 
7952
9270
  int8_t aux8[QK_K];
@@ -8253,6 +9571,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8253
9571
 
8254
9572
  *s = sumf;
8255
9573
 
9574
+ #elif defined(__POWER9_VECTOR__)
9575
+ const vector signed char lowMask = vec_splats((signed char)0xF);
9576
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
9577
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
9578
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
9579
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
9580
+ const vector signed char off = vec_splats((signed char)0x20);
9581
+
9582
+ vector float vsumf0 = vec_splats(0.0f);
9583
+ vector float vsumf1 = vec_splats(0.0f);
9584
+ vector float vsumf2 = vec_splats(0.0f);
9585
+ vector float vsumf3 = vec_splats(0.0f);
9586
+
9587
+ #pragma GCC unroll 2
9588
+ for (int i = 0; i < nb; ++i) {
9589
+ __builtin_prefetch(x[i].ql, 0, 1);
9590
+ __builtin_prefetch(x[i].qh, 0, 1);
9591
+ __builtin_prefetch(y[i].qs, 0, 1);
9592
+
9593
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9594
+ vector float vyd = vec_splats(y[i].d);
9595
+ vector float vd= vec_mul(vxd, vyd);
9596
+
9597
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].ql);
9598
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].ql);
9599
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
9600
+ vector signed char qxs01 = vec_sr(qxs0, v4);
9601
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
9602
+ vector signed char qxs11 = vec_sr(qxs1, v4);
9603
+
9604
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
9605
+
9606
+ vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
9607
+ vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
9608
+ vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
9609
+ vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
9610
+
9611
+ vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
9612
+ vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
9613
+ vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
9614
+ vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
9615
+
9616
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
9617
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
9618
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
9619
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
9620
+
9621
+ vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
9622
+ vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
9623
+ vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
9624
+ vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
9625
+
9626
+ vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
9627
+ vector signed short vs0 = vec_splat(vs, 0);
9628
+ vector signed short vs1 = vec_splat(vs, 1);
9629
+ vector signed short vs2 = vec_splat(vs, 2);
9630
+ vector signed short vs3 = vec_splat(vs, 3);
9631
+
9632
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
9633
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
9634
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
9635
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
9636
+
9637
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9638
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9639
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9640
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9641
+ }
9642
+
9643
+ vsumf0 = vec_add(vsumf0, vsumf2);
9644
+ vsumf1 = vec_add(vsumf1, vsumf3);
9645
+
9646
+ vsumf0 = vec_add(vsumf0, vsumf1);
9647
+
9648
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9649
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9650
+
9651
+ *s = vec_extract(vsumf0, 0);
9652
+
8256
9653
  #else
8257
9654
 
8258
9655
  int8_t aux8[QK_K];
@@ -8294,7 +9691,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8294
9691
 
8295
9692
  #endif
8296
9693
 
8297
- #if defined (__AVX2__) || defined (__ARM_NEON)
9694
+ #if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__)
8298
9695
  static const int8_t keven_signs_q2xs[1024] = {
8299
9696
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8300
9697
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8427,6 +9824,103 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8427
9824
 
8428
9825
  *s = 0.125f * hsum_float_8(accumf);
8429
9826
 
9827
+ #elif defined(__POWER9_VECTOR__)
9828
+ vector float vsumf0 = vec_splats(0.0f);
9829
+ vector float vsumf1 = vec_splats(0.0f);
9830
+ vector float vsumf2 = vec_splats(0.0f);
9831
+ vector float vsumf3 = vec_splats(0.0f);
9832
+
9833
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9834
+
9835
+ for (int i = 0; i < nb; ++i) {
9836
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9837
+ vector float vyd = vec_splats(y[i].d);
9838
+ vector float vd = vec_mul(vxd, vyd);
9839
+
9840
+ vector signed int vsumi0 = vec_splats((int32_t)0);
9841
+ vector signed int vsumi1 = vec_splats((int32_t)0);
9842
+ vector signed int vsumi2 = vec_splats((int32_t)0);
9843
+ vector signed int vsumi3 = vec_splats((int32_t)0);
9844
+ vector signed int vsumi4 = vec_splats((int32_t)0);
9845
+ vector signed int vsumi5 = vec_splats((int32_t)0);
9846
+ vector signed int vsumi6 = vec_splats((int32_t)0);
9847
+ vector signed int vsumi7 = vec_splats((int32_t)0);
9848
+
9849
+ const uint16_t * restrict q2 = x[i].qs;
9850
+ const int8_t * restrict q8 = y[i].qs;
9851
+
9852
+ for (int j = 0; j < QK_K/32; j += 2) {
9853
+ __builtin_prefetch(q2, 0, 1);
9854
+ __builtin_prefetch(q8, 0, 1);
9855
+
9856
+ uint32_t aux32[4];
9857
+ const uint8_t * aux8 = (const uint8_t *)aux32;
9858
+
9859
+ memcpy(aux32, q2, 4*sizeof(uint32_t));
9860
+ q2 += 8;
9861
+
9862
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
9863
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
9864
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
9865
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
9866
+
9867
+ vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))};
9868
+ vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
9869
+ vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))};
9870
+ vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
9871
+
9872
+ vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
9873
+ vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
9874
+ vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
9875
+ vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
9876
+
9877
+ vector signed char q8y0 = vec_xl( 0, q8);
9878
+ vector signed char q8y1 = vec_xl(16, q8);
9879
+ vector signed char q8y2 = vec_xl(32, q8);
9880
+ vector signed char q8y3 = vec_xl(48, q8);
9881
+ q8 += 64;
9882
+
9883
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
9884
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
9885
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
9886
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
9887
+
9888
+ const uint16_t ls0 = aux32[1] >> 28;
9889
+ const uint16_t ls1 = aux32[3] >> 28;
9890
+
9891
+ vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9892
+ vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9893
+
9894
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
9895
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
9896
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
9897
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
9898
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
9899
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
9900
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
9901
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
9902
+ }
9903
+
9904
+ vsumi0 = vec_add(vsumi0, vsumi4);
9905
+ vsumi1 = vec_add(vsumi1, vsumi5);
9906
+ vsumi2 = vec_add(vsumi2, vsumi6);
9907
+ vsumi3 = vec_add(vsumi3, vsumi7);
9908
+
9909
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9910
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9911
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9912
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9913
+ }
9914
+
9915
+ vsumf0 = vec_add(vsumf0, vsumf2);
9916
+ vsumf1 = vec_add(vsumf1, vsumf3);
9917
+
9918
+ vsumf0 = vec_add(vsumf0, vsumf1);
9919
+
9920
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9921
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9922
+
9923
+ *s = 0.125f * vec_extract(vsumf0, 0);
8430
9924
  #else
8431
9925
 
8432
9926
  uint32_t aux32[2];
@@ -8702,6 +10196,104 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8702
10196
  *s = 0.125f * hsum_float_8(accumf);
8703
10197
  #endif
8704
10198
 
10199
+ #elif defined(__POWER9_VECTOR__)
10200
+ vector float vsumf0 = vec_splats(0.0f);
10201
+ vector float vsumf1 = vec_splats(0.0f);
10202
+ vector float vsumf2 = vec_splats(0.0f);
10203
+ vector float vsumf3 = vec_splats(0.0f);
10204
+
10205
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10206
+
10207
+ for (int i = 0; i < nb; ++i) {
10208
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10209
+ vector float vyd = vec_splats(y[i].d);
10210
+ vector float vd = vec_mul(vxd, vyd);
10211
+
10212
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10213
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10214
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10215
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10216
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10217
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10218
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10219
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10220
+
10221
+ const uint16_t * restrict q2 = x[i].qs;
10222
+ const uint8_t * restrict sc = x[i].scales;
10223
+ const int8_t * restrict q8 = y[i].qs;
10224
+
10225
+ for (int j = 0; j < QK_K/64; ++j) {
10226
+ __builtin_prefetch(q2, 0, 1);
10227
+ __builtin_prefetch(q8, 0, 1);
10228
+
10229
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
10230
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
10231
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
10232
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
10233
+
10234
+ vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
10235
+ vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
10236
+ vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
10237
+ vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
10238
+ q2 += 8;
10239
+
10240
+ vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
10241
+ vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
10242
+ vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
10243
+ vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
10244
+
10245
+ vector signed char q8y0 = vec_xl( 0, q8);
10246
+ vector signed char q8y1 = vec_xl(16, q8);
10247
+ vector signed char q8y2 = vec_xl(32, q8);
10248
+ vector signed char q8y3 = vec_xl(48, q8);
10249
+ q8 += 64;
10250
+
10251
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
10252
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
10253
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
10254
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
10255
+
10256
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
10257
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
10258
+ const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
10259
+ const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
10260
+ sc += 2;
10261
+
10262
+ vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
10263
+ vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
10264
+ vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
10265
+ vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
10266
+
10267
+ vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
10268
+ vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
10269
+ vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
10270
+ vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
10271
+ vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
10272
+ vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
10273
+ vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
10274
+ vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10275
+ }
10276
+
10277
+ vsumi0 = vec_add(vsumi0, vsumi4);
10278
+ vsumi1 = vec_add(vsumi1, vsumi5);
10279
+ vsumi2 = vec_add(vsumi2, vsumi6);
10280
+ vsumi3 = vec_add(vsumi3, vsumi7);
10281
+
10282
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10283
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10284
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10285
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10286
+ }
10287
+
10288
+ vsumf0 = vec_add(vsumf0, vsumf2);
10289
+ vsumf1 = vec_add(vsumf1, vsumf3);
10290
+
10291
+ vsumf0 = vec_add(vsumf0, vsumf1);
10292
+
10293
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10294
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10295
+
10296
+ *s = 0.125f * vec_extract(vsumf0, 0);
8705
10297
  #else
8706
10298
 
8707
10299
  float sumf = 0.f;
@@ -8902,6 +10494,124 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8902
10494
 
8903
10495
  *s = 0.125f * hsum_float_8(accumf);
8904
10496
 
10497
+ #elif defined(__POWER9_VECTOR__)
10498
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10499
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10500
+ };
10501
+
10502
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10503
+
10504
+ vector float vsumf0 = vec_splats(0.0f);
10505
+ vector float vsumf1 = vec_splats(0.0f);
10506
+ vector float vsumf2 = vec_splats(0.0f);
10507
+ vector float vsumf3 = vec_splats(0.0f);
10508
+
10509
+ const vector unsigned char mask0 = vec_xl( 0, k_mask1);
10510
+ const vector unsigned char mask1 = vec_xl(16, k_mask1);
10511
+ const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
10512
+
10513
+ for (int i = 0; i < nb; ++i) {
10514
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10515
+ vector float vyd = vec_splats(y[i].d);
10516
+ vector float vd = vec_mul(vxd, vyd);
10517
+
10518
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10519
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10520
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10521
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10522
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10523
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10524
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10525
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10526
+
10527
+ const uint8_t * restrict q2 = x[i].qs;
10528
+ const uint8_t * restrict qh = x[i].qh;
10529
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10530
+ const uint8_t * restrict sc = x[i].scales;
10531
+ const int8_t * restrict q8 = y[i].qs;
10532
+
10533
+ for (int j = 0; j < QK_K/32; j += 2) {
10534
+ __builtin_prefetch(q2, 0, 1);
10535
+ __builtin_prefetch(q8, 0, 1);
10536
+
10537
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
10538
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
10539
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
10540
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
10541
+ q2 += 8;
10542
+ qh += 2;
10543
+
10544
+ vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
10545
+ vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
10546
+ signs += 4;
10547
+
10548
+ vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
10549
+ vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
10550
+ vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
10551
+ vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
10552
+
10553
+ vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
10554
+ vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
10555
+ vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
10556
+ vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
10557
+
10558
+ vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
10559
+ vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
10560
+ vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
10561
+ vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
10562
+
10563
+ vector signed char q8y0 = vec_xl( 0, q8);
10564
+ vector signed char q8y1 = vec_xl(16, q8);
10565
+ vector signed char q8y2 = vec_xl(32, q8);
10566
+ vector signed char q8y3 = vec_xl(48, q8);
10567
+ q8 += 64;
10568
+
10569
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
10570
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
10571
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
10572
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
10573
+
10574
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
10575
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
10576
+ const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
10577
+ const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
10578
+ sc += 2;
10579
+
10580
+ vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
10581
+ vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
10582
+ vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
10583
+ vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
10584
+
10585
+ vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
10586
+ vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
10587
+ vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
10588
+ vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
10589
+ vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
10590
+ vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
10591
+ vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
10592
+ vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10593
+ }
10594
+
10595
+ vsumi0 = vec_add(vsumi0, vsumi4);
10596
+ vsumi1 = vec_add(vsumi1, vsumi5);
10597
+ vsumi2 = vec_add(vsumi2, vsumi6);
10598
+ vsumi3 = vec_add(vsumi3, vsumi7);
10599
+
10600
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10601
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10602
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10603
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10604
+ }
10605
+
10606
+ vsumf0 = vec_add(vsumf0, vsumf2);
10607
+ vsumf1 = vec_add(vsumf1, vsumf3);
10608
+
10609
+ vsumf0 = vec_add(vsumf0, vsumf1);
10610
+
10611
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10612
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10613
+
10614
+ *s = 0.125f * vec_extract(vsumf0, 0);
8905
10615
  #else
8906
10616
 
8907
10617
  float sumf = 0;
@@ -9046,6 +10756,101 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9046
10756
 
9047
10757
  *s = 0.25f * hsum_float_8(accumf);
9048
10758
 
10759
+ #elif defined(__POWER9_VECTOR__)
10760
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10761
+
10762
+ vector float vsumf0 = vec_splats(0.0f);
10763
+ vector float vsumf1 = vec_splats(0.0f);
10764
+ vector float vsumf2 = vec_splats(0.0f);
10765
+ vector float vsumf3 = vec_splats(0.0f);
10766
+
10767
+ for (int i = 0; i < nb; ++i) {
10768
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10769
+ vector float vyd = vec_splats(y[i].d);
10770
+ vector float vd = vec_mul(vxd, vyd);
10771
+
10772
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10773
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10774
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10775
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10776
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10777
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10778
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10779
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10780
+
10781
+ const uint8_t * restrict q3 = x[i].qs;
10782
+ const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
10783
+ const int8_t * restrict q8 = y[i].qs;
10784
+
10785
+ #pragma GCC unroll 1
10786
+ for (int j = 0; j < QK_K/32; j += 2) {
10787
+ __builtin_prefetch(q3, 0, 1);
10788
+ __builtin_prefetch(q8, 0, 1);
10789
+
10790
+ vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
10791
+ vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
10792
+ vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
10793
+ vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
10794
+ q3 += 16;
10795
+
10796
+ vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])};
10797
+ vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
10798
+ vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])};
10799
+ vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
10800
+
10801
+ vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
10802
+ vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
10803
+ vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
10804
+ vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
10805
+
10806
+ vector signed char q8y0 = vec_xl( 0, q8);
10807
+ vector signed char q8y1 = vec_xl(16, q8);
10808
+ vector signed char q8y2 = vec_xl(32, q8);
10809
+ vector signed char q8y3 = vec_xl(48, q8);
10810
+ q8 += 64;
10811
+
10812
+ vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
10813
+ vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
10814
+ vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
10815
+ vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
10816
+
10817
+ const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
10818
+ const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
10819
+ signs += 2;
10820
+
10821
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10822
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10823
+
10824
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10825
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10826
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10827
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10828
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10829
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10830
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10831
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10832
+ }
10833
+
10834
+ vsumi0 = vec_add(vsumi0, vsumi4);
10835
+ vsumi1 = vec_add(vsumi1, vsumi5);
10836
+ vsumi2 = vec_add(vsumi2, vsumi6);
10837
+ vsumi3 = vec_add(vsumi3, vsumi7);
10838
+
10839
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10840
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10841
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10842
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10843
+ }
10844
+
10845
+ vsumf0 = vec_add(vsumf0, vsumf2);
10846
+ vsumf1 = vec_add(vsumf1, vsumf3);
10847
+
10848
+ vsumf0 = vec_add(vsumf0, vsumf1);
10849
+
10850
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10851
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10852
+
10853
+ *s = 0.25f * vec_extract(vsumf0, 0);
9049
10854
  #else
9050
10855
 
9051
10856
  uint32_t aux32;
@@ -9273,6 +11078,124 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9273
11078
 
9274
11079
  *s = hsum_float_8(accumf);
9275
11080
 
11081
+ #elif defined(__POWER9_VECTOR__)
11082
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
11083
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
11084
+ };
11085
+
11086
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
11087
+
11088
+ vector float vsumf0 = vec_splats(0.0f);
11089
+ vector float vsumf1 = vec_splats(0.0f);
11090
+ vector float vsumf2 = vec_splats(0.0f);
11091
+ vector float vsumf3 = vec_splats(0.0f);
11092
+
11093
+ const vector unsigned char mask0 = vec_xl( 0, k_mask1);
11094
+ const vector unsigned char mask1 = vec_xl(16, k_mask1);
11095
+ const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
11096
+
11097
+ for (int i = 0; i < nb; ++i) {
11098
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
11099
+ vector float vyd = vec_splats(y[i].d);
11100
+ vector float vd = vec_mul(vxd, vyd);
11101
+
11102
+ const uint8_t * restrict q3 = x[i].qs;
11103
+ const uint8_t * restrict qh = x[i].qh;
11104
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
11105
+ const uint8_t * restrict sc = x[i].scales;
11106
+ const int8_t * restrict q8 = y[i].qs;
11107
+
11108
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11109
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11110
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11111
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11112
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11113
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11114
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11115
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11116
+
11117
+ for (int j = 0; j < QK_K/32; j += 2) {
11118
+ __builtin_prefetch(q3, 0, 1);
11119
+ __builtin_prefetch(q8, 0, 1);
11120
+
11121
+ vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
11122
+ iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
11123
+ vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
11124
+ iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
11125
+ vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
11126
+ iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
11127
+ vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
11128
+ iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
11129
+ q3 += 16;
11130
+ qh += 2;
11131
+
11132
+ vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
11133
+ vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
11134
+ signs += 4;
11135
+
11136
+ vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
11137
+ vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
11138
+ vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
11139
+ vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
11140
+
11141
+ vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
11142
+ vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
11143
+ vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
11144
+ vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
11145
+
11146
+ vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
11147
+ vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
11148
+ vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
11149
+ vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
11150
+
11151
+ vector signed char q8y0 = vec_xl( 0, q8);
11152
+ vector signed char q8y1 = vec_xl(16, q8);
11153
+ vector signed char q8y2 = vec_xl(32, q8);
11154
+ vector signed char q8y3 = vec_xl(48, q8);
11155
+ q8 += 64;
11156
+
11157
+ vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
11158
+ vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
11159
+ vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
11160
+ vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
11161
+
11162
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
11163
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
11164
+ sc ++;
11165
+
11166
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
11167
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
11168
+
11169
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11170
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11171
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11172
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11173
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11174
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11175
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11176
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11177
+ }
11178
+
11179
+ vsumi0 = vec_add(vsumi0, vsumi4);
11180
+ vsumi1 = vec_add(vsumi1, vsumi5);
11181
+ vsumi2 = vec_add(vsumi2, vsumi6);
11182
+ vsumi3 = vec_add(vsumi3, vsumi7);
11183
+
11184
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11185
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11186
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
11187
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
11188
+ }
11189
+
11190
+ vsumf0 = vec_add(vsumf0, vsumf2);
11191
+ vsumf1 = vec_add(vsumf1, vsumf3);
11192
+
11193
+ vsumf0 = vec_add(vsumf0, vsumf1);
11194
+
11195
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11196
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11197
+
11198
+ *s = vec_extract(vsumf0, 0);
9276
11199
  #else
9277
11200
 
9278
11201
  float sumf = 0.f;
@@ -9427,6 +11350,113 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9427
11350
 
9428
11351
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
9429
11352
 
11353
+ #elif defined(__POWER9_VECTOR__)
11354
+ const vector unsigned char v0 = vec_splats((unsigned char)0x0);
11355
+ const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
11356
+
11357
+ vector float vsumf0 = vec_splats(0.0f);
11358
+ vector float vsumf1 = vec_splats(0.0f);
11359
+ vector float vsumf2 = vec_splats(0.0f);
11360
+ vector float vsumf3 = vec_splats(0.0f);
11361
+
11362
+ for (int i = 0; i < nb; ++i) {
11363
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
11364
+ vector float vyd = vec_splats(y[i].d);
11365
+ vector float vd = vec_mul(vxd, vyd);
11366
+
11367
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11368
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11369
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11370
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11371
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11372
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11373
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11374
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11375
+ vector signed int vsumi8 = vec_splats((int32_t)0);
11376
+
11377
+ const uint8_t * restrict q1 = x[i].qs;
11378
+ const uint16_t * restrict qh = x[i].qh;
11379
+ const int8_t * restrict q8 = y[i].qs;
11380
+ const int16_t * restrict qs = y[i].bsums;
11381
+
11382
+ for (int j = 0; j < QK_K/32; j += 2) {
11383
+ __builtin_prefetch(q1, 0, 1);
11384
+ __builtin_prefetch(qh, 0, 1);
11385
+ __builtin_prefetch(q8, 0, 1);
11386
+
11387
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
11388
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
11389
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
11390
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
11391
+ q1 += 8;
11392
+
11393
+ vector signed char q1x0 = (vector signed char)aux64x2_0;
11394
+ vector signed char q1x1 = (vector signed char)aux64x2_1;
11395
+ vector signed char q1x2 = (vector signed char)aux64x2_2;
11396
+ vector signed char q1x3 = (vector signed char)aux64x2_3;
11397
+
11398
+ vector signed char q8y0 = vec_xl( 0, q8);
11399
+ vector signed char q8y1 = vec_xl(16, q8);
11400
+ vector signed char q8y2 = vec_xl(32, q8);
11401
+ vector signed char q8y3 = vec_xl(48, q8);
11402
+ q8 += 64;
11403
+
11404
+ vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
11405
+ vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
11406
+ vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
11407
+ vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
11408
+
11409
+ const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
11410
+ const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
11411
+
11412
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
11413
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
11414
+ vector signed short vscales = vec_sld(vscales23, vscales01, 8);
11415
+
11416
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11417
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11418
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11419
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11420
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11421
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11422
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11423
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11424
+
11425
+ vector signed short q8ysums = vec_xl_len(qs, 8);
11426
+ qs += 4;
11427
+ q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
11428
+
11429
+ vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
11430
+ qh += 2;
11431
+ vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
11432
+
11433
+ vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
11434
+
11435
+ vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
11436
+ }
11437
+
11438
+ vsumi0 = vec_add(vsumi0, vsumi4);
11439
+ vsumi1 = vec_add(vsumi1, vsumi5);
11440
+ vsumi2 = vec_add(vsumi2, vsumi6);
11441
+ vsumi3 = vec_add(vsumi3, vsumi7);
11442
+
11443
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11444
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11445
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
11446
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
11447
+
11448
+ vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
11449
+ }
11450
+
11451
+ vsumf0 = vec_add(vsumf0, vsumf2);
11452
+ vsumf1 = vec_add(vsumf1, vsumf3);
11453
+
11454
+ vsumf0 = vec_add(vsumf0, vsumf1);
11455
+
11456
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11457
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11458
+
11459
+ *s = vec_extract(vsumf0, 0);
9430
11460
  #else
9431
11461
 
9432
11462
  float sumf = 0;
@@ -9783,6 +11813,51 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9783
11813
 
9784
11814
  *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
9785
11815
 
11816
+ #elif defined(__POWER9_VECTOR__)
11817
+ const vector signed char lowMask = vec_splats((signed char)0xF);
11818
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11819
+
11820
+ vector float vsumf0 = vec_splats(0.0f);
11821
+ vector float vsumf1 = vec_splats(0.0f);
11822
+
11823
+ const vector signed char values = vec_xl( 0, kvalues_iq4nl);
11824
+
11825
+ #pragma GCC unroll 4
11826
+ for (int ib = 0; ib < nb; ++ib) {
11827
+ __builtin_prefetch(x[ib].qs, 0, 1);
11828
+ __builtin_prefetch(y[ib].qs, 0, 1);
11829
+
11830
+
11831
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
11832
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
11833
+ vector float vd = vec_mul(vxd, vyd);
11834
+
11835
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
11836
+ vector signed char q4x0 = vec_and(qxs, lowMask);
11837
+ vector signed char q4x1 = vec_sr(qxs, v4);
11838
+
11839
+ q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
11840
+ q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
11841
+
11842
+ vector signed char q8y0 = vec_xl( 0, y[ib].qs);
11843
+ vector signed char q8y1 = vec_xl(16, y[ib].qs);
11844
+
11845
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11846
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11847
+
11848
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
11849
+ vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
11850
+
11851
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11852
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11853
+ }
11854
+
11855
+ vsumf0 = vec_add(vsumf0, vsumf1);
11856
+
11857
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11858
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11859
+
11860
+ *s = vec_extract(vsumf0, 0);
9786
11861
  #else
9787
11862
  float sumf = 0;
9788
11863
  for (int ib = 0; ib < nb; ++ib) {
@@ -9894,6 +11969,105 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9894
11969
 
9895
11970
  *s = hsum_float_8(accum);
9896
11971
 
11972
+ #elif defined(__POWER9_VECTOR__)
11973
+ const vector signed char lowMask = vec_splats((signed char)0xF);
11974
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11975
+
11976
+ vector float vsumf0 = vec_splats(0.0f);
11977
+ vector float vsumf1 = vec_splats(0.0f);
11978
+ vector float vsumf2 = vec_splats(0.0f);
11979
+ vector float vsumf3 = vec_splats(0.0f);
11980
+
11981
+ const vector signed char values = vec_xl( 0, kvalues_iq4nl);
11982
+
11983
+ for (int ibl = 0; ibl < nb; ++ibl) {
11984
+
11985
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
11986
+ vector float vyd = vec_splats(y[ibl].d);
11987
+ vector float vd = vec_mul(vxd, vyd);
11988
+
11989
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11990
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11991
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11992
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11993
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11994
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11995
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11996
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11997
+
11998
+ uint16_t h = x[ibl].scales_h;
11999
+
12000
+ const uint8_t * restrict q4 = x[ibl].qs;
12001
+ const uint8_t * restrict sc = x[ibl].scales_l;
12002
+ const int8_t * restrict q8 = y[ibl].qs;
12003
+
12004
+ for (int ib = 0; ib < QK_K/64; ib ++ ) {
12005
+ __builtin_prefetch(q4, 0, 1);
12006
+ __builtin_prefetch(q8, 0, 1);
12007
+
12008
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
12009
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
12010
+ q4 += 32;
12011
+
12012
+ vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
12013
+ vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
12014
+ vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
12015
+ vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
12016
+
12017
+ q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
12018
+ q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
12019
+ q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
12020
+ q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
12021
+
12022
+ vector signed char q8y0 = vec_xl( 0, q8);
12023
+ vector signed char q8y1 = vec_xl(16, q8);
12024
+ vector signed char q8y2 = vec_xl(32, q8);
12025
+ vector signed char q8y3 = vec_xl(48, q8);
12026
+ q8 += 64;
12027
+
12028
+ vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
12029
+ vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
12030
+ vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
12031
+ vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
12032
+
12033
+ const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
12034
+ const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32);
12035
+ h >>= 4;
12036
+ sc ++;
12037
+
12038
+ vector signed short vscales01 = vec_splats((int16_t)ls0);
12039
+ vector signed short vscales23 = vec_splats((int16_t)ls1);
12040
+
12041
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
12042
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
12043
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
12044
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
12045
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
12046
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
12047
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
12048
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
12049
+ }
12050
+
12051
+ vsumi0 = vec_add(vsumi0, vsumi4);
12052
+ vsumi1 = vec_add(vsumi1, vsumi5);
12053
+ vsumi2 = vec_add(vsumi2, vsumi6);
12054
+ vsumi3 = vec_add(vsumi3, vsumi7);
12055
+
12056
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
12057
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
12058
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
12059
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
12060
+ }
12061
+
12062
+ vsumf0 = vec_add(vsumf0, vsumf2);
12063
+ vsumf1 = vec_add(vsumf1, vsumf3);
12064
+
12065
+ vsumf0 = vec_add(vsumf0, vsumf1);
12066
+
12067
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
12068
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
12069
+
12070
+ *s = vec_extract(vsumf0, 0);
9897
12071
  #else
9898
12072
  float sumf = 0;
9899
12073
  for (int ibl = 0; ibl < nb; ++ibl) {