llama_cpp 0.15.1 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,12 @@
14
14
  #include <stdlib.h> // for qsort
15
15
  #include <stdio.h> // for GGML_ASSERT
16
16
 
17
+ #if defined(_MSC_VER)
18
+ // disable "possible loss of data" to avoid warnings for hundreds of casts
19
+ // we should just be careful :)
20
+ #pragma warning(disable: 4244 4267)
21
+ #endif
22
+
17
23
  #define UNUSED GGML_UNUSED
18
24
 
19
25
  // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -235,7 +241,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
235
241
  #endif // __AVX__ || __AVX2__ || __AVX512F__
236
242
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
237
243
 
238
- #if defined(__ARM_NEON) || defined(__wasm_simd128__)
244
+ #if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
239
245
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
240
246
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
241
247
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -637,6 +643,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
637
643
  // store result
638
644
  __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
639
645
  }
646
+ #elif defined(__POWER9_VECTOR__)
647
+ for (int i = 0; i < nb; i++) {
648
+ vector float srcv [8];
649
+ vector float asrcv[8];
650
+ vector float amaxv[8];
651
+ vector signed int vi[8];
652
+
653
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
654
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
655
+
656
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
657
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
658
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
659
+
660
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
661
+ vec_extract(amaxv[0], 1)),
662
+ MAX(vec_extract(amaxv[0], 2),
663
+ vec_extract(amaxv[0], 3)));
664
+
665
+ const float d = amax / ((1 << 7) - 1);
666
+ const float id = d ? 1.0f/d : 0.0f;
667
+ const vector float vid = vec_splats(id);
668
+
669
+ y[i].d = GGML_FP32_TO_FP16(d);
670
+
671
+ for (int j = 0; j < 8; j++) {
672
+ const vector float v = vec_round(vec_mul(srcv[j], vid));
673
+ vi[j] = vec_cts(v, 0);
674
+ }
675
+ vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
676
+ vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
677
+ }
640
678
  #else
641
679
  GGML_UNUSED(nb);
642
680
  // scalar
@@ -892,6 +930,46 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
892
930
  int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
893
931
  y[i].s = GGML_FP32_TO_FP16(sum*d);
894
932
  }
933
+ #elif defined(__POWER9_VECTOR__)
934
+ for (int i = 0; i < nb; i++) {
935
+ vector float srcv [8];
936
+ vector float asrcv[8];
937
+ vector float amaxv[8];
938
+ vector signed int vi[8];
939
+
940
+ for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
941
+ for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
942
+
943
+ for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
944
+ for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
945
+ for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
946
+
947
+ const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
948
+ vec_extract(amaxv[0], 1)),
949
+ MAX(vec_extract(amaxv[0], 2),
950
+ vec_extract(amaxv[0], 3)));
951
+
952
+ const float d = amax / ((1 << 7) - 1);
953
+ const float id = d ? 1.0f/d : 0.0f;
954
+ const vector float vid = vec_splats(id);
955
+
956
+ y[i].d = GGML_FP32_TO_FP16(d);
957
+
958
+ vector int accv = vec_splats(0);
959
+
960
+ for (int j = 0; j < 8; j++) {
961
+ const vector float v = vec_round(vec_mul(srcv[j], vid));
962
+ vi[j] = vec_cts(v, 0);
963
+
964
+ accv = vec_add(accv, vi[j]);
965
+ }
966
+ vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
967
+ vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
968
+
969
+ accv = vec_add(accv, vec_sld(accv, accv, 4));
970
+ accv = vec_add(accv, vec_sld(accv, accv, 8));
971
+ y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
972
+ }
895
973
  #else
896
974
  GGML_UNUSED(nb);
897
975
  // scalar
@@ -1908,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
1908
1986
 
1909
1987
  for (int j = 0; j < QK_K/16; ++j) {
1910
1988
  if (quant_weights) {
1911
- const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
1989
+ const float * qw = quant_weights + QK_K * i + 16*j;
1912
1990
  for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
1913
1991
  } else {
1914
1992
  for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
@@ -3409,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3409
3487
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3410
3488
  if (nrc == 2) {
3411
3489
  const block_q4_0 * restrict vx0 = vx;
3412
- const block_q4_0 * restrict vx1 = vx + bx;
3413
-
3490
+ const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
3414
3491
  const block_q8_0 * restrict vy0 = vy;
3415
- const block_q8_0 * restrict vy1 = vy + by;
3492
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
3416
3493
 
3417
3494
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3418
3495
 
@@ -3446,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3446
3523
  const int8x16_t y1_l = vld1q_s8(b_y1->qs);
3447
3524
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3448
3525
 
3449
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3450
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3451
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3452
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3526
+ float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3527
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3528
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3529
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3530
+
3531
+ float32x4_t scale = vld1q_f32(_scale);
3453
3532
 
3454
3533
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3455
3534
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3734,6 +3813,46 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3734
3813
  }
3735
3814
 
3736
3815
  *s = sumf;
3816
+ #elif defined(__POWER9_VECTOR__)
3817
+ const vector signed char lowMask = vec_splats((signed char)0xF);
3818
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
3819
+ const vector signed char v8 = vec_splats((signed char)0x8);
3820
+
3821
+ vector float vsumf0 = vec_splats(0.0f);
3822
+
3823
+ #pragma GCC unroll 4
3824
+ for (int i = 0; i < nb; i++) {
3825
+ __builtin_prefetch(x[i].qs, 0, 1);
3826
+ __builtin_prefetch(y[i].qs, 0, 1);
3827
+
3828
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
3829
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
3830
+ vector float vd = vec_mul(vxd, vyd);
3831
+
3832
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
3833
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
3834
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
3835
+
3836
+ vector signed char q4x0 = vec_and(qxs, lowMask);
3837
+ vector signed char q4x1 = vec_sr(qxs, v4);
3838
+
3839
+ q4x0 = vec_sub(q4x0, v8);
3840
+ q4x1 = vec_sub(q4x1, v8);
3841
+
3842
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
3843
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
3844
+
3845
+ qv0 = vec_add(qv0, qv1);
3846
+
3847
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
3848
+
3849
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
3850
+ }
3851
+
3852
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
3853
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
3854
+
3855
+ *s = vec_extract(vsumf0, 0);
3737
3856
  #else
3738
3857
  // scalar
3739
3858
  float sumf = 0.0;
@@ -3776,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3776
3895
  #if defined(__ARM_FEATURE_MATMUL_INT8)
3777
3896
  if (nrc == 2) {
3778
3897
  const block_q4_1 * restrict vx0 = vx;
3779
- const block_q4_1 * restrict vx1 = vx + bx;
3898
+ const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
3780
3899
  const block_q8_1 * restrict vy0 = vy;
3781
- const block_q8_1 * restrict vy1 = vy + by;
3900
+ const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
3782
3901
 
3783
3902
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3784
3903
  float32x4_t summs0 = vdupq_n_f32(0.0f);
@@ -3789,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3789
3908
  const block_q8_1 * restrict b_y0 = &vy0[i];
3790
3909
  const block_q8_1 * restrict b_y1 = &vy1[i];
3791
3910
 
3792
- float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
3793
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
3794
- GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
3795
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
3796
- summs0 += summs_t;
3911
+ float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
3912
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
3913
+ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
3914
+ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
3915
+ summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
3797
3916
 
3798
3917
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
3799
3918
 
@@ -3813,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3813
3932
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3814
3933
 
3815
3934
  // mmla into int32x4_t
3816
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
3817
- GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
3818
- GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
3819
- GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
3935
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
3936
+ GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
3937
+ GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
3938
+ GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
3939
+ float32x4_t scale = vld1q_f32(_scale);
3820
3940
 
3821
3941
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3822
3942
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -3835,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3835
3955
 
3836
3956
  float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
3837
3957
  float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
3838
- sumv2 = sumv2 + summs0;
3958
+ sumv2 = vaddq_f32(sumv2, summs0);
3839
3959
 
3840
3960
  vst1_f32(s, vget_low_f32(sumv2));
3841
3961
  vst1_f32(s + bs, vget_high_f32(sumv2));
@@ -3952,6 +4072,46 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
3952
4072
  }
3953
4073
 
3954
4074
  *s = sumf;
4075
+ #elif defined(__POWER9_VECTOR__)
4076
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4077
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4078
+
4079
+ vector float vsumf0 = vec_splats(0.0f);
4080
+
4081
+ #pragma GCC unroll 4
4082
+ for (int i = 0; i < nb; i++) {
4083
+ __builtin_prefetch(x[i].qs, 0, 1);
4084
+ __builtin_prefetch(y[i].qs, 0, 1);
4085
+
4086
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4087
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4088
+ vector float vd = vec_mul(vxd, vyd);
4089
+
4090
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
4091
+ vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.0f, 0.0f, 0.0f};
4092
+ vsumf0 = vec_madd(vxmin, vys, vsumf0);
4093
+
4094
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4095
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4096
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
4097
+
4098
+ vector signed char q4x0 = vec_and(qxs, lowMask);
4099
+ vector signed char q4x1 = vec_sr(qxs, v4);
4100
+
4101
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4102
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4103
+
4104
+ qv0 = vec_add(qv0, qv1);
4105
+
4106
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4107
+
4108
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4109
+ }
4110
+
4111
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4112
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4113
+
4114
+ *s = vec_extract(vsumf0, 0);
3955
4115
  #else
3956
4116
  // scalar
3957
4117
  float sumf = 0.0;
@@ -4237,6 +4397,49 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4237
4397
  }
4238
4398
 
4239
4399
  *s = sumf;
4400
+ #elif defined(__POWER9_VECTOR__)
4401
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4402
+ const vector unsigned char v4 = vec_splats((unsigned char)4);
4403
+
4404
+ vector float vsumf0 = vec_splats(0.0f);
4405
+
4406
+ #pragma GCC unroll 4
4407
+ for (int i = 0; i < nb; ++i) {
4408
+ __builtin_prefetch(x[i].qs, 0, 1);
4409
+ __builtin_prefetch(y[i].qs, 0, 1);
4410
+
4411
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4412
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4413
+ vector float vd = vec_mul(vxd, vyd);
4414
+
4415
+ vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[i].qh[0]]), (uint64_t)(table_b2b_1[x[i].qh[1]])};
4416
+ vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[i].qh[2]]), (uint64_t)(table_b2b_1[x[i].qh[3]])};
4417
+
4418
+ vector signed char qh0 = (vector signed char)aux64x2_0;
4419
+ vector signed char qh1 = (vector signed char)aux64x2_1;
4420
+
4421
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4422
+
4423
+ vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
4424
+ vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
4425
+
4426
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4427
+ vector signed char q8y1 = vec_xl( 16, y[i].qs);
4428
+
4429
+ vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
4430
+ vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
4431
+
4432
+ qv0 = vec_add(qv0, qv1);
4433
+
4434
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4435
+
4436
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4437
+ }
4438
+
4439
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4440
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4441
+
4442
+ *s = vec_extract(vsumf0, 0);
4240
4443
  #else
4241
4444
  // scalar
4242
4445
  float sumf = 0.0;
@@ -4541,6 +4744,53 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4541
4744
  }
4542
4745
 
4543
4746
  *s = sumf;
4747
+ #elif defined(__POWER9_VECTOR__)
4748
+ const vector signed char lowMask = vec_splats((signed char)0xF);
4749
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4750
+
4751
+ vector float vsumf0 = vec_splats(0.0f);
4752
+
4753
+ #pragma GCC unroll 4
4754
+ for (int i = 0; i < nb; ++i) {
4755
+ __builtin_prefetch(x[i].qs, 0, 1);
4756
+ __builtin_prefetch(y[i].qs, 0, 1);
4757
+
4758
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4759
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4760
+ vector float vd = vec_mul(vxd, vyd);
4761
+
4762
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
4763
+ vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.f, 0.f, 0.f};
4764
+ vsumf0 = vec_madd(vxmin, vys, vsumf0);
4765
+
4766
+ vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[i].qh[0]]), (uint64_t)(table_b2b_0[x[i].qh[1]])};
4767
+ vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[i].qh[2]]), (uint64_t)(table_b2b_0[x[i].qh[3]])};
4768
+
4769
+ vector signed char qh0 = (vector signed char)aux64x2_0;
4770
+ vector signed char qh1 = (vector signed char)aux64x2_1;
4771
+
4772
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
4773
+
4774
+ vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
4775
+ vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
4776
+
4777
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4778
+ vector signed char q8y1 = vec_xl( 16, y[i].qs);
4779
+
4780
+ vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
4781
+ vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
4782
+
4783
+ qv0 = vec_add(qv0, qv1);
4784
+
4785
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4786
+
4787
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4788
+ }
4789
+
4790
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
4791
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
4792
+
4793
+ *s = vec_extract(vsumf0, 0);
4544
4794
  #else
4545
4795
  // scalar
4546
4796
  float sumf = 0.0;
@@ -4589,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4589
4839
  #if defined(__ARM_FEATURE_MATMUL_INT8)
4590
4840
  if (nrc == 2) {
4591
4841
  const block_q8_0 * restrict vx0 = vx;
4592
- const block_q8_0 * restrict vx1 = vx + bx;
4842
+ const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
4593
4843
  const block_q8_0 * restrict vy0 = vy;
4594
- const block_q8_0 * restrict vy1 = vy + by;
4844
+ const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
4595
4845
 
4596
4846
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
4597
4847
 
@@ -4613,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4613
4863
  const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4614
4864
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4615
4865
 
4616
- float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4617
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4618
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4619
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4866
+ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4867
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4868
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4869
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4870
+ float32x4_t scale = vld1q_f32(_scale);
4620
4871
 
4621
4872
  int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4622
4873
  int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
@@ -4716,6 +4967,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4716
4967
  }
4717
4968
 
4718
4969
  *s = sumf;
4970
+ #elif defined(__POWER9_VECTOR__)
4971
+ vector float vsumf0 = vec_splats(0.0f);
4972
+
4973
+ #pragma GCC unroll 4
4974
+ for (int i = 0; i < nb; i++) {
4975
+ __builtin_prefetch(x[i].qs, 0, 1);
4976
+ __builtin_prefetch(y[i].qs, 0, 1);
4977
+
4978
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
4979
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
4980
+ vector float vd = vec_mul(vxd, vyd);
4981
+
4982
+ vector signed char q8x0 = vec_xl( 0, x[i].qs);
4983
+ vector signed char q8x1 = vec_xl(16, x[i].qs);
4984
+ vector signed char q8y0 = vec_xl( 0, y[i].qs);
4985
+ vector signed char q8y1 = vec_xl(16, y[i].qs);
4986
+
4987
+ vector signed short qv0 = vec_mule(q8x0, q8y0);
4988
+ vector signed short qv1 = vec_mulo(q8x0, q8y0);
4989
+ vector signed short qv2 = vec_mule(q8x1, q8y1);
4990
+ vector signed short qv3 = vec_mulo(q8x1, q8y1);
4991
+
4992
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
4993
+ vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
4994
+ vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
4995
+ vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
4996
+
4997
+ vsumi0 = vec_add(vsumi0, vsumi2);
4998
+ vsumi1 = vec_add(vsumi1, vsumi3);
4999
+
5000
+ vsumi0 = vec_add(vsumi0, vsumi1);
5001
+
5002
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5003
+ }
5004
+
5005
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5006
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5007
+
5008
+ *s = vec_extract(vsumf0, 0);
4719
5009
  #else
4720
5010
  // scalar
4721
5011
  float sumf = 0.0;
@@ -5071,6 +5361,147 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5071
5361
 
5072
5362
  *s = sumf;
5073
5363
 
5364
+ #elif defined(__POWER9_VECTOR__)
5365
+ const vector signed char lowMask = vec_splats((signed char)0x3);
5366
+ const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5367
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5368
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5369
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5370
+
5371
+ vector float vsumf0 = vec_splats(0.0f);
5372
+ vector float vsumf1 = vec_splats(0.0f);
5373
+ vector float vsumf2 = vec_splats(0.0f);
5374
+ vector float vsumf3 = vec_splats(0.0f);
5375
+
5376
+ for (int i = 0; i < nb; ++i) {
5377
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
5378
+ vector float vyd = vec_splats(y[i].d);
5379
+ vector float vd = vec_mul(vxd, vyd);
5380
+
5381
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
5382
+ vector float vdmin = vec_mul(vxmin, vyd);
5383
+
5384
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
5385
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
5386
+
5387
+ vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
5388
+ vector signed char vscales = vec_and(q2xmins, lowScaleMask);
5389
+
5390
+ q2xmins = vec_sr(q2xmins, v4);
5391
+ vector signed short q2xmins0 = vec_unpackh(q2xmins);
5392
+ vector signed short q2xmins1 = vec_unpackl(q2xmins);
5393
+
5394
+ vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
5395
+ vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
5396
+ vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
5397
+ vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
5398
+
5399
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
5400
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
5401
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5402
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5403
+
5404
+ vector signed int vsumi0 = vec_splats((int32_t)0);
5405
+ vector signed int vsumi1 = vec_splats((int32_t)0);
5406
+ vector signed int vsumi2 = vec_splats((int32_t)0);
5407
+ vector signed int vsumi3 = vec_splats((int32_t)0);
5408
+ vector signed int vsumi4 = vec_splats((int32_t)0);
5409
+ vector signed int vsumi5 = vec_splats((int32_t)0);
5410
+ vector signed int vsumi6 = vec_splats((int32_t)0);
5411
+ vector signed int vsumi7 = vec_splats((int32_t)0);
5412
+
5413
+ const uint8_t * restrict q2 = x[i].qs;
5414
+ const int8_t * restrict q8 = y[i].qs;
5415
+
5416
+ for (int j = 0; j < QK_K/128; ++j) {
5417
+ __builtin_prefetch(q2, 0, 1);
5418
+ __builtin_prefetch(q8, 0, 1);
5419
+
5420
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
5421
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5422
+ q2 += 32;
5423
+
5424
+ vector signed char q2x00 = vec_and(qxs0, lowMask);
5425
+ vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5426
+ vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5427
+ vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
5428
+ vector signed char q2x10 = vec_and(qxs1, lowMask);
5429
+ vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
5430
+ vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
5431
+ vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
5432
+
5433
+ vector signed char q8y00 = vec_xl( 0, q8);
5434
+ vector signed char q8y10 = vec_xl( 16, q8);
5435
+ vector signed char q8y01 = vec_xl( 32, q8);
5436
+ vector signed char q8y11 = vec_xl( 48, q8);
5437
+ vector signed char q8y02 = vec_xl( 64, q8);
5438
+ vector signed char q8y12 = vec_xl( 80, q8);
5439
+ vector signed char q8y03 = vec_xl( 96, q8);
5440
+ vector signed char q8y13 = vec_xl(112, q8);
5441
+ q8 += 128;
5442
+
5443
+ vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
5444
+ vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
5445
+ vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
5446
+ vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
5447
+ vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
5448
+ vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
5449
+ vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
5450
+ vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
5451
+
5452
+ vector signed short vscales_h = vec_unpackh(vscales);
5453
+ vector signed short vs0 = vec_splat(vscales_h, 0);
5454
+ vector signed short vs1 = vec_splat(vscales_h, 1);
5455
+ vector signed short vs2 = vec_splat(vscales_h, 2);
5456
+ vector signed short vs3 = vec_splat(vscales_h, 3);
5457
+ vector signed short vs4 = vec_splat(vscales_h, 4);
5458
+ vector signed short vs5 = vec_splat(vscales_h, 5);
5459
+ vector signed short vs6 = vec_splat(vscales_h, 6);
5460
+ vector signed short vs7 = vec_splat(vscales_h, 7);
5461
+ vscales = vec_sld(vscales, vscales, 8);
5462
+
5463
+ qv0 = vec_mul(qv0, vs0);
5464
+ qv1 = vec_mul(qv1, vs2);
5465
+ qv2 = vec_mul(qv2, vs4);
5466
+ qv3 = vec_mul(qv3, vs6);
5467
+
5468
+ qv0 = vec_madd(qv4, vs1, qv0);
5469
+ qv1 = vec_madd(qv5, vs3, qv1);
5470
+ qv2 = vec_madd(qv6, vs5, qv2);
5471
+ qv3 = vec_madd(qv7, vs7, qv3);
5472
+
5473
+ vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
5474
+ vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
5475
+ vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
5476
+ vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
5477
+
5478
+ vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
5479
+ vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
5480
+ vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
5481
+ vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
5482
+ }
5483
+
5484
+ vsumi0 = vec_add(vsumi0, vsumi4);
5485
+ vsumi1 = vec_add(vsumi1, vsumi5);
5486
+ vsumi2 = vec_add(vsumi2, vsumi6);
5487
+ vsumi3 = vec_add(vsumi3, vsumi7);
5488
+
5489
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5490
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
5491
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
5492
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
5493
+ }
5494
+
5495
+ vsumf0 = vec_add(vsumf0, vsumf2);
5496
+ vsumf1 = vec_add(vsumf1, vsumf3);
5497
+
5498
+ vsumf0 = vec_add(vsumf0, vsumf1);
5499
+
5500
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5501
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5502
+
5503
+ *s = vec_extract(vsumf0, 0);
5504
+
5074
5505
  #else
5075
5506
 
5076
5507
  float sumf = 0;
@@ -5341,6 +5772,87 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5341
5772
 
5342
5773
  *s = sumf;
5343
5774
 
5775
+ #elif defined(__POWER9_VECTOR__)
5776
+ const vector signed char lowMask = vec_splats((signed char)0x3);
5777
+ const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5778
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5779
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5780
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5781
+
5782
+ vector float vsumf0 = vec_splats(0.0f);
5783
+ vector float vsumf1 = vec_splats(0.0f);
5784
+ vector float vsumf2 = vec_splats(0.0f);
5785
+ vector float vsumf3 = vec_splats(0.0f);
5786
+
5787
+ #pragma GCC unroll 2
5788
+ for (int i = 0; i < nb; ++i) {
5789
+ __builtin_prefetch(x[i].qs, 0, 1);
5790
+ __builtin_prefetch(y[i].qs, 0, 1);
5791
+
5792
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
5793
+ vector float vyd = vec_splats(y[i].d);
5794
+ vector float vd = vec_mul(vxd, vyd);
5795
+
5796
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
5797
+ vector float vdmin = vec_mul(vxmin, vyd);
5798
+
5799
+ vector signed short q8ysums0 = vec_xl_len(y[i].bsums, 8);
5800
+
5801
+ vector signed char q2xmins = (vector signed char)vec_xl_len(x[i].scales, 4);
5802
+ vector signed char vscales = vec_and(q2xmins, lowScaleMask);
5803
+
5804
+ q2xmins = vec_sr(q2xmins, v4);
5805
+ vector signed short q2xmins0 = vec_unpackh((vector signed char)q2xmins);
5806
+
5807
+ vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
5808
+ vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
5809
+
5810
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
5811
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
5812
+
5813
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
5814
+ vector signed char q2x00 = vec_and(qxs0, lowMask);
5815
+ vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5816
+ vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5817
+ vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
5818
+
5819
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
5820
+ vector signed char q8y01 = vec_xl( 16, y[i].qs);
5821
+ vector signed char q8y02 = vec_xl( 32, y[i].qs);
5822
+ vector signed char q8y03 = vec_xl( 48, y[i].qs);
5823
+
5824
+ vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
5825
+ vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
5826
+ vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
5827
+ vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
5828
+
5829
+ vector signed short vscales_h = vec_unpackh(vscales);
5830
+ vector signed short vs0 = vec_splat(vscales_h, 0);
5831
+ vector signed short vs1 = vec_splat(vscales_h, 1);
5832
+ vector signed short vs2 = vec_splat(vscales_h, 2);
5833
+ vector signed short vs3 = vec_splat(vscales_h, 3);
5834
+
5835
+ vector signed int vsumi0 = vec_add(vec_mule(qv0, vs0), vec_mulo(qv0, vs0));
5836
+ vector signed int vsumi1 = vec_add(vec_mule(qv1, vs1), vec_mulo(qv1, vs1));
5837
+ vector signed int vsumi2 = vec_add(vec_mule(qv2, vs2), vec_mulo(qv2, vs2));
5838
+ vector signed int vsumi3 = vec_add(vec_mule(qv3, vs3), vec_mulo(qv3, vs3));
5839
+
5840
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5841
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
5842
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
5843
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
5844
+ }
5845
+
5846
+ vsumf0 = vec_add(vsumf0, vsumf2);
5847
+ vsumf1 = vec_add(vsumf1, vsumf3);
5848
+
5849
+ vsumf0 = vec_add(vsumf0, vsumf1);
5850
+
5851
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
5852
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
5853
+
5854
+ *s = vec_extract(vsumf0, 0);
5855
+
5344
5856
  #else
5345
5857
 
5346
5858
  float sumf = 0;
@@ -5835,6 +6347,160 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5835
6347
 
5836
6348
  *s = sumf;
5837
6349
 
6350
+ #elif defined(__POWER9_VECTOR__)
6351
+ const vector signed char lowMask = vec_splats((signed char)0x3);
6352
+ const vector signed char v1 = vec_splats((signed char)0x1);
6353
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6354
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
6355
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
6356
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
6357
+ const vector signed char off = vec_splats((signed char)0x20);
6358
+
6359
+ vector float vsumf0 = vec_splats(0.0f);
6360
+ vector float vsumf1 = vec_splats(0.0f);
6361
+ vector float vsumf2 = vec_splats(0.0f);
6362
+ vector float vsumf3 = vec_splats(0.0f);
6363
+
6364
+ for (int i = 0; i < nb; ++i) {
6365
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
6366
+ vector float vyd = vec_splats(y[i].d);
6367
+ vector float vd = vec_mul(vxd, vyd);
6368
+
6369
+ uint32_t aux[3];
6370
+ uint32_t utmp[4];
6371
+
6372
+ memcpy(aux, x[i].scales, 12);
6373
+ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6374
+ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6375
+ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6376
+ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6377
+
6378
+ vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
6379
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6380
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6381
+
6382
+ vscales = vec_sub(vscales, off);
6383
+
6384
+ vector signed int vsumi0 = vec_splats((int32_t)0);
6385
+ vector signed int vsumi1 = vec_splats((int32_t)0);
6386
+ vector signed int vsumi2 = vec_splats((int32_t)0);
6387
+ vector signed int vsumi3 = vec_splats((int32_t)0);
6388
+ vector signed int vsumi4 = vec_splats((int32_t)0);
6389
+ vector signed int vsumi5 = vec_splats((int32_t)0);
6390
+ vector signed int vsumi6 = vec_splats((int32_t)0);
6391
+ vector signed int vsumi7 = vec_splats((int32_t)0);
6392
+
6393
+ const uint8_t * restrict q3 = x[i].qs;
6394
+ const int8_t * restrict q8 = y[i].qs;
6395
+
6396
+ for (int j = 0; j < QK_K/128; ++j) {
6397
+ __builtin_prefetch(q3, 0, 1);
6398
+ __builtin_prefetch(q8, 0, 1);
6399
+
6400
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
6401
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
6402
+ q3 += 32;
6403
+
6404
+ //the low 2 bits
6405
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
6406
+ vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
6407
+ vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
6408
+ vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
6409
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
6410
+ vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
6411
+ vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
6412
+ vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
6413
+
6414
+ //the 3rd bit
6415
+ vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
6416
+ vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
6417
+ vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
6418
+ vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
6419
+ vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
6420
+ vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
6421
+ vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
6422
+ vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
6423
+ qxhs0 = vec_sr(qxhs0, v4);
6424
+ qxhs1 = vec_sr(qxhs1, v4);
6425
+
6426
+ vector signed char q3x00 = vec_sub(qxs00, qxh00);
6427
+ vector signed char q3x01 = vec_sub(qxs01, qxh01);
6428
+ vector signed char q3x02 = vec_sub(qxs02, qxh02);
6429
+ vector signed char q3x03 = vec_sub(qxs03, qxh03);
6430
+ vector signed char q3x10 = vec_sub(qxs10, qxh10);
6431
+ vector signed char q3x11 = vec_sub(qxs11, qxh11);
6432
+ vector signed char q3x12 = vec_sub(qxs12, qxh12);
6433
+ vector signed char q3x13 = vec_sub(qxs13, qxh13);
6434
+
6435
+ vector signed char q8y00 = vec_xl( 0, q8);
6436
+ vector signed char q8y10 = vec_xl( 16, q8);
6437
+ vector signed char q8y01 = vec_xl( 32, q8);
6438
+ vector signed char q8y11 = vec_xl( 48, q8);
6439
+ vector signed char q8y02 = vec_xl( 64, q8);
6440
+ vector signed char q8y12 = vec_xl( 80, q8);
6441
+ vector signed char q8y03 = vec_xl( 96, q8);
6442
+ vector signed char q8y13 = vec_xl(112, q8);
6443
+ q8 += 128;
6444
+
6445
+ vector signed short vscales_h = vec_unpackh(vscales);
6446
+ vector signed short vs0 = vec_splat(vscales_h, 0);
6447
+ vector signed short vs1 = vec_splat(vscales_h, 1);
6448
+ vector signed short vs2 = vec_splat(vscales_h, 2);
6449
+ vector signed short vs3 = vec_splat(vscales_h, 3);
6450
+ vector signed short vs4 = vec_splat(vscales_h, 4);
6451
+ vector signed short vs5 = vec_splat(vscales_h, 5);
6452
+ vector signed short vs6 = vec_splat(vscales_h, 6);
6453
+ vector signed short vs7 = vec_splat(vscales_h, 7);
6454
+ vscales = vec_sld(vscales, vscales, 8);
6455
+
6456
+ vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
6457
+ vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
6458
+ vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
6459
+ vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
6460
+ vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
6461
+ vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
6462
+ vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6463
+ vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6464
+
6465
+ vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6466
+ vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6467
+ vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
6468
+ vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
6469
+ vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6470
+ vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6471
+ vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
6472
+ vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
6473
+
6474
+ vsumi0 = vec_add(vsum0, vsumi0);
6475
+ vsumi1 = vec_add(vsum1, vsumi1);
6476
+ vsumi2 = vec_add(vsum2, vsumi2);
6477
+ vsumi3 = vec_add(vsum3, vsumi3);
6478
+ vsumi4 = vec_add(vsum4, vsumi4);
6479
+ vsumi5 = vec_add(vsum5, vsumi5);
6480
+ vsumi6 = vec_add(vsum6, vsumi6);
6481
+ vsumi7 = vec_add(vsum7, vsumi7);
6482
+ }
6483
+
6484
+ vsumi0 = vec_add(vsumi0, vsumi4);
6485
+ vsumi1 = vec_add(vsumi1, vsumi5);
6486
+ vsumi2 = vec_add(vsumi2, vsumi6);
6487
+ vsumi3 = vec_add(vsumi3, vsumi7);
6488
+
6489
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
6490
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
6491
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
6492
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
6493
+ }
6494
+
6495
+ vsumf0 = vec_add(vsumf0, vsumf2);
6496
+ vsumf1 = vec_add(vsumf1, vsumf3);
6497
+
6498
+ vsumf0 = vec_add(vsumf0, vsumf1);
6499
+
6500
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
6501
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
6502
+
6503
+ *s = vec_extract(vsumf0, 0);
5838
6504
  #else
5839
6505
  // scalar version
5840
6506
  // This function is written like this so the compiler can manage to vectorize most of it
@@ -6201,6 +6867,95 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6201
6867
 
6202
6868
  *s = sumf;
6203
6869
 
6870
+ #elif defined(__POWER9_VECTOR__)
6871
+ const vector signed char lowMask = vec_splats((signed char)0x3);
6872
+ const vector signed char v1 = vec_splats((signed char)0x1);
6873
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6874
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
6875
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
6876
+ const vector signed char off = vec_splats((signed char)0x8);
6877
+
6878
+ vector float vsumf0 = vec_splats(0.0f);
6879
+ vector float vsumf1 = vec_splats(0.0f);
6880
+ vector float vsumf2 = vec_splats(0.0f);
6881
+ vector float vsumf3 = vec_splats(0.0f);
6882
+
6883
+ #pragma GCC unroll 2
6884
+ for (int i = 0; i < nb; ++i) {
6885
+ __builtin_prefetch(x[i].qs, 0, 1);
6886
+ __builtin_prefetch(y[i].qs, 0, 1);
6887
+
6888
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
6889
+ vector float vyd = vec_splats(y[i].d);
6890
+ vector float vd = vec_mul(vxd, vyd);
6891
+
6892
+ uint16_t aux16[2];
6893
+ int8_t * scales = (int8_t *)aux16;
6894
+
6895
+ const uint16_t a = *(const uint16_t *)x[i].scales;
6896
+ aux16[0] = a & 0x0f0f;
6897
+ aux16[1] = (a >> 4) & 0x0f0f;
6898
+
6899
+ vector signed char vscales = (vector signed char)vec_xl_len(scales, 8);
6900
+ vector signed char qxhs0 = (vector signed char)vec_xl_len(x[i].hmask, 8);
6901
+ qxhs0 = vec_or(qxhs0, vec_sr(vec_sld(qxhs0, qxhs0, 8), (vector unsigned char)v1));
6902
+
6903
+ vscales = vec_sub(vscales, off);
6904
+
6905
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
6906
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
6907
+ vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
6908
+ vector signed char qxs10 = vec_and(vec_sr(qxs0, v4), lowMask);
6909
+ vector signed char qxs11 = vec_and(vec_sr(qxs0, v6), lowMask);
6910
+
6911
+ //the 3rd bit
6912
+ vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
6913
+ vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
6914
+ vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v4)), v2);
6915
+ vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v6)), v2);
6916
+ qxhs0 = vec_sr(qxhs0, v4);
6917
+
6918
+ vector signed char q3x00 = vec_sub(qxs00, qxh00);
6919
+ vector signed char q3x01 = vec_sub(qxs01, qxh01);
6920
+ vector signed char q3x10 = vec_sub(qxs10, qxh02);
6921
+ vector signed char q3x11 = vec_sub(qxs11, qxh03);
6922
+
6923
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
6924
+ vector signed char q8y01 = vec_xl( 16, y[i].qs);
6925
+ vector signed char q8y10 = vec_xl( 32, y[i].qs);
6926
+ vector signed char q8y11 = vec_xl( 48, y[i].qs);
6927
+
6928
+ vector signed short vscales_h = vec_unpackh(vscales);
6929
+ vector signed short vs0 = vec_splat(vscales_h, 0);
6930
+ vector signed short vs1 = vec_splat(vscales_h, 1);
6931
+ vector signed short vs2 = vec_splat(vscales_h, 2);
6932
+ vector signed short vs3 = vec_splat(vscales_h, 3);
6933
+
6934
+ vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
6935
+ vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
6936
+ vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
6937
+ vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
6938
+
6939
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6940
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6941
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6942
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6943
+
6944
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
6945
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
6946
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
6947
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
6948
+ }
6949
+
6950
+ vsumf0 = vec_add(vsumf0, vsumf2);
6951
+ vsumf1 = vec_add(vsumf1, vsumf3);
6952
+
6953
+ vsumf0 = vec_add(vsumf0, vsumf1);
6954
+
6955
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
6956
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
6957
+
6958
+ *s = vec_extract(vsumf0, 0);
6204
6959
  #else
6205
6960
 
6206
6961
  int8_t aux8[QK_K];
@@ -6553,6 +7308,142 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6553
7308
 
6554
7309
  *s = sumf;
6555
7310
 
7311
+ #elif defined(__POWER9_VECTOR__)
7312
+ const vector signed char lowMask = vec_splats((signed char)0xF);
7313
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7314
+
7315
+ vector float vsumf0 = vec_splats(0.0f);
7316
+ vector float vsumf1 = vec_splats(0.0f);
7317
+ vector float vsumf2 = vec_splats(0.0f);
7318
+ vector float vsumf3 = vec_splats(0.0f);
7319
+
7320
+ for (int i = 0; i < nb; ++i) {
7321
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
7322
+ vector float vyd = vec_splats(y[i].d);
7323
+ vector float vd = vec_mul(vxd, vyd);
7324
+
7325
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7326
+ vector float vdmin = vec_mul(vxmin, vyd);
7327
+
7328
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7329
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7330
+
7331
+ memcpy(utmp, x[i].scales, 12);
7332
+
7333
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7334
+ const uint32_t uaux = utmp[1] & kmask1;
7335
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7336
+ utmp[2] = uaux;
7337
+ utmp[0] &= kmask1;
7338
+
7339
+ vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7340
+ vector signed short vscales = vec_unpackh(utmps);
7341
+ vector signed short q4xmins = vec_unpackl(utmps);
7342
+ vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
7343
+ vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
7344
+
7345
+ vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
7346
+ vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
7347
+ vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
7348
+ vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
7349
+
7350
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
7351
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
7352
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7353
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7354
+
7355
+ vector signed int vsumi0 = vec_splats((int32_t)0);
7356
+ vector signed int vsumi1 = vec_splats((int32_t)0);
7357
+ vector signed int vsumi2 = vec_splats((int32_t)0);
7358
+ vector signed int vsumi3 = vec_splats((int32_t)0);
7359
+ vector signed int vsumi4 = vec_splats((int32_t)0);
7360
+ vector signed int vsumi5 = vec_splats((int32_t)0);
7361
+ vector signed int vsumi6 = vec_splats((int32_t)0);
7362
+ vector signed int vsumi7 = vec_splats((int32_t)0);
7363
+
7364
+ const uint8_t * restrict q4 = x[i].qs;
7365
+ const int8_t * restrict q8 = y[i].qs;
7366
+
7367
+ for (int j = 0; j < QK_K/64; j+=2) {
7368
+ __builtin_prefetch(q4, 0, 1);
7369
+ __builtin_prefetch(q8, 0, 1);
7370
+
7371
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
7372
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
7373
+ vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
7374
+ vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7375
+ q4 += 64;
7376
+
7377
+ vector signed char q4x00 = vec_and(qxs0, lowMask);
7378
+ vector signed char q4x01 = vec_sr(qxs0, v4);
7379
+ vector signed char q4x10 = vec_and(qxs1, lowMask);
7380
+ vector signed char q4x11 = vec_sr(qxs1, v4);
7381
+ vector signed char q4x20 = vec_and(qxs2, lowMask);
7382
+ vector signed char q4x21 = vec_sr(qxs2, v4);
7383
+ vector signed char q4x30 = vec_and(qxs3, lowMask);
7384
+ vector signed char q4x31 = vec_sr(qxs3, v4);
7385
+
7386
+ vector signed char q8y00 = vec_xl( 0, q8);
7387
+ vector signed char q8y10 = vec_xl( 16, q8);
7388
+ vector signed char q8y01 = vec_xl( 32, q8);
7389
+ vector signed char q8y11 = vec_xl( 48, q8);
7390
+ vector signed char q8y20 = vec_xl( 64, q8);
7391
+ vector signed char q8y30 = vec_xl( 80, q8);
7392
+ vector signed char q8y21 = vec_xl( 96, q8);
7393
+ vector signed char q8y31 = vec_xl(112, q8);
7394
+ q8 += 128;
7395
+
7396
+ vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7397
+ vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7398
+ vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7399
+ vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7400
+ vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
7401
+ vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
7402
+ vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
7403
+ vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
7404
+
7405
+ vector signed short vs0 = vec_splat(vscales, 0);
7406
+ vector signed short vs1 = vec_splat(vscales, 1);
7407
+ vector signed short vs2 = vec_splat(vscales, 2);
7408
+ vector signed short vs3 = vec_splat(vscales, 3);
7409
+ vscales = vec_sld(vscales, vscales, 8);
7410
+
7411
+ qv00 = vec_add(qv00, qv10);
7412
+ qv10 = vec_add(qv01, qv11);
7413
+ qv20 = vec_add(qv20, qv30);
7414
+ qv30 = vec_add(qv21, qv31);
7415
+
7416
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7417
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7418
+ vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
7419
+ vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
7420
+ vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
7421
+ vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
7422
+ vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
7423
+ vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
7424
+ }
7425
+
7426
+ vsumi0 = vec_add(vsumi0, vsumi4);
7427
+ vsumi1 = vec_add(vsumi1, vsumi5);
7428
+ vsumi2 = vec_add(vsumi2, vsumi6);
7429
+ vsumi3 = vec_add(vsumi3, vsumi7);
7430
+
7431
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7432
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7433
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
7434
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
7435
+ }
7436
+
7437
+ vsumf0 = vec_add(vsumf0, vsumf2);
7438
+ vsumf1 = vec_add(vsumf1, vsumf3);
7439
+
7440
+ vsumf0 = vec_add(vsumf0, vsumf1);
7441
+
7442
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
7443
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
7444
+
7445
+ *s = vec_extract(vsumf0, 0);
7446
+
6556
7447
  #else
6557
7448
 
6558
7449
 
@@ -6819,6 +7710,87 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6819
7710
 
6820
7711
  *s = sumf;
6821
7712
 
7713
+ #elif defined(__POWER9_VECTOR__)
7714
+ const vector signed char lowMask = vec_splats((signed char)0xF);
7715
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7716
+
7717
+ vector float vsumf0 = vec_splats(0.0f);
7718
+ vector float vsumf1 = vec_splats(0.0f);
7719
+ vector float vsumf2 = vec_splats(0.0f);
7720
+ vector float vsumf3 = vec_splats(0.0f);
7721
+
7722
+ #pragma GCC unroll 2
7723
+ for (int i = 0; i < nb; ++i) {
7724
+ __builtin_prefetch(x[i].qs, 0, 1);
7725
+ __builtin_prefetch(y[i].qs, 0, 1);
7726
+
7727
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d[1]));
7728
+ vector float vyd = vec_splats(y[i].d);
7729
+ vector float vd= vec_mul(vxd, vyd);
7730
+
7731
+ uint16_t s16[2];
7732
+ const uint8_t * scales = (const uint8_t *)s16;
7733
+
7734
+ const uint16_t * restrict b = (const uint16_t *)x[i].scales;
7735
+ s16[0] = b[0] & 0x0f0f;
7736
+ s16[1] = (b[0] >> 4) & 0x0f0f;
7737
+
7738
+ vector signed char utmps = (vector signed char)vec_xl_len(scales, 4);
7739
+ vector signed short vscales = (vector signed short)vec_unpackh(utmps);
7740
+ vector signed short q4xmins0 = vec_mergeh(vscales, vscales);
7741
+ q4xmins0 = vec_sld(q4xmins0, q4xmins0, 8);
7742
+
7743
+ vector signed short q8ysums0 = vec_xl_len((const int16_t *)(y[i].bsums), 8);
7744
+
7745
+ vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
7746
+ vector signed int prod1 = vec_mulo(q4xmins0, q8ysums0);
7747
+
7748
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vd, vsumf0);
7749
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vd, vsumf1);
7750
+
7751
+ vd = vec_mul(vyd, vec_splats(GGML_FP16_TO_FP32(x[i].d[0])));
7752
+
7753
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
7754
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
7755
+ vector signed char q4x00 = vec_and(qxs0, lowMask);
7756
+ vector signed char q4x01 = vec_sr(qxs0, v4);
7757
+ vector signed char q4x10 = vec_and(qxs1, lowMask);
7758
+ vector signed char q4x11 = vec_sr(qxs1, v4);
7759
+
7760
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
7761
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
7762
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
7763
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
7764
+
7765
+ vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7766
+ vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7767
+ vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7768
+ vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7769
+
7770
+ vector signed short vs0 = vec_splat(vscales, 0);
7771
+ vector signed short vs1 = vec_splat(vscales, 1);
7772
+
7773
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
7774
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs0), vec_mulo(qv10, vs0));
7775
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs1), vec_mulo(qv01, vs1));
7776
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs1), vec_mulo(qv11, vs1));
7777
+
7778
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7779
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7780
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
7781
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
7782
+ }
7783
+
7784
+ vsumf0 = vec_add(vsumf0, vsumf2);
7785
+ vsumf1 = vec_add(vsumf1, vsumf3);
7786
+
7787
+ vsumf0 = vec_add(vsumf0, vsumf1);
7788
+
7789
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
7790
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
7791
+
7792
+ *s = vec_extract(vsumf0, 0);
7793
+
6822
7794
  #else
6823
7795
 
6824
7796
  uint8_t aux8[QK_K];
@@ -7220,6 +8192,130 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7220
8192
 
7221
8193
  *s = sumf+sums;
7222
8194
 
8195
+ #elif defined(__POWER9_VECTOR__)
8196
+ const vector signed char lowMask = vec_splats((signed char)0xF);
8197
+ const vector unsigned char v1 = vec_splats((unsigned char)0x1);
8198
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8199
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8200
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
8201
+
8202
+ vector float vsumf0 = vec_splats(0.0f);
8203
+ vector float vsumf1 = vec_splats(0.0f);
8204
+ vector float vsumf2 = vec_splats(0.0f);
8205
+ vector float vsumf3 = vec_splats(0.0f);
8206
+
8207
+ for (int i = 0; i < nb; ++i) {
8208
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
8209
+ vector float vyd = vec_splats(y[i].d);
8210
+ vector float vd = vec_mul(vxd, vyd);
8211
+
8212
+ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
8213
+ vector float vdmin = vec_mul(vxmin, vyd);
8214
+
8215
+ memcpy(utmp, x[i].scales, 12);
8216
+
8217
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8218
+ const uint32_t uaux = utmp[1] & kmask1;
8219
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8220
+ utmp[2] = uaux;
8221
+ utmp[0] &= kmask1;
8222
+
8223
+ vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
8224
+ vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
8225
+
8226
+ vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
8227
+ vector signed short vscales = vec_unpackh(utmps);
8228
+
8229
+ vector signed short q5xmins = vec_unpackl(utmps);
8230
+ vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
8231
+ vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
8232
+
8233
+ vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
8234
+ vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
8235
+ vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
8236
+ vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
8237
+
8238
+ vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
8239
+ vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
8240
+ vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
8241
+ vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
8242
+
8243
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
8244
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
8245
+
8246
+ vector signed int vsumi0 = vec_splats((int32_t)0);
8247
+ vector signed int vsumi1 = vec_splats((int32_t)0);
8248
+ vector signed int vsumi2 = vec_splats((int32_t)0);
8249
+ vector signed int vsumi3 = vec_splats((int32_t)0);
8250
+
8251
+ const uint8_t * restrict q5 = x[i].qs;
8252
+ const int8_t * restrict q8 = y[i].qs;
8253
+
8254
+ for (int j = 0; j < QK_K/64; ++j) {
8255
+ __builtin_prefetch(q5, 0, 1);
8256
+ __builtin_prefetch(q8, 0, 1);
8257
+
8258
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
8259
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
8260
+ q5 += 32;
8261
+
8262
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
8263
+ vector signed char qxs01 = vec_sr(qxs0, v4);
8264
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
8265
+ vector signed char qxs11 = vec_sr(qxs1, v4);
8266
+
8267
+ vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
8268
+ vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
8269
+ vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
8270
+ vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
8271
+ qxhs0 = vec_sr(qxhs0, v2);
8272
+ qxhs1 = vec_sr(qxhs1, v2);
8273
+
8274
+ vector signed char q5x00 = vec_or(q5h00, qxs00);
8275
+ vector signed char q5x01 = vec_or(q5h01, qxs01);
8276
+ vector signed char q5x10 = vec_or(q5h10, qxs10);
8277
+ vector signed char q5x11 = vec_or(q5h11, qxs11);
8278
+
8279
+ vector signed char q8y00 = vec_xl( 0, q8);
8280
+ vector signed char q8y10 = vec_xl(16, q8);
8281
+ vector signed char q8y01 = vec_xl(32, q8);
8282
+ vector signed char q8y11 = vec_xl(48, q8);
8283
+ q8 += 64;
8284
+
8285
+ vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
8286
+ vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
8287
+ vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
8288
+ vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
8289
+
8290
+ vector signed short vs0 = vec_splat(vscales, 0);
8291
+ vector signed short vs1 = vec_splat(vscales, 1);
8292
+ vscales = vec_sld(vscales, vscales, 12);
8293
+
8294
+ qv00 = vec_add(qv00, qv10);
8295
+ qv01 = vec_add(qv01, qv11);
8296
+
8297
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
8298
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
8299
+ vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
8300
+ vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
8301
+ }
8302
+
8303
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
8304
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
8305
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
8306
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
8307
+ }
8308
+
8309
+ vsumf0 = vec_add(vsumf0, vsumf2);
8310
+ vsumf1 = vec_add(vsumf1, vsumf3);
8311
+
8312
+ vsumf0 = vec_add(vsumf0, vsumf1);
8313
+
8314
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
8315
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
8316
+
8317
+ *s = vec_extract(vsumf0, 0);
8318
+
7223
8319
  #else
7224
8320
 
7225
8321
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -7517,6 +8613,83 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7517
8613
 
7518
8614
  *s = sumf;
7519
8615
 
8616
+ #elif defined(__POWER9_VECTOR__)
8617
+ const vector signed char lowMask = vec_splats((signed char)0xF);
8618
+ const vector unsigned char v1 = vec_splats((unsigned char)0x1);
8619
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8620
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
8621
+
8622
+ vector float vsumf0 = vec_splats(0.0f);
8623
+ vector float vsumf1 = vec_splats(0.0f);
8624
+ vector float vsumf2 = vec_splats(0.0f);
8625
+ vector float vsumf3 = vec_splats(0.0f);
8626
+
8627
+ #pragma GCC unroll 2
8628
+ for (int i = 0; i < nb; ++i) {
8629
+ __builtin_prefetch(x[i].qs, 0, 1);
8630
+ __builtin_prefetch(y[i].qs, 0, 1);
8631
+
8632
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
8633
+ vector float vyd = vec_splats(y[i].d);
8634
+ vector float vd= vec_mul(vxd, vyd);
8635
+
8636
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
8637
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
8638
+ vector signed char qxs00 = (vector signed char)vec_and(qxs0, lowMask);
8639
+ vector signed char qxs01 = (vector signed char)vec_sr(qxs0, v4);
8640
+ vector signed char qxs10 = (vector signed char)vec_and(qxs1, lowMask);
8641
+ vector signed char qxs11 = (vector signed char)vec_sr(qxs1, v4);
8642
+
8643
+ vector signed char qxhs = (vector signed char)vec_xl_len(x[i].qh, 8);
8644
+ vector signed char qxhs0 = vec_or(qxhs, vec_sr(vec_sld(qxhs, qxhs, 8), v1));
8645
+ vector signed char qxhs1 = vec_sr(qxhs0, v2);
8646
+ vector signed char qxh00 = vec_sl(vec_andc((vector signed char)v1, qxhs0), v4);
8647
+ vector signed char qxh10 = vec_sl(vec_andc((vector signed char)v1, qxhs1), v4);
8648
+ vector signed char qxh01 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs0, v4)), v4);
8649
+ vector signed char qxh11 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs1, v4)), v4);
8650
+
8651
+ vector signed char q5x00 = vec_sub(qxs00, qxh00);
8652
+ vector signed char q5x10 = vec_sub(qxs10, qxh10);
8653
+ vector signed char q5x01 = vec_sub(qxs01, qxh01);
8654
+ vector signed char q5x11 = vec_sub(qxs11, qxh11);
8655
+
8656
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
8657
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
8658
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
8659
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
8660
+
8661
+ vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
8662
+ vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
8663
+ vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
8664
+ vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
8665
+
8666
+ vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
8667
+ vector signed short vs0 = vec_splat(vs, 0);
8668
+ vector signed short vs1 = vec_splat(vs, 1);
8669
+ vector signed short vs2 = vec_splat(vs, 2);
8670
+ vector signed short vs3 = vec_splat(vs, 3);
8671
+
8672
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
8673
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
8674
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
8675
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
8676
+
8677
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
8678
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
8679
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
8680
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
8681
+ }
8682
+
8683
+ vsumf0 = vec_add(vsumf0, vsumf2);
8684
+ vsumf1 = vec_add(vsumf1, vsumf3);
8685
+
8686
+ vsumf0 = vec_add(vsumf0, vsumf1);
8687
+
8688
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
8689
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
8690
+
8691
+ *s = vec_extract(vsumf0, 0);
8692
+
7520
8693
  #else
7521
8694
 
7522
8695
  int8_t aux8[QK_K];
@@ -7947,6 +9120,151 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7947
9120
 
7948
9121
  *s = sumf;
7949
9122
 
9123
+ #elif defined(__POWER9_VECTOR__)
9124
+ const vector signed char lowMask = vec_splats((signed char)0xF);
9125
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
9126
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
9127
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
9128
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
9129
+ const vector signed char off = vec_splats((signed char)0x20);
9130
+
9131
+ vector float vsumf0 = vec_splats(0.0f);
9132
+ vector float vsumf1 = vec_splats(0.0f);
9133
+ vector float vsumf2 = vec_splats(0.0f);
9134
+ vector float vsumf3 = vec_splats(0.0f);
9135
+
9136
+ for (int i = 0; i < nb; ++i) {
9137
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9138
+ vector float vyd = vec_splats(y[i].d);
9139
+ vector float vd = vec_mul(vxd, vyd);
9140
+
9141
+ vector signed int vsumi0 = vec_splats((int32_t)0);
9142
+ vector signed int vsumi1 = vec_splats((int32_t)0);
9143
+ vector signed int vsumi2 = vec_splats((int32_t)0);
9144
+ vector signed int vsumi3 = vec_splats((int32_t)0);
9145
+ vector signed int vsumi4 = vec_splats((int32_t)0);
9146
+ vector signed int vsumi5 = vec_splats((int32_t)0);
9147
+ vector signed int vsumi6 = vec_splats((int32_t)0);
9148
+ vector signed int vsumi7 = vec_splats((int32_t)0);
9149
+
9150
+ const uint8_t * restrict q6 = x[i].ql;
9151
+ const uint8_t * restrict qh = x[i].qh;
9152
+ const int8_t * restrict qs = x[i].scales;
9153
+ const int8_t * restrict q8 = y[i].qs;
9154
+
9155
+ for (int j = 0; j < QK_K/128; ++j) {
9156
+ __builtin_prefetch(q6, 0, 0);
9157
+ __builtin_prefetch(qh, 0, 0);
9158
+ __builtin_prefetch(q8, 0, 0);
9159
+
9160
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
9161
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
9162
+ vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
9163
+ vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
9164
+ q6 += 64;
9165
+
9166
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
9167
+ vector signed char qxs01 = vec_sr(qxs0, v4);
9168
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
9169
+ vector signed char qxs11 = vec_sr(qxs1, v4);
9170
+ vector signed char qxs20 = vec_and(qxs2, lowMask);
9171
+ vector signed char qxs21 = vec_sr(qxs2, v4);
9172
+ vector signed char qxs30 = vec_and(qxs3, lowMask);
9173
+ vector signed char qxs31 = vec_sr(qxs3, v4);
9174
+
9175
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
9176
+ vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
9177
+ qh += 32;
9178
+
9179
+ vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
9180
+ vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
9181
+ vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
9182
+ vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
9183
+ vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
9184
+ vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
9185
+ vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
9186
+ vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
9187
+
9188
+ vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
9189
+ vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
9190
+ vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
9191
+ vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
9192
+ vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
9193
+ vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
9194
+ vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
9195
+ vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
9196
+
9197
+ vector signed char q8y00 = vec_xl( 0, q8);
9198
+ vector signed char q8y10 = vec_xl( 16, q8);
9199
+ vector signed char q8y20 = vec_xl( 32, q8);
9200
+ vector signed char q8y30 = vec_xl( 48, q8);
9201
+ vector signed char q8y01 = vec_xl( 64, q8);
9202
+ vector signed char q8y11 = vec_xl( 80, q8);
9203
+ vector signed char q8y21 = vec_xl( 96, q8);
9204
+ vector signed char q8y31 = vec_xl(112, q8);
9205
+ q8 += 128;
9206
+
9207
+ vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
9208
+ vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
9209
+ vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
9210
+ vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
9211
+ vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
9212
+ vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
9213
+ vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
9214
+ vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
9215
+
9216
+ vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
9217
+ qs += 8;
9218
+
9219
+ vector signed short vs0 = vec_splat(vscales, 0);
9220
+ vector signed short vs1 = vec_splat(vscales, 1);
9221
+ vector signed short vs2 = vec_splat(vscales, 2);
9222
+ vector signed short vs3 = vec_splat(vscales, 3);
9223
+ vector signed short vs4 = vec_splat(vscales, 4);
9224
+ vector signed short vs5 = vec_splat(vscales, 5);
9225
+ vector signed short vs6 = vec_splat(vscales, 6);
9226
+ vector signed short vs7 = vec_splat(vscales, 7);
9227
+
9228
+ vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
9229
+ vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
9230
+ vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
9231
+ vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
9232
+ vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
9233
+ vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
9234
+ vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
9235
+ vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
9236
+
9237
+ vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
9238
+ vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
9239
+ vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
9240
+ vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
9241
+ vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
9242
+ vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
9243
+ vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
9244
+ vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
9245
+ }
9246
+
9247
+ vsumi0 = vec_add(vsumi0, vsumi4);
9248
+ vsumi1 = vec_add(vsumi1, vsumi5);
9249
+ vsumi2 = vec_add(vsumi2, vsumi6);
9250
+ vsumi3 = vec_add(vsumi3, vsumi7);
9251
+
9252
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9253
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9254
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9255
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9256
+ }
9257
+
9258
+ vsumf0 = vec_add(vsumf0, vsumf2);
9259
+ vsumf1 = vec_add(vsumf1, vsumf3);
9260
+
9261
+ vsumf0 = vec_add(vsumf0, vsumf1);
9262
+
9263
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9264
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9265
+
9266
+ *s = vec_extract(vsumf0, 0);
9267
+
7950
9268
  #else
7951
9269
 
7952
9270
  int8_t aux8[QK_K];
@@ -8253,6 +9571,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8253
9571
 
8254
9572
  *s = sumf;
8255
9573
 
9574
+ #elif defined(__POWER9_VECTOR__)
9575
+ const vector signed char lowMask = vec_splats((signed char)0xF);
9576
+ const vector unsigned char v2 = vec_splats((unsigned char)0x2);
9577
+ const vector unsigned char v3 = vec_splats((unsigned char)0x3);
9578
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
9579
+ const vector unsigned char v6 = vec_splats((unsigned char)0x6);
9580
+ const vector signed char off = vec_splats((signed char)0x20);
9581
+
9582
+ vector float vsumf0 = vec_splats(0.0f);
9583
+ vector float vsumf1 = vec_splats(0.0f);
9584
+ vector float vsumf2 = vec_splats(0.0f);
9585
+ vector float vsumf3 = vec_splats(0.0f);
9586
+
9587
+ #pragma GCC unroll 2
9588
+ for (int i = 0; i < nb; ++i) {
9589
+ __builtin_prefetch(x[i].ql, 0, 1);
9590
+ __builtin_prefetch(x[i].qh, 0, 1);
9591
+ __builtin_prefetch(y[i].qs, 0, 1);
9592
+
9593
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9594
+ vector float vyd = vec_splats(y[i].d);
9595
+ vector float vd= vec_mul(vxd, vyd);
9596
+
9597
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].ql);
9598
+ vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].ql);
9599
+ vector signed char qxs00 = vec_and(qxs0, lowMask);
9600
+ vector signed char qxs01 = vec_sr(qxs0, v4);
9601
+ vector signed char qxs10 = vec_and(qxs1, lowMask);
9602
+ vector signed char qxs11 = vec_sr(qxs1, v4);
9603
+
9604
+ vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
9605
+
9606
+ vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
9607
+ vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
9608
+ vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
9609
+ vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
9610
+
9611
+ vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
9612
+ vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
9613
+ vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
9614
+ vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
9615
+
9616
+ vector signed char q8y00 = vec_xl( 0, y[i].qs);
9617
+ vector signed char q8y10 = vec_xl(16, y[i].qs);
9618
+ vector signed char q8y01 = vec_xl(32, y[i].qs);
9619
+ vector signed char q8y11 = vec_xl(48, y[i].qs);
9620
+
9621
+ vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
9622
+ vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
9623
+ vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
9624
+ vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
9625
+
9626
+ vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
9627
+ vector signed short vs0 = vec_splat(vs, 0);
9628
+ vector signed short vs1 = vec_splat(vs, 1);
9629
+ vector signed short vs2 = vec_splat(vs, 2);
9630
+ vector signed short vs3 = vec_splat(vs, 3);
9631
+
9632
+ vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
9633
+ vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
9634
+ vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
9635
+ vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
9636
+
9637
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9638
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9639
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9640
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9641
+ }
9642
+
9643
+ vsumf0 = vec_add(vsumf0, vsumf2);
9644
+ vsumf1 = vec_add(vsumf1, vsumf3);
9645
+
9646
+ vsumf0 = vec_add(vsumf0, vsumf1);
9647
+
9648
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9649
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9650
+
9651
+ *s = vec_extract(vsumf0, 0);
9652
+
8256
9653
  #else
8257
9654
 
8258
9655
  int8_t aux8[QK_K];
@@ -8294,7 +9691,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8294
9691
 
8295
9692
  #endif
8296
9693
 
8297
- #if defined (__AVX2__) || defined (__ARM_NEON)
9694
+ #if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__)
8298
9695
  static const int8_t keven_signs_q2xs[1024] = {
8299
9696
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8300
9697
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8427,6 +9824,103 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8427
9824
 
8428
9825
  *s = 0.125f * hsum_float_8(accumf);
8429
9826
 
9827
+ #elif defined(__POWER9_VECTOR__)
9828
+ vector float vsumf0 = vec_splats(0.0f);
9829
+ vector float vsumf1 = vec_splats(0.0f);
9830
+ vector float vsumf2 = vec_splats(0.0f);
9831
+ vector float vsumf3 = vec_splats(0.0f);
9832
+
9833
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
9834
+
9835
+ for (int i = 0; i < nb; ++i) {
9836
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
9837
+ vector float vyd = vec_splats(y[i].d);
9838
+ vector float vd = vec_mul(vxd, vyd);
9839
+
9840
+ vector signed int vsumi0 = vec_splats((int32_t)0);
9841
+ vector signed int vsumi1 = vec_splats((int32_t)0);
9842
+ vector signed int vsumi2 = vec_splats((int32_t)0);
9843
+ vector signed int vsumi3 = vec_splats((int32_t)0);
9844
+ vector signed int vsumi4 = vec_splats((int32_t)0);
9845
+ vector signed int vsumi5 = vec_splats((int32_t)0);
9846
+ vector signed int vsumi6 = vec_splats((int32_t)0);
9847
+ vector signed int vsumi7 = vec_splats((int32_t)0);
9848
+
9849
+ const uint16_t * restrict q2 = x[i].qs;
9850
+ const int8_t * restrict q8 = y[i].qs;
9851
+
9852
+ for (int j = 0; j < QK_K/32; j += 2) {
9853
+ __builtin_prefetch(q2, 0, 1);
9854
+ __builtin_prefetch(q8, 0, 1);
9855
+
9856
+ uint32_t aux32[4];
9857
+ const uint8_t * aux8 = (const uint8_t *)aux32;
9858
+
9859
+ memcpy(aux32, q2, 4*sizeof(uint32_t));
9860
+ q2 += 8;
9861
+
9862
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
9863
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
9864
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
9865
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
9866
+
9867
+ vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))};
9868
+ vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
9869
+ vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))};
9870
+ vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
9871
+
9872
+ vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
9873
+ vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
9874
+ vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
9875
+ vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
9876
+
9877
+ vector signed char q8y0 = vec_xl( 0, q8);
9878
+ vector signed char q8y1 = vec_xl(16, q8);
9879
+ vector signed char q8y2 = vec_xl(32, q8);
9880
+ vector signed char q8y3 = vec_xl(48, q8);
9881
+ q8 += 64;
9882
+
9883
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
9884
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
9885
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
9886
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
9887
+
9888
+ const uint16_t ls0 = aux32[1] >> 28;
9889
+ const uint16_t ls1 = aux32[3] >> 28;
9890
+
9891
+ vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9892
+ vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9893
+
9894
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
9895
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
9896
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
9897
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
9898
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
9899
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
9900
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
9901
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
9902
+ }
9903
+
9904
+ vsumi0 = vec_add(vsumi0, vsumi4);
9905
+ vsumi1 = vec_add(vsumi1, vsumi5);
9906
+ vsumi2 = vec_add(vsumi2, vsumi6);
9907
+ vsumi3 = vec_add(vsumi3, vsumi7);
9908
+
9909
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9910
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9911
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
9912
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
9913
+ }
9914
+
9915
+ vsumf0 = vec_add(vsumf0, vsumf2);
9916
+ vsumf1 = vec_add(vsumf1, vsumf3);
9917
+
9918
+ vsumf0 = vec_add(vsumf0, vsumf1);
9919
+
9920
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
9921
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
9922
+
9923
+ *s = 0.125f * vec_extract(vsumf0, 0);
8430
9924
  #else
8431
9925
 
8432
9926
  uint32_t aux32[2];
@@ -8702,6 +10196,104 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
8702
10196
  *s = 0.125f * hsum_float_8(accumf);
8703
10197
  #endif
8704
10198
 
10199
+ #elif defined(__POWER9_VECTOR__)
10200
+ vector float vsumf0 = vec_splats(0.0f);
10201
+ vector float vsumf1 = vec_splats(0.0f);
10202
+ vector float vsumf2 = vec_splats(0.0f);
10203
+ vector float vsumf3 = vec_splats(0.0f);
10204
+
10205
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10206
+
10207
+ for (int i = 0; i < nb; ++i) {
10208
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10209
+ vector float vyd = vec_splats(y[i].d);
10210
+ vector float vd = vec_mul(vxd, vyd);
10211
+
10212
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10213
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10214
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10215
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10216
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10217
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10218
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10219
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10220
+
10221
+ const uint16_t * restrict q2 = x[i].qs;
10222
+ const uint8_t * restrict sc = x[i].scales;
10223
+ const int8_t * restrict q8 = y[i].qs;
10224
+
10225
+ for (int j = 0; j < QK_K/64; ++j) {
10226
+ __builtin_prefetch(q2, 0, 1);
10227
+ __builtin_prefetch(q8, 0, 1);
10228
+
10229
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
10230
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
10231
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
10232
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
10233
+
10234
+ vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
10235
+ vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
10236
+ vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
10237
+ vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
10238
+ q2 += 8;
10239
+
10240
+ vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
10241
+ vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
10242
+ vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
10243
+ vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
10244
+
10245
+ vector signed char q8y0 = vec_xl( 0, q8);
10246
+ vector signed char q8y1 = vec_xl(16, q8);
10247
+ vector signed char q8y2 = vec_xl(32, q8);
10248
+ vector signed char q8y3 = vec_xl(48, q8);
10249
+ q8 += 64;
10250
+
10251
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
10252
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
10253
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
10254
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
10255
+
10256
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
10257
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
10258
+ const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
10259
+ const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
10260
+ sc += 2;
10261
+
10262
+ vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
10263
+ vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
10264
+ vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
10265
+ vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
10266
+
10267
+ vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
10268
+ vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
10269
+ vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
10270
+ vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
10271
+ vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
10272
+ vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
10273
+ vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
10274
+ vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10275
+ }
10276
+
10277
+ vsumi0 = vec_add(vsumi0, vsumi4);
10278
+ vsumi1 = vec_add(vsumi1, vsumi5);
10279
+ vsumi2 = vec_add(vsumi2, vsumi6);
10280
+ vsumi3 = vec_add(vsumi3, vsumi7);
10281
+
10282
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10283
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10284
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10285
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10286
+ }
10287
+
10288
+ vsumf0 = vec_add(vsumf0, vsumf2);
10289
+ vsumf1 = vec_add(vsumf1, vsumf3);
10290
+
10291
+ vsumf0 = vec_add(vsumf0, vsumf1);
10292
+
10293
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10294
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10295
+
10296
+ *s = 0.125f * vec_extract(vsumf0, 0);
8705
10297
  #else
8706
10298
 
8707
10299
  float sumf = 0.f;
@@ -8902,6 +10494,124 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
8902
10494
 
8903
10495
  *s = 0.125f * hsum_float_8(accumf);
8904
10496
 
10497
+ #elif defined(__POWER9_VECTOR__)
10498
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10499
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10500
+ };
10501
+
10502
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10503
+
10504
+ vector float vsumf0 = vec_splats(0.0f);
10505
+ vector float vsumf1 = vec_splats(0.0f);
10506
+ vector float vsumf2 = vec_splats(0.0f);
10507
+ vector float vsumf3 = vec_splats(0.0f);
10508
+
10509
+ const vector unsigned char mask0 = vec_xl( 0, k_mask1);
10510
+ const vector unsigned char mask1 = vec_xl(16, k_mask1);
10511
+ const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
10512
+
10513
+ for (int i = 0; i < nb; ++i) {
10514
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10515
+ vector float vyd = vec_splats(y[i].d);
10516
+ vector float vd = vec_mul(vxd, vyd);
10517
+
10518
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10519
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10520
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10521
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10522
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10523
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10524
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10525
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10526
+
10527
+ const uint8_t * restrict q2 = x[i].qs;
10528
+ const uint8_t * restrict qh = x[i].qh;
10529
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
10530
+ const uint8_t * restrict sc = x[i].scales;
10531
+ const int8_t * restrict q8 = y[i].qs;
10532
+
10533
+ for (int j = 0; j < QK_K/32; j += 2) {
10534
+ __builtin_prefetch(q2, 0, 1);
10535
+ __builtin_prefetch(q8, 0, 1);
10536
+
10537
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
10538
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
10539
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
10540
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
10541
+ q2 += 8;
10542
+ qh += 2;
10543
+
10544
+ vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
10545
+ vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
10546
+ signs += 4;
10547
+
10548
+ vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
10549
+ vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
10550
+ vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
10551
+ vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
10552
+
10553
+ vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
10554
+ vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
10555
+ vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
10556
+ vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
10557
+
10558
+ vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
10559
+ vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
10560
+ vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
10561
+ vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
10562
+
10563
+ vector signed char q8y0 = vec_xl( 0, q8);
10564
+ vector signed char q8y1 = vec_xl(16, q8);
10565
+ vector signed char q8y2 = vec_xl(32, q8);
10566
+ vector signed char q8y3 = vec_xl(48, q8);
10567
+ q8 += 64;
10568
+
10569
+ vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
10570
+ vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
10571
+ vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
10572
+ vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
10573
+
10574
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
10575
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
10576
+ const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
10577
+ const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
10578
+ sc += 2;
10579
+
10580
+ vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
10581
+ vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
10582
+ vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
10583
+ vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
10584
+
10585
+ vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
10586
+ vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
10587
+ vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
10588
+ vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
10589
+ vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
10590
+ vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
10591
+ vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
10592
+ vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10593
+ }
10594
+
10595
+ vsumi0 = vec_add(vsumi0, vsumi4);
10596
+ vsumi1 = vec_add(vsumi1, vsumi5);
10597
+ vsumi2 = vec_add(vsumi2, vsumi6);
10598
+ vsumi3 = vec_add(vsumi3, vsumi7);
10599
+
10600
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10601
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10602
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10603
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10604
+ }
10605
+
10606
+ vsumf0 = vec_add(vsumf0, vsumf2);
10607
+ vsumf1 = vec_add(vsumf1, vsumf3);
10608
+
10609
+ vsumf0 = vec_add(vsumf0, vsumf1);
10610
+
10611
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10612
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10613
+
10614
+ *s = 0.125f * vec_extract(vsumf0, 0);
8905
10615
  #else
8906
10616
 
8907
10617
  float sumf = 0;
@@ -9046,6 +10756,101 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9046
10756
 
9047
10757
  *s = 0.25f * hsum_float_8(accumf);
9048
10758
 
10759
+ #elif defined(__POWER9_VECTOR__)
10760
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10761
+
10762
+ vector float vsumf0 = vec_splats(0.0f);
10763
+ vector float vsumf1 = vec_splats(0.0f);
10764
+ vector float vsumf2 = vec_splats(0.0f);
10765
+ vector float vsumf3 = vec_splats(0.0f);
10766
+
10767
+ for (int i = 0; i < nb; ++i) {
10768
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
10769
+ vector float vyd = vec_splats(y[i].d);
10770
+ vector float vd = vec_mul(vxd, vyd);
10771
+
10772
+ vector signed int vsumi0 = vec_splats((int32_t)0);
10773
+ vector signed int vsumi1 = vec_splats((int32_t)0);
10774
+ vector signed int vsumi2 = vec_splats((int32_t)0);
10775
+ vector signed int vsumi3 = vec_splats((int32_t)0);
10776
+ vector signed int vsumi4 = vec_splats((int32_t)0);
10777
+ vector signed int vsumi5 = vec_splats((int32_t)0);
10778
+ vector signed int vsumi6 = vec_splats((int32_t)0);
10779
+ vector signed int vsumi7 = vec_splats((int32_t)0);
10780
+
10781
+ const uint8_t * restrict q3 = x[i].qs;
10782
+ const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
10783
+ const int8_t * restrict q8 = y[i].qs;
10784
+
10785
+ #pragma GCC unroll 1
10786
+ for (int j = 0; j < QK_K/32; j += 2) {
10787
+ __builtin_prefetch(q3, 0, 1);
10788
+ __builtin_prefetch(q8, 0, 1);
10789
+
10790
+ vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
10791
+ vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
10792
+ vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
10793
+ vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
10794
+ q3 += 16;
10795
+
10796
+ vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])};
10797
+ vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
10798
+ vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])};
10799
+ vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
10800
+
10801
+ vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
10802
+ vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
10803
+ vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
10804
+ vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
10805
+
10806
+ vector signed char q8y0 = vec_xl( 0, q8);
10807
+ vector signed char q8y1 = vec_xl(16, q8);
10808
+ vector signed char q8y2 = vec_xl(32, q8);
10809
+ vector signed char q8y3 = vec_xl(48, q8);
10810
+ q8 += 64;
10811
+
10812
+ vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
10813
+ vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
10814
+ vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
10815
+ vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
10816
+
10817
+ const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
10818
+ const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
10819
+ signs += 2;
10820
+
10821
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10822
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10823
+
10824
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10825
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10826
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10827
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10828
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10829
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10830
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10831
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10832
+ }
10833
+
10834
+ vsumi0 = vec_add(vsumi0, vsumi4);
10835
+ vsumi1 = vec_add(vsumi1, vsumi5);
10836
+ vsumi2 = vec_add(vsumi2, vsumi6);
10837
+ vsumi3 = vec_add(vsumi3, vsumi7);
10838
+
10839
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10840
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10841
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
10842
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
10843
+ }
10844
+
10845
+ vsumf0 = vec_add(vsumf0, vsumf2);
10846
+ vsumf1 = vec_add(vsumf1, vsumf3);
10847
+
10848
+ vsumf0 = vec_add(vsumf0, vsumf1);
10849
+
10850
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
10851
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
10852
+
10853
+ *s = 0.25f * vec_extract(vsumf0, 0);
9049
10854
  #else
9050
10855
 
9051
10856
  uint32_t aux32;
@@ -9273,6 +11078,124 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
9273
11078
 
9274
11079
  *s = hsum_float_8(accumf);
9275
11080
 
11081
+ #elif defined(__POWER9_VECTOR__)
11082
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
11083
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
11084
+ };
11085
+
11086
+ static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
11087
+
11088
+ vector float vsumf0 = vec_splats(0.0f);
11089
+ vector float vsumf1 = vec_splats(0.0f);
11090
+ vector float vsumf2 = vec_splats(0.0f);
11091
+ vector float vsumf3 = vec_splats(0.0f);
11092
+
11093
+ const vector unsigned char mask0 = vec_xl( 0, k_mask1);
11094
+ const vector unsigned char mask1 = vec_xl(16, k_mask1);
11095
+ const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
11096
+
11097
+ for (int i = 0; i < nb; ++i) {
11098
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
11099
+ vector float vyd = vec_splats(y[i].d);
11100
+ vector float vd = vec_mul(vxd, vyd);
11101
+
11102
+ const uint8_t * restrict q3 = x[i].qs;
11103
+ const uint8_t * restrict qh = x[i].qh;
11104
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
11105
+ const uint8_t * restrict sc = x[i].scales;
11106
+ const int8_t * restrict q8 = y[i].qs;
11107
+
11108
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11109
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11110
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11111
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11112
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11113
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11114
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11115
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11116
+
11117
+ for (int j = 0; j < QK_K/32; j += 2) {
11118
+ __builtin_prefetch(q3, 0, 1);
11119
+ __builtin_prefetch(q8, 0, 1);
11120
+
11121
+ vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
11122
+ iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
11123
+ vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
11124
+ iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
11125
+ vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
11126
+ iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
11127
+ vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
11128
+ iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
11129
+ q3 += 16;
11130
+ qh += 2;
11131
+
11132
+ vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
11133
+ vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
11134
+ signs += 4;
11135
+
11136
+ vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
11137
+ vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
11138
+ vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
11139
+ vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
11140
+
11141
+ vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
11142
+ vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
11143
+ vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
11144
+ vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
11145
+
11146
+ vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
11147
+ vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
11148
+ vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
11149
+ vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
11150
+
11151
+ vector signed char q8y0 = vec_xl( 0, q8);
11152
+ vector signed char q8y1 = vec_xl(16, q8);
11153
+ vector signed char q8y2 = vec_xl(32, q8);
11154
+ vector signed char q8y3 = vec_xl(48, q8);
11155
+ q8 += 64;
11156
+
11157
+ vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
11158
+ vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
11159
+ vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
11160
+ vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
11161
+
11162
+ const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
11163
+ const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
11164
+ sc ++;
11165
+
11166
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
11167
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
11168
+
11169
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11170
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11171
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11172
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11173
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11174
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11175
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11176
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11177
+ }
11178
+
11179
+ vsumi0 = vec_add(vsumi0, vsumi4);
11180
+ vsumi1 = vec_add(vsumi1, vsumi5);
11181
+ vsumi2 = vec_add(vsumi2, vsumi6);
11182
+ vsumi3 = vec_add(vsumi3, vsumi7);
11183
+
11184
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11185
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11186
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
11187
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
11188
+ }
11189
+
11190
+ vsumf0 = vec_add(vsumf0, vsumf2);
11191
+ vsumf1 = vec_add(vsumf1, vsumf3);
11192
+
11193
+ vsumf0 = vec_add(vsumf0, vsumf1);
11194
+
11195
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11196
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11197
+
11198
+ *s = vec_extract(vsumf0, 0);
9276
11199
  #else
9277
11200
 
9278
11201
  float sumf = 0.f;
@@ -9427,6 +11350,113 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
9427
11350
 
9428
11351
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
9429
11352
 
11353
+ #elif defined(__POWER9_VECTOR__)
11354
+ const vector unsigned char v0 = vec_splats((unsigned char)0x0);
11355
+ const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
11356
+
11357
+ vector float vsumf0 = vec_splats(0.0f);
11358
+ vector float vsumf1 = vec_splats(0.0f);
11359
+ vector float vsumf2 = vec_splats(0.0f);
11360
+ vector float vsumf3 = vec_splats(0.0f);
11361
+
11362
+ for (int i = 0; i < nb; ++i) {
11363
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
11364
+ vector float vyd = vec_splats(y[i].d);
11365
+ vector float vd = vec_mul(vxd, vyd);
11366
+
11367
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11368
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11369
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11370
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11371
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11372
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11373
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11374
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11375
+ vector signed int vsumi8 = vec_splats((int32_t)0);
11376
+
11377
+ const uint8_t * restrict q1 = x[i].qs;
11378
+ const uint16_t * restrict qh = x[i].qh;
11379
+ const int8_t * restrict q8 = y[i].qs;
11380
+ const int16_t * restrict qs = y[i].bsums;
11381
+
11382
+ for (int j = 0; j < QK_K/32; j += 2) {
11383
+ __builtin_prefetch(q1, 0, 1);
11384
+ __builtin_prefetch(qh, 0, 1);
11385
+ __builtin_prefetch(q8, 0, 1);
11386
+
11387
+ vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
11388
+ vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
11389
+ vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
11390
+ vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
11391
+ q1 += 8;
11392
+
11393
+ vector signed char q1x0 = (vector signed char)aux64x2_0;
11394
+ vector signed char q1x1 = (vector signed char)aux64x2_1;
11395
+ vector signed char q1x2 = (vector signed char)aux64x2_2;
11396
+ vector signed char q1x3 = (vector signed char)aux64x2_3;
11397
+
11398
+ vector signed char q8y0 = vec_xl( 0, q8);
11399
+ vector signed char q8y1 = vec_xl(16, q8);
11400
+ vector signed char q8y2 = vec_xl(32, q8);
11401
+ vector signed char q8y3 = vec_xl(48, q8);
11402
+ q8 += 64;
11403
+
11404
+ vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
11405
+ vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
11406
+ vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
11407
+ vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
11408
+
11409
+ const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
11410
+ const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
11411
+
11412
+ vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
11413
+ vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
11414
+ vector signed short vscales = vec_sld(vscales23, vscales01, 8);
11415
+
11416
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11417
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11418
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11419
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11420
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11421
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11422
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11423
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11424
+
11425
+ vector signed short q8ysums = vec_xl_len(qs, 8);
11426
+ qs += 4;
11427
+ q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
11428
+
11429
+ vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
11430
+ qh += 2;
11431
+ vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
11432
+
11433
+ vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
11434
+
11435
+ vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
11436
+ }
11437
+
11438
+ vsumi0 = vec_add(vsumi0, vsumi4);
11439
+ vsumi1 = vec_add(vsumi1, vsumi5);
11440
+ vsumi2 = vec_add(vsumi2, vsumi6);
11441
+ vsumi3 = vec_add(vsumi3, vsumi7);
11442
+
11443
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11444
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11445
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
11446
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
11447
+
11448
+ vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
11449
+ }
11450
+
11451
+ vsumf0 = vec_add(vsumf0, vsumf2);
11452
+ vsumf1 = vec_add(vsumf1, vsumf3);
11453
+
11454
+ vsumf0 = vec_add(vsumf0, vsumf1);
11455
+
11456
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11457
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11458
+
11459
+ *s = vec_extract(vsumf0, 0);
9430
11460
  #else
9431
11461
 
9432
11462
  float sumf = 0;
@@ -9783,6 +11813,51 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
9783
11813
 
9784
11814
  *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
9785
11815
 
11816
+ #elif defined(__POWER9_VECTOR__)
11817
+ const vector signed char lowMask = vec_splats((signed char)0xF);
11818
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11819
+
11820
+ vector float vsumf0 = vec_splats(0.0f);
11821
+ vector float vsumf1 = vec_splats(0.0f);
11822
+
11823
+ const vector signed char values = vec_xl( 0, kvalues_iq4nl);
11824
+
11825
+ #pragma GCC unroll 4
11826
+ for (int ib = 0; ib < nb; ++ib) {
11827
+ __builtin_prefetch(x[ib].qs, 0, 1);
11828
+ __builtin_prefetch(y[ib].qs, 0, 1);
11829
+
11830
+
11831
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
11832
+ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
11833
+ vector float vd = vec_mul(vxd, vyd);
11834
+
11835
+ vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
11836
+ vector signed char q4x0 = vec_and(qxs, lowMask);
11837
+ vector signed char q4x1 = vec_sr(qxs, v4);
11838
+
11839
+ q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
11840
+ q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
11841
+
11842
+ vector signed char q8y0 = vec_xl( 0, y[ib].qs);
11843
+ vector signed char q8y1 = vec_xl(16, y[ib].qs);
11844
+
11845
+ vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11846
+ vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11847
+
11848
+ vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
11849
+ vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
11850
+
11851
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11852
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11853
+ }
11854
+
11855
+ vsumf0 = vec_add(vsumf0, vsumf1);
11856
+
11857
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
11858
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
11859
+
11860
+ *s = vec_extract(vsumf0, 0);
9786
11861
  #else
9787
11862
  float sumf = 0;
9788
11863
  for (int ib = 0; ib < nb; ++ib) {
@@ -9894,6 +11969,105 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9894
11969
 
9895
11970
  *s = hsum_float_8(accum);
9896
11971
 
11972
+ #elif defined(__POWER9_VECTOR__)
11973
+ const vector signed char lowMask = vec_splats((signed char)0xF);
11974
+ const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11975
+
11976
+ vector float vsumf0 = vec_splats(0.0f);
11977
+ vector float vsumf1 = vec_splats(0.0f);
11978
+ vector float vsumf2 = vec_splats(0.0f);
11979
+ vector float vsumf3 = vec_splats(0.0f);
11980
+
11981
+ const vector signed char values = vec_xl( 0, kvalues_iq4nl);
11982
+
11983
+ for (int ibl = 0; ibl < nb; ++ibl) {
11984
+
11985
+ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
11986
+ vector float vyd = vec_splats(y[ibl].d);
11987
+ vector float vd = vec_mul(vxd, vyd);
11988
+
11989
+ vector signed int vsumi0 = vec_splats((int32_t)0);
11990
+ vector signed int vsumi1 = vec_splats((int32_t)0);
11991
+ vector signed int vsumi2 = vec_splats((int32_t)0);
11992
+ vector signed int vsumi3 = vec_splats((int32_t)0);
11993
+ vector signed int vsumi4 = vec_splats((int32_t)0);
11994
+ vector signed int vsumi5 = vec_splats((int32_t)0);
11995
+ vector signed int vsumi6 = vec_splats((int32_t)0);
11996
+ vector signed int vsumi7 = vec_splats((int32_t)0);
11997
+
11998
+ uint16_t h = x[ibl].scales_h;
11999
+
12000
+ const uint8_t * restrict q4 = x[ibl].qs;
12001
+ const uint8_t * restrict sc = x[ibl].scales_l;
12002
+ const int8_t * restrict q8 = y[ibl].qs;
12003
+
12004
+ for (int ib = 0; ib < QK_K/64; ib ++ ) {
12005
+ __builtin_prefetch(q4, 0, 1);
12006
+ __builtin_prefetch(q8, 0, 1);
12007
+
12008
+ vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
12009
+ vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
12010
+ q4 += 32;
12011
+
12012
+ vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
12013
+ vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
12014
+ vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
12015
+ vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
12016
+
12017
+ q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
12018
+ q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
12019
+ q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
12020
+ q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
12021
+
12022
+ vector signed char q8y0 = vec_xl( 0, q8);
12023
+ vector signed char q8y1 = vec_xl(16, q8);
12024
+ vector signed char q8y2 = vec_xl(32, q8);
12025
+ vector signed char q8y3 = vec_xl(48, q8);
12026
+ q8 += 64;
12027
+
12028
+ vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
12029
+ vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
12030
+ vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
12031
+ vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
12032
+
12033
+ const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
12034
+ const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32);
12035
+ h >>= 4;
12036
+ sc ++;
12037
+
12038
+ vector signed short vscales01 = vec_splats((int16_t)ls0);
12039
+ vector signed short vscales23 = vec_splats((int16_t)ls1);
12040
+
12041
+ vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
12042
+ vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
12043
+ vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
12044
+ vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
12045
+ vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
12046
+ vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
12047
+ vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
12048
+ vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
12049
+ }
12050
+
12051
+ vsumi0 = vec_add(vsumi0, vsumi4);
12052
+ vsumi1 = vec_add(vsumi1, vsumi5);
12053
+ vsumi2 = vec_add(vsumi2, vsumi6);
12054
+ vsumi3 = vec_add(vsumi3, vsumi7);
12055
+
12056
+ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
12057
+ vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
12058
+ vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
12059
+ vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
12060
+ }
12061
+
12062
+ vsumf0 = vec_add(vsumf0, vsumf2);
12063
+ vsumf1 = vec_add(vsumf1, vsumf3);
12064
+
12065
+ vsumf0 = vec_add(vsumf0, vsumf1);
12066
+
12067
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
12068
+ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
12069
+
12070
+ *s = vec_extract(vsumf0, 0);
9897
12071
  #else
9898
12072
  float sumf = 0;
9899
12073
  for (int ibl = 0; ibl < nb; ++ibl) {