llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,8 +4,6 @@
4
4
  #include "ggml-quants.h"
5
5
  #include "ggml-impl.h"
6
6
 
7
- #define GGML_COMMON_IMPL_C
8
- #include "ggml-common.h"
9
7
 
10
8
  #include <math.h>
11
9
  #include <string.h>
@@ -1078,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1078
1076
  }
1079
1077
  vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
1080
1078
  vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
1079
+ }
1081
1080
 
1082
1081
  #elif defined(__loongarch_asx)
1083
1082
  for (int i = 0; i < nb; i++) {
@@ -1437,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1437
1436
  accv = vec_add(accv, vec_sld(accv, accv, 4));
1438
1437
  accv = vec_add(accv, vec_sld(accv, accv, 8));
1439
1438
  y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
1439
+ }
1440
1440
 
1441
1441
  #elif defined(__loongarch_asx)
1442
1442
  for (int i = 0; i < nb; i++) {
@@ -4113,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4113
4113
 
4114
4114
  #elif defined(__POWER9_VECTOR__)
4115
4115
  const vector signed char lowMask = vec_splats((signed char)0xF);
4116
+ const vector signed int v0 = vec_splats((int32_t)0);
4116
4117
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4117
4118
  const vector signed char v8 = vec_splats((signed char)0x8);
4118
4119
 
4119
4120
  vector float vsumf0 = vec_splats(0.0f);
4120
4121
 
4121
- #pragma GCC unroll 4
4122
+ #pragma GCC unroll 8
4122
4123
  for (int i = 0; i < nb; i++) {
4123
4124
  __builtin_prefetch(x[i].qs, 0, 1);
4124
4125
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -4140,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4140
4141
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4141
4142
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4142
4143
 
4143
- qv0 = vec_add(qv0, qv1);
4144
+ vector signed int vsumi0 = v0;
4144
4145
 
4145
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4146
+ vsumi0 = vec_sum4s(qv0, vsumi0);
4147
+ vsumi0 = vec_sum4s(qv1, vsumi0);
4146
4148
 
4147
4149
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4148
4150
  }
@@ -4516,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4516
4518
 
4517
4519
  #elif defined(__POWER9_VECTOR__)
4518
4520
  const vector signed char lowMask = vec_splats((signed char)0xF);
4521
+ const vector signed int v0 = vec_splats((int32_t)0);
4519
4522
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4520
4523
 
4521
4524
  vector float vsumf0 = vec_splats(0.0f);
@@ -4537,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4537
4540
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
4538
4541
  vector signed char q8y1 = vec_xl(16, y[i].qs);
4539
4542
 
4540
- vector signed char q4x0 = vec_and(qxs, lowMask);
4541
- vector signed char q4x1 = vec_sr(qxs, v4);
4543
+ vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
4544
+ vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
4542
4545
 
4543
- vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4544
- vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4546
+ vector signed int vsumi0 = v0;
4545
4547
 
4546
- qv0 = vec_add(qv0, qv1);
4547
-
4548
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4548
+ vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
4549
+ vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
4549
4550
 
4550
4551
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4551
4552
  }
@@ -5247,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5247
5248
 
5248
5249
  #elif defined(__POWER9_VECTOR__)
5249
5250
  const vector signed char lowMask = vec_splats((signed char)0xF);
5251
+ const vector signed int v0 = vec_splats((int32_t)0);
5250
5252
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5251
5253
 
5252
5254
  vector float vsumf0 = vec_splats(0.0f);
@@ -5272,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5272
5274
 
5273
5275
  vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
5274
5276
 
5275
- vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
5276
- vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
5277
+ vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
5278
+ vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
5277
5279
 
5278
5280
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
5279
5281
  vector signed char q8y1 = vec_xl( 16, y[i].qs);
5280
5282
 
5281
- vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
5282
- vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
5283
+ vector signed int vsumi0 = v0;
5283
5284
 
5284
- qv0 = vec_add(qv0, qv1);
5285
-
5286
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
5285
+ vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
5286
+ vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
5287
5287
 
5288
5288
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5289
5289
  }
@@ -5523,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5523
5523
  *s = sumf;
5524
5524
 
5525
5525
  #elif defined(__POWER9_VECTOR__)
5526
+ const vector signed int v0 = vec_splats((int32_t)0);
5526
5527
  vector float vsumf0 = vec_splats(0.0f);
5527
5528
 
5528
- #pragma GCC unroll 4
5529
+ #pragma GCC unroll 8
5529
5530
  for (int i = 0; i < nb; i++) {
5530
5531
  __builtin_prefetch(x[i].qs, 0, 1);
5531
5532
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -5544,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5544
5545
  vector signed short qv2 = vec_mule(q8x1, q8y1);
5545
5546
  vector signed short qv3 = vec_mulo(q8x1, q8y1);
5546
5547
 
5547
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
5548
- vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
5549
- vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
5550
- vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
5548
+ vector signed int vsumi0 = v0;
5549
+ vector signed int vsumi1 = v0;
5551
5550
 
5552
- vsumi0 = vec_add(vsumi0, vsumi2);
5553
- vsumi1 = vec_add(vsumi1, vsumi3);
5551
+ vsumi0 = vec_sum4s(qv0, vsumi0);
5552
+ vsumi1 = vec_sum4s(qv1, vsumi1);
5553
+ vsumi0 = vec_sum4s(qv2, vsumi0);
5554
+ vsumi1 = vec_sum4s(qv3, vsumi1);
5554
5555
 
5555
5556
  vsumi0 = vec_add(vsumi0, vsumi1);
5556
5557
 
@@ -5938,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5938
5939
  #elif defined(__POWER9_VECTOR__)
5939
5940
  const vector signed char lowMask = vec_splats((signed char)0x3);
5940
5941
  const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5942
+ const vector int v0 = vec_splats((int32_t)0);
5941
5943
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5942
5944
  const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5943
5945
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -5975,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5975
5977
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5976
5978
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5977
5979
 
5978
- vector signed int vsumi0 = vec_splats((int32_t)0);
5979
- vector signed int vsumi1 = vec_splats((int32_t)0);
5980
- vector signed int vsumi2 = vec_splats((int32_t)0);
5981
- vector signed int vsumi3 = vec_splats((int32_t)0);
5982
- vector signed int vsumi4 = vec_splats((int32_t)0);
5983
- vector signed int vsumi5 = vec_splats((int32_t)0);
5984
- vector signed int vsumi6 = vec_splats((int32_t)0);
5985
- vector signed int vsumi7 = vec_splats((int32_t)0);
5980
+ vector signed int vsumi0 = v0;
5981
+ vector signed int vsumi1 = v0;
5982
+ vector signed int vsumi2 = v0;
5983
+ vector signed int vsumi3 = v0;
5984
+ vector signed int vsumi4 = v0;
5985
+ vector signed int vsumi5 = v0;
5986
+ vector signed int vsumi6 = v0;
5987
+ vector signed int vsumi7 = v0;
5986
5988
 
5989
+ const uint8_t * restrict q2 = x[i].qs;
5990
+ const int8_t * restrict q8 = y[i].qs;
5987
5991
 
5988
5992
  for (int j = 0; j < QK_K/128; ++j) {
5989
5993
  __builtin_prefetch(q2, 0, 1);
@@ -5993,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5993
5997
  vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5994
5998
  q2 += 32;
5995
5999
 
5996
- vector signed char q2x00 = vec_and(qxs0, lowMask);
5997
- vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5998
- vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5999
- vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
6000
- vector signed char q2x10 = vec_and(qxs1, lowMask);
6001
- vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
6002
- vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
6003
- vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
6000
+ vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
6001
+ vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
6002
+ vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
6003
+ vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
6004
+ vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
6005
+ vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
6006
+ vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
6007
+ vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
6004
6008
 
6005
6009
  vector signed char q8y00 = vec_xl( 0, q8);
6006
6010
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -6012,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6012
6016
  vector signed char q8y13 = vec_xl(112, q8);
6013
6017
  q8 += 128;
6014
6018
 
6015
- vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
6016
- vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
6017
- vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
6018
- vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
6019
- vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
6020
- vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
6021
- vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
6022
- vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
6023
-
6024
- vector signed short vscales_h = vec_unpackh(vscales);
6025
- vector signed short vs0 = vec_splat(vscales_h, 0);
6026
- vector signed short vs1 = vec_splat(vscales_h, 1);
6027
- vector signed short vs2 = vec_splat(vscales_h, 2);
6028
- vector signed short vs3 = vec_splat(vscales_h, 3);
6029
- vector signed short vs4 = vec_splat(vscales_h, 4);
6030
- vector signed short vs5 = vec_splat(vscales_h, 5);
6031
- vector signed short vs6 = vec_splat(vscales_h, 6);
6032
- vector signed short vs7 = vec_splat(vscales_h, 7);
6019
+ vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
6020
+ vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
6021
+ vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
6022
+ vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
6023
+ vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
6024
+ vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
6025
+ vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
6026
+ vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
6027
+
6028
+ vector signed short vscales_07 = vec_unpackh(vscales);
6029
+ vector signed int vscales_03 = vec_unpackh(vscales_07);
6030
+ vector signed int vscales_47 = vec_unpackl(vscales_07);
6031
+ vector signed int vs0 = vec_splat(vscales_03, 0);
6032
+ vector signed int vs1 = vec_splat(vscales_03, 1);
6033
+ vector signed int vs2 = vec_splat(vscales_03, 2);
6034
+ vector signed int vs3 = vec_splat(vscales_03, 3);
6035
+ vector signed int vs4 = vec_splat(vscales_47, 0);
6036
+ vector signed int vs5 = vec_splat(vscales_47, 1);
6037
+ vector signed int vs6 = vec_splat(vscales_47, 2);
6038
+ vector signed int vs7 = vec_splat(vscales_47, 3);
6033
6039
  vscales = vec_sld(vscales, vscales, 8);
6034
6040
 
6035
- qv0 = vec_mul(qv0, vs0);
6036
- qv1 = vec_mul(qv1, vs2);
6037
- qv2 = vec_mul(qv2, vs4);
6038
- qv3 = vec_mul(qv3, vs6);
6039
-
6040
- qv0 = vec_madd(qv4, vs1, qv0);
6041
- qv1 = vec_madd(qv5, vs3, qv1);
6042
- qv2 = vec_madd(qv6, vs5, qv2);
6043
- qv3 = vec_madd(qv7, vs7, qv3);
6044
-
6045
- vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
6046
- vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
6047
- vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
6048
- vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
6049
-
6050
- vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
6051
- vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
6052
- vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
6053
- vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
6041
+ vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
6042
+ vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
6043
+ vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
6044
+ vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
6045
+ vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
6046
+ vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
6047
+ vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
6048
+ vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
6054
6049
  }
6055
6050
 
6056
6051
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -6641,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6641
6636
 
6642
6637
  #elif defined(__POWER9_VECTOR__)
6643
6638
  const vector signed char lowMask = vec_splats((signed char)0x3);
6639
+ const vector signed char lowMask1 = vec_splats((int8_t)0xf);
6640
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
6641
+ const vector int v0 = vec_splats((int32_t)0);
6644
6642
  const vector signed char v1 = vec_splats((signed char)0x1);
6645
6643
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6646
6644
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -6658,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6658
6656
  vector float vyd = vec_splats(y[i].d);
6659
6657
  vector float vd = vec_mul(vxd, vyd);
6660
6658
 
6661
- uint32_t aux[3];
6662
- uint32_t utmp[4];
6659
+ UNUSED(kmask1);
6660
+ UNUSED(kmask2);
6663
6661
 
6664
- memcpy(aux, x[i].scales, 12);
6665
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6666
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6667
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6668
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6662
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
6663
+ vector signed char u1 = vec_and(u0, lowMask1);
6664
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
6665
+ vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
6666
+ vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
6667
+ vector signed char u31 = vec_and(u3, lowMask2);
6669
6668
 
6670
- vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
6669
+ u1 = vec_or(u1, u30);
6670
+ u2 = vec_or(vec_sr(u0, v4), u31);
6671
+
6672
+ vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
6671
6673
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6672
6674
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6673
6675
 
6674
6676
  vscales = vec_sub(vscales, off);
6675
6677
 
6676
- vector signed int vsumi0 = vec_splats((int32_t)0);
6677
- vector signed int vsumi1 = vec_splats((int32_t)0);
6678
- vector signed int vsumi2 = vec_splats((int32_t)0);
6679
- vector signed int vsumi3 = vec_splats((int32_t)0);
6680
- vector signed int vsumi4 = vec_splats((int32_t)0);
6681
- vector signed int vsumi5 = vec_splats((int32_t)0);
6682
- vector signed int vsumi6 = vec_splats((int32_t)0);
6683
- vector signed int vsumi7 = vec_splats((int32_t)0);
6684
-
6678
+ vector signed int vsumi0 = v0;
6679
+ vector signed int vsumi1 = v0;
6680
+ vector signed int vsumi2 = v0;
6681
+ vector signed int vsumi3 = v0;
6682
+ vector signed int vsumi4 = v0;
6683
+ vector signed int vsumi5 = v0;
6684
+ vector signed int vsumi6 = v0;
6685
+ vector signed int vsumi7 = v0;
6685
6686
 
6686
6687
  const uint8_t * restrict q3 = x[i].qs;
6687
6688
  const int8_t * restrict q8 = y[i].qs;
@@ -6755,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6755
6756
  vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6756
6757
  vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6757
6758
 
6758
- vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6759
- vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6760
- vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
6761
- vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
6762
- vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6763
- vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6764
- vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
6765
- vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
6766
-
6767
- vsumi0 = vec_add(vsum0, vsumi0);
6768
- vsumi1 = vec_add(vsum1, vsumi1);
6769
- vsumi2 = vec_add(vsum2, vsumi2);
6770
- vsumi3 = vec_add(vsum3, vsumi3);
6771
- vsumi4 = vec_add(vsum4, vsumi4);
6772
- vsumi5 = vec_add(vsum5, vsumi5);
6773
- vsumi6 = vec_add(vsum6, vsumi6);
6774
- vsumi7 = vec_add(vsum7, vsumi7);
6759
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
6760
+ vsumi1 = vec_msum(qv01, vs2, vsumi1);
6761
+ vsumi2 = vec_msum(qv02, vs4, vsumi2);
6762
+ vsumi3 = vec_msum(qv03, vs6, vsumi3);
6763
+ vsumi4 = vec_msum(qv10, vs1, vsumi4);
6764
+ vsumi5 = vec_msum(qv11, vs3, vsumi5);
6765
+ vsumi6 = vec_msum(qv12, vs5, vsumi6);
6766
+ vsumi7 = vec_msum(qv13, vs7, vsumi7);
6775
6767
  }
6776
6768
 
6777
6769
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -7270,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7270
7262
 
7271
7263
  #elif defined(__POWER9_VECTOR__)
7272
7264
  const vector signed char lowMask = vec_splats((signed char)0xF);
7265
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7266
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7267
+ const vector int v0 = vec_splats((int32_t)0);
7268
+ const vector unsigned char v2 = vec_splats((uint8_t)2);
7273
7269
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7274
7270
 
7275
7271
  vector float vsumf0 = vec_splats(0.0f);
@@ -7288,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7288
7284
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7289
7285
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7290
7286
 
7291
- memcpy(utmp, x[i].scales, 12);
7287
+ UNUSED(kmask1);
7288
+ UNUSED(kmask2);
7289
+ UNUSED(kmask3);
7290
+ UNUSED(utmp);
7292
7291
 
7293
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7294
- const uint32_t uaux = utmp[1] & kmask1;
7295
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7296
- utmp[2] = uaux;
7297
- utmp[0] &= kmask1;
7292
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7293
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7294
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7295
+ vector signed char u3 = vec_sr(u2, v4);
7296
+
7297
+ vector signed char u30 = u1;
7298
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7299
+
7300
+ u1 = vec_and(u0, lowMask1);
7301
+ u2 = vec_or(u30, u31);
7302
+
7303
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7298
7304
 
7299
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7300
7305
  vector signed short vscales = vec_unpackh(utmps);
7301
7306
  vector signed short q4xmins = vec_unpackl(utmps);
7302
7307
  vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
@@ -7312,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7312
7317
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7313
7318
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7314
7319
 
7315
- vector signed int vsumi0 = vec_splats((int32_t)0);
7316
- vector signed int vsumi1 = vec_splats((int32_t)0);
7317
- vector signed int vsumi2 = vec_splats((int32_t)0);
7318
- vector signed int vsumi3 = vec_splats((int32_t)0);
7319
- vector signed int vsumi4 = vec_splats((int32_t)0);
7320
- vector signed int vsumi5 = vec_splats((int32_t)0);
7321
- vector signed int vsumi6 = vec_splats((int32_t)0);
7322
- vector signed int vsumi7 = vec_splats((int32_t)0);
7320
+ vector signed int vsumi0 = v0;
7321
+ vector signed int vsumi1 = v0;
7322
+ vector signed int vsumi2 = v0;
7323
+ vector signed int vsumi3 = v0;
7323
7324
 
7324
7325
  const uint8_t * restrict q4 = x[i].qs;
7325
7326
  const int8_t * restrict q8 = y[i].qs;
@@ -7334,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7334
7335
  vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7335
7336
  q4 += 64;
7336
7337
 
7337
- vector signed char q4x00 = vec_and(qxs0, lowMask);
7338
- vector signed char q4x01 = vec_sr(qxs0, v4);
7339
- vector signed char q4x10 = vec_and(qxs1, lowMask);
7340
- vector signed char q4x11 = vec_sr(qxs1, v4);
7341
- vector signed char q4x20 = vec_and(qxs2, lowMask);
7342
- vector signed char q4x21 = vec_sr(qxs2, v4);
7343
- vector signed char q4x30 = vec_and(qxs3, lowMask);
7344
- vector signed char q4x31 = vec_sr(qxs3, v4);
7338
+ vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
7339
+ vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
7340
+ vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
7341
+ vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
7342
+ vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
7343
+ vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
7344
+ vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
7345
+ vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
7345
7346
 
7346
7347
  vector signed char q8y00 = vec_xl( 0, q8);
7347
7348
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -7353,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7353
7354
  vector signed char q8y31 = vec_xl(112, q8);
7354
7355
  q8 += 128;
7355
7356
 
7356
- vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7357
- vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7358
- vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7359
- vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7360
- vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
7361
- vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
7362
- vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
7363
- vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
7364
-
7365
- vector signed short vs0 = vec_splat(vscales, 0);
7366
- vector signed short vs1 = vec_splat(vscales, 1);
7367
- vector signed short vs2 = vec_splat(vscales, 2);
7368
- vector signed short vs3 = vec_splat(vscales, 3);
7357
+ vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
7358
+ vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
7359
+ vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
7360
+ vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
7361
+ vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
7362
+ vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
7363
+ vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
7364
+ vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
7365
+
7366
+ vector signed int vscales_h = vec_unpackh(vscales);
7367
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7368
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7369
+ vector signed int vs2 = vec_splat(vscales_h, 2);
7370
+ vector signed int vs3 = vec_splat(vscales_h, 3);
7369
7371
  vscales = vec_sld(vscales, vscales, 8);
7370
7372
 
7371
- qv00 = vec_add(qv00, qv10);
7372
- qv10 = vec_add(qv01, qv11);
7373
- qv20 = vec_add(qv20, qv30);
7374
- qv30 = vec_add(qv21, qv31);
7373
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7374
+ vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
7375
+ vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
7376
+ vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
7375
7377
 
7376
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7377
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7378
- vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
7379
- vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
7380
- vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
7381
- vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
7382
- vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
7383
- vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
7378
+ vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
7379
+ vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
7380
+ vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
7381
+ vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
7384
7382
  }
7385
7383
 
7386
- vsumi0 = vec_add(vsumi0, vsumi4);
7387
- vsumi1 = vec_add(vsumi1, vsumi5);
7388
- vsumi2 = vec_add(vsumi2, vsumi6);
7389
- vsumi3 = vec_add(vsumi3, vsumi7);
7390
-
7391
7384
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7392
7385
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7393
7386
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -7889,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7889
7882
 
7890
7883
  #elif defined(__POWER9_VECTOR__)
7891
7884
  const vector signed char lowMask = vec_splats((signed char)0xF);
7885
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7886
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7887
+ const vector int v0 = vec_splats((int32_t)0);
7892
7888
  const vector unsigned char v1 = vec_splats((unsigned char)0x1);
7893
7889
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
7894
7890
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -7907,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7907
7903
  vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7908
7904
  vector float vdmin = vec_mul(vxmin, vyd);
7909
7905
 
7910
- memcpy(utmp, x[i].scales, 12);
7906
+ UNUSED(kmask1);
7907
+ UNUSED(kmask2);
7908
+ UNUSED(kmask3);
7909
+ UNUSED(utmp);
7911
7910
 
7912
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7913
- const uint32_t uaux = utmp[1] & kmask1;
7914
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7915
- utmp[2] = uaux;
7916
- utmp[0] &= kmask1;
7911
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7912
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7913
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7914
+ vector signed char u3 = vec_sr(u2, v4);
7915
+
7916
+ vector signed char u30 = u1;
7917
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7918
+
7919
+ u1 = vec_and(u0, lowMask1);
7920
+ u2 = vec_or(u30, u31);
7921
+
7922
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7917
7923
 
7918
7924
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7919
7925
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7920
7926
 
7921
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7922
7927
  vector signed short vscales = vec_unpackh(utmps);
7923
7928
 
7924
7929
  vector signed short q5xmins = vec_unpackl(utmps);
@@ -7938,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7938
7943
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
7939
7944
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
7940
7945
 
7941
- vector signed int vsumi0 = vec_splats((int32_t)0);
7942
- vector signed int vsumi1 = vec_splats((int32_t)0);
7943
- vector signed int vsumi2 = vec_splats((int32_t)0);
7944
- vector signed int vsumi3 = vec_splats((int32_t)0);
7946
+ vector signed int vsumi0 = v0;
7947
+ vector signed int vsumi1 = v0;
7948
+ vector signed int vsumi2 = v0;
7949
+ vector signed int vsumi3 = v0;
7945
7950
 
7946
7951
  const uint8_t * restrict q5 = x[i].qs;
7947
7952
  const int8_t * restrict q8 = y[i].qs;
@@ -7966,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7966
7971
  qxhs0 = vec_sr(qxhs0, v2);
7967
7972
  qxhs1 = vec_sr(qxhs1, v2);
7968
7973
 
7969
- vector signed char q5x00 = vec_or(q5h00, qxs00);
7970
- vector signed char q5x01 = vec_or(q5h01, qxs01);
7971
- vector signed char q5x10 = vec_or(q5h10, qxs10);
7972
- vector signed char q5x11 = vec_or(q5h11, qxs11);
7974
+ vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
7975
+ vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
7976
+ vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
7977
+ vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
7973
7978
 
7974
7979
  vector signed char q8y00 = vec_xl( 0, q8);
7975
7980
  vector signed char q8y10 = vec_xl(16, q8);
@@ -7977,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7977
7982
  vector signed char q8y11 = vec_xl(48, q8);
7978
7983
  q8 += 64;
7979
7984
 
7980
- vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
7981
- vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
7982
- vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
7983
- vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
7985
+ vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
7986
+ vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
7987
+ vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
7988
+ vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
7984
7989
 
7985
- vector signed short vs0 = vec_splat(vscales, 0);
7986
- vector signed short vs1 = vec_splat(vscales, 1);
7990
+ vector signed int vscales_h = vec_unpackh(vscales);
7991
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7992
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7987
7993
  vscales = vec_sld(vscales, vscales, 12);
7988
7994
 
7989
- qv00 = vec_add(qv00, qv10);
7990
- qv01 = vec_add(qv01, qv11);
7991
-
7992
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7993
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7994
- vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
7995
- vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
7995
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7996
+ vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
7997
+ vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
7998
+ vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
7996
7999
  }
7997
8000
 
7998
8001
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
@@ -8553,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8553
8556
 
8554
8557
  #elif defined(__POWER9_VECTOR__)
8555
8558
  const vector signed char lowMask = vec_splats((signed char)0xF);
8559
+ const vector int v0 = vec_splats((int32_t)0);
8556
8560
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8557
8561
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8558
8562
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -8569,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8569
8573
  vector float vyd = vec_splats(y[i].d);
8570
8574
  vector float vd = vec_mul(vxd, vyd);
8571
8575
 
8572
- vector signed int vsumi0 = vec_splats((int32_t)0);
8573
- vector signed int vsumi1 = vec_splats((int32_t)0);
8574
- vector signed int vsumi2 = vec_splats((int32_t)0);
8575
- vector signed int vsumi3 = vec_splats((int32_t)0);
8576
- vector signed int vsumi4 = vec_splats((int32_t)0);
8577
- vector signed int vsumi5 = vec_splats((int32_t)0);
8578
- vector signed int vsumi6 = vec_splats((int32_t)0);
8579
- vector signed int vsumi7 = vec_splats((int32_t)0);
8576
+ vector signed int vsumi0 = v0;
8577
+ vector signed int vsumi1 = v0;
8578
+ vector signed int vsumi2 = v0;
8579
+ vector signed int vsumi3 = v0;
8580
+ vector signed int vsumi4 = v0;
8581
+ vector signed int vsumi5 = v0;
8582
+ vector signed int vsumi6 = v0;
8583
+ vector signed int vsumi7 = v0;
8580
8584
 
8581
8585
  const uint8_t * restrict q6 = x[i].ql;
8582
8586
  const uint8_t * restrict qh = x[i].qh;
@@ -8656,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8656
8660
  vector signed short vs6 = vec_splat(vscales, 6);
8657
8661
  vector signed short vs7 = vec_splat(vscales, 7);
8658
8662
 
8659
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
8660
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
8661
- vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
8662
- vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
8663
- vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
8664
- vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
8665
- vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
8666
- vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
8667
-
8668
- vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
8669
- vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
8670
- vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
8671
- vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
8672
- vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
8673
- vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
8674
- vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
8675
- vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
8663
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
8664
+ vsumi1 = vec_msum(qv01, vs4, vsumi1);
8665
+ vsumi2 = vec_msum(qv10, vs1, vsumi2);
8666
+ vsumi3 = vec_msum(qv11, vs5, vsumi3);
8667
+ vsumi4 = vec_msum(qv20, vs2, vsumi4);
8668
+ vsumi5 = vec_msum(qv21, vs6, vsumi5);
8669
+ vsumi6 = vec_msum(qv30, vs3, vsumi6);
8670
+ vsumi7 = vec_msum(qv31, vs7, vsumi7);
8676
8671
  }
8677
8672
 
8678
8673
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -8819,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8819
8814
  #endif
8820
8815
  }
8821
8816
 
8822
- #if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
8817
+ #if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
8823
8818
  static const int8_t keven_signs_q2xs[1024] = {
8824
8819
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8825
8820
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8952,7 +8947,63 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8952
8947
 
8953
8948
  *s = 0.125f * hsum_float_8(accumf);
8954
8949
 
8950
+ #elif defined(__AVX__)
8951
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
8952
+
8953
+ uint32_t aux32[4];
8954
+ const uint8_t * aux8 = (const uint8_t *)aux32;
8955
+
8956
+ __m256 accumf = _mm256_setzero_ps();
8957
+ for (int i = 0; i < nb; ++i) {
8958
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8959
+ const uint16_t * restrict q2 = x[i].qs;
8960
+ const int8_t * restrict q8 = y[i].qs;
8961
+ __m128i sumi1_0 = _mm_setzero_si128();
8962
+ __m128i sumi1_1 = _mm_setzero_si128();
8963
+ __m128i sumi2_0 = _mm_setzero_si128();
8964
+ __m128i sumi2_1 = _mm_setzero_si128();
8965
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8966
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8967
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8968
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8969
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8970
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
8971
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
8972
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
8973
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
8974
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
8975
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
8976
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
8977
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
8978
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
8979
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
8980
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
8981
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
8982
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
8983
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
8984
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
8985
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
8986
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
8987
+ const uint16_t ls1 = aux32[1] >> 28;
8988
+ const uint16_t ls2 = aux32[3] >> 28;
8989
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
8990
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
8991
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
8992
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
8993
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
8994
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
8995
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
8996
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
8997
+ }
8998
+
8999
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9000
+
9001
+ }
9002
+
9003
+ *s = 0.125f * hsum_float_8(accumf);
9004
+
8955
9005
  #elif defined(__POWER9_VECTOR__)
9006
+ const vector int v0 = vec_splats((int32_t)0);
8956
9007
  vector float vsumf0 = vec_splats(0.0f);
8957
9008
  vector float vsumf1 = vec_splats(0.0f);
8958
9009
  vector float vsumf2 = vec_splats(0.0f);
@@ -8965,14 +9016,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8965
9016
  vector float vyd = vec_splats(y[i].d);
8966
9017
  vector float vd = vec_mul(vxd, vyd);
8967
9018
 
8968
- vector signed int vsumi0 = vec_splats((int32_t)0);
8969
- vector signed int vsumi1 = vec_splats((int32_t)0);
8970
- vector signed int vsumi2 = vec_splats((int32_t)0);
8971
- vector signed int vsumi3 = vec_splats((int32_t)0);
8972
- vector signed int vsumi4 = vec_splats((int32_t)0);
8973
- vector signed int vsumi5 = vec_splats((int32_t)0);
8974
- vector signed int vsumi6 = vec_splats((int32_t)0);
8975
- vector signed int vsumi7 = vec_splats((int32_t)0);
9019
+ vector signed int vsumi0 = v0;
9020
+ vector signed int vsumi1 = v0;
9021
+ vector signed int vsumi2 = v0;
9022
+ vector signed int vsumi3 = v0;
8976
9023
 
8977
9024
  const uint16_t * restrict q2 = x[i].qs;
8978
9025
  const int8_t * restrict q8 = y[i].qs;
@@ -9019,21 +9066,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9019
9066
  vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9020
9067
  vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9021
9068
 
9022
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
9023
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
9024
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
9025
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
9026
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
9027
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
9028
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
9029
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
9069
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
9070
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
9071
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
9072
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
9030
9073
  }
9031
9074
 
9032
- vsumi0 = vec_add(vsumi0, vsumi4);
9033
- vsumi1 = vec_add(vsumi1, vsumi5);
9034
- vsumi2 = vec_add(vsumi2, vsumi6);
9035
- vsumi3 = vec_add(vsumi3, vsumi7);
9036
-
9037
9075
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9038
9076
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9039
9077
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9307,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9307
9345
  }
9308
9346
 
9309
9347
  *s = 0.125f * hsum_float_8(accumf);
9348
+
9349
+ #elif defined(__AVX__)
9350
+ const __m128i mone = _mm_set1_epi8(1);
9351
+ static const char block_sign_shuffle_mask_1[32] = {
9352
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
9353
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
9354
+ };
9355
+ static const char block_sign_shuffle_mask_2[32] = {
9356
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
9357
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
9358
+ };
9359
+ static const uint8_t bit_selector_mask_bytes[32] = {
9360
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9361
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9362
+ };
9363
+
9364
+ const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
9365
+ const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
9366
+ const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
9367
+ const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
9368
+ const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
9369
+ const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
9370
+
9371
+ static const uint8_t k_bit_helper[32] = {
9372
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9373
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9374
+ };
9375
+ const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
9376
+ const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
9377
+ const __m128i m511 = _mm_set1_epi16(511);
9378
+ const __m128i m4 = _mm_set1_epi8(0xf);
9379
+ const __m128i m1 = _mm_set1_epi8(1);
9380
+
9381
+ uint64_t aux64;
9382
+
9383
+ // somewhat hacky, but gives a significant boost in performance
9384
+ __m256i aux_gindex;
9385
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
9386
+
9387
+ __m256 accumf = _mm256_setzero_ps();
9388
+ for (int i = 0; i < nb; ++i) {
9389
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9390
+ const uint16_t * restrict q2 = x[i].qs;
9391
+ const int8_t * restrict q8 = y[i].qs;
9392
+
9393
+ memcpy(&aux64, x[i].scales, 8);
9394
+ __m128i stmp = _mm_set1_epi64x(aux64);
9395
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
9396
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
9397
+
9398
+ __m128i sumi1_0 = _mm_setzero_si128();
9399
+ __m128i sumi1_1 = _mm_setzero_si128();
9400
+ __m128i sumi2_0 = _mm_setzero_si128();
9401
+ __m128i sumi2_1 = _mm_setzero_si128();
9402
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
9403
+
9404
+ const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
9405
+ const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
9406
+ aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
9407
+
9408
+ const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
9409
+ const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
9410
+ const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
9411
+ const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
9412
+ const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
9413
+ const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
9414
+
9415
+ const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
9416
+ const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
9417
+ const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
9418
+ const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
9419
+
9420
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9421
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9422
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9423
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9424
+ const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9425
+ const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9426
+ const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9427
+ const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9428
+
9429
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
9430
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
9431
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
9432
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
9433
+ const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
9434
+ const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
9435
+ const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
9436
+ const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
9437
+
9438
+ // AVX2 full_signs_1 is full_sign_bits_0 here
9439
+ // AVX2 full_signs_2 is full_sign_bits_1 here
9440
+ __m128i signs_0, signs_1;
9441
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
9442
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
9443
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9444
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9445
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
9446
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
9447
+
9448
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
9449
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
9450
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9451
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9452
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
9453
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
9454
+
9455
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
9456
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
9457
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9458
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9459
+ const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
9460
+ const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
9461
+
9462
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
9463
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
9464
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9465
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9466
+ const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
9467
+ const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
9468
+
9469
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
9470
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
9471
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
9472
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
9473
+ const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
9474
+ const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
9475
+ const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
9476
+ const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
9477
+
9478
+ __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
9479
+ const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
9480
+ const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9481
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
9482
+ const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
9483
+ const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9484
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
9485
+ const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
9486
+ const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9487
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
9488
+ const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
9489
+ const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9490
+
9491
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
9492
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
9493
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
9494
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
9495
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
9496
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
9497
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
9498
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
9499
+ }
9500
+
9501
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9502
+
9503
+ }
9504
+
9505
+ *s = 0.125f * hsum_float_8(accumf);
9506
+
9310
9507
  #elif defined(__loongarch_asx)
9311
9508
 
9312
9509
  const __m256i mone = __lasx_xvreplgr2vr_b(1);
@@ -9425,6 +9622,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9425
9622
 
9426
9623
  *s = 0.125f * hsum_float_8(accumf);
9427
9624
  #elif defined(__POWER9_VECTOR__)
9625
+ const vector int v0 = vec_splats((int32_t)0);
9428
9626
  vector float vsumf0 = vec_splats(0.0f);
9429
9627
  vector float vsumf1 = vec_splats(0.0f);
9430
9628
  vector float vsumf2 = vec_splats(0.0f);
@@ -9437,14 +9635,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9437
9635
  vector float vyd = vec_splats(y[i].d);
9438
9636
  vector float vd = vec_mul(vxd, vyd);
9439
9637
 
9440
- vector signed int vsumi0 = vec_splats((int32_t)0);
9441
- vector signed int vsumi1 = vec_splats((int32_t)0);
9442
- vector signed int vsumi2 = vec_splats((int32_t)0);
9443
- vector signed int vsumi3 = vec_splats((int32_t)0);
9444
- vector signed int vsumi4 = vec_splats((int32_t)0);
9445
- vector signed int vsumi5 = vec_splats((int32_t)0);
9446
- vector signed int vsumi6 = vec_splats((int32_t)0);
9447
- vector signed int vsumi7 = vec_splats((int32_t)0);
9638
+ vector signed int vsumi0 = v0;
9639
+ vector signed int vsumi1 = v0;
9640
+ vector signed int vsumi2 = v0;
9641
+ vector signed int vsumi3 = v0;
9448
9642
 
9449
9643
  const uint16_t * restrict q2 = x[i].qs;
9450
9644
  const uint8_t * restrict sc = x[i].scales;
@@ -9492,21 +9686,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9492
9686
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9493
9687
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9494
9688
 
9495
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9496
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9497
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9498
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9499
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9500
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9501
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9502
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
9689
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
9690
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
9691
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
9692
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
9503
9693
  }
9504
9694
 
9505
- vsumi0 = vec_add(vsumi0, vsumi4);
9506
- vsumi1 = vec_add(vsumi1, vsumi5);
9507
- vsumi2 = vec_add(vsumi2, vsumi6);
9508
- vsumi3 = vec_add(vsumi3, vsumi7);
9509
-
9510
9695
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9511
9696
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9512
9697
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9722,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9722
9907
 
9723
9908
  *s = 0.125f * hsum_float_8(accumf);
9724
9909
 
9910
+ #elif defined(__AVX__)
9911
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9912
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9913
+ };
9914
+
9915
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9916
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9917
+ };
9918
+
9919
+ const __m128i m4 = _mm_set1_epi8(0xf);
9920
+ const __m128i m1 = _mm_set1_epi8(1);
9921
+
9922
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
9923
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
9924
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
9925
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
9926
+
9927
+ uint64_t aux64;
9928
+
9929
+ __m256 accumf = _mm256_setzero_ps();
9930
+ for (int i = 0; i < nb; ++i) {
9931
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9932
+ const uint8_t * restrict qs = x[i].qs;
9933
+ const uint8_t * restrict qh = x[i].qh;
9934
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9935
+ const int8_t * restrict q8 = y[i].qs;
9936
+
9937
+ memcpy(&aux64, x[i].scales, 8);
9938
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9939
+ const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
9940
+ const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
9941
+
9942
+ __m128i sumi1_0 = _mm_setzero_si128();
9943
+ __m128i sumi1_1 = _mm_setzero_si128();
9944
+ __m128i sumi2_0 = _mm_setzero_si128();
9945
+ __m128i sumi2_1 = _mm_setzero_si128();
9946
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9947
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9948
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9949
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9950
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9951
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9952
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9953
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9954
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
9955
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9956
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9957
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9958
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
9959
+ qs += 8;
9960
+
9961
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
9962
+ __m128i aux128_1 = aux128_0;
9963
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
9964
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
9965
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
9966
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
9967
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
9968
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
9969
+
9970
+ aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
9971
+ aux128_1 = aux128_0;
9972
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
9973
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
9974
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
9975
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
9976
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
9977
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
9978
+
9979
+ signs += 4;
9980
+
9981
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
9982
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
9983
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
9984
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
9985
+
9986
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
9987
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
9988
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
9989
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
9990
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
9991
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
9992
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
9993
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
9994
+ }
9995
+
9996
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9997
+
9998
+ }
9999
+
10000
+ *s = 0.125f * hsum_float_8(accumf);
10001
+
9725
10002
  #elif defined(__POWER9_VECTOR__)
9726
10003
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9727
10004
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@@ -9729,6 +10006,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9729
10006
 
9730
10007
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9731
10008
 
10009
+ const vector int v0 = vec_splats((int32_t)0);
10010
+
9732
10011
  vector float vsumf0 = vec_splats(0.0f);
9733
10012
  vector float vsumf1 = vec_splats(0.0f);
9734
10013
  vector float vsumf2 = vec_splats(0.0f);
@@ -9743,14 +10022,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9743
10022
  vector float vyd = vec_splats(y[i].d);
9744
10023
  vector float vd = vec_mul(vxd, vyd);
9745
10024
 
9746
- vector signed int vsumi0 = vec_splats((int32_t)0);
9747
- vector signed int vsumi1 = vec_splats((int32_t)0);
9748
- vector signed int vsumi2 = vec_splats((int32_t)0);
9749
- vector signed int vsumi3 = vec_splats((int32_t)0);
9750
- vector signed int vsumi4 = vec_splats((int32_t)0);
9751
- vector signed int vsumi5 = vec_splats((int32_t)0);
9752
- vector signed int vsumi6 = vec_splats((int32_t)0);
9753
- vector signed int vsumi7 = vec_splats((int32_t)0);
10025
+ vector signed int vsumi0 = v0;
10026
+ vector signed int vsumi1 = v0;
10027
+ vector signed int vsumi2 = v0;
10028
+ vector signed int vsumi3 = v0;
9754
10029
 
9755
10030
  const uint8_t * restrict q2 = x[i].qs;
9756
10031
  const uint8_t * restrict qh = x[i].qh;
@@ -9810,21 +10085,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9810
10085
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9811
10086
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9812
10087
 
9813
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9814
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9815
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9816
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9817
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9818
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9819
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9820
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10088
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
10089
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
10090
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
10091
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
9821
10092
  }
9822
10093
 
9823
- vsumi0 = vec_add(vsumi0, vsumi4);
9824
- vsumi1 = vec_add(vsumi1, vsumi5);
9825
- vsumi2 = vec_add(vsumi2, vsumi6);
9826
- vsumi3 = vec_add(vsumi3, vsumi7);
9827
-
9828
10094
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9829
10095
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9830
10096
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10059,9 +10325,68 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10059
10325
 
10060
10326
  *s = 0.25f * hsum_float_8(accumf);
10061
10327
 
10328
+ #elif defined(__AVX__)
10329
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10330
+
10331
+ uint32_t aux32[2];
10332
+
10333
+ __m256 accumf = _mm256_setzero_ps();
10334
+ for (int i = 0; i < nb; ++i) {
10335
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10336
+ const uint8_t * restrict q3 = x[i].qs;
10337
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10338
+ const int8_t * restrict q8 = y[i].qs;
10339
+ __m128i sumi1_0 = _mm_setzero_si128();
10340
+ __m128i sumi1_1 = _mm_setzero_si128();
10341
+ __m128i sumi2_0 = _mm_setzero_si128();
10342
+ __m128i sumi2_1 = _mm_setzero_si128();
10343
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10344
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10345
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10346
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10347
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10348
+ const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10349
+ const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
10350
+ q3 += 8;
10351
+ const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10352
+ const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
10353
+ q3 += 8;
10354
+ memcpy(aux32, gas, 8); gas += 8;
10355
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
10356
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
10357
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
10358
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
10359
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
10360
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
10361
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
10362
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
10363
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
10364
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
10365
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
10366
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
10367
+ const uint16_t ls1 = aux32[0] >> 28;
10368
+ const uint16_t ls2 = aux32[1] >> 28;
10369
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
10370
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
10371
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
10372
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
10373
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
10374
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
10375
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
10376
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
10377
+ }
10378
+
10379
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
10380
+
10381
+ }
10382
+
10383
+ *s = 0.25f * hsum_float_8(accumf);
10384
+
10062
10385
  #elif defined(__POWER9_VECTOR__)
10063
10386
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10064
10387
 
10388
+ const vector int v0 = vec_splats((int32_t)0);
10389
+
10065
10390
  vector float vsumf0 = vec_splats(0.0f);
10066
10391
  vector float vsumf1 = vec_splats(0.0f);
10067
10392
  vector float vsumf2 = vec_splats(0.0f);
@@ -10072,14 +10397,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10072
10397
  vector float vyd = vec_splats(y[i].d);
10073
10398
  vector float vd = vec_mul(vxd, vyd);
10074
10399
 
10075
- vector signed int vsumi0 = vec_splats((int32_t)0);
10076
- vector signed int vsumi1 = vec_splats((int32_t)0);
10077
- vector signed int vsumi2 = vec_splats((int32_t)0);
10078
- vector signed int vsumi3 = vec_splats((int32_t)0);
10079
- vector signed int vsumi4 = vec_splats((int32_t)0);
10080
- vector signed int vsumi5 = vec_splats((int32_t)0);
10081
- vector signed int vsumi6 = vec_splats((int32_t)0);
10082
- vector signed int vsumi7 = vec_splats((int32_t)0);
10400
+ vector signed int vsumi0 = v0;
10401
+ vector signed int vsumi1 = v0;
10402
+ vector signed int vsumi2 = v0;
10403
+ vector signed int vsumi3 = v0;
10083
10404
 
10084
10405
  const uint8_t * restrict q3 = x[i].qs;
10085
10406
  const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
@@ -10124,21 +10445,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10124
10445
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10125
10446
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10126
10447
 
10127
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10128
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10129
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10130
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10131
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10132
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10133
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10134
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10448
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10449
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10450
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10451
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10135
10452
  }
10136
10453
 
10137
- vsumi0 = vec_add(vsumi0, vsumi4);
10138
- vsumi1 = vec_add(vsumi1, vsumi5);
10139
- vsumi2 = vec_add(vsumi2, vsumi6);
10140
- vsumi3 = vec_add(vsumi3, vsumi7);
10141
-
10142
10454
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10143
10455
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10144
10456
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10421,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10421
10733
 
10422
10734
  *s = hsum_float_8(accumf);
10423
10735
 
10736
+ #elif defined(__AVX__)
10737
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10738
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10739
+ };
10740
+
10741
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10742
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10743
+ };
10744
+
10745
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
10746
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
10747
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
10748
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
10749
+
10750
+ const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
10751
+ const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
10752
+ const __m128i idx_mask = _mm_set1_epi32(256);
10753
+
10754
+ typedef union {
10755
+ __m128i vec[4];
10756
+ uint32_t index[16];
10757
+ } index_t;
10758
+
10759
+ index_t idx;
10760
+
10761
+ __m256 accumf = _mm256_setzero_ps();
10762
+ for (int i = 0; i < nb; ++i) {
10763
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10764
+ const uint8_t * restrict qs = x[i].qs;
10765
+ const uint8_t * restrict qh = x[i].qh;
10766
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10767
+ const int8_t * restrict q8 = y[i].qs;
10768
+ __m128i sumi1_0 = _mm_setzero_si128();
10769
+ __m128i sumi1_1 = _mm_setzero_si128();
10770
+ __m128i sumi2_0 = _mm_setzero_si128();
10771
+ __m128i sumi2_1 = _mm_setzero_si128();
10772
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10773
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10774
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10775
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10776
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10777
+ const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
10778
+ const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
10779
+ const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
10780
+ idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
10781
+ idx.vec[1] = idx.vec[0];
10782
+ idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
10783
+ idx.vec[3] = idx.vec[2];
10784
+
10785
+ idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
10786
+ idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
10787
+ idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
10788
+ idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
10789
+
10790
+ idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
10791
+ idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
10792
+ idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
10793
+ idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
10794
+
10795
+ const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
10796
+ const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
10797
+ const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
10798
+ const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
10799
+
10800
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
10801
+ __m128i aux128_1 = aux128_0;
10802
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
10803
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
10804
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
10805
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
10806
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
10807
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
10808
+
10809
+ aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
10810
+ aux128_1 = aux128_0;
10811
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
10812
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
10813
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
10814
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
10815
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
10816
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
10817
+
10818
+ signs += 4;
10819
+
10820
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
10821
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
10822
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
10823
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
10824
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
10825
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
10826
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
10827
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
10828
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
10829
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
10830
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
10831
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
10832
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
10833
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
10834
+ }
10835
+
10836
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
10837
+
10838
+ }
10839
+
10840
+ *s = hsum_float_8(accumf);
10841
+
10424
10842
  #elif defined(__POWER9_VECTOR__)
10425
10843
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10426
10844
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@@ -10428,6 +10846,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10428
10846
 
10429
10847
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10430
10848
 
10849
+ const vector int v0 = vec_splats((int32_t)0);
10850
+
10431
10851
  vector float vsumf0 = vec_splats(0.0f);
10432
10852
  vector float vsumf1 = vec_splats(0.0f);
10433
10853
  vector float vsumf2 = vec_splats(0.0f);
@@ -10448,14 +10868,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10448
10868
  const uint8_t * restrict sc = x[i].scales;
10449
10869
  const int8_t * restrict q8 = y[i].qs;
10450
10870
 
10451
- vector signed int vsumi0 = vec_splats((int32_t)0);
10452
- vector signed int vsumi1 = vec_splats((int32_t)0);
10453
- vector signed int vsumi2 = vec_splats((int32_t)0);
10454
- vector signed int vsumi3 = vec_splats((int32_t)0);
10455
- vector signed int vsumi4 = vec_splats((int32_t)0);
10456
- vector signed int vsumi5 = vec_splats((int32_t)0);
10457
- vector signed int vsumi6 = vec_splats((int32_t)0);
10458
- vector signed int vsumi7 = vec_splats((int32_t)0);
10871
+ vector signed int vsumi0 = v0;
10872
+ vector signed int vsumi1 = v0;
10873
+ vector signed int vsumi2 = v0;
10874
+ vector signed int vsumi3 = v0;
10459
10875
 
10460
10876
  for (int j = 0; j < QK_K/32; j += 2) {
10461
10877
  __builtin_prefetch(q3, 0, 1);
@@ -10509,21 +10925,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10509
10925
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10510
10926
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10511
10927
 
10512
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10513
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10514
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10515
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10516
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10517
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10518
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10519
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10928
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10929
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10930
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10931
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10520
10932
  }
10521
10933
 
10522
- vsumi0 = vec_add(vsumi0, vsumi4);
10523
- vsumi1 = vec_add(vsumi1, vsumi5);
10524
- vsumi2 = vec_add(vsumi2, vsumi6);
10525
- vsumi3 = vec_add(vsumi3, vsumi7);
10526
-
10527
10934
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10528
10935
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10529
10936
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10669,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10669
11076
  }
10670
11077
 
10671
11078
 
11079
+ #if defined(__AVX__)
11080
+ static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
11081
+ const __m128i ax = _mm_sign_epi8(x, x);
11082
+ const __m128i sy = _mm_sign_epi8(y, x);
11083
+ return _mm_maddubs_epi16(ax, sy);
11084
+ }
11085
+ #endif
11086
+
10672
11087
  #if defined(__AVX2__)
10673
11088
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10674
11089
  const __m256i ax = _mm256_sign_epi8(x, x);
@@ -10786,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10786
11201
 
10787
11202
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
10788
11203
 
11204
+ #elif defined __AVX__
11205
+ __m256 accum = _mm256_setzero_ps();
11206
+ float accum1 = 0;
11207
+ for (int i = 0; i < nb; ++i) {
11208
+
11209
+ const int8_t * q8 = y[i].qs;
11210
+ const uint8_t * qs = x[i].qs;
11211
+ const uint16_t * qh = x[i].qh;
11212
+
11213
+ __m128i sumi1_0 = _mm_setzero_si128();
11214
+ __m128i sumi1_1 = _mm_setzero_si128();
11215
+ int sumi1 = 0;
11216
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
11217
+ const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
11218
+ const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
11219
+ const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11220
+ const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
11221
+ qs += 8;
11222
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11223
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11224
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11225
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11226
+
11227
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
11228
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
11229
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
11230
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
11231
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
11232
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
11233
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
11234
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
11235
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
11236
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
11237
+
11238
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
11239
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
11240
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
11241
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
11242
+ }
11243
+
11244
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
11245
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
11246
+ accum1 += d * sumi1;
11247
+
11248
+ }
11249
+
11250
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
11251
+
10789
11252
  #elif defined(__POWER9_VECTOR__)
10790
11253
  const vector unsigned char v0 = vec_splats((unsigned char)0x0);
10791
11254
  const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
@@ -10804,10 +11267,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10804
11267
  vector signed int vsumi1 = vec_splats((int32_t)0);
10805
11268
  vector signed int vsumi2 = vec_splats((int32_t)0);
10806
11269
  vector signed int vsumi3 = vec_splats((int32_t)0);
10807
- vector signed int vsumi4 = vec_splats((int32_t)0);
10808
- vector signed int vsumi5 = vec_splats((int32_t)0);
10809
- vector signed int vsumi6 = vec_splats((int32_t)0);
10810
- vector signed int vsumi7 = vec_splats((int32_t)0);
10811
11270
  vector signed int vsumi8 = vec_splats((int32_t)0);
10812
11271
 
10813
11272
  const uint8_t * restrict q1 = x[i].qs;
@@ -10849,14 +11308,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10849
11308
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10850
11309
  vector signed short vscales = vec_sld(vscales23, vscales01, 8);
10851
11310
 
10852
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10853
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10854
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10855
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10856
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10857
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10858
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10859
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11311
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
11312
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
11313
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
11314
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10860
11315
 
10861
11316
  vector signed short q8ysums = vec_xl_len(qs, 8);
10862
11317
  qs += 4;
@@ -10871,11 +11326,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10871
11326
  vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
10872
11327
  }
10873
11328
 
10874
- vsumi0 = vec_add(vsumi0, vsumi4);
10875
- vsumi1 = vec_add(vsumi1, vsumi5);
10876
- vsumi2 = vec_add(vsumi2, vsumi6);
10877
- vsumi3 = vec_add(vsumi3, vsumi7);
10878
-
10879
11329
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10880
11330
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10881
11331
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -11137,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11137
11587
 
11138
11588
  *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
11139
11589
 
11590
+ #elif defined __AVX__
11591
+ const __m128i mask = _mm_set1_epi16(0x7);
11592
+ const __m128i mone = _mm_set1_epi16(1);
11593
+
11594
+ __m256 accum1 = _mm256_setzero_ps();
11595
+ __m256 accum2 = _mm256_setzero_ps();
11596
+ for (int i = 0; i < nb; ++i) {
11597
+
11598
+ const int8_t * q8 = y[i].qs;
11599
+ const uint8_t * qs = x[i].qs;
11600
+ const uint8_t * qh = x[i].qh;
11601
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
11602
+
11603
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11604
+
11605
+ __m128i sumi1_0 = _mm_setzero_si128();
11606
+ __m128i sumi1_1 = _mm_setzero_si128();
11607
+ __m128i sumi2_0 = _mm_setzero_si128();
11608
+ __m128i sumi2_1 = _mm_setzero_si128();
11609
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
11610
+ const __m128i q1b_1_0 = _mm_set_epi64x(
11611
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
11612
+ const __m128i q1b_1_1 = _mm_set_epi64x(
11613
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
11614
+ const __m128i q1b_2_0 = _mm_set_epi64x(
11615
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
11616
+ const __m128i q1b_2_1 = _mm_set_epi64x(
11617
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
11618
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11619
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11620
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11621
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11622
+
11623
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
11624
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
11625
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
11626
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
11627
+
11628
+ const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11629
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11630
+ const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11631
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11632
+ const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11633
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11634
+ const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11635
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11636
+
11637
+ const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
11638
+ const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
11639
+ const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
11640
+ const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
11641
+
11642
+ __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
11643
+ __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
11644
+ __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
11645
+ __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
11646
+
11647
+ scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
11648
+ scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
11649
+ scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
11650
+ scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
11651
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
11652
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
11653
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
11654
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
11655
+ const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
11656
+ const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
11657
+ const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
11658
+ const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
11659
+
11660
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
11661
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
11662
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
11663
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
11664
+
11665
+ qs += 8; qh += 4;
11666
+ }
11667
+
11668
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
11669
+
11670
+ accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
11671
+ accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
11672
+ }
11673
+
11674
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
11675
+
11140
11676
  #else
11141
11677
 
11142
11678
  int sum1[2], sum2[2], delta[4];
@@ -11267,8 +11803,47 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11267
11803
 
11268
11804
  *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11269
11805
 
11806
+ #elif defined __AVX__
11807
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
11808
+ const __m128i m4b = _mm_set1_epi8(0x0f);
11809
+ const __m128i mone = _mm_set1_epi16(1);
11810
+
11811
+ __m256 accum1 = _mm256_setzero_ps();
11812
+ __m256 accum2 = _mm256_setzero_ps();
11813
+ for (int ib = 0; ib < nb; ib += 2) {
11814
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
11815
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
11816
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
11817
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
11818
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
11819
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
11820
+
11821
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
11822
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
11823
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
11824
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
11825
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
11826
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
11827
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
11828
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
11829
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
11830
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
11831
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
11832
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
11833
+ accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
11834
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
11835
+ accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
11836
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
11837
+
11838
+ y += 2;
11839
+ x += 2;
11840
+ }
11841
+
11842
+ *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11843
+
11270
11844
  #elif defined(__POWER9_VECTOR__)
11271
11845
  const vector signed char lowMask = vec_splats((signed char)0xF);
11846
+ const vector signed int v0 = vec_splats((int32_t)0);
11272
11847
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11273
11848
 
11274
11849
  vector float vsumf0 = vec_splats(0.0f);
@@ -11299,8 +11874,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11299
11874
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11300
11875
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11301
11876
 
11302
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
11303
- vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
11877
+ vector signed int vsumi0 = v0;
11878
+ vector signed int vsumi1 = v0;
11879
+
11880
+ vsumi0 = vec_sum4s(qv0, vsumi0);
11881
+ vsumi1 = vec_sum4s(qv1, vsumi1);
11304
11882
 
11305
11883
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11306
11884
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
@@ -11453,8 +12031,57 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11453
12031
 
11454
12032
  *s = hsum_float_8(accum);
11455
12033
 
12034
+ #elif defined __AVX__
12035
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
12036
+ const __m128i m4b = _mm_set1_epi8(0x0f);
12037
+
12038
+ __m256 accum = _mm256_setzero_ps();
12039
+ for (int ibl = 0; ibl < nb; ++ibl) {
12040
+ const uint8_t * qs = x[ibl].qs;
12041
+ const int8_t * q8 = y[ibl].qs;
12042
+ uint16_t sh = x[ibl].scales_h;
12043
+ __m128i sumi1_0 = _mm_setzero_si128();
12044
+ __m128i sumi1_1 = _mm_setzero_si128();
12045
+ __m128i sumi2_0 = _mm_setzero_si128();
12046
+ __m128i sumi2_1 = _mm_setzero_si128();
12047
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
12048
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
12049
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
12050
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12051
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12052
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12053
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12054
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
12055
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
12056
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
12057
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
12058
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
12059
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
12060
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
12061
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
12062
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
12063
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
12064
+ sh >>= 4;
12065
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
12066
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
12067
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
12068
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
12069
+ sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
12070
+ sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
12071
+ sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
12072
+ sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
12073
+ }
12074
+ __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
12075
+ __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
12076
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
12077
+ _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
12078
+ }
12079
+
12080
+ *s = hsum_float_8(accum);
12081
+
11456
12082
  #elif defined(__POWER9_VECTOR__)
11457
12083
  const vector signed char lowMask = vec_splats((signed char)0xF);
12084
+ const vector int v0 = vec_splats((int32_t)0);
11458
12085
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11459
12086
 
11460
12087
  vector float vsumf0 = vec_splats(0.0f);
@@ -11470,14 +12097,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11470
12097
  vector float vyd = vec_splats(y[ibl].d);
11471
12098
  vector float vd = vec_mul(vxd, vyd);
11472
12099
 
11473
- vector signed int vsumi0 = vec_splats((int32_t)0);
11474
- vector signed int vsumi1 = vec_splats((int32_t)0);
11475
- vector signed int vsumi2 = vec_splats((int32_t)0);
11476
- vector signed int vsumi3 = vec_splats((int32_t)0);
11477
- vector signed int vsumi4 = vec_splats((int32_t)0);
11478
- vector signed int vsumi5 = vec_splats((int32_t)0);
11479
- vector signed int vsumi6 = vec_splats((int32_t)0);
11480
- vector signed int vsumi7 = vec_splats((int32_t)0);
12100
+ vector signed int vsumi0 = v0;
12101
+ vector signed int vsumi1 = v0;
12102
+ vector signed int vsumi2 = v0;
12103
+ vector signed int vsumi3 = v0;
11481
12104
 
11482
12105
  uint16_t h = x[ibl].scales_h;
11483
12106
 
@@ -11522,21 +12145,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11522
12145
  vector signed short vscales01 = vec_splats((int16_t)ls0);
11523
12146
  vector signed short vscales23 = vec_splats((int16_t)ls1);
11524
12147
 
11525
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11526
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11527
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11528
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11529
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11530
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11531
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11532
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
12148
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
12149
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
12150
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
12151
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
11533
12152
  }
11534
12153
 
11535
- vsumi0 = vec_add(vsumi0, vsumi4);
11536
- vsumi1 = vec_add(vsumi1, vsumi5);
11537
- vsumi2 = vec_add(vsumi2, vsumi6);
11538
- vsumi3 = vec_add(vsumi3, vsumi7);
11539
-
11540
12154
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11541
12155
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11542
12156
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -13139,7 +13753,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
13139
13753
  const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
13140
13754
  int num_neighbors = neighbours[0];
13141
13755
  GGML_ASSERT(num_neighbors > 0);
13142
- float best_score = 0;
13756
+ float best_score = -FLT_MAX;
13143
13757
  int grid_index = -1;
13144
13758
  for (int j = 1; j <= num_neighbors; ++j) {
13145
13759
  const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
@@ -13337,7 +13951,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
13337
13951
  sumw[j+1] = sumw[j] + weight[i];
13338
13952
  }
13339
13953
  }
13340
- float best_score = 0, scale = max;
13954
+ float best_score = -FLT_MIN, scale = max;
13341
13955
  int besti1 = -1, besti2 = -1, best_shift = 0;
13342
13956
  for (int i1 = 0; i1 <= block_size; ++i1) {
13343
13957
  for (int i2 = i1; i2 <= block_size; ++i2) {
@@ -13513,7 +14127,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
13513
14127
  idx[2*j] = j;
13514
14128
  }
13515
14129
  qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
13516
- float best_score = 0, scale = max;
14130
+ float best_score = -FLT_MIN, scale = max;
13517
14131
  int besti1 = -1, besti2 = -1, best_k = -1;
13518
14132
  // 0: +, +
13519
14133
  // 1: +, -