llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,8 +4,6 @@
4
4
  #include "ggml-quants.h"
5
5
  #include "ggml-impl.h"
6
6
 
7
- #define GGML_COMMON_IMPL_C
8
- #include "ggml-common.h"
9
7
 
10
8
  #include <math.h>
11
9
  #include <string.h>
@@ -1078,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1078
1076
  }
1079
1077
  vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
1080
1078
  vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
1079
+ }
1081
1080
 
1082
1081
  #elif defined(__loongarch_asx)
1083
1082
  for (int i = 0; i < nb; i++) {
@@ -1437,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1437
1436
  accv = vec_add(accv, vec_sld(accv, accv, 4));
1438
1437
  accv = vec_add(accv, vec_sld(accv, accv, 8));
1439
1438
  y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
1439
+ }
1440
1440
 
1441
1441
  #elif defined(__loongarch_asx)
1442
1442
  for (int i = 0; i < nb; i++) {
@@ -4113,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4113
4113
 
4114
4114
  #elif defined(__POWER9_VECTOR__)
4115
4115
  const vector signed char lowMask = vec_splats((signed char)0xF);
4116
+ const vector signed int v0 = vec_splats((int32_t)0);
4116
4117
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4117
4118
  const vector signed char v8 = vec_splats((signed char)0x8);
4118
4119
 
4119
4120
  vector float vsumf0 = vec_splats(0.0f);
4120
4121
 
4121
- #pragma GCC unroll 4
4122
+ #pragma GCC unroll 8
4122
4123
  for (int i = 0; i < nb; i++) {
4123
4124
  __builtin_prefetch(x[i].qs, 0, 1);
4124
4125
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -4140,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
4140
4141
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4141
4142
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4142
4143
 
4143
- qv0 = vec_add(qv0, qv1);
4144
+ vector signed int vsumi0 = v0;
4144
4145
 
4145
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4146
+ vsumi0 = vec_sum4s(qv0, vsumi0);
4147
+ vsumi0 = vec_sum4s(qv1, vsumi0);
4146
4148
 
4147
4149
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4148
4150
  }
@@ -4516,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4516
4518
 
4517
4519
  #elif defined(__POWER9_VECTOR__)
4518
4520
  const vector signed char lowMask = vec_splats((signed char)0xF);
4521
+ const vector signed int v0 = vec_splats((int32_t)0);
4519
4522
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
4520
4523
 
4521
4524
  vector float vsumf0 = vec_splats(0.0f);
@@ -4537,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
4537
4540
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
4538
4541
  vector signed char q8y1 = vec_xl(16, y[i].qs);
4539
4542
 
4540
- vector signed char q4x0 = vec_and(qxs, lowMask);
4541
- vector signed char q4x1 = vec_sr(qxs, v4);
4543
+ vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
4544
+ vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
4542
4545
 
4543
- vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
4544
- vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
4546
+ vector signed int vsumi0 = v0;
4545
4547
 
4546
- qv0 = vec_add(qv0, qv1);
4547
-
4548
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
4548
+ vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
4549
+ vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
4549
4550
 
4550
4551
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
4551
4552
  }
@@ -5247,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5247
5248
 
5248
5249
  #elif defined(__POWER9_VECTOR__)
5249
5250
  const vector signed char lowMask = vec_splats((signed char)0xF);
5251
+ const vector signed int v0 = vec_splats((int32_t)0);
5250
5252
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
5251
5253
 
5252
5254
  vector float vsumf0 = vec_splats(0.0f);
@@ -5272,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
5272
5274
 
5273
5275
  vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
5274
5276
 
5275
- vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
5276
- vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
5277
+ vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
5278
+ vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
5277
5279
 
5278
5280
  vector signed char q8y0 = vec_xl( 0, y[i].qs);
5279
5281
  vector signed char q8y1 = vec_xl( 16, y[i].qs);
5280
5282
 
5281
- vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
5282
- vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
5283
+ vector signed int vsumi0 = v0;
5283
5284
 
5284
- qv0 = vec_add(qv0, qv1);
5285
-
5286
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
5285
+ vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
5286
+ vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
5287
5287
 
5288
5288
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
5289
5289
  }
@@ -5523,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5523
5523
  *s = sumf;
5524
5524
 
5525
5525
  #elif defined(__POWER9_VECTOR__)
5526
+ const vector signed int v0 = vec_splats((int32_t)0);
5526
5527
  vector float vsumf0 = vec_splats(0.0f);
5527
5528
 
5528
- #pragma GCC unroll 4
5529
+ #pragma GCC unroll 8
5529
5530
  for (int i = 0; i < nb; i++) {
5530
5531
  __builtin_prefetch(x[i].qs, 0, 1);
5531
5532
  __builtin_prefetch(y[i].qs, 0, 1);
@@ -5544,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
5544
5545
  vector signed short qv2 = vec_mule(q8x1, q8y1);
5545
5546
  vector signed short qv3 = vec_mulo(q8x1, q8y1);
5546
5547
 
5547
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
5548
- vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
5549
- vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
5550
- vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
5548
+ vector signed int vsumi0 = v0;
5549
+ vector signed int vsumi1 = v0;
5551
5550
 
5552
- vsumi0 = vec_add(vsumi0, vsumi2);
5553
- vsumi1 = vec_add(vsumi1, vsumi3);
5551
+ vsumi0 = vec_sum4s(qv0, vsumi0);
5552
+ vsumi1 = vec_sum4s(qv1, vsumi1);
5553
+ vsumi0 = vec_sum4s(qv2, vsumi0);
5554
+ vsumi1 = vec_sum4s(qv3, vsumi1);
5554
5555
 
5555
5556
  vsumi0 = vec_add(vsumi0, vsumi1);
5556
5557
 
@@ -5938,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5938
5939
  #elif defined(__POWER9_VECTOR__)
5939
5940
  const vector signed char lowMask = vec_splats((signed char)0x3);
5940
5941
  const vector signed char lowScaleMask = vec_splats((signed char)0xF);
5942
+ const vector int v0 = vec_splats((int32_t)0);
5941
5943
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
5942
5944
  const vector unsigned char v6 = vec_splats((unsigned char)0x6);
5943
5945
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -5975,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5975
5977
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
5976
5978
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
5977
5979
 
5978
- vector signed int vsumi0 = vec_splats((int32_t)0);
5979
- vector signed int vsumi1 = vec_splats((int32_t)0);
5980
- vector signed int vsumi2 = vec_splats((int32_t)0);
5981
- vector signed int vsumi3 = vec_splats((int32_t)0);
5982
- vector signed int vsumi4 = vec_splats((int32_t)0);
5983
- vector signed int vsumi5 = vec_splats((int32_t)0);
5984
- vector signed int vsumi6 = vec_splats((int32_t)0);
5985
- vector signed int vsumi7 = vec_splats((int32_t)0);
5980
+ vector signed int vsumi0 = v0;
5981
+ vector signed int vsumi1 = v0;
5982
+ vector signed int vsumi2 = v0;
5983
+ vector signed int vsumi3 = v0;
5984
+ vector signed int vsumi4 = v0;
5985
+ vector signed int vsumi5 = v0;
5986
+ vector signed int vsumi6 = v0;
5987
+ vector signed int vsumi7 = v0;
5986
5988
 
5989
+ const uint8_t * restrict q2 = x[i].qs;
5990
+ const int8_t * restrict q8 = y[i].qs;
5987
5991
 
5988
5992
  for (int j = 0; j < QK_K/128; ++j) {
5989
5993
  __builtin_prefetch(q2, 0, 1);
@@ -5993,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
5993
5997
  vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
5994
5998
  q2 += 32;
5995
5999
 
5996
- vector signed char q2x00 = vec_and(qxs0, lowMask);
5997
- vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
5998
- vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
5999
- vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
6000
- vector signed char q2x10 = vec_and(qxs1, lowMask);
6001
- vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
6002
- vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
6003
- vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
6000
+ vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
6001
+ vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
6002
+ vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
6003
+ vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
6004
+ vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
6005
+ vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
6006
+ vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
6007
+ vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
6004
6008
 
6005
6009
  vector signed char q8y00 = vec_xl( 0, q8);
6006
6010
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -6012,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6012
6016
  vector signed char q8y13 = vec_xl(112, q8);
6013
6017
  q8 += 128;
6014
6018
 
6015
- vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
6016
- vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
6017
- vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
6018
- vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
6019
- vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
6020
- vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
6021
- vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
6022
- vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
6023
-
6024
- vector signed short vscales_h = vec_unpackh(vscales);
6025
- vector signed short vs0 = vec_splat(vscales_h, 0);
6026
- vector signed short vs1 = vec_splat(vscales_h, 1);
6027
- vector signed short vs2 = vec_splat(vscales_h, 2);
6028
- vector signed short vs3 = vec_splat(vscales_h, 3);
6029
- vector signed short vs4 = vec_splat(vscales_h, 4);
6030
- vector signed short vs5 = vec_splat(vscales_h, 5);
6031
- vector signed short vs6 = vec_splat(vscales_h, 6);
6032
- vector signed short vs7 = vec_splat(vscales_h, 7);
6019
+ vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
6020
+ vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
6021
+ vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
6022
+ vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
6023
+ vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
6024
+ vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
6025
+ vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
6026
+ vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
6027
+
6028
+ vector signed short vscales_07 = vec_unpackh(vscales);
6029
+ vector signed int vscales_03 = vec_unpackh(vscales_07);
6030
+ vector signed int vscales_47 = vec_unpackl(vscales_07);
6031
+ vector signed int vs0 = vec_splat(vscales_03, 0);
6032
+ vector signed int vs1 = vec_splat(vscales_03, 1);
6033
+ vector signed int vs2 = vec_splat(vscales_03, 2);
6034
+ vector signed int vs3 = vec_splat(vscales_03, 3);
6035
+ vector signed int vs4 = vec_splat(vscales_47, 0);
6036
+ vector signed int vs5 = vec_splat(vscales_47, 1);
6037
+ vector signed int vs6 = vec_splat(vscales_47, 2);
6038
+ vector signed int vs7 = vec_splat(vscales_47, 3);
6033
6039
  vscales = vec_sld(vscales, vscales, 8);
6034
6040
 
6035
- qv0 = vec_mul(qv0, vs0);
6036
- qv1 = vec_mul(qv1, vs2);
6037
- qv2 = vec_mul(qv2, vs4);
6038
- qv3 = vec_mul(qv3, vs6);
6039
-
6040
- qv0 = vec_madd(qv4, vs1, qv0);
6041
- qv1 = vec_madd(qv5, vs3, qv1);
6042
- qv2 = vec_madd(qv6, vs5, qv2);
6043
- qv3 = vec_madd(qv7, vs7, qv3);
6044
-
6045
- vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
6046
- vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
6047
- vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
6048
- vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
6049
-
6050
- vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
6051
- vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
6052
- vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
6053
- vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
6041
+ vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
6042
+ vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
6043
+ vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
6044
+ vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
6045
+ vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
6046
+ vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
6047
+ vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
6048
+ vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
6054
6049
  }
6055
6050
 
6056
6051
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -6641,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6641
6636
 
6642
6637
  #elif defined(__POWER9_VECTOR__)
6643
6638
  const vector signed char lowMask = vec_splats((signed char)0x3);
6639
+ const vector signed char lowMask1 = vec_splats((int8_t)0xf);
6640
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
6641
+ const vector int v0 = vec_splats((int32_t)0);
6644
6642
  const vector signed char v1 = vec_splats((signed char)0x1);
6645
6643
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
6646
6644
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -6658,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6658
6656
  vector float vyd = vec_splats(y[i].d);
6659
6657
  vector float vd = vec_mul(vxd, vyd);
6660
6658
 
6661
- uint32_t aux[3];
6662
- uint32_t utmp[4];
6659
+ UNUSED(kmask1);
6660
+ UNUSED(kmask2);
6663
6661
 
6664
- memcpy(aux, x[i].scales, 12);
6665
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6666
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6667
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6668
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6662
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
6663
+ vector signed char u1 = vec_and(u0, lowMask1);
6664
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
6665
+ vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
6666
+ vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
6667
+ vector signed char u31 = vec_and(u3, lowMask2);
6669
6668
 
6670
- vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
6669
+ u1 = vec_or(u1, u30);
6670
+ u2 = vec_or(vec_sr(u0, v4), u31);
6671
+
6672
+ vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
6671
6673
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
6672
6674
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
6673
6675
 
6674
6676
  vscales = vec_sub(vscales, off);
6675
6677
 
6676
- vector signed int vsumi0 = vec_splats((int32_t)0);
6677
- vector signed int vsumi1 = vec_splats((int32_t)0);
6678
- vector signed int vsumi2 = vec_splats((int32_t)0);
6679
- vector signed int vsumi3 = vec_splats((int32_t)0);
6680
- vector signed int vsumi4 = vec_splats((int32_t)0);
6681
- vector signed int vsumi5 = vec_splats((int32_t)0);
6682
- vector signed int vsumi6 = vec_splats((int32_t)0);
6683
- vector signed int vsumi7 = vec_splats((int32_t)0);
6684
-
6678
+ vector signed int vsumi0 = v0;
6679
+ vector signed int vsumi1 = v0;
6680
+ vector signed int vsumi2 = v0;
6681
+ vector signed int vsumi3 = v0;
6682
+ vector signed int vsumi4 = v0;
6683
+ vector signed int vsumi5 = v0;
6684
+ vector signed int vsumi6 = v0;
6685
+ vector signed int vsumi7 = v0;
6685
6686
 
6686
6687
  const uint8_t * restrict q3 = x[i].qs;
6687
6688
  const int8_t * restrict q8 = y[i].qs;
@@ -6755,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6755
6756
  vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
6756
6757
  vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
6757
6758
 
6758
- vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
6759
- vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
6760
- vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
6761
- vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
6762
- vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
6763
- vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
6764
- vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
6765
- vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
6766
-
6767
- vsumi0 = vec_add(vsum0, vsumi0);
6768
- vsumi1 = vec_add(vsum1, vsumi1);
6769
- vsumi2 = vec_add(vsum2, vsumi2);
6770
- vsumi3 = vec_add(vsum3, vsumi3);
6771
- vsumi4 = vec_add(vsum4, vsumi4);
6772
- vsumi5 = vec_add(vsum5, vsumi5);
6773
- vsumi6 = vec_add(vsum6, vsumi6);
6774
- vsumi7 = vec_add(vsum7, vsumi7);
6759
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
6760
+ vsumi1 = vec_msum(qv01, vs2, vsumi1);
6761
+ vsumi2 = vec_msum(qv02, vs4, vsumi2);
6762
+ vsumi3 = vec_msum(qv03, vs6, vsumi3);
6763
+ vsumi4 = vec_msum(qv10, vs1, vsumi4);
6764
+ vsumi5 = vec_msum(qv11, vs3, vsumi5);
6765
+ vsumi6 = vec_msum(qv12, vs5, vsumi6);
6766
+ vsumi7 = vec_msum(qv13, vs7, vsumi7);
6775
6767
  }
6776
6768
 
6777
6769
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -7270,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7270
7262
 
7271
7263
  #elif defined(__POWER9_VECTOR__)
7272
7264
  const vector signed char lowMask = vec_splats((signed char)0xF);
7265
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7266
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7267
+ const vector int v0 = vec_splats((int32_t)0);
7268
+ const vector unsigned char v2 = vec_splats((uint8_t)2);
7273
7269
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
7274
7270
 
7275
7271
  vector float vsumf0 = vec_splats(0.0f);
@@ -7288,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7288
7284
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7289
7285
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7290
7286
 
7291
- memcpy(utmp, x[i].scales, 12);
7287
+ UNUSED(kmask1);
7288
+ UNUSED(kmask2);
7289
+ UNUSED(kmask3);
7290
+ UNUSED(utmp);
7292
7291
 
7293
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7294
- const uint32_t uaux = utmp[1] & kmask1;
7295
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7296
- utmp[2] = uaux;
7297
- utmp[0] &= kmask1;
7292
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7293
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7294
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7295
+ vector signed char u3 = vec_sr(u2, v4);
7296
+
7297
+ vector signed char u30 = u1;
7298
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7299
+
7300
+ u1 = vec_and(u0, lowMask1);
7301
+ u2 = vec_or(u30, u31);
7302
+
7303
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7298
7304
 
7299
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7300
7305
  vector signed short vscales = vec_unpackh(utmps);
7301
7306
  vector signed short q4xmins = vec_unpackl(utmps);
7302
7307
  vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
@@ -7312,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7312
7317
  vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
7313
7318
  vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
7314
7319
 
7315
- vector signed int vsumi0 = vec_splats((int32_t)0);
7316
- vector signed int vsumi1 = vec_splats((int32_t)0);
7317
- vector signed int vsumi2 = vec_splats((int32_t)0);
7318
- vector signed int vsumi3 = vec_splats((int32_t)0);
7319
- vector signed int vsumi4 = vec_splats((int32_t)0);
7320
- vector signed int vsumi5 = vec_splats((int32_t)0);
7321
- vector signed int vsumi6 = vec_splats((int32_t)0);
7322
- vector signed int vsumi7 = vec_splats((int32_t)0);
7320
+ vector signed int vsumi0 = v0;
7321
+ vector signed int vsumi1 = v0;
7322
+ vector signed int vsumi2 = v0;
7323
+ vector signed int vsumi3 = v0;
7323
7324
 
7324
7325
  const uint8_t * restrict q4 = x[i].qs;
7325
7326
  const int8_t * restrict q8 = y[i].qs;
@@ -7334,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7334
7335
  vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
7335
7336
  q4 += 64;
7336
7337
 
7337
- vector signed char q4x00 = vec_and(qxs0, lowMask);
7338
- vector signed char q4x01 = vec_sr(qxs0, v4);
7339
- vector signed char q4x10 = vec_and(qxs1, lowMask);
7340
- vector signed char q4x11 = vec_sr(qxs1, v4);
7341
- vector signed char q4x20 = vec_and(qxs2, lowMask);
7342
- vector signed char q4x21 = vec_sr(qxs2, v4);
7343
- vector signed char q4x30 = vec_and(qxs3, lowMask);
7344
- vector signed char q4x31 = vec_sr(qxs3, v4);
7338
+ vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
7339
+ vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
7340
+ vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
7341
+ vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
7342
+ vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
7343
+ vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
7344
+ vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
7345
+ vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
7345
7346
 
7346
7347
  vector signed char q8y00 = vec_xl( 0, q8);
7347
7348
  vector signed char q8y10 = vec_xl( 16, q8);
@@ -7353,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7353
7354
  vector signed char q8y31 = vec_xl(112, q8);
7354
7355
  q8 += 128;
7355
7356
 
7356
- vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
7357
- vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
7358
- vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
7359
- vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
7360
- vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
7361
- vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
7362
- vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
7363
- vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
7364
-
7365
- vector signed short vs0 = vec_splat(vscales, 0);
7366
- vector signed short vs1 = vec_splat(vscales, 1);
7367
- vector signed short vs2 = vec_splat(vscales, 2);
7368
- vector signed short vs3 = vec_splat(vscales, 3);
7357
+ vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
7358
+ vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
7359
+ vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
7360
+ vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
7361
+ vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
7362
+ vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
7363
+ vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
7364
+ vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
7365
+
7366
+ vector signed int vscales_h = vec_unpackh(vscales);
7367
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7368
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7369
+ vector signed int vs2 = vec_splat(vscales_h, 2);
7370
+ vector signed int vs3 = vec_splat(vscales_h, 3);
7369
7371
  vscales = vec_sld(vscales, vscales, 8);
7370
7372
 
7371
- qv00 = vec_add(qv00, qv10);
7372
- qv10 = vec_add(qv01, qv11);
7373
- qv20 = vec_add(qv20, qv30);
7374
- qv30 = vec_add(qv21, qv31);
7373
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7374
+ vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
7375
+ vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
7376
+ vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
7375
7377
 
7376
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7377
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7378
- vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
7379
- vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
7380
- vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
7381
- vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
7382
- vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
7383
- vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
7378
+ vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
7379
+ vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
7380
+ vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
7381
+ vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
7384
7382
  }
7385
7383
 
7386
- vsumi0 = vec_add(vsumi0, vsumi4);
7387
- vsumi1 = vec_add(vsumi1, vsumi5);
7388
- vsumi2 = vec_add(vsumi2, vsumi6);
7389
- vsumi3 = vec_add(vsumi3, vsumi7);
7390
-
7391
7384
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
7392
7385
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
7393
7386
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -7889,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7889
7882
 
7890
7883
  #elif defined(__POWER9_VECTOR__)
7891
7884
  const vector signed char lowMask = vec_splats((signed char)0xF);
7885
+ const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
7886
+ const vector signed char lowMask2 = vec_splats((int8_t)0x30);
7887
+ const vector int v0 = vec_splats((int32_t)0);
7892
7888
  const vector unsigned char v1 = vec_splats((unsigned char)0x1);
7893
7889
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
7894
7890
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
@@ -7907,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7907
7903
  vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
7908
7904
  vector float vdmin = vec_mul(vxmin, vyd);
7909
7905
 
7910
- memcpy(utmp, x[i].scales, 12);
7906
+ UNUSED(kmask1);
7907
+ UNUSED(kmask2);
7908
+ UNUSED(kmask3);
7909
+ UNUSED(utmp);
7911
7910
 
7912
- utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7913
- const uint32_t uaux = utmp[1] & kmask1;
7914
- utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7915
- utmp[2] = uaux;
7916
- utmp[0] &= kmask1;
7911
+ vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
7912
+ vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
7913
+ vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
7914
+ vector signed char u3 = vec_sr(u2, v4);
7915
+
7916
+ vector signed char u30 = u1;
7917
+ vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
7918
+
7919
+ u1 = vec_and(u0, lowMask1);
7920
+ u2 = vec_or(u30, u31);
7921
+
7922
+ vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
7917
7923
 
7918
7924
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
7919
7925
  vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
7920
7926
 
7921
- vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
7922
7927
  vector signed short vscales = vec_unpackh(utmps);
7923
7928
 
7924
7929
  vector signed short q5xmins = vec_unpackl(utmps);
@@ -7938,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7938
7943
  vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
7939
7944
  vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
7940
7945
 
7941
- vector signed int vsumi0 = vec_splats((int32_t)0);
7942
- vector signed int vsumi1 = vec_splats((int32_t)0);
7943
- vector signed int vsumi2 = vec_splats((int32_t)0);
7944
- vector signed int vsumi3 = vec_splats((int32_t)0);
7946
+ vector signed int vsumi0 = v0;
7947
+ vector signed int vsumi1 = v0;
7948
+ vector signed int vsumi2 = v0;
7949
+ vector signed int vsumi3 = v0;
7945
7950
 
7946
7951
  const uint8_t * restrict q5 = x[i].qs;
7947
7952
  const int8_t * restrict q8 = y[i].qs;
@@ -7966,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7966
7971
  qxhs0 = vec_sr(qxhs0, v2);
7967
7972
  qxhs1 = vec_sr(qxhs1, v2);
7968
7973
 
7969
- vector signed char q5x00 = vec_or(q5h00, qxs00);
7970
- vector signed char q5x01 = vec_or(q5h01, qxs01);
7971
- vector signed char q5x10 = vec_or(q5h10, qxs10);
7972
- vector signed char q5x11 = vec_or(q5h11, qxs11);
7974
+ vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
7975
+ vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
7976
+ vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
7977
+ vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
7973
7978
 
7974
7979
  vector signed char q8y00 = vec_xl( 0, q8);
7975
7980
  vector signed char q8y10 = vec_xl(16, q8);
@@ -7977,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7977
7982
  vector signed char q8y11 = vec_xl(48, q8);
7978
7983
  q8 += 64;
7979
7984
 
7980
- vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
7981
- vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
7982
- vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
7983
- vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
7985
+ vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
7986
+ vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
7987
+ vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
7988
+ vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
7984
7989
 
7985
- vector signed short vs0 = vec_splat(vscales, 0);
7986
- vector signed short vs1 = vec_splat(vscales, 1);
7990
+ vector signed int vscales_h = vec_unpackh(vscales);
7991
+ vector signed int vs0 = vec_splat(vscales_h, 0);
7992
+ vector signed int vs1 = vec_splat(vscales_h, 1);
7987
7993
  vscales = vec_sld(vscales, vscales, 12);
7988
7994
 
7989
- qv00 = vec_add(qv00, qv10);
7990
- qv01 = vec_add(qv01, qv11);
7991
-
7992
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
7993
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
7994
- vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
7995
- vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
7995
+ vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
7996
+ vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
7997
+ vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
7998
+ vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
7996
7999
  }
7997
8000
 
7998
8001
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
@@ -8553,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8553
8556
 
8554
8557
  #elif defined(__POWER9_VECTOR__)
8555
8558
  const vector signed char lowMask = vec_splats((signed char)0xF);
8559
+ const vector int v0 = vec_splats((int32_t)0);
8556
8560
  const vector unsigned char v2 = vec_splats((unsigned char)0x2);
8557
8561
  const vector unsigned char v3 = vec_splats((unsigned char)0x3);
8558
8562
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
@@ -8569,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8569
8573
  vector float vyd = vec_splats(y[i].d);
8570
8574
  vector float vd = vec_mul(vxd, vyd);
8571
8575
 
8572
- vector signed int vsumi0 = vec_splats((int32_t)0);
8573
- vector signed int vsumi1 = vec_splats((int32_t)0);
8574
- vector signed int vsumi2 = vec_splats((int32_t)0);
8575
- vector signed int vsumi3 = vec_splats((int32_t)0);
8576
- vector signed int vsumi4 = vec_splats((int32_t)0);
8577
- vector signed int vsumi5 = vec_splats((int32_t)0);
8578
- vector signed int vsumi6 = vec_splats((int32_t)0);
8579
- vector signed int vsumi7 = vec_splats((int32_t)0);
8576
+ vector signed int vsumi0 = v0;
8577
+ vector signed int vsumi1 = v0;
8578
+ vector signed int vsumi2 = v0;
8579
+ vector signed int vsumi3 = v0;
8580
+ vector signed int vsumi4 = v0;
8581
+ vector signed int vsumi5 = v0;
8582
+ vector signed int vsumi6 = v0;
8583
+ vector signed int vsumi7 = v0;
8580
8584
 
8581
8585
  const uint8_t * restrict q6 = x[i].ql;
8582
8586
  const uint8_t * restrict qh = x[i].qh;
@@ -8656,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8656
8660
  vector signed short vs6 = vec_splat(vscales, 6);
8657
8661
  vector signed short vs7 = vec_splat(vscales, 7);
8658
8662
 
8659
- vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
8660
- vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
8661
- vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
8662
- vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
8663
- vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
8664
- vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
8665
- vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
8666
- vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
8667
-
8668
- vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
8669
- vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
8670
- vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
8671
- vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
8672
- vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
8673
- vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
8674
- vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
8675
- vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
8663
+ vsumi0 = vec_msum(qv00, vs0, vsumi0);
8664
+ vsumi1 = vec_msum(qv01, vs4, vsumi1);
8665
+ vsumi2 = vec_msum(qv10, vs1, vsumi2);
8666
+ vsumi3 = vec_msum(qv11, vs5, vsumi3);
8667
+ vsumi4 = vec_msum(qv20, vs2, vsumi4);
8668
+ vsumi5 = vec_msum(qv21, vs6, vsumi5);
8669
+ vsumi6 = vec_msum(qv30, vs3, vsumi6);
8670
+ vsumi7 = vec_msum(qv31, vs7, vsumi7);
8676
8671
  }
8677
8672
 
8678
8673
  vsumi0 = vec_add(vsumi0, vsumi4);
@@ -8819,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8819
8814
  #endif
8820
8815
  }
8821
8816
 
8822
- #if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
8817
+ #if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
8823
8818
  static const int8_t keven_signs_q2xs[1024] = {
8824
8819
  1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
8825
8820
  1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
@@ -8952,7 +8947,63 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8952
8947
 
8953
8948
  *s = 0.125f * hsum_float_8(accumf);
8954
8949
 
8950
+ #elif defined(__AVX__)
8951
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
8952
+
8953
+ uint32_t aux32[4];
8954
+ const uint8_t * aux8 = (const uint8_t *)aux32;
8955
+
8956
+ __m256 accumf = _mm256_setzero_ps();
8957
+ for (int i = 0; i < nb; ++i) {
8958
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
8959
+ const uint16_t * restrict q2 = x[i].qs;
8960
+ const int8_t * restrict q8 = y[i].qs;
8961
+ __m128i sumi1_0 = _mm_setzero_si128();
8962
+ __m128i sumi1_1 = _mm_setzero_si128();
8963
+ __m128i sumi2_0 = _mm_setzero_si128();
8964
+ __m128i sumi2_1 = _mm_setzero_si128();
8965
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8966
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8967
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8968
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8969
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
8970
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
8971
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
8972
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
8973
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
8974
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
8975
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
8976
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
8977
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
8978
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
8979
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
8980
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
8981
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
8982
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
8983
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
8984
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
8985
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
8986
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
8987
+ const uint16_t ls1 = aux32[1] >> 28;
8988
+ const uint16_t ls2 = aux32[3] >> 28;
8989
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
8990
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
8991
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
8992
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
8993
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
8994
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
8995
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
8996
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
8997
+ }
8998
+
8999
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9000
+
9001
+ }
9002
+
9003
+ *s = 0.125f * hsum_float_8(accumf);
9004
+
8955
9005
  #elif defined(__POWER9_VECTOR__)
9006
+ const vector int v0 = vec_splats((int32_t)0);
8956
9007
  vector float vsumf0 = vec_splats(0.0f);
8957
9008
  vector float vsumf1 = vec_splats(0.0f);
8958
9009
  vector float vsumf2 = vec_splats(0.0f);
@@ -8965,14 +9016,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
8965
9016
  vector float vyd = vec_splats(y[i].d);
8966
9017
  vector float vd = vec_mul(vxd, vyd);
8967
9018
 
8968
- vector signed int vsumi0 = vec_splats((int32_t)0);
8969
- vector signed int vsumi1 = vec_splats((int32_t)0);
8970
- vector signed int vsumi2 = vec_splats((int32_t)0);
8971
- vector signed int vsumi3 = vec_splats((int32_t)0);
8972
- vector signed int vsumi4 = vec_splats((int32_t)0);
8973
- vector signed int vsumi5 = vec_splats((int32_t)0);
8974
- vector signed int vsumi6 = vec_splats((int32_t)0);
8975
- vector signed int vsumi7 = vec_splats((int32_t)0);
9019
+ vector signed int vsumi0 = v0;
9020
+ vector signed int vsumi1 = v0;
9021
+ vector signed int vsumi2 = v0;
9022
+ vector signed int vsumi3 = v0;
8976
9023
 
8977
9024
  const uint16_t * restrict q2 = x[i].qs;
8978
9025
  const int8_t * restrict q8 = y[i].qs;
@@ -9019,21 +9066,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9019
9066
  vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
9020
9067
  vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
9021
9068
 
9022
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
9023
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
9024
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
9025
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
9026
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
9027
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
9028
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
9029
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
9069
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
9070
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
9071
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
9072
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
9030
9073
  }
9031
9074
 
9032
- vsumi0 = vec_add(vsumi0, vsumi4);
9033
- vsumi1 = vec_add(vsumi1, vsumi5);
9034
- vsumi2 = vec_add(vsumi2, vsumi6);
9035
- vsumi3 = vec_add(vsumi3, vsumi7);
9036
-
9037
9075
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9038
9076
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9039
9077
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9307,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9307
9345
  }
9308
9346
 
9309
9347
  *s = 0.125f * hsum_float_8(accumf);
9348
+
9349
+ #elif defined(__AVX__)
9350
+ const __m128i mone = _mm_set1_epi8(1);
9351
+ static const char block_sign_shuffle_mask_1[32] = {
9352
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
9353
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
9354
+ };
9355
+ static const char block_sign_shuffle_mask_2[32] = {
9356
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
9357
+ 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
9358
+ };
9359
+ static const uint8_t bit_selector_mask_bytes[32] = {
9360
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9361
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9362
+ };
9363
+
9364
+ const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
9365
+ const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
9366
+ const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
9367
+ const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
9368
+ const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
9369
+ const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
9370
+
9371
+ static const uint8_t k_bit_helper[32] = {
9372
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9373
+ 0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
9374
+ };
9375
+ const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
9376
+ const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
9377
+ const __m128i m511 = _mm_set1_epi16(511);
9378
+ const __m128i m4 = _mm_set1_epi8(0xf);
9379
+ const __m128i m1 = _mm_set1_epi8(1);
9380
+
9381
+ uint64_t aux64;
9382
+
9383
+ // somewhat hacky, but gives a significant boost in performance
9384
+ __m256i aux_gindex;
9385
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
9386
+
9387
+ __m256 accumf = _mm256_setzero_ps();
9388
+ for (int i = 0; i < nb; ++i) {
9389
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9390
+ const uint16_t * restrict q2 = x[i].qs;
9391
+ const int8_t * restrict q8 = y[i].qs;
9392
+
9393
+ memcpy(&aux64, x[i].scales, 8);
9394
+ __m128i stmp = _mm_set1_epi64x(aux64);
9395
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
9396
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
9397
+
9398
+ __m128i sumi1_0 = _mm_setzero_si128();
9399
+ __m128i sumi1_1 = _mm_setzero_si128();
9400
+ __m128i sumi2_0 = _mm_setzero_si128();
9401
+ __m128i sumi2_1 = _mm_setzero_si128();
9402
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
9403
+
9404
+ const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
9405
+ const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
9406
+ aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
9407
+
9408
+ const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
9409
+ const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
9410
+ const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
9411
+ const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
9412
+ const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
9413
+ const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
9414
+
9415
+ const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
9416
+ const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
9417
+ const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
9418
+ const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
9419
+
9420
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9421
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9422
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9423
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9424
+ const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9425
+ const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9426
+ const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9427
+ const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9428
+
9429
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
9430
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
9431
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
9432
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
9433
+ const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
9434
+ const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
9435
+ const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
9436
+ const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
9437
+
9438
+ // AVX2 full_signs_1 is full_sign_bits_0 here
9439
+ // AVX2 full_signs_2 is full_sign_bits_1 here
9440
+ __m128i signs_0, signs_1;
9441
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
9442
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
9443
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9444
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9445
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
9446
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
9447
+
9448
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
9449
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
9450
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9451
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9452
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
9453
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
9454
+
9455
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
9456
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
9457
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9458
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9459
+ const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
9460
+ const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
9461
+
9462
+ signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
9463
+ signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
9464
+ signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
9465
+ signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
9466
+ const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
9467
+ const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
9468
+
9469
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
9470
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
9471
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
9472
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
9473
+ const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
9474
+ const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
9475
+ const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
9476
+ const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
9477
+
9478
+ __m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
9479
+ const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
9480
+ const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9481
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
9482
+ const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
9483
+ const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9484
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
9485
+ const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
9486
+ const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9487
+ sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
9488
+ const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
9489
+ const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
9490
+
9491
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
9492
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
9493
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
9494
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
9495
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
9496
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
9497
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
9498
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
9499
+ }
9500
+
9501
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9502
+
9503
+ }
9504
+
9505
+ *s = 0.125f * hsum_float_8(accumf);
9506
+
9310
9507
  #elif defined(__loongarch_asx)
9311
9508
 
9312
9509
  const __m256i mone = __lasx_xvreplgr2vr_b(1);
@@ -9425,6 +9622,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9425
9622
 
9426
9623
  *s = 0.125f * hsum_float_8(accumf);
9427
9624
  #elif defined(__POWER9_VECTOR__)
9625
+ const vector int v0 = vec_splats((int32_t)0);
9428
9626
  vector float vsumf0 = vec_splats(0.0f);
9429
9627
  vector float vsumf1 = vec_splats(0.0f);
9430
9628
  vector float vsumf2 = vec_splats(0.0f);
@@ -9437,14 +9635,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9437
9635
  vector float vyd = vec_splats(y[i].d);
9438
9636
  vector float vd = vec_mul(vxd, vyd);
9439
9637
 
9440
- vector signed int vsumi0 = vec_splats((int32_t)0);
9441
- vector signed int vsumi1 = vec_splats((int32_t)0);
9442
- vector signed int vsumi2 = vec_splats((int32_t)0);
9443
- vector signed int vsumi3 = vec_splats((int32_t)0);
9444
- vector signed int vsumi4 = vec_splats((int32_t)0);
9445
- vector signed int vsumi5 = vec_splats((int32_t)0);
9446
- vector signed int vsumi6 = vec_splats((int32_t)0);
9447
- vector signed int vsumi7 = vec_splats((int32_t)0);
9638
+ vector signed int vsumi0 = v0;
9639
+ vector signed int vsumi1 = v0;
9640
+ vector signed int vsumi2 = v0;
9641
+ vector signed int vsumi3 = v0;
9448
9642
 
9449
9643
  const uint16_t * restrict q2 = x[i].qs;
9450
9644
  const uint8_t * restrict sc = x[i].scales;
@@ -9492,21 +9686,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
9492
9686
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9493
9687
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9494
9688
 
9495
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9496
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9497
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9498
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9499
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9500
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9501
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9502
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
9689
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
9690
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
9691
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
9692
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
9503
9693
  }
9504
9694
 
9505
- vsumi0 = vec_add(vsumi0, vsumi4);
9506
- vsumi1 = vec_add(vsumi1, vsumi5);
9507
- vsumi2 = vec_add(vsumi2, vsumi6);
9508
- vsumi3 = vec_add(vsumi3, vsumi7);
9509
-
9510
9695
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9511
9696
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9512
9697
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -9722,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9722
9907
 
9723
9908
  *s = 0.125f * hsum_float_8(accumf);
9724
9909
 
9910
+ #elif defined(__AVX__)
9911
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9912
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
9913
+ };
9914
+
9915
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9916
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
9917
+ };
9918
+
9919
+ const __m128i m4 = _mm_set1_epi8(0xf);
9920
+ const __m128i m1 = _mm_set1_epi8(1);
9921
+
9922
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
9923
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
9924
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
9925
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
9926
+
9927
+ uint64_t aux64;
9928
+
9929
+ __m256 accumf = _mm256_setzero_ps();
9930
+ for (int i = 0; i < nb; ++i) {
9931
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
9932
+ const uint8_t * restrict qs = x[i].qs;
9933
+ const uint8_t * restrict qh = x[i].qh;
9934
+ const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
9935
+ const int8_t * restrict q8 = y[i].qs;
9936
+
9937
+ memcpy(&aux64, x[i].scales, 8);
9938
+ const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
9939
+ const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
9940
+ const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
9941
+
9942
+ __m128i sumi1_0 = _mm_setzero_si128();
9943
+ __m128i sumi1_1 = _mm_setzero_si128();
9944
+ __m128i sumi2_0 = _mm_setzero_si128();
9945
+ __m128i sumi2_1 = _mm_setzero_si128();
9946
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
9947
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9948
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9949
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9950
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
9951
+ const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
9952
+ iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
9953
+ const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
9954
+ iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
9955
+ const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
9956
+ iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
9957
+ const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
9958
+ iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
9959
+ qs += 8;
9960
+
9961
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
9962
+ __m128i aux128_1 = aux128_0;
9963
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
9964
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
9965
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
9966
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
9967
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
9968
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
9969
+
9970
+ aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
9971
+ aux128_1 = aux128_0;
9972
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
9973
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
9974
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
9975
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
9976
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
9977
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
9978
+
9979
+ signs += 4;
9980
+
9981
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
9982
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
9983
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
9984
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
9985
+
9986
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
9987
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
9988
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
9989
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
9990
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
9991
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
9992
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
9993
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
9994
+ }
9995
+
9996
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
9997
+
9998
+ }
9999
+
10000
+ *s = 0.125f * hsum_float_8(accumf);
10001
+
9725
10002
  #elif defined(__POWER9_VECTOR__)
9726
10003
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
9727
10004
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@@ -9729,6 +10006,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9729
10006
 
9730
10007
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
9731
10008
 
10009
+ const vector int v0 = vec_splats((int32_t)0);
10010
+
9732
10011
  vector float vsumf0 = vec_splats(0.0f);
9733
10012
  vector float vsumf1 = vec_splats(0.0f);
9734
10013
  vector float vsumf2 = vec_splats(0.0f);
@@ -9743,14 +10022,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9743
10022
  vector float vyd = vec_splats(y[i].d);
9744
10023
  vector float vd = vec_mul(vxd, vyd);
9745
10024
 
9746
- vector signed int vsumi0 = vec_splats((int32_t)0);
9747
- vector signed int vsumi1 = vec_splats((int32_t)0);
9748
- vector signed int vsumi2 = vec_splats((int32_t)0);
9749
- vector signed int vsumi3 = vec_splats((int32_t)0);
9750
- vector signed int vsumi4 = vec_splats((int32_t)0);
9751
- vector signed int vsumi5 = vec_splats((int32_t)0);
9752
- vector signed int vsumi6 = vec_splats((int32_t)0);
9753
- vector signed int vsumi7 = vec_splats((int32_t)0);
10025
+ vector signed int vsumi0 = v0;
10026
+ vector signed int vsumi1 = v0;
10027
+ vector signed int vsumi2 = v0;
10028
+ vector signed int vsumi3 = v0;
9754
10029
 
9755
10030
  const uint8_t * restrict q2 = x[i].qs;
9756
10031
  const uint8_t * restrict qh = x[i].qh;
@@ -9810,21 +10085,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
9810
10085
  vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
9811
10086
  vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
9812
10087
 
9813
- vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
9814
- vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
9815
- vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
9816
- vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
9817
- vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
9818
- vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
9819
- vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
9820
- vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
10088
+ vsumi0 = vec_msum(qv0, vscales0, vsumi0);
10089
+ vsumi1 = vec_msum(qv1, vscales1, vsumi1);
10090
+ vsumi2 = vec_msum(qv2, vscales2, vsumi2);
10091
+ vsumi3 = vec_msum(qv3, vscales3, vsumi3);
9821
10092
  }
9822
10093
 
9823
- vsumi0 = vec_add(vsumi0, vsumi4);
9824
- vsumi1 = vec_add(vsumi1, vsumi5);
9825
- vsumi2 = vec_add(vsumi2, vsumi6);
9826
- vsumi3 = vec_add(vsumi3, vsumi7);
9827
-
9828
10094
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
9829
10095
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
9830
10096
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10059,9 +10325,68 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10059
10325
 
10060
10326
  *s = 0.25f * hsum_float_8(accumf);
10061
10327
 
10328
+ #elif defined(__AVX__)
10329
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10330
+
10331
+ uint32_t aux32[2];
10332
+
10333
+ __m256 accumf = _mm256_setzero_ps();
10334
+ for (int i = 0; i < nb; ++i) {
10335
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10336
+ const uint8_t * restrict q3 = x[i].qs;
10337
+ const uint8_t * restrict gas = x[i].qs + QK_K/4;
10338
+ const int8_t * restrict q8 = y[i].qs;
10339
+ __m128i sumi1_0 = _mm_setzero_si128();
10340
+ __m128i sumi1_1 = _mm_setzero_si128();
10341
+ __m128i sumi2_0 = _mm_setzero_si128();
10342
+ __m128i sumi2_1 = _mm_setzero_si128();
10343
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10344
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10345
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10346
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10347
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10348
+ const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10349
+ const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
10350
+ q3 += 8;
10351
+ const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
10352
+ const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
10353
+ q3 += 8;
10354
+ memcpy(aux32, gas, 8); gas += 8;
10355
+ const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
10356
+ const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
10357
+ const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
10358
+ const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
10359
+ const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
10360
+ const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
10361
+ const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
10362
+ const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
10363
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
10364
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
10365
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
10366
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
10367
+ const uint16_t ls1 = aux32[0] >> 28;
10368
+ const uint16_t ls2 = aux32[1] >> 28;
10369
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
10370
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
10371
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
10372
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
10373
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
10374
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
10375
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
10376
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
10377
+ }
10378
+
10379
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
10380
+
10381
+ }
10382
+
10383
+ *s = 0.25f * hsum_float_8(accumf);
10384
+
10062
10385
  #elif defined(__POWER9_VECTOR__)
10063
10386
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
10064
10387
 
10388
+ const vector int v0 = vec_splats((int32_t)0);
10389
+
10065
10390
  vector float vsumf0 = vec_splats(0.0f);
10066
10391
  vector float vsumf1 = vec_splats(0.0f);
10067
10392
  vector float vsumf2 = vec_splats(0.0f);
@@ -10072,14 +10397,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10072
10397
  vector float vyd = vec_splats(y[i].d);
10073
10398
  vector float vd = vec_mul(vxd, vyd);
10074
10399
 
10075
- vector signed int vsumi0 = vec_splats((int32_t)0);
10076
- vector signed int vsumi1 = vec_splats((int32_t)0);
10077
- vector signed int vsumi2 = vec_splats((int32_t)0);
10078
- vector signed int vsumi3 = vec_splats((int32_t)0);
10079
- vector signed int vsumi4 = vec_splats((int32_t)0);
10080
- vector signed int vsumi5 = vec_splats((int32_t)0);
10081
- vector signed int vsumi6 = vec_splats((int32_t)0);
10082
- vector signed int vsumi7 = vec_splats((int32_t)0);
10400
+ vector signed int vsumi0 = v0;
10401
+ vector signed int vsumi1 = v0;
10402
+ vector signed int vsumi2 = v0;
10403
+ vector signed int vsumi3 = v0;
10083
10404
 
10084
10405
  const uint8_t * restrict q3 = x[i].qs;
10085
10406
  const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
@@ -10124,21 +10445,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
10124
10445
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10125
10446
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10126
10447
 
10127
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10128
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10129
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10130
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10131
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10132
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10133
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10134
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10448
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10449
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10450
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10451
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10135
10452
  }
10136
10453
 
10137
- vsumi0 = vec_add(vsumi0, vsumi4);
10138
- vsumi1 = vec_add(vsumi1, vsumi5);
10139
- vsumi2 = vec_add(vsumi2, vsumi6);
10140
- vsumi3 = vec_add(vsumi3, vsumi7);
10141
-
10142
10454
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10143
10455
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10144
10456
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10421,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10421
10733
 
10422
10734
  *s = hsum_float_8(accumf);
10423
10735
 
10736
+ #elif defined(__AVX__)
10737
+ static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10738
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
10739
+ };
10740
+
10741
+ static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10742
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
10743
+ };
10744
+
10745
+ const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
10746
+ const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
10747
+ const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
10748
+ const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
10749
+
10750
+ const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
10751
+ const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
10752
+ const __m128i idx_mask = _mm_set1_epi32(256);
10753
+
10754
+ typedef union {
10755
+ __m128i vec[4];
10756
+ uint32_t index[16];
10757
+ } index_t;
10758
+
10759
+ index_t idx;
10760
+
10761
+ __m256 accumf = _mm256_setzero_ps();
10762
+ for (int i = 0; i < nb; ++i) {
10763
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
10764
+ const uint8_t * restrict qs = x[i].qs;
10765
+ const uint8_t * restrict qh = x[i].qh;
10766
+ const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
10767
+ const int8_t * restrict q8 = y[i].qs;
10768
+ __m128i sumi1_0 = _mm_setzero_si128();
10769
+ __m128i sumi1_1 = _mm_setzero_si128();
10770
+ __m128i sumi2_0 = _mm_setzero_si128();
10771
+ __m128i sumi2_1 = _mm_setzero_si128();
10772
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
10773
+ const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10774
+ const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10775
+ const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10776
+ const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
10777
+ const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
10778
+ const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
10779
+ const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
10780
+ idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
10781
+ idx.vec[1] = idx.vec[0];
10782
+ idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
10783
+ idx.vec[3] = idx.vec[2];
10784
+
10785
+ idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
10786
+ idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
10787
+ idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
10788
+ idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
10789
+
10790
+ idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
10791
+ idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
10792
+ idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
10793
+ idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
10794
+
10795
+ const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
10796
+ const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
10797
+ const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
10798
+ const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
10799
+
10800
+ __m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
10801
+ __m128i aux128_1 = aux128_0;
10802
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
10803
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
10804
+ const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
10805
+ const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
10806
+ const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
10807
+ const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
10808
+
10809
+ aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
10810
+ aux128_1 = aux128_0;
10811
+ aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
10812
+ aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
10813
+ const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
10814
+ const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
10815
+ const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
10816
+ const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
10817
+
10818
+ signs += 4;
10819
+
10820
+ const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
10821
+ const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
10822
+ const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
10823
+ const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
10824
+ const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
10825
+ const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
10826
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
10827
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
10828
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
10829
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
10830
+ sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
10831
+ sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
10832
+ sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
10833
+ sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
10834
+ }
10835
+
10836
+ accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
10837
+
10838
+ }
10839
+
10840
+ *s = hsum_float_8(accumf);
10841
+
10424
10842
  #elif defined(__POWER9_VECTOR__)
10425
10843
  static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
10426
10844
  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
@@ -10428,6 +10846,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10428
10846
 
10429
10847
  static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
10430
10848
 
10849
+ const vector int v0 = vec_splats((int32_t)0);
10850
+
10431
10851
  vector float vsumf0 = vec_splats(0.0f);
10432
10852
  vector float vsumf1 = vec_splats(0.0f);
10433
10853
  vector float vsumf2 = vec_splats(0.0f);
@@ -10448,14 +10868,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10448
10868
  const uint8_t * restrict sc = x[i].scales;
10449
10869
  const int8_t * restrict q8 = y[i].qs;
10450
10870
 
10451
- vector signed int vsumi0 = vec_splats((int32_t)0);
10452
- vector signed int vsumi1 = vec_splats((int32_t)0);
10453
- vector signed int vsumi2 = vec_splats((int32_t)0);
10454
- vector signed int vsumi3 = vec_splats((int32_t)0);
10455
- vector signed int vsumi4 = vec_splats((int32_t)0);
10456
- vector signed int vsumi5 = vec_splats((int32_t)0);
10457
- vector signed int vsumi6 = vec_splats((int32_t)0);
10458
- vector signed int vsumi7 = vec_splats((int32_t)0);
10871
+ vector signed int vsumi0 = v0;
10872
+ vector signed int vsumi1 = v0;
10873
+ vector signed int vsumi2 = v0;
10874
+ vector signed int vsumi3 = v0;
10459
10875
 
10460
10876
  for (int j = 0; j < QK_K/32; j += 2) {
10461
10877
  __builtin_prefetch(q3, 0, 1);
@@ -10509,21 +10925,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10509
10925
  vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
10510
10926
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10511
10927
 
10512
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10513
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10514
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10515
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10516
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10517
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10518
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10519
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
10928
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
10929
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
10930
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
10931
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10520
10932
  }
10521
10933
 
10522
- vsumi0 = vec_add(vsumi0, vsumi4);
10523
- vsumi1 = vec_add(vsumi1, vsumi5);
10524
- vsumi2 = vec_add(vsumi2, vsumi6);
10525
- vsumi3 = vec_add(vsumi3, vsumi7);
10526
-
10527
10934
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10528
10935
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10529
10936
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -10669,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
10669
11076
  }
10670
11077
 
10671
11078
 
11079
+ #if defined(__AVX__)
11080
+ static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
11081
+ const __m128i ax = _mm_sign_epi8(x, x);
11082
+ const __m128i sy = _mm_sign_epi8(y, x);
11083
+ return _mm_maddubs_epi16(ax, sy);
11084
+ }
11085
+ #endif
11086
+
10672
11087
  #if defined(__AVX2__)
10673
11088
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
10674
11089
  const __m256i ax = _mm256_sign_epi8(x, x);
@@ -10786,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10786
11201
 
10787
11202
  *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
10788
11203
 
11204
+ #elif defined __AVX__
11205
+ __m256 accum = _mm256_setzero_ps();
11206
+ float accum1 = 0;
11207
+ for (int i = 0; i < nb; ++i) {
11208
+
11209
+ const int8_t * q8 = y[i].qs;
11210
+ const uint8_t * qs = x[i].qs;
11211
+ const uint16_t * qh = x[i].qh;
11212
+
11213
+ __m128i sumi1_0 = _mm_setzero_si128();
11214
+ __m128i sumi1_1 = _mm_setzero_si128();
11215
+ int sumi1 = 0;
11216
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
11217
+ const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
11218
+ const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
11219
+ const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11220
+ const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
11221
+ qs += 8;
11222
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11223
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11224
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11225
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11226
+
11227
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
11228
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
11229
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
11230
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
11231
+ const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
11232
+ const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
11233
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
11234
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
11235
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
11236
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
11237
+
11238
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
11239
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
11240
+ sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
11241
+ + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
11242
+ }
11243
+
11244
+ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
11245
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
11246
+ accum1 += d * sumi1;
11247
+
11248
+ }
11249
+
11250
+ *s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
11251
+
10789
11252
  #elif defined(__POWER9_VECTOR__)
10790
11253
  const vector unsigned char v0 = vec_splats((unsigned char)0x0);
10791
11254
  const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
@@ -10804,10 +11267,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10804
11267
  vector signed int vsumi1 = vec_splats((int32_t)0);
10805
11268
  vector signed int vsumi2 = vec_splats((int32_t)0);
10806
11269
  vector signed int vsumi3 = vec_splats((int32_t)0);
10807
- vector signed int vsumi4 = vec_splats((int32_t)0);
10808
- vector signed int vsumi5 = vec_splats((int32_t)0);
10809
- vector signed int vsumi6 = vec_splats((int32_t)0);
10810
- vector signed int vsumi7 = vec_splats((int32_t)0);
10811
11270
  vector signed int vsumi8 = vec_splats((int32_t)0);
10812
11271
 
10813
11272
  const uint8_t * restrict q1 = x[i].qs;
@@ -10849,14 +11308,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10849
11308
  vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
10850
11309
  vector signed short vscales = vec_sld(vscales23, vscales01, 8);
10851
11310
 
10852
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
10853
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
10854
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
10855
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
10856
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
10857
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
10858
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
10859
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
11311
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
11312
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
11313
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
11314
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
10860
11315
 
10861
11316
  vector signed short q8ysums = vec_xl_len(qs, 8);
10862
11317
  qs += 4;
@@ -10871,11 +11326,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
10871
11326
  vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
10872
11327
  }
10873
11328
 
10874
- vsumi0 = vec_add(vsumi0, vsumi4);
10875
- vsumi1 = vec_add(vsumi1, vsumi5);
10876
- vsumi2 = vec_add(vsumi2, vsumi6);
10877
- vsumi3 = vec_add(vsumi3, vsumi7);
10878
-
10879
11329
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
10880
11330
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
10881
11331
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -11137,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
11137
11587
 
11138
11588
  *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
11139
11589
 
11590
+ #elif defined __AVX__
11591
+ const __m128i mask = _mm_set1_epi16(0x7);
11592
+ const __m128i mone = _mm_set1_epi16(1);
11593
+
11594
+ __m256 accum1 = _mm256_setzero_ps();
11595
+ __m256 accum2 = _mm256_setzero_ps();
11596
+ for (int i = 0; i < nb; ++i) {
11597
+
11598
+ const int8_t * q8 = y[i].qs;
11599
+ const uint8_t * qs = x[i].qs;
11600
+ const uint8_t * qh = x[i].qh;
11601
+ const uint16_t * sc = (const uint16_t *)x[i].scales;
11602
+
11603
+ scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
11604
+
11605
+ __m128i sumi1_0 = _mm_setzero_si128();
11606
+ __m128i sumi1_1 = _mm_setzero_si128();
11607
+ __m128i sumi2_0 = _mm_setzero_si128();
11608
+ __m128i sumi2_1 = _mm_setzero_si128();
11609
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
11610
+ const __m128i q1b_1_0 = _mm_set_epi64x(
11611
+ iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
11612
+ const __m128i q1b_1_1 = _mm_set_epi64x(
11613
+ iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
11614
+ const __m128i q1b_2_0 = _mm_set_epi64x(
11615
+ iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
11616
+ const __m128i q1b_2_1 = _mm_set_epi64x(
11617
+ iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
11618
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11619
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11620
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11621
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
11622
+
11623
+ const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
11624
+ const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
11625
+ const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
11626
+ const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
11627
+
11628
+ const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11629
+ qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11630
+ const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11631
+ qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11632
+ const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11633
+ qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11634
+ const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
11635
+ qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11636
+
11637
+ const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
11638
+ const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
11639
+ const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
11640
+ const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
11641
+
11642
+ __m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
11643
+ __m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
11644
+ __m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
11645
+ __m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
11646
+
11647
+ scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
11648
+ scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
11649
+ scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
11650
+ scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
11651
+ const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
11652
+ const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
11653
+ const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
11654
+ const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
11655
+ const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
11656
+ const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
11657
+ const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
11658
+ const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
11659
+
11660
+ sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
11661
+ sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
11662
+ sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
11663
+ sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
11664
+
11665
+ qs += 8; qh += 4;
11666
+ }
11667
+
11668
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
11669
+
11670
+ accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
11671
+ accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
11672
+ }
11673
+
11674
+ *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
11675
+
11140
11676
  #else
11141
11677
 
11142
11678
  int sum1[2], sum2[2], delta[4];
@@ -11267,8 +11803,47 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11267
11803
 
11268
11804
  *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11269
11805
 
11806
+ #elif defined __AVX__
11807
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
11808
+ const __m128i m4b = _mm_set1_epi8(0x0f);
11809
+ const __m128i mone = _mm_set1_epi16(1);
11810
+
11811
+ __m256 accum1 = _mm256_setzero_ps();
11812
+ __m256 accum2 = _mm256_setzero_ps();
11813
+ for (int ib = 0; ib < nb; ib += 2) {
11814
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
11815
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
11816
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
11817
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
11818
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
11819
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
11820
+
11821
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
11822
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
11823
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
11824
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
11825
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
11826
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
11827
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
11828
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
11829
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
11830
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
11831
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
11832
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
11833
+ accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
11834
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
11835
+ accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
11836
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
11837
+
11838
+ y += 2;
11839
+ x += 2;
11840
+ }
11841
+
11842
+ *s = hsum_float_8(_mm256_add_ps(accum1, accum2));
11843
+
11270
11844
  #elif defined(__POWER9_VECTOR__)
11271
11845
  const vector signed char lowMask = vec_splats((signed char)0xF);
11846
+ const vector signed int v0 = vec_splats((int32_t)0);
11272
11847
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11273
11848
 
11274
11849
  vector float vsumf0 = vec_splats(0.0f);
@@ -11299,8 +11874,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
11299
11874
  vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
11300
11875
  vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
11301
11876
 
11302
- vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
11303
- vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
11877
+ vector signed int vsumi0 = v0;
11878
+ vector signed int vsumi1 = v0;
11879
+
11880
+ vsumi0 = vec_sum4s(qv0, vsumi0);
11881
+ vsumi1 = vec_sum4s(qv1, vsumi1);
11304
11882
 
11305
11883
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11306
11884
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
@@ -11453,8 +12031,57 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11453
12031
 
11454
12032
  *s = hsum_float_8(accum);
11455
12033
 
12034
+ #elif defined __AVX__
12035
+ const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
12036
+ const __m128i m4b = _mm_set1_epi8(0x0f);
12037
+
12038
+ __m256 accum = _mm256_setzero_ps();
12039
+ for (int ibl = 0; ibl < nb; ++ibl) {
12040
+ const uint8_t * qs = x[ibl].qs;
12041
+ const int8_t * q8 = y[ibl].qs;
12042
+ uint16_t sh = x[ibl].scales_h;
12043
+ __m128i sumi1_0 = _mm_setzero_si128();
12044
+ __m128i sumi1_1 = _mm_setzero_si128();
12045
+ __m128i sumi2_0 = _mm_setzero_si128();
12046
+ __m128i sumi2_1 = _mm_setzero_si128();
12047
+ for (int ib = 0; ib < QK_K/32; ib += 2) {
12048
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
12049
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
12050
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12051
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12052
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12053
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
12054
+ const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
12055
+ const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
12056
+ const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
12057
+ const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
12058
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
12059
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
12060
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
12061
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
12062
+ const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
12063
+ const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
12064
+ sh >>= 4;
12065
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
12066
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
12067
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
12068
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
12069
+ sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
12070
+ sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
12071
+ sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
12072
+ sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
12073
+ }
12074
+ __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
12075
+ __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
12076
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
12077
+ _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
12078
+ }
12079
+
12080
+ *s = hsum_float_8(accum);
12081
+
11456
12082
  #elif defined(__POWER9_VECTOR__)
11457
12083
  const vector signed char lowMask = vec_splats((signed char)0xF);
12084
+ const vector int v0 = vec_splats((int32_t)0);
11458
12085
  const vector unsigned char v4 = vec_splats((unsigned char)0x4);
11459
12086
 
11460
12087
  vector float vsumf0 = vec_splats(0.0f);
@@ -11470,14 +12097,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11470
12097
  vector float vyd = vec_splats(y[ibl].d);
11471
12098
  vector float vd = vec_mul(vxd, vyd);
11472
12099
 
11473
- vector signed int vsumi0 = vec_splats((int32_t)0);
11474
- vector signed int vsumi1 = vec_splats((int32_t)0);
11475
- vector signed int vsumi2 = vec_splats((int32_t)0);
11476
- vector signed int vsumi3 = vec_splats((int32_t)0);
11477
- vector signed int vsumi4 = vec_splats((int32_t)0);
11478
- vector signed int vsumi5 = vec_splats((int32_t)0);
11479
- vector signed int vsumi6 = vec_splats((int32_t)0);
11480
- vector signed int vsumi7 = vec_splats((int32_t)0);
12100
+ vector signed int vsumi0 = v0;
12101
+ vector signed int vsumi1 = v0;
12102
+ vector signed int vsumi2 = v0;
12103
+ vector signed int vsumi3 = v0;
11481
12104
 
11482
12105
  uint16_t h = x[ibl].scales_h;
11483
12106
 
@@ -11522,21 +12145,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
11522
12145
  vector signed short vscales01 = vec_splats((int16_t)ls0);
11523
12146
  vector signed short vscales23 = vec_splats((int16_t)ls1);
11524
12147
 
11525
- vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
11526
- vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
11527
- vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
11528
- vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
11529
- vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
11530
- vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
11531
- vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
11532
- vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
12148
+ vsumi0 = vec_msum(qv0, vscales01, vsumi0);
12149
+ vsumi1 = vec_msum(qv1, vscales01, vsumi1);
12150
+ vsumi2 = vec_msum(qv2, vscales23, vsumi2);
12151
+ vsumi3 = vec_msum(qv3, vscales23, vsumi3);
11533
12152
  }
11534
12153
 
11535
- vsumi0 = vec_add(vsumi0, vsumi4);
11536
- vsumi1 = vec_add(vsumi1, vsumi5);
11537
- vsumi2 = vec_add(vsumi2, vsumi6);
11538
- vsumi3 = vec_add(vsumi3, vsumi7);
11539
-
11540
12154
  vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
11541
12155
  vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
11542
12156
  vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
@@ -13139,7 +13753,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
13139
13753
  const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
13140
13754
  int num_neighbors = neighbours[0];
13141
13755
  GGML_ASSERT(num_neighbors > 0);
13142
- float best_score = 0;
13756
+ float best_score = -FLT_MAX;
13143
13757
  int grid_index = -1;
13144
13758
  for (int j = 1; j <= num_neighbors; ++j) {
13145
13759
  const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
@@ -13337,7 +13951,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
13337
13951
  sumw[j+1] = sumw[j] + weight[i];
13338
13952
  }
13339
13953
  }
13340
- float best_score = 0, scale = max;
13954
+ float best_score = -FLT_MIN, scale = max;
13341
13955
  int besti1 = -1, besti2 = -1, best_shift = 0;
13342
13956
  for (int i1 = 0; i1 <= block_size; ++i1) {
13343
13957
  for (int i2 = i1; i2 <= block_size; ++i2) {
@@ -13513,7 +14127,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
13513
14127
  idx[2*j] = j;
13514
14128
  }
13515
14129
  qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
13516
- float best_score = 0, scale = max;
14130
+ float best_score = -FLT_MIN, scale = max;
13517
14131
  int besti1 = -1, besti2 = -1, best_k = -1;
13518
14132
  // 0: +, +
13519
14133
  // 1: +, -