llama_cpp 0.16.1 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -4,8 +4,6 @@
|
|
4
4
|
#include "ggml-quants.h"
|
5
5
|
#include "ggml-impl.h"
|
6
6
|
|
7
|
-
#define GGML_COMMON_IMPL_C
|
8
|
-
#include "ggml-common.h"
|
9
7
|
|
10
8
|
#include <math.h>
|
11
9
|
#include <string.h>
|
@@ -1078,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
1078
1076
|
}
|
1079
1077
|
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
1080
1078
|
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
1079
|
+
}
|
1081
1080
|
|
1082
1081
|
#elif defined(__loongarch_asx)
|
1083
1082
|
for (int i = 0; i < nb; i++) {
|
@@ -1437,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
1437
1436
|
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
1438
1437
|
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
1439
1438
|
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
1439
|
+
}
|
1440
1440
|
|
1441
1441
|
#elif defined(__loongarch_asx)
|
1442
1442
|
for (int i = 0; i < nb; i++) {
|
@@ -4113,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4113
4113
|
|
4114
4114
|
#elif defined(__POWER9_VECTOR__)
|
4115
4115
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4116
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
4116
4117
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4117
4118
|
const vector signed char v8 = vec_splats((signed char)0x8);
|
4118
4119
|
|
4119
4120
|
vector float vsumf0 = vec_splats(0.0f);
|
4120
4121
|
|
4121
|
-
#pragma GCC unroll
|
4122
|
+
#pragma GCC unroll 8
|
4122
4123
|
for (int i = 0; i < nb; i++) {
|
4123
4124
|
__builtin_prefetch(x[i].qs, 0, 1);
|
4124
4125
|
__builtin_prefetch(y[i].qs, 0, 1);
|
@@ -4140,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4140
4141
|
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
4141
4142
|
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4142
4143
|
|
4143
|
-
|
4144
|
+
vector signed int vsumi0 = v0;
|
4144
4145
|
|
4145
|
-
|
4146
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
4147
|
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
4146
4148
|
|
4147
4149
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4148
4150
|
}
|
@@ -4516,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4516
4518
|
|
4517
4519
|
#elif defined(__POWER9_VECTOR__)
|
4518
4520
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4521
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
4519
4522
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4520
4523
|
|
4521
4524
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -4537,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4537
4540
|
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4538
4541
|
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4539
4542
|
|
4540
|
-
vector
|
4541
|
-
vector
|
4543
|
+
vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
|
4544
|
+
vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
|
4542
4545
|
|
4543
|
-
vector signed
|
4544
|
-
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4546
|
+
vector signed int vsumi0 = v0;
|
4545
4547
|
|
4546
|
-
|
4547
|
-
|
4548
|
-
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4548
|
+
vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
|
4549
|
+
vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
|
4549
4550
|
|
4550
4551
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4551
4552
|
}
|
@@ -5247,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5247
5248
|
|
5248
5249
|
#elif defined(__POWER9_VECTOR__)
|
5249
5250
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
5251
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
5250
5252
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5251
5253
|
|
5252
5254
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -5272,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5272
5274
|
|
5273
5275
|
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
5274
5276
|
|
5275
|
-
vector
|
5276
|
-
vector
|
5277
|
+
vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
|
5278
|
+
vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
|
5277
5279
|
|
5278
5280
|
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
5279
5281
|
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
5280
5282
|
|
5281
|
-
vector signed
|
5282
|
-
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
5283
|
+
vector signed int vsumi0 = v0;
|
5283
5284
|
|
5284
|
-
|
5285
|
-
|
5286
|
-
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
5285
|
+
vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
|
5286
|
+
vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
|
5287
5287
|
|
5288
5288
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5289
5289
|
}
|
@@ -5523,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
5523
5523
|
*s = sumf;
|
5524
5524
|
|
5525
5525
|
#elif defined(__POWER9_VECTOR__)
|
5526
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
5526
5527
|
vector float vsumf0 = vec_splats(0.0f);
|
5527
5528
|
|
5528
|
-
#pragma GCC unroll
|
5529
|
+
#pragma GCC unroll 8
|
5529
5530
|
for (int i = 0; i < nb; i++) {
|
5530
5531
|
__builtin_prefetch(x[i].qs, 0, 1);
|
5531
5532
|
__builtin_prefetch(y[i].qs, 0, 1);
|
@@ -5544,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
5544
5545
|
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
5545
5546
|
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
5546
5547
|
|
5547
|
-
vector signed int vsumi0 =
|
5548
|
-
vector signed int vsumi1 =
|
5549
|
-
vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
|
5550
|
-
vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
|
5548
|
+
vector signed int vsumi0 = v0;
|
5549
|
+
vector signed int vsumi1 = v0;
|
5551
5550
|
|
5552
|
-
vsumi0 =
|
5553
|
-
vsumi1 =
|
5551
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
5552
|
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
5553
|
+
vsumi0 = vec_sum4s(qv2, vsumi0);
|
5554
|
+
vsumi1 = vec_sum4s(qv3, vsumi1);
|
5554
5555
|
|
5555
5556
|
vsumi0 = vec_add(vsumi0, vsumi1);
|
5556
5557
|
|
@@ -5938,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5938
5939
|
#elif defined(__POWER9_VECTOR__)
|
5939
5940
|
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5940
5941
|
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5942
|
+
const vector int v0 = vec_splats((int32_t)0);
|
5941
5943
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5942
5944
|
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5943
5945
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
@@ -5975,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5975
5977
|
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
5976
5978
|
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
5977
5979
|
|
5978
|
-
vector signed int vsumi0 =
|
5979
|
-
vector signed int vsumi1 =
|
5980
|
-
vector signed int vsumi2 =
|
5981
|
-
vector signed int vsumi3 =
|
5982
|
-
vector signed int vsumi4 =
|
5983
|
-
vector signed int vsumi5 =
|
5984
|
-
vector signed int vsumi6 =
|
5985
|
-
vector signed int vsumi7 =
|
5980
|
+
vector signed int vsumi0 = v0;
|
5981
|
+
vector signed int vsumi1 = v0;
|
5982
|
+
vector signed int vsumi2 = v0;
|
5983
|
+
vector signed int vsumi3 = v0;
|
5984
|
+
vector signed int vsumi4 = v0;
|
5985
|
+
vector signed int vsumi5 = v0;
|
5986
|
+
vector signed int vsumi6 = v0;
|
5987
|
+
vector signed int vsumi7 = v0;
|
5986
5988
|
|
5989
|
+
const uint8_t * restrict q2 = x[i].qs;
|
5990
|
+
const int8_t * restrict q8 = y[i].qs;
|
5987
5991
|
|
5988
5992
|
for (int j = 0; j < QK_K/128; ++j) {
|
5989
5993
|
__builtin_prefetch(q2, 0, 1);
|
@@ -5993,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5993
5997
|
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
5994
5998
|
q2 += 32;
|
5995
5999
|
|
5996
|
-
vector
|
5997
|
-
vector
|
5998
|
-
vector
|
5999
|
-
vector
|
6000
|
-
vector
|
6001
|
-
vector
|
6002
|
-
vector
|
6003
|
-
vector
|
6000
|
+
vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
6001
|
+
vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
|
6002
|
+
vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
|
6003
|
+
vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
|
6004
|
+
vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
6005
|
+
vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
|
6006
|
+
vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
|
6007
|
+
vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
|
6004
6008
|
|
6005
6009
|
vector signed char q8y00 = vec_xl( 0, q8);
|
6006
6010
|
vector signed char q8y10 = vec_xl( 16, q8);
|
@@ -6012,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6012
6016
|
vector signed char q8y13 = vec_xl(112, q8);
|
6013
6017
|
q8 += 128;
|
6014
6018
|
|
6015
|
-
vector signed
|
6016
|
-
vector signed
|
6017
|
-
vector signed
|
6018
|
-
vector signed
|
6019
|
-
vector signed
|
6020
|
-
vector signed
|
6021
|
-
vector signed
|
6022
|
-
vector signed
|
6023
|
-
|
6024
|
-
vector signed short
|
6025
|
-
vector signed
|
6026
|
-
vector signed
|
6027
|
-
vector signed
|
6028
|
-
vector signed
|
6029
|
-
vector signed
|
6030
|
-
vector signed
|
6031
|
-
vector signed
|
6032
|
-
vector signed
|
6019
|
+
vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
|
6020
|
+
vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
|
6021
|
+
vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
|
6022
|
+
vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
|
6023
|
+
vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
|
6024
|
+
vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
|
6025
|
+
vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
|
6026
|
+
vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
|
6027
|
+
|
6028
|
+
vector signed short vscales_07 = vec_unpackh(vscales);
|
6029
|
+
vector signed int vscales_03 = vec_unpackh(vscales_07);
|
6030
|
+
vector signed int vscales_47 = vec_unpackl(vscales_07);
|
6031
|
+
vector signed int vs0 = vec_splat(vscales_03, 0);
|
6032
|
+
vector signed int vs1 = vec_splat(vscales_03, 1);
|
6033
|
+
vector signed int vs2 = vec_splat(vscales_03, 2);
|
6034
|
+
vector signed int vs3 = vec_splat(vscales_03, 3);
|
6035
|
+
vector signed int vs4 = vec_splat(vscales_47, 0);
|
6036
|
+
vector signed int vs5 = vec_splat(vscales_47, 1);
|
6037
|
+
vector signed int vs6 = vec_splat(vscales_47, 2);
|
6038
|
+
vector signed int vs7 = vec_splat(vscales_47, 3);
|
6033
6039
|
vscales = vec_sld(vscales, vscales, 8);
|
6034
6040
|
|
6035
|
-
|
6036
|
-
|
6037
|
-
|
6038
|
-
|
6039
|
-
|
6040
|
-
|
6041
|
-
|
6042
|
-
|
6043
|
-
qv3 = vec_madd(qv7, vs7, qv3);
|
6044
|
-
|
6045
|
-
vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
|
6046
|
-
vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
|
6047
|
-
vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
|
6048
|
-
vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
|
6049
|
-
|
6050
|
-
vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
|
6051
|
-
vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
|
6052
|
-
vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
|
6053
|
-
vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
|
6041
|
+
vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
|
6042
|
+
vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
|
6043
|
+
vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
|
6044
|
+
vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
|
6045
|
+
vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
|
6046
|
+
vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
|
6047
|
+
vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
|
6048
|
+
vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
|
6054
6049
|
}
|
6055
6050
|
|
6056
6051
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -6641,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6641
6636
|
|
6642
6637
|
#elif defined(__POWER9_VECTOR__)
|
6643
6638
|
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6639
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0xf);
|
6640
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
6641
|
+
const vector int v0 = vec_splats((int32_t)0);
|
6644
6642
|
const vector signed char v1 = vec_splats((signed char)0x1);
|
6645
6643
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6646
6644
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
@@ -6658,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6658
6656
|
vector float vyd = vec_splats(y[i].d);
|
6659
6657
|
vector float vd = vec_mul(vxd, vyd);
|
6660
6658
|
|
6661
|
-
|
6662
|
-
|
6659
|
+
UNUSED(kmask1);
|
6660
|
+
UNUSED(kmask2);
|
6663
6661
|
|
6664
|
-
|
6665
|
-
|
6666
|
-
|
6667
|
-
|
6668
|
-
|
6662
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
6663
|
+
vector signed char u1 = vec_and(u0, lowMask1);
|
6664
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
6665
|
+
vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
|
6666
|
+
vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
|
6667
|
+
vector signed char u31 = vec_and(u3, lowMask2);
|
6669
6668
|
|
6670
|
-
|
6669
|
+
u1 = vec_or(u1, u30);
|
6670
|
+
u2 = vec_or(vec_sr(u0, v4), u31);
|
6671
|
+
|
6672
|
+
vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
|
6671
6673
|
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
6672
6674
|
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
6673
6675
|
|
6674
6676
|
vscales = vec_sub(vscales, off);
|
6675
6677
|
|
6676
|
-
vector signed int vsumi0 =
|
6677
|
-
vector signed int vsumi1 =
|
6678
|
-
vector signed int vsumi2 =
|
6679
|
-
vector signed int vsumi3 =
|
6680
|
-
vector signed int vsumi4 =
|
6681
|
-
vector signed int vsumi5 =
|
6682
|
-
vector signed int vsumi6 =
|
6683
|
-
vector signed int vsumi7 =
|
6684
|
-
|
6678
|
+
vector signed int vsumi0 = v0;
|
6679
|
+
vector signed int vsumi1 = v0;
|
6680
|
+
vector signed int vsumi2 = v0;
|
6681
|
+
vector signed int vsumi3 = v0;
|
6682
|
+
vector signed int vsumi4 = v0;
|
6683
|
+
vector signed int vsumi5 = v0;
|
6684
|
+
vector signed int vsumi6 = v0;
|
6685
|
+
vector signed int vsumi7 = v0;
|
6685
6686
|
|
6686
6687
|
const uint8_t * restrict q3 = x[i].qs;
|
6687
6688
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -6755,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6755
6756
|
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
6756
6757
|
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
6757
6758
|
|
6758
|
-
|
6759
|
-
|
6760
|
-
|
6761
|
-
|
6762
|
-
|
6763
|
-
|
6764
|
-
|
6765
|
-
|
6766
|
-
|
6767
|
-
vsumi0 = vec_add(vsum0, vsumi0);
|
6768
|
-
vsumi1 = vec_add(vsum1, vsumi1);
|
6769
|
-
vsumi2 = vec_add(vsum2, vsumi2);
|
6770
|
-
vsumi3 = vec_add(vsum3, vsumi3);
|
6771
|
-
vsumi4 = vec_add(vsum4, vsumi4);
|
6772
|
-
vsumi5 = vec_add(vsum5, vsumi5);
|
6773
|
-
vsumi6 = vec_add(vsum6, vsumi6);
|
6774
|
-
vsumi7 = vec_add(vsum7, vsumi7);
|
6759
|
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
6760
|
+
vsumi1 = vec_msum(qv01, vs2, vsumi1);
|
6761
|
+
vsumi2 = vec_msum(qv02, vs4, vsumi2);
|
6762
|
+
vsumi3 = vec_msum(qv03, vs6, vsumi3);
|
6763
|
+
vsumi4 = vec_msum(qv10, vs1, vsumi4);
|
6764
|
+
vsumi5 = vec_msum(qv11, vs3, vsumi5);
|
6765
|
+
vsumi6 = vec_msum(qv12, vs5, vsumi6);
|
6766
|
+
vsumi7 = vec_msum(qv13, vs7, vsumi7);
|
6775
6767
|
}
|
6776
6768
|
|
6777
6769
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -7270,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7270
7262
|
|
7271
7263
|
#elif defined(__POWER9_VECTOR__)
|
7272
7264
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7265
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
7266
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
7267
|
+
const vector int v0 = vec_splats((int32_t)0);
|
7268
|
+
const vector unsigned char v2 = vec_splats((uint8_t)2);
|
7273
7269
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7274
7270
|
|
7275
7271
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -7288,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7288
7284
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7289
7285
|
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7290
7286
|
|
7291
|
-
|
7287
|
+
UNUSED(kmask1);
|
7288
|
+
UNUSED(kmask2);
|
7289
|
+
UNUSED(kmask3);
|
7290
|
+
UNUSED(utmp);
|
7292
7291
|
|
7293
|
-
|
7294
|
-
|
7295
|
-
|
7296
|
-
|
7297
|
-
|
7292
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
7293
|
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
7294
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
7295
|
+
vector signed char u3 = vec_sr(u2, v4);
|
7296
|
+
|
7297
|
+
vector signed char u30 = u1;
|
7298
|
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
7299
|
+
|
7300
|
+
u1 = vec_and(u0, lowMask1);
|
7301
|
+
u2 = vec_or(u30, u31);
|
7302
|
+
|
7303
|
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
7298
7304
|
|
7299
|
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7300
7305
|
vector signed short vscales = vec_unpackh(utmps);
|
7301
7306
|
vector signed short q4xmins = vec_unpackl(utmps);
|
7302
7307
|
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
@@ -7312,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7312
7317
|
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
7313
7318
|
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
7314
7319
|
|
7315
|
-
vector signed int vsumi0 =
|
7316
|
-
vector signed int vsumi1 =
|
7317
|
-
vector signed int vsumi2 =
|
7318
|
-
vector signed int vsumi3 =
|
7319
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
7320
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
7321
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
7322
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
7320
|
+
vector signed int vsumi0 = v0;
|
7321
|
+
vector signed int vsumi1 = v0;
|
7322
|
+
vector signed int vsumi2 = v0;
|
7323
|
+
vector signed int vsumi3 = v0;
|
7323
7324
|
|
7324
7325
|
const uint8_t * restrict q4 = x[i].qs;
|
7325
7326
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -7334,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7334
7335
|
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
7335
7336
|
q4 += 64;
|
7336
7337
|
|
7337
|
-
vector
|
7338
|
-
vector
|
7339
|
-
vector
|
7340
|
-
vector
|
7341
|
-
vector
|
7342
|
-
vector
|
7343
|
-
vector
|
7344
|
-
vector
|
7338
|
+
vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
7339
|
+
vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
|
7340
|
+
vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
7341
|
+
vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
|
7342
|
+
vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
|
7343
|
+
vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
|
7344
|
+
vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
|
7345
|
+
vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
|
7345
7346
|
|
7346
7347
|
vector signed char q8y00 = vec_xl( 0, q8);
|
7347
7348
|
vector signed char q8y10 = vec_xl( 16, q8);
|
@@ -7353,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7353
7354
|
vector signed char q8y31 = vec_xl(112, q8);
|
7354
7355
|
q8 += 128;
|
7355
7356
|
|
7356
|
-
vector signed
|
7357
|
-
vector signed
|
7358
|
-
vector signed
|
7359
|
-
vector signed
|
7360
|
-
vector signed
|
7361
|
-
vector signed
|
7362
|
-
vector signed
|
7363
|
-
vector signed
|
7364
|
-
|
7365
|
-
vector signed
|
7366
|
-
vector signed
|
7367
|
-
vector signed
|
7368
|
-
vector signed
|
7357
|
+
vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
|
7358
|
+
vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
|
7359
|
+
vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
|
7360
|
+
vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
|
7361
|
+
vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
|
7362
|
+
vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
|
7363
|
+
vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
|
7364
|
+
vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
|
7365
|
+
|
7366
|
+
vector signed int vscales_h = vec_unpackh(vscales);
|
7367
|
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
7368
|
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
7369
|
+
vector signed int vs2 = vec_splat(vscales_h, 2);
|
7370
|
+
vector signed int vs3 = vec_splat(vscales_h, 3);
|
7369
7371
|
vscales = vec_sld(vscales, vscales, 8);
|
7370
7372
|
|
7371
|
-
|
7372
|
-
|
7373
|
-
|
7374
|
-
|
7373
|
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
7374
|
+
vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
|
7375
|
+
vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
|
7376
|
+
vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
|
7375
7377
|
|
7376
|
-
vsumi0 = vec_add(
|
7377
|
-
vsumi1 = vec_add(
|
7378
|
-
vsumi2 = vec_add(
|
7379
|
-
vsumi3 = vec_add(
|
7380
|
-
vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
|
7381
|
-
vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
|
7382
|
-
vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
|
7383
|
-
vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
|
7378
|
+
vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
|
7379
|
+
vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
|
7380
|
+
vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
|
7381
|
+
vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
|
7384
7382
|
}
|
7385
7383
|
|
7386
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
7387
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
7388
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
7389
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
7390
|
-
|
7391
7384
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7392
7385
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7393
7386
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -7889,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7889
7882
|
|
7890
7883
|
#elif defined(__POWER9_VECTOR__)
|
7891
7884
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7885
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
7886
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
7887
|
+
const vector int v0 = vec_splats((int32_t)0);
|
7892
7888
|
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
7893
7889
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
7894
7890
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
@@ -7907,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7907
7903
|
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
7908
7904
|
vector float vdmin = vec_mul(vxmin, vyd);
|
7909
7905
|
|
7910
|
-
|
7906
|
+
UNUSED(kmask1);
|
7907
|
+
UNUSED(kmask2);
|
7908
|
+
UNUSED(kmask3);
|
7909
|
+
UNUSED(utmp);
|
7911
7910
|
|
7912
|
-
|
7913
|
-
|
7914
|
-
|
7915
|
-
|
7916
|
-
|
7911
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
7912
|
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
7913
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
7914
|
+
vector signed char u3 = vec_sr(u2, v4);
|
7915
|
+
|
7916
|
+
vector signed char u30 = u1;
|
7917
|
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
7918
|
+
|
7919
|
+
u1 = vec_and(u0, lowMask1);
|
7920
|
+
u2 = vec_or(u30, u31);
|
7921
|
+
|
7922
|
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
7917
7923
|
|
7918
7924
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7919
7925
|
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7920
7926
|
|
7921
|
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7922
7927
|
vector signed short vscales = vec_unpackh(utmps);
|
7923
7928
|
|
7924
7929
|
vector signed short q5xmins = vec_unpackl(utmps);
|
@@ -7938,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7938
7943
|
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
7939
7944
|
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
7940
7945
|
|
7941
|
-
vector signed int vsumi0 =
|
7942
|
-
vector signed int vsumi1 =
|
7943
|
-
vector signed int vsumi2 =
|
7944
|
-
vector signed int vsumi3 =
|
7946
|
+
vector signed int vsumi0 = v0;
|
7947
|
+
vector signed int vsumi1 = v0;
|
7948
|
+
vector signed int vsumi2 = v0;
|
7949
|
+
vector signed int vsumi3 = v0;
|
7945
7950
|
|
7946
7951
|
const uint8_t * restrict q5 = x[i].qs;
|
7947
7952
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -7966,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7966
7971
|
qxhs0 = vec_sr(qxhs0, v2);
|
7967
7972
|
qxhs1 = vec_sr(qxhs1, v2);
|
7968
7973
|
|
7969
|
-
vector
|
7970
|
-
vector
|
7971
|
-
vector
|
7972
|
-
vector
|
7974
|
+
vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
|
7975
|
+
vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
|
7976
|
+
vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
|
7977
|
+
vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
|
7973
7978
|
|
7974
7979
|
vector signed char q8y00 = vec_xl( 0, q8);
|
7975
7980
|
vector signed char q8y10 = vec_xl(16, q8);
|
@@ -7977,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7977
7982
|
vector signed char q8y11 = vec_xl(48, q8);
|
7978
7983
|
q8 += 64;
|
7979
7984
|
|
7980
|
-
vector signed
|
7981
|
-
vector signed
|
7982
|
-
vector signed
|
7983
|
-
vector signed
|
7985
|
+
vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
|
7986
|
+
vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
|
7987
|
+
vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
|
7988
|
+
vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
|
7984
7989
|
|
7985
|
-
vector signed
|
7986
|
-
vector signed
|
7990
|
+
vector signed int vscales_h = vec_unpackh(vscales);
|
7991
|
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
7992
|
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
7987
7993
|
vscales = vec_sld(vscales, vscales, 12);
|
7988
7994
|
|
7989
|
-
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
7994
|
-
vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
|
7995
|
-
vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
|
7995
|
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
7996
|
+
vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
|
7997
|
+
vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
|
7998
|
+
vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
|
7996
7999
|
}
|
7997
8000
|
|
7998
8001
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
@@ -8553,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8553
8556
|
|
8554
8557
|
#elif defined(__POWER9_VECTOR__)
|
8555
8558
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8559
|
+
const vector int v0 = vec_splats((int32_t)0);
|
8556
8560
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8557
8561
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
8558
8562
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
@@ -8569,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8569
8573
|
vector float vyd = vec_splats(y[i].d);
|
8570
8574
|
vector float vd = vec_mul(vxd, vyd);
|
8571
8575
|
|
8572
|
-
vector signed int vsumi0 =
|
8573
|
-
vector signed int vsumi1 =
|
8574
|
-
vector signed int vsumi2 =
|
8575
|
-
vector signed int vsumi3 =
|
8576
|
-
vector signed int vsumi4 =
|
8577
|
-
vector signed int vsumi5 =
|
8578
|
-
vector signed int vsumi6 =
|
8579
|
-
vector signed int vsumi7 =
|
8576
|
+
vector signed int vsumi0 = v0;
|
8577
|
+
vector signed int vsumi1 = v0;
|
8578
|
+
vector signed int vsumi2 = v0;
|
8579
|
+
vector signed int vsumi3 = v0;
|
8580
|
+
vector signed int vsumi4 = v0;
|
8581
|
+
vector signed int vsumi5 = v0;
|
8582
|
+
vector signed int vsumi6 = v0;
|
8583
|
+
vector signed int vsumi7 = v0;
|
8580
8584
|
|
8581
8585
|
const uint8_t * restrict q6 = x[i].ql;
|
8582
8586
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -8656,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8656
8660
|
vector signed short vs6 = vec_splat(vscales, 6);
|
8657
8661
|
vector signed short vs7 = vec_splat(vscales, 7);
|
8658
8662
|
|
8659
|
-
vsumi0 =
|
8660
|
-
vsumi1 =
|
8661
|
-
vsumi2 =
|
8662
|
-
vsumi3 =
|
8663
|
-
vsumi4 =
|
8664
|
-
vsumi5 =
|
8665
|
-
vsumi6 =
|
8666
|
-
vsumi7 =
|
8667
|
-
|
8668
|
-
vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
|
8669
|
-
vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
|
8670
|
-
vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
|
8671
|
-
vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
|
8672
|
-
vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
|
8673
|
-
vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
|
8674
|
-
vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
|
8675
|
-
vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
|
8663
|
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
8664
|
+
vsumi1 = vec_msum(qv01, vs4, vsumi1);
|
8665
|
+
vsumi2 = vec_msum(qv10, vs1, vsumi2);
|
8666
|
+
vsumi3 = vec_msum(qv11, vs5, vsumi3);
|
8667
|
+
vsumi4 = vec_msum(qv20, vs2, vsumi4);
|
8668
|
+
vsumi5 = vec_msum(qv21, vs6, vsumi5);
|
8669
|
+
vsumi6 = vec_msum(qv30, vs3, vsumi6);
|
8670
|
+
vsumi7 = vec_msum(qv31, vs7, vsumi7);
|
8676
8671
|
}
|
8677
8672
|
|
8678
8673
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -8819,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8819
8814
|
#endif
|
8820
8815
|
}
|
8821
8816
|
|
8822
|
-
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
8817
|
+
#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
8823
8818
|
static const int8_t keven_signs_q2xs[1024] = {
|
8824
8819
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8825
8820
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8952,7 +8947,63 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8952
8947
|
|
8953
8948
|
*s = 0.125f * hsum_float_8(accumf);
|
8954
8949
|
|
8950
|
+
#elif defined(__AVX__)
|
8951
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
8952
|
+
|
8953
|
+
uint32_t aux32[4];
|
8954
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
8955
|
+
|
8956
|
+
__m256 accumf = _mm256_setzero_ps();
|
8957
|
+
for (int i = 0; i < nb; ++i) {
|
8958
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
8959
|
+
const uint16_t * restrict q2 = x[i].qs;
|
8960
|
+
const int8_t * restrict q8 = y[i].qs;
|
8961
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
8962
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
8963
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
8964
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
8965
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
8966
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8967
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8968
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8969
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8970
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
8971
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
8972
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
|
8973
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
8974
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
|
8975
|
+
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
8976
|
+
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
8977
|
+
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
8978
|
+
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
|
8979
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
8980
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
8981
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
8982
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
8983
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
8984
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
8985
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
8986
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
8987
|
+
const uint16_t ls1 = aux32[1] >> 28;
|
8988
|
+
const uint16_t ls2 = aux32[3] >> 28;
|
8989
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
8990
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
8991
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
8992
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
8993
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
8994
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
8995
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
8996
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
8997
|
+
}
|
8998
|
+
|
8999
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9000
|
+
|
9001
|
+
}
|
9002
|
+
|
9003
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9004
|
+
|
8955
9005
|
#elif defined(__POWER9_VECTOR__)
|
9006
|
+
const vector int v0 = vec_splats((int32_t)0);
|
8956
9007
|
vector float vsumf0 = vec_splats(0.0f);
|
8957
9008
|
vector float vsumf1 = vec_splats(0.0f);
|
8958
9009
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -8965,14 +9016,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8965
9016
|
vector float vyd = vec_splats(y[i].d);
|
8966
9017
|
vector float vd = vec_mul(vxd, vyd);
|
8967
9018
|
|
8968
|
-
vector signed int vsumi0 =
|
8969
|
-
vector signed int vsumi1 =
|
8970
|
-
vector signed int vsumi2 =
|
8971
|
-
vector signed int vsumi3 =
|
8972
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
8973
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
8974
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
8975
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9019
|
+
vector signed int vsumi0 = v0;
|
9020
|
+
vector signed int vsumi1 = v0;
|
9021
|
+
vector signed int vsumi2 = v0;
|
9022
|
+
vector signed int vsumi3 = v0;
|
8976
9023
|
|
8977
9024
|
const uint16_t * restrict q2 = x[i].qs;
|
8978
9025
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -9019,21 +9066,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9019
9066
|
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
9020
9067
|
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
9021
9068
|
|
9022
|
-
vsumi0 =
|
9023
|
-
vsumi1 =
|
9024
|
-
vsumi2 =
|
9025
|
-
vsumi3 =
|
9026
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
9027
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
9028
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
9029
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
9069
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
9070
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
9071
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
9072
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
9030
9073
|
}
|
9031
9074
|
|
9032
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9033
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9034
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9035
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9036
|
-
|
9037
9075
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9038
9076
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9039
9077
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -9307,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9307
9345
|
}
|
9308
9346
|
|
9309
9347
|
*s = 0.125f * hsum_float_8(accumf);
|
9348
|
+
|
9349
|
+
#elif defined(__AVX__)
|
9350
|
+
const __m128i mone = _mm_set1_epi8(1);
|
9351
|
+
static const char block_sign_shuffle_mask_1[32] = {
|
9352
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
9353
|
+
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
9354
|
+
};
|
9355
|
+
static const char block_sign_shuffle_mask_2[32] = {
|
9356
|
+
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
9357
|
+
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
9358
|
+
};
|
9359
|
+
static const uint8_t bit_selector_mask_bytes[32] = {
|
9360
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9361
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9362
|
+
};
|
9363
|
+
|
9364
|
+
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
|
9365
|
+
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
|
9366
|
+
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
|
9367
|
+
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
|
9368
|
+
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
|
9369
|
+
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
|
9370
|
+
|
9371
|
+
static const uint8_t k_bit_helper[32] = {
|
9372
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9373
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9374
|
+
};
|
9375
|
+
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
9376
|
+
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
|
9377
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
9378
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9379
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9380
|
+
|
9381
|
+
uint64_t aux64;
|
9382
|
+
|
9383
|
+
// somewhat hacky, but gives a significant boost in performance
|
9384
|
+
__m256i aux_gindex;
|
9385
|
+
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
9386
|
+
|
9387
|
+
__m256 accumf = _mm256_setzero_ps();
|
9388
|
+
for (int i = 0; i < nb; ++i) {
|
9389
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9390
|
+
const uint16_t * restrict q2 = x[i].qs;
|
9391
|
+
const int8_t * restrict q8 = y[i].qs;
|
9392
|
+
|
9393
|
+
memcpy(&aux64, x[i].scales, 8);
|
9394
|
+
__m128i stmp = _mm_set1_epi64x(aux64);
|
9395
|
+
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
9396
|
+
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
9397
|
+
|
9398
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
9399
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
9400
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
9401
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
9402
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
9403
|
+
|
9404
|
+
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
|
9405
|
+
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
|
9406
|
+
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
|
9407
|
+
|
9408
|
+
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
|
9409
|
+
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
|
9410
|
+
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
|
9411
|
+
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
|
9412
|
+
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
|
9413
|
+
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
|
9414
|
+
|
9415
|
+
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
|
9416
|
+
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
|
9417
|
+
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
|
9418
|
+
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
|
9419
|
+
|
9420
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9421
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9422
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9423
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9424
|
+
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9425
|
+
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9426
|
+
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9427
|
+
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9428
|
+
|
9429
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
9430
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
|
9431
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
9432
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
|
9433
|
+
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
|
9434
|
+
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
|
9435
|
+
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
9436
|
+
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
|
9437
|
+
|
9438
|
+
// AVX2 full_signs_1 is full_sign_bits_0 here
|
9439
|
+
// AVX2 full_signs_2 is full_sign_bits_1 here
|
9440
|
+
__m128i signs_0, signs_1;
|
9441
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
|
9442
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
|
9443
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9444
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9445
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
|
9446
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
|
9447
|
+
|
9448
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
|
9449
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
|
9450
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9451
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9452
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
|
9453
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
|
9454
|
+
|
9455
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
|
9456
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
|
9457
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9458
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9459
|
+
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
|
9460
|
+
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
|
9461
|
+
|
9462
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
|
9463
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
|
9464
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9465
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9466
|
+
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
|
9467
|
+
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
|
9468
|
+
|
9469
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
9470
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
9471
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
9472
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
9473
|
+
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
|
9474
|
+
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
|
9475
|
+
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
|
9476
|
+
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
|
9477
|
+
|
9478
|
+
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
|
9479
|
+
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9480
|
+
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9481
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
|
9482
|
+
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9483
|
+
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9484
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
|
9485
|
+
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9486
|
+
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9487
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
|
9488
|
+
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9489
|
+
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9490
|
+
|
9491
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
|
9492
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
|
9493
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
|
9494
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
|
9495
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
|
9496
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
|
9497
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
|
9498
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
|
9499
|
+
}
|
9500
|
+
|
9501
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9502
|
+
|
9503
|
+
}
|
9504
|
+
|
9505
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9506
|
+
|
9310
9507
|
#elif defined(__loongarch_asx)
|
9311
9508
|
|
9312
9509
|
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
@@ -9425,6 +9622,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9425
9622
|
|
9426
9623
|
*s = 0.125f * hsum_float_8(accumf);
|
9427
9624
|
#elif defined(__POWER9_VECTOR__)
|
9625
|
+
const vector int v0 = vec_splats((int32_t)0);
|
9428
9626
|
vector float vsumf0 = vec_splats(0.0f);
|
9429
9627
|
vector float vsumf1 = vec_splats(0.0f);
|
9430
9628
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -9437,14 +9635,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9437
9635
|
vector float vyd = vec_splats(y[i].d);
|
9438
9636
|
vector float vd = vec_mul(vxd, vyd);
|
9439
9637
|
|
9440
|
-
vector signed int vsumi0 =
|
9441
|
-
vector signed int vsumi1 =
|
9442
|
-
vector signed int vsumi2 =
|
9443
|
-
vector signed int vsumi3 =
|
9444
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9445
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9446
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9447
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9638
|
+
vector signed int vsumi0 = v0;
|
9639
|
+
vector signed int vsumi1 = v0;
|
9640
|
+
vector signed int vsumi2 = v0;
|
9641
|
+
vector signed int vsumi3 = v0;
|
9448
9642
|
|
9449
9643
|
const uint16_t * restrict q2 = x[i].qs;
|
9450
9644
|
const uint8_t * restrict sc = x[i].scales;
|
@@ -9492,21 +9686,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9492
9686
|
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
9493
9687
|
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
9494
9688
|
|
9495
|
-
vsumi0 =
|
9496
|
-
vsumi1 =
|
9497
|
-
vsumi2 =
|
9498
|
-
vsumi3 =
|
9499
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
9500
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
9501
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
9502
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
9689
|
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
9690
|
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
9691
|
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
9692
|
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
9503
9693
|
}
|
9504
9694
|
|
9505
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9506
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9507
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9508
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9509
|
-
|
9510
9695
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9511
9696
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9512
9697
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -9722,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9722
9907
|
|
9723
9908
|
*s = 0.125f * hsum_float_8(accumf);
|
9724
9909
|
|
9910
|
+
#elif defined(__AVX__)
|
9911
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9912
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9913
|
+
};
|
9914
|
+
|
9915
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9916
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9917
|
+
};
|
9918
|
+
|
9919
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9920
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9921
|
+
|
9922
|
+
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
9923
|
+
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
9924
|
+
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
9925
|
+
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
9926
|
+
|
9927
|
+
uint64_t aux64;
|
9928
|
+
|
9929
|
+
__m256 accumf = _mm256_setzero_ps();
|
9930
|
+
for (int i = 0; i < nb; ++i) {
|
9931
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9932
|
+
const uint8_t * restrict qs = x[i].qs;
|
9933
|
+
const uint8_t * restrict qh = x[i].qh;
|
9934
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9935
|
+
const int8_t * restrict q8 = y[i].qs;
|
9936
|
+
|
9937
|
+
memcpy(&aux64, x[i].scales, 8);
|
9938
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
9939
|
+
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
|
9940
|
+
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
|
9941
|
+
|
9942
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
9943
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
9944
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
9945
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
9946
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9947
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9948
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9949
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9950
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9951
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
9952
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
9953
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
9954
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
|
9955
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
9956
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9957
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
9958
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
|
9959
|
+
qs += 8;
|
9960
|
+
|
9961
|
+
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
9962
|
+
__m128i aux128_1 = aux128_0;
|
9963
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
9964
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
9965
|
+
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
9966
|
+
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
9967
|
+
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
9968
|
+
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
9969
|
+
|
9970
|
+
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
9971
|
+
aux128_1 = aux128_0;
|
9972
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
9973
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
9974
|
+
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
9975
|
+
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
9976
|
+
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
9977
|
+
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
9978
|
+
|
9979
|
+
signs += 4;
|
9980
|
+
|
9981
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
9982
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
9983
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
9984
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
9985
|
+
|
9986
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
|
9987
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
|
9988
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
|
9989
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
|
9990
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
9991
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
9992
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
9993
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
9994
|
+
}
|
9995
|
+
|
9996
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9997
|
+
|
9998
|
+
}
|
9999
|
+
|
10000
|
+
*s = 0.125f * hsum_float_8(accumf);
|
10001
|
+
|
9725
10002
|
#elif defined(__POWER9_VECTOR__)
|
9726
10003
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9727
10004
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
@@ -9729,6 +10006,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9729
10006
|
|
9730
10007
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9731
10008
|
|
10009
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10010
|
+
|
9732
10011
|
vector float vsumf0 = vec_splats(0.0f);
|
9733
10012
|
vector float vsumf1 = vec_splats(0.0f);
|
9734
10013
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -9743,14 +10022,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9743
10022
|
vector float vyd = vec_splats(y[i].d);
|
9744
10023
|
vector float vd = vec_mul(vxd, vyd);
|
9745
10024
|
|
9746
|
-
vector signed int vsumi0 =
|
9747
|
-
vector signed int vsumi1 =
|
9748
|
-
vector signed int vsumi2 =
|
9749
|
-
vector signed int vsumi3 =
|
9750
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9751
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9752
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9753
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10025
|
+
vector signed int vsumi0 = v0;
|
10026
|
+
vector signed int vsumi1 = v0;
|
10027
|
+
vector signed int vsumi2 = v0;
|
10028
|
+
vector signed int vsumi3 = v0;
|
9754
10029
|
|
9755
10030
|
const uint8_t * restrict q2 = x[i].qs;
|
9756
10031
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -9810,21 +10085,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9810
10085
|
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
9811
10086
|
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
9812
10087
|
|
9813
|
-
vsumi0 =
|
9814
|
-
vsumi1 =
|
9815
|
-
vsumi2 =
|
9816
|
-
vsumi3 =
|
9817
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
9818
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
9819
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
9820
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10088
|
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
10089
|
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
10090
|
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
10091
|
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
9821
10092
|
}
|
9822
10093
|
|
9823
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9824
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9825
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9826
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9827
|
-
|
9828
10094
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9829
10095
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9830
10096
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10059,9 +10325,68 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10059
10325
|
|
10060
10326
|
*s = 0.25f * hsum_float_8(accumf);
|
10061
10327
|
|
10328
|
+
#elif defined(__AVX__)
|
10329
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10330
|
+
|
10331
|
+
uint32_t aux32[2];
|
10332
|
+
|
10333
|
+
__m256 accumf = _mm256_setzero_ps();
|
10334
|
+
for (int i = 0; i < nb; ++i) {
|
10335
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10336
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10337
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10338
|
+
const int8_t * restrict q8 = y[i].qs;
|
10339
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
10340
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
10341
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
10342
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
10343
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10344
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10345
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10346
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10347
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10348
|
+
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10349
|
+
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
10350
|
+
q3 += 8;
|
10351
|
+
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10352
|
+
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
10353
|
+
q3 += 8;
|
10354
|
+
memcpy(aux32, gas, 8); gas += 8;
|
10355
|
+
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
10356
|
+
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
|
10357
|
+
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
10358
|
+
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
10359
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
10360
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
10361
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
10362
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
10363
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
10364
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
10365
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
10366
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
10367
|
+
const uint16_t ls1 = aux32[0] >> 28;
|
10368
|
+
const uint16_t ls2 = aux32[1] >> 28;
|
10369
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
10370
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
10371
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
10372
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
10373
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
10374
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
10375
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
10376
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
10377
|
+
}
|
10378
|
+
|
10379
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
10380
|
+
|
10381
|
+
}
|
10382
|
+
|
10383
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10384
|
+
|
10062
10385
|
#elif defined(__POWER9_VECTOR__)
|
10063
10386
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10064
10387
|
|
10388
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10389
|
+
|
10065
10390
|
vector float vsumf0 = vec_splats(0.0f);
|
10066
10391
|
vector float vsumf1 = vec_splats(0.0f);
|
10067
10392
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -10072,14 +10397,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10072
10397
|
vector float vyd = vec_splats(y[i].d);
|
10073
10398
|
vector float vd = vec_mul(vxd, vyd);
|
10074
10399
|
|
10075
|
-
vector signed int vsumi0 =
|
10076
|
-
vector signed int vsumi1 =
|
10077
|
-
vector signed int vsumi2 =
|
10078
|
-
vector signed int vsumi3 =
|
10079
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10080
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10081
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10082
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10400
|
+
vector signed int vsumi0 = v0;
|
10401
|
+
vector signed int vsumi1 = v0;
|
10402
|
+
vector signed int vsumi2 = v0;
|
10403
|
+
vector signed int vsumi3 = v0;
|
10083
10404
|
|
10084
10405
|
const uint8_t * restrict q3 = x[i].qs;
|
10085
10406
|
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
@@ -10124,21 +10445,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10124
10445
|
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10125
10446
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10126
10447
|
|
10127
|
-
vsumi0 =
|
10128
|
-
vsumi1 =
|
10129
|
-
vsumi2 =
|
10130
|
-
vsumi3 =
|
10131
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10132
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10133
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10134
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10448
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
10449
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
10450
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
10451
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10135
10452
|
}
|
10136
10453
|
|
10137
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10138
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10139
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10140
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10141
|
-
|
10142
10454
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10143
10455
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10144
10456
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10421,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10421
10733
|
|
10422
10734
|
*s = hsum_float_8(accumf);
|
10423
10735
|
|
10736
|
+
#elif defined(__AVX__)
|
10737
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10738
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10739
|
+
};
|
10740
|
+
|
10741
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10742
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10743
|
+
};
|
10744
|
+
|
10745
|
+
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
10746
|
+
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
10747
|
+
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
10748
|
+
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
10749
|
+
|
10750
|
+
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
|
10751
|
+
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
|
10752
|
+
const __m128i idx_mask = _mm_set1_epi32(256);
|
10753
|
+
|
10754
|
+
typedef union {
|
10755
|
+
__m128i vec[4];
|
10756
|
+
uint32_t index[16];
|
10757
|
+
} index_t;
|
10758
|
+
|
10759
|
+
index_t idx;
|
10760
|
+
|
10761
|
+
__m256 accumf = _mm256_setzero_ps();
|
10762
|
+
for (int i = 0; i < nb; ++i) {
|
10763
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10764
|
+
const uint8_t * restrict qs = x[i].qs;
|
10765
|
+
const uint8_t * restrict qh = x[i].qh;
|
10766
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10767
|
+
const int8_t * restrict q8 = y[i].qs;
|
10768
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
10769
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
10770
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
10771
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
10772
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10773
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10774
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10775
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10776
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10777
|
+
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
|
10778
|
+
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
|
10779
|
+
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
|
10780
|
+
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
|
10781
|
+
idx.vec[1] = idx.vec[0];
|
10782
|
+
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
|
10783
|
+
idx.vec[3] = idx.vec[2];
|
10784
|
+
|
10785
|
+
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
|
10786
|
+
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
|
10787
|
+
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
|
10788
|
+
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
|
10789
|
+
|
10790
|
+
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
|
10791
|
+
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
|
10792
|
+
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
|
10793
|
+
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
|
10794
|
+
|
10795
|
+
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
|
10796
|
+
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
|
10797
|
+
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
|
10798
|
+
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
|
10799
|
+
|
10800
|
+
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
|
10801
|
+
__m128i aux128_1 = aux128_0;
|
10802
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
10803
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
10804
|
+
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
10805
|
+
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
10806
|
+
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
10807
|
+
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
10808
|
+
|
10809
|
+
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
|
10810
|
+
aux128_1 = aux128_0;
|
10811
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
10812
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
10813
|
+
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
10814
|
+
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
10815
|
+
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
10816
|
+
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
10817
|
+
|
10818
|
+
signs += 4;
|
10819
|
+
|
10820
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
10821
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
10822
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
10823
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
10824
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
10825
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
10826
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
10827
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
10828
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
10829
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
10830
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
10831
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
10832
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
10833
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
10834
|
+
}
|
10835
|
+
|
10836
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
10837
|
+
|
10838
|
+
}
|
10839
|
+
|
10840
|
+
*s = hsum_float_8(accumf);
|
10841
|
+
|
10424
10842
|
#elif defined(__POWER9_VECTOR__)
|
10425
10843
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10426
10844
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
@@ -10428,6 +10846,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10428
10846
|
|
10429
10847
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10430
10848
|
|
10849
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10850
|
+
|
10431
10851
|
vector float vsumf0 = vec_splats(0.0f);
|
10432
10852
|
vector float vsumf1 = vec_splats(0.0f);
|
10433
10853
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -10448,14 +10868,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10448
10868
|
const uint8_t * restrict sc = x[i].scales;
|
10449
10869
|
const int8_t * restrict q8 = y[i].qs;
|
10450
10870
|
|
10451
|
-
vector signed int vsumi0 =
|
10452
|
-
vector signed int vsumi1 =
|
10453
|
-
vector signed int vsumi2 =
|
10454
|
-
vector signed int vsumi3 =
|
10455
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10456
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10457
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10458
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10871
|
+
vector signed int vsumi0 = v0;
|
10872
|
+
vector signed int vsumi1 = v0;
|
10873
|
+
vector signed int vsumi2 = v0;
|
10874
|
+
vector signed int vsumi3 = v0;
|
10459
10875
|
|
10460
10876
|
for (int j = 0; j < QK_K/32; j += 2) {
|
10461
10877
|
__builtin_prefetch(q3, 0, 1);
|
@@ -10509,21 +10925,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10509
10925
|
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10510
10926
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10511
10927
|
|
10512
|
-
vsumi0 =
|
10513
|
-
vsumi1 =
|
10514
|
-
vsumi2 =
|
10515
|
-
vsumi3 =
|
10516
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10517
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10518
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10519
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10928
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
10929
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
10930
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
10931
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10520
10932
|
}
|
10521
10933
|
|
10522
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10523
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10524
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10525
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10526
|
-
|
10527
10934
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10528
10935
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10529
10936
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10669,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10669
11076
|
}
|
10670
11077
|
|
10671
11078
|
|
11079
|
+
#if defined(__AVX__)
|
11080
|
+
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
11081
|
+
const __m128i ax = _mm_sign_epi8(x, x);
|
11082
|
+
const __m128i sy = _mm_sign_epi8(y, x);
|
11083
|
+
return _mm_maddubs_epi16(ax, sy);
|
11084
|
+
}
|
11085
|
+
#endif
|
11086
|
+
|
10672
11087
|
#if defined(__AVX2__)
|
10673
11088
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
10674
11089
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
@@ -10786,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10786
11201
|
|
10787
11202
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
10788
11203
|
|
11204
|
+
#elif defined __AVX__
|
11205
|
+
__m256 accum = _mm256_setzero_ps();
|
11206
|
+
float accum1 = 0;
|
11207
|
+
for (int i = 0; i < nb; ++i) {
|
11208
|
+
|
11209
|
+
const int8_t * q8 = y[i].qs;
|
11210
|
+
const uint8_t * qs = x[i].qs;
|
11211
|
+
const uint16_t * qh = x[i].qh;
|
11212
|
+
|
11213
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
11214
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
11215
|
+
int sumi1 = 0;
|
11216
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
11217
|
+
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
11218
|
+
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
11219
|
+
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
11220
|
+
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
11221
|
+
qs += 8;
|
11222
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11223
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11224
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11225
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11226
|
+
|
11227
|
+
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
11228
|
+
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
11229
|
+
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
11230
|
+
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
11231
|
+
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
11232
|
+
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
11233
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
11234
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
11235
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
11236
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
11237
|
+
|
11238
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
11239
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
11240
|
+
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
11241
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
11242
|
+
}
|
11243
|
+
|
11244
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
11245
|
+
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
11246
|
+
accum1 += d * sumi1;
|
11247
|
+
|
11248
|
+
}
|
11249
|
+
|
11250
|
+
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
11251
|
+
|
10789
11252
|
#elif defined(__POWER9_VECTOR__)
|
10790
11253
|
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
10791
11254
|
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
@@ -10804,10 +11267,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10804
11267
|
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10805
11268
|
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10806
11269
|
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10807
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10808
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10809
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10810
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10811
11270
|
vector signed int vsumi8 = vec_splats((int32_t)0);
|
10812
11271
|
|
10813
11272
|
const uint8_t * restrict q1 = x[i].qs;
|
@@ -10849,14 +11308,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10849
11308
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10850
11309
|
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
10851
11310
|
|
10852
|
-
vsumi0 =
|
10853
|
-
vsumi1 =
|
10854
|
-
vsumi2 =
|
10855
|
-
vsumi3 =
|
10856
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10857
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10858
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10859
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11311
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
11312
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
11313
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
11314
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10860
11315
|
|
10861
11316
|
vector signed short q8ysums = vec_xl_len(qs, 8);
|
10862
11317
|
qs += 4;
|
@@ -10871,11 +11326,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10871
11326
|
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
10872
11327
|
}
|
10873
11328
|
|
10874
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10875
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10876
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10877
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10878
|
-
|
10879
11329
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10880
11330
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10881
11331
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -11137,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
11137
11587
|
|
11138
11588
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
11139
11589
|
|
11590
|
+
#elif defined __AVX__
|
11591
|
+
const __m128i mask = _mm_set1_epi16(0x7);
|
11592
|
+
const __m128i mone = _mm_set1_epi16(1);
|
11593
|
+
|
11594
|
+
__m256 accum1 = _mm256_setzero_ps();
|
11595
|
+
__m256 accum2 = _mm256_setzero_ps();
|
11596
|
+
for (int i = 0; i < nb; ++i) {
|
11597
|
+
|
11598
|
+
const int8_t * q8 = y[i].qs;
|
11599
|
+
const uint8_t * qs = x[i].qs;
|
11600
|
+
const uint8_t * qh = x[i].qh;
|
11601
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
11602
|
+
|
11603
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
11604
|
+
|
11605
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
11606
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
11607
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
11608
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
11609
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
11610
|
+
const __m128i q1b_1_0 = _mm_set_epi64x(
|
11611
|
+
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
11612
|
+
const __m128i q1b_1_1 = _mm_set_epi64x(
|
11613
|
+
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
11614
|
+
const __m128i q1b_2_0 = _mm_set_epi64x(
|
11615
|
+
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
11616
|
+
const __m128i q1b_2_1 = _mm_set_epi64x(
|
11617
|
+
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
11618
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11619
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11620
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11621
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11622
|
+
|
11623
|
+
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
11624
|
+
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
11625
|
+
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
11626
|
+
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
11627
|
+
|
11628
|
+
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11629
|
+
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11630
|
+
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11631
|
+
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11632
|
+
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11633
|
+
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11634
|
+
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11635
|
+
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11636
|
+
|
11637
|
+
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
11638
|
+
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
11639
|
+
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
11640
|
+
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
11641
|
+
|
11642
|
+
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
11643
|
+
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
11644
|
+
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
11645
|
+
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
11646
|
+
|
11647
|
+
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
11648
|
+
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
11649
|
+
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
11650
|
+
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
11651
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
11652
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
11653
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
11654
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
11655
|
+
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
11656
|
+
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
11657
|
+
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
11658
|
+
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
11659
|
+
|
11660
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
11661
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
11662
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
11663
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
11664
|
+
|
11665
|
+
qs += 8; qh += 4;
|
11666
|
+
}
|
11667
|
+
|
11668
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
11669
|
+
|
11670
|
+
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
11671
|
+
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
11672
|
+
}
|
11673
|
+
|
11674
|
+
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
11675
|
+
|
11140
11676
|
#else
|
11141
11677
|
|
11142
11678
|
int sum1[2], sum2[2], delta[4];
|
@@ -11267,8 +11803,47 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
11267
11803
|
|
11268
11804
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
11269
11805
|
|
11806
|
+
#elif defined __AVX__
|
11807
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
11808
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
11809
|
+
const __m128i mone = _mm_set1_epi16(1);
|
11810
|
+
|
11811
|
+
__m256 accum1 = _mm256_setzero_ps();
|
11812
|
+
__m256 accum2 = _mm256_setzero_ps();
|
11813
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
11814
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
11815
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
11816
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
11817
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
|
11818
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
11819
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
|
11820
|
+
|
11821
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
11822
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
11823
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
11824
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
11825
|
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
11826
|
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
11827
|
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
11828
|
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
11829
|
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
11830
|
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
11831
|
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
11832
|
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
11833
|
+
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
11834
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
11835
|
+
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
11836
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
11837
|
+
|
11838
|
+
y += 2;
|
11839
|
+
x += 2;
|
11840
|
+
}
|
11841
|
+
|
11842
|
+
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
11843
|
+
|
11270
11844
|
#elif defined(__POWER9_VECTOR__)
|
11271
11845
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11846
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
11272
11847
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11273
11848
|
|
11274
11849
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -11299,8 +11874,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
11299
11874
|
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
11300
11875
|
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
11301
11876
|
|
11302
|
-
vector signed int vsumi0 =
|
11303
|
-
vector signed int vsumi1 =
|
11877
|
+
vector signed int vsumi0 = v0;
|
11878
|
+
vector signed int vsumi1 = v0;
|
11879
|
+
|
11880
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
11881
|
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
11304
11882
|
|
11305
11883
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11306
11884
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
@@ -11453,8 +12031,57 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11453
12031
|
|
11454
12032
|
*s = hsum_float_8(accum);
|
11455
12033
|
|
12034
|
+
#elif defined __AVX__
|
12035
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
12036
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
12037
|
+
|
12038
|
+
__m256 accum = _mm256_setzero_ps();
|
12039
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
12040
|
+
const uint8_t * qs = x[ibl].qs;
|
12041
|
+
const int8_t * q8 = y[ibl].qs;
|
12042
|
+
uint16_t sh = x[ibl].scales_h;
|
12043
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
12044
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
12045
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
12046
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
12047
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
12048
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
12049
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
12050
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12051
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12052
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12053
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12054
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
12055
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
12056
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
12057
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
12058
|
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
12059
|
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
12060
|
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
12061
|
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
12062
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
12063
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
12064
|
+
sh >>= 4;
|
12065
|
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
12066
|
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
12067
|
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
12068
|
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
12069
|
+
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
12070
|
+
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
12071
|
+
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
12072
|
+
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
12073
|
+
}
|
12074
|
+
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
12075
|
+
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
12076
|
+
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
12077
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
12078
|
+
}
|
12079
|
+
|
12080
|
+
*s = hsum_float_8(accum);
|
12081
|
+
|
11456
12082
|
#elif defined(__POWER9_VECTOR__)
|
11457
12083
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
12084
|
+
const vector int v0 = vec_splats((int32_t)0);
|
11458
12085
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11459
12086
|
|
11460
12087
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -11470,14 +12097,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11470
12097
|
vector float vyd = vec_splats(y[ibl].d);
|
11471
12098
|
vector float vd = vec_mul(vxd, vyd);
|
11472
12099
|
|
11473
|
-
vector signed int vsumi0 =
|
11474
|
-
vector signed int vsumi1 =
|
11475
|
-
vector signed int vsumi2 =
|
11476
|
-
vector signed int vsumi3 =
|
11477
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11478
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11479
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11480
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
12100
|
+
vector signed int vsumi0 = v0;
|
12101
|
+
vector signed int vsumi1 = v0;
|
12102
|
+
vector signed int vsumi2 = v0;
|
12103
|
+
vector signed int vsumi3 = v0;
|
11481
12104
|
|
11482
12105
|
uint16_t h = x[ibl].scales_h;
|
11483
12106
|
|
@@ -11522,21 +12145,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11522
12145
|
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
11523
12146
|
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
11524
12147
|
|
11525
|
-
vsumi0 =
|
11526
|
-
vsumi1 =
|
11527
|
-
vsumi2 =
|
11528
|
-
vsumi3 =
|
11529
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11530
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11531
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11532
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
12148
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
12149
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
12150
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
12151
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
11533
12152
|
}
|
11534
12153
|
|
11535
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
11536
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
11537
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
11538
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
11539
|
-
|
11540
12154
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11541
12155
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11542
12156
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -13139,7 +13753,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
13139
13753
|
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
13140
13754
|
int num_neighbors = neighbours[0];
|
13141
13755
|
GGML_ASSERT(num_neighbors > 0);
|
13142
|
-
float best_score =
|
13756
|
+
float best_score = -FLT_MAX;
|
13143
13757
|
int grid_index = -1;
|
13144
13758
|
for (int j = 1; j <= num_neighbors; ++j) {
|
13145
13759
|
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
@@ -13337,7 +13951,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
13337
13951
|
sumw[j+1] = sumw[j] + weight[i];
|
13338
13952
|
}
|
13339
13953
|
}
|
13340
|
-
float best_score =
|
13954
|
+
float best_score = -FLT_MIN, scale = max;
|
13341
13955
|
int besti1 = -1, besti2 = -1, best_shift = 0;
|
13342
13956
|
for (int i1 = 0; i1 <= block_size; ++i1) {
|
13343
13957
|
for (int i2 = i1; i2 <= block_size; ++i2) {
|
@@ -13513,7 +14127,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
13513
14127
|
idx[2*j] = j;
|
13514
14128
|
}
|
13515
14129
|
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
13516
|
-
float best_score =
|
14130
|
+
float best_score = -FLT_MIN, scale = max;
|
13517
14131
|
int besti1 = -1, besti2 = -1, best_k = -1;
|
13518
14132
|
// 0: +, +
|
13519
14133
|
// 1: +, -
|