llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -4,8 +4,6 @@
|
|
4
4
|
#include "ggml-quants.h"
|
5
5
|
#include "ggml-impl.h"
|
6
6
|
|
7
|
-
#define GGML_COMMON_IMPL_C
|
8
|
-
#include "ggml-common.h"
|
9
7
|
|
10
8
|
#include <math.h>
|
11
9
|
#include <string.h>
|
@@ -1078,6 +1076,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
1078
1076
|
}
|
1079
1077
|
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
1080
1078
|
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
1079
|
+
}
|
1081
1080
|
|
1082
1081
|
#elif defined(__loongarch_asx)
|
1083
1082
|
for (int i = 0; i < nb; i++) {
|
@@ -1437,6 +1436,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
1437
1436
|
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
1438
1437
|
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
1439
1438
|
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
1439
|
+
}
|
1440
1440
|
|
1441
1441
|
#elif defined(__loongarch_asx)
|
1442
1442
|
for (int i = 0; i < nb; i++) {
|
@@ -4113,12 +4113,13 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4113
4113
|
|
4114
4114
|
#elif defined(__POWER9_VECTOR__)
|
4115
4115
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4116
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
4116
4117
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4117
4118
|
const vector signed char v8 = vec_splats((signed char)0x8);
|
4118
4119
|
|
4119
4120
|
vector float vsumf0 = vec_splats(0.0f);
|
4120
4121
|
|
4121
|
-
#pragma GCC unroll
|
4122
|
+
#pragma GCC unroll 8
|
4122
4123
|
for (int i = 0; i < nb; i++) {
|
4123
4124
|
__builtin_prefetch(x[i].qs, 0, 1);
|
4124
4125
|
__builtin_prefetch(y[i].qs, 0, 1);
|
@@ -4140,9 +4141,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4140
4141
|
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
4141
4142
|
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4142
4143
|
|
4143
|
-
|
4144
|
+
vector signed int vsumi0 = v0;
|
4144
4145
|
|
4145
|
-
|
4146
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
4147
|
+
vsumi0 = vec_sum4s(qv1, vsumi0);
|
4146
4148
|
|
4147
4149
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4148
4150
|
}
|
@@ -4516,6 +4518,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4516
4518
|
|
4517
4519
|
#elif defined(__POWER9_VECTOR__)
|
4518
4520
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4521
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
4519
4522
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4520
4523
|
|
4521
4524
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -4537,15 +4540,13 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4537
4540
|
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4538
4541
|
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4539
4542
|
|
4540
|
-
vector
|
4541
|
-
vector
|
4543
|
+
vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
|
4544
|
+
vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
|
4542
4545
|
|
4543
|
-
vector signed
|
4544
|
-
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4546
|
+
vector signed int vsumi0 = v0;
|
4545
4547
|
|
4546
|
-
|
4547
|
-
|
4548
|
-
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4548
|
+
vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
|
4549
|
+
vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
|
4549
4550
|
|
4550
4551
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4551
4552
|
}
|
@@ -5247,6 +5248,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5247
5248
|
|
5248
5249
|
#elif defined(__POWER9_VECTOR__)
|
5249
5250
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
5251
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
5250
5252
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5251
5253
|
|
5252
5254
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -5272,18 +5274,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
5272
5274
|
|
5273
5275
|
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
5274
5276
|
|
5275
|
-
vector
|
5276
|
-
vector
|
5277
|
+
vector unsigned char q5x0 = (vector unsigned char)vec_or(vec_and(qxs, lowMask), qh0);
|
5278
|
+
vector unsigned char q5x1 = (vector unsigned char)vec_or(vec_sr(qxs, v4), qh1);
|
5277
5279
|
|
5278
5280
|
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
5279
5281
|
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
5280
5282
|
|
5281
|
-
vector signed
|
5282
|
-
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
5283
|
+
vector signed int vsumi0 = v0;
|
5283
5284
|
|
5284
|
-
|
5285
|
-
|
5286
|
-
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
5285
|
+
vsumi0 = vec_msum(q8y0, q5x0, vsumi0);
|
5286
|
+
vsumi0 = vec_msum(q8y1, q5x1, vsumi0);
|
5287
5287
|
|
5288
5288
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5289
5289
|
}
|
@@ -5523,9 +5523,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
5523
5523
|
*s = sumf;
|
5524
5524
|
|
5525
5525
|
#elif defined(__POWER9_VECTOR__)
|
5526
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
5526
5527
|
vector float vsumf0 = vec_splats(0.0f);
|
5527
5528
|
|
5528
|
-
#pragma GCC unroll
|
5529
|
+
#pragma GCC unroll 8
|
5529
5530
|
for (int i = 0; i < nb; i++) {
|
5530
5531
|
__builtin_prefetch(x[i].qs, 0, 1);
|
5531
5532
|
__builtin_prefetch(y[i].qs, 0, 1);
|
@@ -5544,13 +5545,13 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
5544
5545
|
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
5545
5546
|
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
5546
5547
|
|
5547
|
-
vector signed int vsumi0 =
|
5548
|
-
vector signed int vsumi1 =
|
5549
|
-
vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
|
5550
|
-
vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
|
5548
|
+
vector signed int vsumi0 = v0;
|
5549
|
+
vector signed int vsumi1 = v0;
|
5551
5550
|
|
5552
|
-
vsumi0 =
|
5553
|
-
vsumi1 =
|
5551
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
5552
|
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
5553
|
+
vsumi0 = vec_sum4s(qv2, vsumi0);
|
5554
|
+
vsumi1 = vec_sum4s(qv3, vsumi1);
|
5554
5555
|
|
5555
5556
|
vsumi0 = vec_add(vsumi0, vsumi1);
|
5556
5557
|
|
@@ -5938,6 +5939,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5938
5939
|
#elif defined(__POWER9_VECTOR__)
|
5939
5940
|
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5940
5941
|
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5942
|
+
const vector int v0 = vec_splats((int32_t)0);
|
5941
5943
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5942
5944
|
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5943
5945
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
@@ -5975,15 +5977,17 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5975
5977
|
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
5976
5978
|
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
5977
5979
|
|
5978
|
-
vector signed int vsumi0 =
|
5979
|
-
vector signed int vsumi1 =
|
5980
|
-
vector signed int vsumi2 =
|
5981
|
-
vector signed int vsumi3 =
|
5982
|
-
vector signed int vsumi4 =
|
5983
|
-
vector signed int vsumi5 =
|
5984
|
-
vector signed int vsumi6 =
|
5985
|
-
vector signed int vsumi7 =
|
5980
|
+
vector signed int vsumi0 = v0;
|
5981
|
+
vector signed int vsumi1 = v0;
|
5982
|
+
vector signed int vsumi2 = v0;
|
5983
|
+
vector signed int vsumi3 = v0;
|
5984
|
+
vector signed int vsumi4 = v0;
|
5985
|
+
vector signed int vsumi5 = v0;
|
5986
|
+
vector signed int vsumi6 = v0;
|
5987
|
+
vector signed int vsumi7 = v0;
|
5986
5988
|
|
5989
|
+
const uint8_t * restrict q2 = x[i].qs;
|
5990
|
+
const int8_t * restrict q8 = y[i].qs;
|
5987
5991
|
|
5988
5992
|
for (int j = 0; j < QK_K/128; ++j) {
|
5989
5993
|
__builtin_prefetch(q2, 0, 1);
|
@@ -5993,14 +5997,14 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5993
5997
|
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
5994
5998
|
q2 += 32;
|
5995
5999
|
|
5996
|
-
vector
|
5997
|
-
vector
|
5998
|
-
vector
|
5999
|
-
vector
|
6000
|
-
vector
|
6001
|
-
vector
|
6002
|
-
vector
|
6003
|
-
vector
|
6000
|
+
vector unsigned char q2x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
6001
|
+
vector unsigned char q2x01 = (vector unsigned char)vec_and(vec_sr(qxs0, v2), lowMask);
|
6002
|
+
vector unsigned char q2x02 = (vector unsigned char)vec_and(vec_sr(qxs0, v4), lowMask);
|
6003
|
+
vector unsigned char q2x03 = (vector unsigned char)vec_and(vec_sr(qxs0, v6), lowMask);
|
6004
|
+
vector unsigned char q2x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
6005
|
+
vector unsigned char q2x11 = (vector unsigned char)vec_and(vec_sr(qxs1, v2), lowMask);
|
6006
|
+
vector unsigned char q2x12 = (vector unsigned char)vec_and(vec_sr(qxs1, v4), lowMask);
|
6007
|
+
vector unsigned char q2x13 = (vector unsigned char)vec_and(vec_sr(qxs1, v6), lowMask);
|
6004
6008
|
|
6005
6009
|
vector signed char q8y00 = vec_xl( 0, q8);
|
6006
6010
|
vector signed char q8y10 = vec_xl( 16, q8);
|
@@ -6012,45 +6016,36 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6012
6016
|
vector signed char q8y13 = vec_xl(112, q8);
|
6013
6017
|
q8 += 128;
|
6014
6018
|
|
6015
|
-
vector signed
|
6016
|
-
vector signed
|
6017
|
-
vector signed
|
6018
|
-
vector signed
|
6019
|
-
vector signed
|
6020
|
-
vector signed
|
6021
|
-
vector signed
|
6022
|
-
vector signed
|
6023
|
-
|
6024
|
-
vector signed short
|
6025
|
-
vector signed
|
6026
|
-
vector signed
|
6027
|
-
vector signed
|
6028
|
-
vector signed
|
6029
|
-
vector signed
|
6030
|
-
vector signed
|
6031
|
-
vector signed
|
6032
|
-
vector signed
|
6019
|
+
vector signed int qv0 = vec_msum(q8y00, q2x00, v0);
|
6020
|
+
vector signed int qv1 = vec_msum(q8y01, q2x01, v0);
|
6021
|
+
vector signed int qv2 = vec_msum(q8y02, q2x02, v0);
|
6022
|
+
vector signed int qv3 = vec_msum(q8y03, q2x03, v0);
|
6023
|
+
vector signed int qv4 = vec_msum(q8y10, q2x10, v0);
|
6024
|
+
vector signed int qv5 = vec_msum(q8y11, q2x11, v0);
|
6025
|
+
vector signed int qv6 = vec_msum(q8y12, q2x12, v0);
|
6026
|
+
vector signed int qv7 = vec_msum(q8y13, q2x13, v0);
|
6027
|
+
|
6028
|
+
vector signed short vscales_07 = vec_unpackh(vscales);
|
6029
|
+
vector signed int vscales_03 = vec_unpackh(vscales_07);
|
6030
|
+
vector signed int vscales_47 = vec_unpackl(vscales_07);
|
6031
|
+
vector signed int vs0 = vec_splat(vscales_03, 0);
|
6032
|
+
vector signed int vs1 = vec_splat(vscales_03, 1);
|
6033
|
+
vector signed int vs2 = vec_splat(vscales_03, 2);
|
6034
|
+
vector signed int vs3 = vec_splat(vscales_03, 3);
|
6035
|
+
vector signed int vs4 = vec_splat(vscales_47, 0);
|
6036
|
+
vector signed int vs5 = vec_splat(vscales_47, 1);
|
6037
|
+
vector signed int vs6 = vec_splat(vscales_47, 2);
|
6038
|
+
vector signed int vs7 = vec_splat(vscales_47, 3);
|
6033
6039
|
vscales = vec_sld(vscales, vscales, 8);
|
6034
6040
|
|
6035
|
-
|
6036
|
-
|
6037
|
-
|
6038
|
-
|
6039
|
-
|
6040
|
-
|
6041
|
-
|
6042
|
-
|
6043
|
-
qv3 = vec_madd(qv7, vs7, qv3);
|
6044
|
-
|
6045
|
-
vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
|
6046
|
-
vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
|
6047
|
-
vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
|
6048
|
-
vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
|
6049
|
-
|
6050
|
-
vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
|
6051
|
-
vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
|
6052
|
-
vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
|
6053
|
-
vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
|
6041
|
+
vsumi0 = vec_add(vec_mul(qv0, vs0), vsumi0);
|
6042
|
+
vsumi1 = vec_add(vec_mul(qv1, vs2), vsumi1);
|
6043
|
+
vsumi2 = vec_add(vec_mul(qv2, vs4), vsumi2);
|
6044
|
+
vsumi3 = vec_add(vec_mul(qv3, vs6), vsumi3);
|
6045
|
+
vsumi4 = vec_add(vec_mul(qv4, vs1), vsumi4);
|
6046
|
+
vsumi5 = vec_add(vec_mul(qv5, vs3), vsumi5);
|
6047
|
+
vsumi6 = vec_add(vec_mul(qv6, vs5), vsumi6);
|
6048
|
+
vsumi7 = vec_add(vec_mul(qv7, vs7), vsumi7);
|
6054
6049
|
}
|
6055
6050
|
|
6056
6051
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -6641,6 +6636,9 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6641
6636
|
|
6642
6637
|
#elif defined(__POWER9_VECTOR__)
|
6643
6638
|
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6639
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0xf);
|
6640
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
6641
|
+
const vector int v0 = vec_splats((int32_t)0);
|
6644
6642
|
const vector signed char v1 = vec_splats((signed char)0x1);
|
6645
6643
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6646
6644
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
@@ -6658,30 +6656,33 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6658
6656
|
vector float vyd = vec_splats(y[i].d);
|
6659
6657
|
vector float vd = vec_mul(vxd, vyd);
|
6660
6658
|
|
6661
|
-
|
6662
|
-
|
6659
|
+
UNUSED(kmask1);
|
6660
|
+
UNUSED(kmask2);
|
6663
6661
|
|
6664
|
-
|
6665
|
-
|
6666
|
-
|
6667
|
-
|
6668
|
-
|
6662
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
6663
|
+
vector signed char u1 = vec_and(u0, lowMask1);
|
6664
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
6665
|
+
vector signed char u3 = (vector signed char)vec_mergeh((vector signed int)u2, (vector signed int)vec_sr(u2, v2));
|
6666
|
+
vector signed char u30 = vec_sl(vec_and(u3, lowMask), v4);
|
6667
|
+
vector signed char u31 = vec_and(u3, lowMask2);
|
6669
6668
|
|
6670
|
-
|
6669
|
+
u1 = vec_or(u1, u30);
|
6670
|
+
u2 = vec_or(vec_sr(u0, v4), u31);
|
6671
|
+
|
6672
|
+
vector signed char vscales = (vector signed char)vec_mergeh((vector signed long long)u1, (vector signed long long)u2);
|
6671
6673
|
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
6672
6674
|
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
6673
6675
|
|
6674
6676
|
vscales = vec_sub(vscales, off);
|
6675
6677
|
|
6676
|
-
vector signed int vsumi0 =
|
6677
|
-
vector signed int vsumi1 =
|
6678
|
-
vector signed int vsumi2 =
|
6679
|
-
vector signed int vsumi3 =
|
6680
|
-
vector signed int vsumi4 =
|
6681
|
-
vector signed int vsumi5 =
|
6682
|
-
vector signed int vsumi6 =
|
6683
|
-
vector signed int vsumi7 =
|
6684
|
-
|
6678
|
+
vector signed int vsumi0 = v0;
|
6679
|
+
vector signed int vsumi1 = v0;
|
6680
|
+
vector signed int vsumi2 = v0;
|
6681
|
+
vector signed int vsumi3 = v0;
|
6682
|
+
vector signed int vsumi4 = v0;
|
6683
|
+
vector signed int vsumi5 = v0;
|
6684
|
+
vector signed int vsumi6 = v0;
|
6685
|
+
vector signed int vsumi7 = v0;
|
6685
6686
|
|
6686
6687
|
const uint8_t * restrict q3 = x[i].qs;
|
6687
6688
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -6755,23 +6756,14 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6755
6756
|
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
6756
6757
|
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
6757
6758
|
|
6758
|
-
|
6759
|
-
|
6760
|
-
|
6761
|
-
|
6762
|
-
|
6763
|
-
|
6764
|
-
|
6765
|
-
|
6766
|
-
|
6767
|
-
vsumi0 = vec_add(vsum0, vsumi0);
|
6768
|
-
vsumi1 = vec_add(vsum1, vsumi1);
|
6769
|
-
vsumi2 = vec_add(vsum2, vsumi2);
|
6770
|
-
vsumi3 = vec_add(vsum3, vsumi3);
|
6771
|
-
vsumi4 = vec_add(vsum4, vsumi4);
|
6772
|
-
vsumi5 = vec_add(vsum5, vsumi5);
|
6773
|
-
vsumi6 = vec_add(vsum6, vsumi6);
|
6774
|
-
vsumi7 = vec_add(vsum7, vsumi7);
|
6759
|
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
6760
|
+
vsumi1 = vec_msum(qv01, vs2, vsumi1);
|
6761
|
+
vsumi2 = vec_msum(qv02, vs4, vsumi2);
|
6762
|
+
vsumi3 = vec_msum(qv03, vs6, vsumi3);
|
6763
|
+
vsumi4 = vec_msum(qv10, vs1, vsumi4);
|
6764
|
+
vsumi5 = vec_msum(qv11, vs3, vsumi5);
|
6765
|
+
vsumi6 = vec_msum(qv12, vs5, vsumi6);
|
6766
|
+
vsumi7 = vec_msum(qv13, vs7, vsumi7);
|
6775
6767
|
}
|
6776
6768
|
|
6777
6769
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -7270,6 +7262,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7270
7262
|
|
7271
7263
|
#elif defined(__POWER9_VECTOR__)
|
7272
7264
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7265
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
7266
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
7267
|
+
const vector int v0 = vec_splats((int32_t)0);
|
7268
|
+
const vector unsigned char v2 = vec_splats((uint8_t)2);
|
7273
7269
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7274
7270
|
|
7275
7271
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -7288,15 +7284,24 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7288
7284
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7289
7285
|
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7290
7286
|
|
7291
|
-
|
7287
|
+
UNUSED(kmask1);
|
7288
|
+
UNUSED(kmask2);
|
7289
|
+
UNUSED(kmask3);
|
7290
|
+
UNUSED(utmp);
|
7292
7291
|
|
7293
|
-
|
7294
|
-
|
7295
|
-
|
7296
|
-
|
7297
|
-
|
7292
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
7293
|
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
7294
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
7295
|
+
vector signed char u3 = vec_sr(u2, v4);
|
7296
|
+
|
7297
|
+
vector signed char u30 = u1;
|
7298
|
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
7299
|
+
|
7300
|
+
u1 = vec_and(u0, lowMask1);
|
7301
|
+
u2 = vec_or(u30, u31);
|
7302
|
+
|
7303
|
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
7298
7304
|
|
7299
|
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7300
7305
|
vector signed short vscales = vec_unpackh(utmps);
|
7301
7306
|
vector signed short q4xmins = vec_unpackl(utmps);
|
7302
7307
|
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
@@ -7312,14 +7317,10 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7312
7317
|
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
7313
7318
|
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
7314
7319
|
|
7315
|
-
vector signed int vsumi0 =
|
7316
|
-
vector signed int vsumi1 =
|
7317
|
-
vector signed int vsumi2 =
|
7318
|
-
vector signed int vsumi3 =
|
7319
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
7320
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
7321
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
7322
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
7320
|
+
vector signed int vsumi0 = v0;
|
7321
|
+
vector signed int vsumi1 = v0;
|
7322
|
+
vector signed int vsumi2 = v0;
|
7323
|
+
vector signed int vsumi3 = v0;
|
7323
7324
|
|
7324
7325
|
const uint8_t * restrict q4 = x[i].qs;
|
7325
7326
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -7334,14 +7335,14 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7334
7335
|
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
7335
7336
|
q4 += 64;
|
7336
7337
|
|
7337
|
-
vector
|
7338
|
-
vector
|
7339
|
-
vector
|
7340
|
-
vector
|
7341
|
-
vector
|
7342
|
-
vector
|
7343
|
-
vector
|
7344
|
-
vector
|
7338
|
+
vector unsigned char q4x00 = (vector unsigned char)vec_and(qxs0, lowMask);
|
7339
|
+
vector unsigned char q4x01 = (vector unsigned char)vec_sr(qxs0, v4);
|
7340
|
+
vector unsigned char q4x10 = (vector unsigned char)vec_and(qxs1, lowMask);
|
7341
|
+
vector unsigned char q4x11 = (vector unsigned char)vec_sr(qxs1, v4);
|
7342
|
+
vector unsigned char q4x20 = (vector unsigned char)vec_and(qxs2, lowMask);
|
7343
|
+
vector unsigned char q4x21 = (vector unsigned char)vec_sr(qxs2, v4);
|
7344
|
+
vector unsigned char q4x30 = (vector unsigned char)vec_and(qxs3, lowMask);
|
7345
|
+
vector unsigned char q4x31 = (vector unsigned char)vec_sr(qxs3, v4);
|
7345
7346
|
|
7346
7347
|
vector signed char q8y00 = vec_xl( 0, q8);
|
7347
7348
|
vector signed char q8y10 = vec_xl( 16, q8);
|
@@ -7353,41 +7354,33 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7353
7354
|
vector signed char q8y31 = vec_xl(112, q8);
|
7354
7355
|
q8 += 128;
|
7355
7356
|
|
7356
|
-
vector signed
|
7357
|
-
vector signed
|
7358
|
-
vector signed
|
7359
|
-
vector signed
|
7360
|
-
vector signed
|
7361
|
-
vector signed
|
7362
|
-
vector signed
|
7363
|
-
vector signed
|
7364
|
-
|
7365
|
-
vector signed
|
7366
|
-
vector signed
|
7367
|
-
vector signed
|
7368
|
-
vector signed
|
7357
|
+
vector signed int qv00 = vec_msum(q8y00, q4x00, v0);
|
7358
|
+
vector signed int qv01 = vec_msum(q8y01, q4x01, v0);
|
7359
|
+
vector signed int qv10 = vec_msum(q8y10, q4x10, v0);
|
7360
|
+
vector signed int qv11 = vec_msum(q8y11, q4x11, v0);
|
7361
|
+
vector signed int qv20 = vec_msum(q8y20, q4x20, v0);
|
7362
|
+
vector signed int qv21 = vec_msum(q8y21, q4x21, v0);
|
7363
|
+
vector signed int qv30 = vec_msum(q8y30, q4x30, v0);
|
7364
|
+
vector signed int qv31 = vec_msum(q8y31, q4x31, v0);
|
7365
|
+
|
7366
|
+
vector signed int vscales_h = vec_unpackh(vscales);
|
7367
|
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
7368
|
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
7369
|
+
vector signed int vs2 = vec_splat(vscales_h, 2);
|
7370
|
+
vector signed int vs3 = vec_splat(vscales_h, 3);
|
7369
7371
|
vscales = vec_sld(vscales, vscales, 8);
|
7370
7372
|
|
7371
|
-
|
7372
|
-
|
7373
|
-
|
7374
|
-
|
7373
|
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
7374
|
+
vsumi1 = vec_add(vec_mul(qv01, vs1), vsumi1);
|
7375
|
+
vsumi2 = vec_add(vec_mul(qv20, vs2), vsumi2);
|
7376
|
+
vsumi3 = vec_add(vec_mul(qv21, vs3), vsumi3);
|
7375
7377
|
|
7376
|
-
vsumi0 = vec_add(
|
7377
|
-
vsumi1 = vec_add(
|
7378
|
-
vsumi2 = vec_add(
|
7379
|
-
vsumi3 = vec_add(
|
7380
|
-
vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
|
7381
|
-
vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
|
7382
|
-
vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
|
7383
|
-
vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
|
7378
|
+
vsumi0 = vec_add(vec_mul(qv10, vs0), vsumi0);
|
7379
|
+
vsumi1 = vec_add(vec_mul(qv11, vs1), vsumi1);
|
7380
|
+
vsumi2 = vec_add(vec_mul(qv30, vs2), vsumi2);
|
7381
|
+
vsumi3 = vec_add(vec_mul(qv31, vs3), vsumi3);
|
7384
7382
|
}
|
7385
7383
|
|
7386
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
7387
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
7388
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
7389
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
7390
|
-
|
7391
7384
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7392
7385
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7393
7386
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -7889,6 +7882,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7889
7882
|
|
7890
7883
|
#elif defined(__POWER9_VECTOR__)
|
7891
7884
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7885
|
+
const vector signed char lowMask1 = vec_splats((int8_t)0x3f);
|
7886
|
+
const vector signed char lowMask2 = vec_splats((int8_t)0x30);
|
7887
|
+
const vector int v0 = vec_splats((int32_t)0);
|
7892
7888
|
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
7893
7889
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
7894
7890
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
@@ -7907,18 +7903,27 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7907
7903
|
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
7908
7904
|
vector float vdmin = vec_mul(vxmin, vyd);
|
7909
7905
|
|
7910
|
-
|
7906
|
+
UNUSED(kmask1);
|
7907
|
+
UNUSED(kmask2);
|
7908
|
+
UNUSED(kmask3);
|
7909
|
+
UNUSED(utmp);
|
7911
7910
|
|
7912
|
-
|
7913
|
-
|
7914
|
-
|
7915
|
-
|
7916
|
-
|
7911
|
+
vector signed char u0 = (vector signed char)vec_xl_len(x[i].scales, 8);
|
7912
|
+
vector signed char u1 = vec_and(vec_sr(u0, v2), lowMask2);
|
7913
|
+
vector signed char u2 = (vector signed char)vec_xl_len(x[i].scales + 8, 4);
|
7914
|
+
vector signed char u3 = vec_sr(u2, v4);
|
7915
|
+
|
7916
|
+
vector signed char u30 = u1;
|
7917
|
+
vector signed char u31 = (vector signed char)vec_mergeh((vector signed int)vec_and(u2, lowMask), (vector signed int)u3);
|
7918
|
+
|
7919
|
+
u1 = vec_and(u0, lowMask1);
|
7920
|
+
u2 = vec_or(u30, u31);
|
7921
|
+
|
7922
|
+
vector signed char utmps = (vector signed char)vec_mergeh((vector signed int)u1, (vector signed int)u2);
|
7917
7923
|
|
7918
7924
|
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7919
7925
|
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7920
7926
|
|
7921
|
-
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7922
7927
|
vector signed short vscales = vec_unpackh(utmps);
|
7923
7928
|
|
7924
7929
|
vector signed short q5xmins = vec_unpackl(utmps);
|
@@ -7938,10 +7943,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7938
7943
|
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
7939
7944
|
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
7940
7945
|
|
7941
|
-
vector signed int vsumi0 =
|
7942
|
-
vector signed int vsumi1 =
|
7943
|
-
vector signed int vsumi2 =
|
7944
|
-
vector signed int vsumi3 =
|
7946
|
+
vector signed int vsumi0 = v0;
|
7947
|
+
vector signed int vsumi1 = v0;
|
7948
|
+
vector signed int vsumi2 = v0;
|
7949
|
+
vector signed int vsumi3 = v0;
|
7945
7950
|
|
7946
7951
|
const uint8_t * restrict q5 = x[i].qs;
|
7947
7952
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -7966,10 +7971,10 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7966
7971
|
qxhs0 = vec_sr(qxhs0, v2);
|
7967
7972
|
qxhs1 = vec_sr(qxhs1, v2);
|
7968
7973
|
|
7969
|
-
vector
|
7970
|
-
vector
|
7971
|
-
vector
|
7972
|
-
vector
|
7974
|
+
vector unsigned char q5x00 = (vector unsigned char)vec_or(q5h00, qxs00);
|
7975
|
+
vector unsigned char q5x01 = (vector unsigned char)vec_or(q5h01, qxs01);
|
7976
|
+
vector unsigned char q5x10 = (vector unsigned char)vec_or(q5h10, qxs10);
|
7977
|
+
vector unsigned char q5x11 = (vector unsigned char)vec_or(q5h11, qxs11);
|
7973
7978
|
|
7974
7979
|
vector signed char q8y00 = vec_xl( 0, q8);
|
7975
7980
|
vector signed char q8y10 = vec_xl(16, q8);
|
@@ -7977,22 +7982,20 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7977
7982
|
vector signed char q8y11 = vec_xl(48, q8);
|
7978
7983
|
q8 += 64;
|
7979
7984
|
|
7980
|
-
vector signed
|
7981
|
-
vector signed
|
7982
|
-
vector signed
|
7983
|
-
vector signed
|
7985
|
+
vector signed int qv00 = vec_msum(q8y00, q5x00, v0);
|
7986
|
+
vector signed int qv01 = vec_msum(q8y01, q5x01, v0);
|
7987
|
+
vector signed int qv10 = vec_msum(q8y10, q5x10, v0);
|
7988
|
+
vector signed int qv11 = vec_msum(q8y11, q5x11, v0);
|
7984
7989
|
|
7985
|
-
vector signed
|
7986
|
-
vector signed
|
7990
|
+
vector signed int vscales_h = vec_unpackh(vscales);
|
7991
|
+
vector signed int vs0 = vec_splat(vscales_h, 0);
|
7992
|
+
vector signed int vs1 = vec_splat(vscales_h, 1);
|
7987
7993
|
vscales = vec_sld(vscales, vscales, 12);
|
7988
7994
|
|
7989
|
-
|
7990
|
-
|
7991
|
-
|
7992
|
-
|
7993
|
-
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
7994
|
-
vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
|
7995
|
-
vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
|
7995
|
+
vsumi0 = vec_add(vec_mul(qv00, vs0), vsumi0);
|
7996
|
+
vsumi1 = vec_add(vec_mul(qv10, vs0), vsumi1);
|
7997
|
+
vsumi2 = vec_add(vec_mul(qv01, vs1), vsumi2);
|
7998
|
+
vsumi3 = vec_add(vec_mul(qv11, vs1), vsumi3);
|
7996
7999
|
}
|
7997
8000
|
|
7998
8001
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
@@ -8553,6 +8556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8553
8556
|
|
8554
8557
|
#elif defined(__POWER9_VECTOR__)
|
8555
8558
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8559
|
+
const vector int v0 = vec_splats((int32_t)0);
|
8556
8560
|
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8557
8561
|
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
8558
8562
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
@@ -8569,14 +8573,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8569
8573
|
vector float vyd = vec_splats(y[i].d);
|
8570
8574
|
vector float vd = vec_mul(vxd, vyd);
|
8571
8575
|
|
8572
|
-
vector signed int vsumi0 =
|
8573
|
-
vector signed int vsumi1 =
|
8574
|
-
vector signed int vsumi2 =
|
8575
|
-
vector signed int vsumi3 =
|
8576
|
-
vector signed int vsumi4 =
|
8577
|
-
vector signed int vsumi5 =
|
8578
|
-
vector signed int vsumi6 =
|
8579
|
-
vector signed int vsumi7 =
|
8576
|
+
vector signed int vsumi0 = v0;
|
8577
|
+
vector signed int vsumi1 = v0;
|
8578
|
+
vector signed int vsumi2 = v0;
|
8579
|
+
vector signed int vsumi3 = v0;
|
8580
|
+
vector signed int vsumi4 = v0;
|
8581
|
+
vector signed int vsumi5 = v0;
|
8582
|
+
vector signed int vsumi6 = v0;
|
8583
|
+
vector signed int vsumi7 = v0;
|
8580
8584
|
|
8581
8585
|
const uint8_t * restrict q6 = x[i].ql;
|
8582
8586
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -8656,23 +8660,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8656
8660
|
vector signed short vs6 = vec_splat(vscales, 6);
|
8657
8661
|
vector signed short vs7 = vec_splat(vscales, 7);
|
8658
8662
|
|
8659
|
-
vsumi0 =
|
8660
|
-
vsumi1 =
|
8661
|
-
vsumi2 =
|
8662
|
-
vsumi3 =
|
8663
|
-
vsumi4 =
|
8664
|
-
vsumi5 =
|
8665
|
-
vsumi6 =
|
8666
|
-
vsumi7 =
|
8667
|
-
|
8668
|
-
vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
|
8669
|
-
vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
|
8670
|
-
vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
|
8671
|
-
vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
|
8672
|
-
vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
|
8673
|
-
vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
|
8674
|
-
vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
|
8675
|
-
vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
|
8663
|
+
vsumi0 = vec_msum(qv00, vs0, vsumi0);
|
8664
|
+
vsumi1 = vec_msum(qv01, vs4, vsumi1);
|
8665
|
+
vsumi2 = vec_msum(qv10, vs1, vsumi2);
|
8666
|
+
vsumi3 = vec_msum(qv11, vs5, vsumi3);
|
8667
|
+
vsumi4 = vec_msum(qv20, vs2, vsumi4);
|
8668
|
+
vsumi5 = vec_msum(qv21, vs6, vsumi5);
|
8669
|
+
vsumi6 = vec_msum(qv30, vs3, vsumi6);
|
8670
|
+
vsumi7 = vec_msum(qv31, vs7, vsumi7);
|
8676
8671
|
}
|
8677
8672
|
|
8678
8673
|
vsumi0 = vec_add(vsumi0, vsumi4);
|
@@ -8819,7 +8814,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8819
8814
|
#endif
|
8820
8815
|
}
|
8821
8816
|
|
8822
|
-
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
8817
|
+
#if defined (__AVX__) || defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__) || defined(__loongarch_asx)
|
8823
8818
|
static const int8_t keven_signs_q2xs[1024] = {
|
8824
8819
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8825
8820
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8952,7 +8947,63 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8952
8947
|
|
8953
8948
|
*s = 0.125f * hsum_float_8(accumf);
|
8954
8949
|
|
8950
|
+
#elif defined(__AVX__)
|
8951
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
8952
|
+
|
8953
|
+
uint32_t aux32[4];
|
8954
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
8955
|
+
|
8956
|
+
__m256 accumf = _mm256_setzero_ps();
|
8957
|
+
for (int i = 0; i < nb; ++i) {
|
8958
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
8959
|
+
const uint16_t * restrict q2 = x[i].qs;
|
8960
|
+
const int8_t * restrict q8 = y[i].qs;
|
8961
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
8962
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
8963
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
8964
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
8965
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
8966
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8967
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8968
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8969
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
8970
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
8971
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
8972
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2xxs_grid[aux8[3]], iq2xxs_grid[aux8[2]]);
|
8973
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
8974
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]]);
|
8975
|
+
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
8976
|
+
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
8977
|
+
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
8978
|
+
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127]);
|
8979
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
8980
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
8981
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
8982
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
8983
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
8984
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
8985
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
8986
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
8987
|
+
const uint16_t ls1 = aux32[1] >> 28;
|
8988
|
+
const uint16_t ls2 = aux32[3] >> 28;
|
8989
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
8990
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
8991
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
8992
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
8993
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
8994
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
8995
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
8996
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
8997
|
+
}
|
8998
|
+
|
8999
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9000
|
+
|
9001
|
+
}
|
9002
|
+
|
9003
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9004
|
+
|
8955
9005
|
#elif defined(__POWER9_VECTOR__)
|
9006
|
+
const vector int v0 = vec_splats((int32_t)0);
|
8956
9007
|
vector float vsumf0 = vec_splats(0.0f);
|
8957
9008
|
vector float vsumf1 = vec_splats(0.0f);
|
8958
9009
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -8965,14 +9016,10 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8965
9016
|
vector float vyd = vec_splats(y[i].d);
|
8966
9017
|
vector float vd = vec_mul(vxd, vyd);
|
8967
9018
|
|
8968
|
-
vector signed int vsumi0 =
|
8969
|
-
vector signed int vsumi1 =
|
8970
|
-
vector signed int vsumi2 =
|
8971
|
-
vector signed int vsumi3 =
|
8972
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
8973
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
8974
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
8975
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9019
|
+
vector signed int vsumi0 = v0;
|
9020
|
+
vector signed int vsumi1 = v0;
|
9021
|
+
vector signed int vsumi2 = v0;
|
9022
|
+
vector signed int vsumi3 = v0;
|
8976
9023
|
|
8977
9024
|
const uint16_t * restrict q2 = x[i].qs;
|
8978
9025
|
const int8_t * restrict q8 = y[i].qs;
|
@@ -9019,21 +9066,12 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9019
9066
|
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
9020
9067
|
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
9021
9068
|
|
9022
|
-
vsumi0 =
|
9023
|
-
vsumi1 =
|
9024
|
-
vsumi2 =
|
9025
|
-
vsumi3 =
|
9026
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
9027
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
9028
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
9029
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
9069
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
9070
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
9071
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
9072
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
9030
9073
|
}
|
9031
9074
|
|
9032
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9033
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9034
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9035
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9036
|
-
|
9037
9075
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9038
9076
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9039
9077
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -9307,6 +9345,165 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9307
9345
|
}
|
9308
9346
|
|
9309
9347
|
*s = 0.125f * hsum_float_8(accumf);
|
9348
|
+
|
9349
|
+
#elif defined(__AVX__)
|
9350
|
+
const __m128i mone = _mm_set1_epi8(1);
|
9351
|
+
static const char block_sign_shuffle_mask_1[32] = {
|
9352
|
+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
|
9353
|
+
0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
|
9354
|
+
};
|
9355
|
+
static const char block_sign_shuffle_mask_2[32] = {
|
9356
|
+
0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a,
|
9357
|
+
0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e,
|
9358
|
+
};
|
9359
|
+
static const uint8_t bit_selector_mask_bytes[32] = {
|
9360
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9361
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9362
|
+
};
|
9363
|
+
|
9364
|
+
const __m128i bit_selector_mask_0 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes);
|
9365
|
+
const __m128i bit_selector_mask_1 = _mm_loadu_si128((const __m128i*)bit_selector_mask_bytes + 1);
|
9366
|
+
const __m128i block_sign_shuffle_1_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1);
|
9367
|
+
const __m128i block_sign_shuffle_1_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_1 + 1);
|
9368
|
+
const __m128i block_sign_shuffle_2_0 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2);
|
9369
|
+
const __m128i block_sign_shuffle_2_1 = _mm_loadu_si128((const __m128i*)block_sign_shuffle_mask_2 + 1);
|
9370
|
+
|
9371
|
+
static const uint8_t k_bit_helper[32] = {
|
9372
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9373
|
+
0x00, 0x80, 0x80, 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00, 0x80, 0x00, 0x80, 0x80, 0x00,
|
9374
|
+
};
|
9375
|
+
const __m128i bit_helper_0 = _mm_loadu_si128((const __m128i*)k_bit_helper);
|
9376
|
+
const __m128i bit_helper_1 = _mm_loadu_si128((const __m128i*)k_bit_helper + 1);
|
9377
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
9378
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9379
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9380
|
+
|
9381
|
+
uint64_t aux64;
|
9382
|
+
|
9383
|
+
// somewhat hacky, but gives a significant boost in performance
|
9384
|
+
__m256i aux_gindex;
|
9385
|
+
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
9386
|
+
|
9387
|
+
__m256 accumf = _mm256_setzero_ps();
|
9388
|
+
for (int i = 0; i < nb; ++i) {
|
9389
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9390
|
+
const uint16_t * restrict q2 = x[i].qs;
|
9391
|
+
const int8_t * restrict q8 = y[i].qs;
|
9392
|
+
|
9393
|
+
memcpy(&aux64, x[i].scales, 8);
|
9394
|
+
__m128i stmp = _mm_set1_epi64x(aux64);
|
9395
|
+
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
9396
|
+
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
9397
|
+
|
9398
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
9399
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
9400
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
9401
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
9402
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 4) {
|
9403
|
+
|
9404
|
+
const __m128i q2_data_0 = _mm_loadu_si128((const __m128i*)q2);
|
9405
|
+
const __m128i q2_data_1 = _mm_loadu_si128((const __m128i*)q2 + 1); q2 += 16;
|
9406
|
+
aux_gindex = MM256_SET_M128I(_mm_and_si128(q2_data_1, m511), _mm_and_si128(q2_data_0, m511));
|
9407
|
+
|
9408
|
+
const __m128i partial_sign_bits_0 = _mm_srli_epi16(q2_data_0, 9);
|
9409
|
+
const __m128i partial_sign_bits_1 = _mm_srli_epi16(q2_data_1, 9);
|
9410
|
+
const __m128i partial_sign_bits_upper_0 = _mm_srli_epi16(q2_data_0, 13);
|
9411
|
+
const __m128i partial_sign_bits_upper_1 = _mm_srli_epi16(q2_data_1, 13);
|
9412
|
+
const __m128i partial_sign_bits_for_counting_0 = _mm_xor_si128(partial_sign_bits_0, partial_sign_bits_upper_0);
|
9413
|
+
const __m128i partial_sign_bits_for_counting_1 = _mm_xor_si128(partial_sign_bits_1, partial_sign_bits_upper_1);
|
9414
|
+
|
9415
|
+
const __m128i odd_bits_0 = _mm_shuffle_epi8(bit_helper_0, partial_sign_bits_for_counting_0);
|
9416
|
+
const __m128i odd_bits_1 = _mm_shuffle_epi8(bit_helper_1, partial_sign_bits_for_counting_1);
|
9417
|
+
const __m128i full_sign_bits_0 = _mm_or_si128(partial_sign_bits_0, odd_bits_0);
|
9418
|
+
const __m128i full_sign_bits_1 = _mm_or_si128(partial_sign_bits_1, odd_bits_1);
|
9419
|
+
|
9420
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9421
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9422
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9423
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9424
|
+
const __m128i q8_3_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9425
|
+
const __m128i q8_3_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9426
|
+
const __m128i q8_4_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9427
|
+
const __m128i q8_4_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9428
|
+
|
9429
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
9430
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]]);
|
9431
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
9432
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]]);
|
9433
|
+
const __m128i q2_3_0 = _mm_set_epi64x(iq2xs_grid[gindex[9]], iq2xs_grid[gindex[8]]);
|
9434
|
+
const __m128i q2_3_1 = _mm_set_epi64x(iq2xs_grid[gindex[11]], iq2xs_grid[gindex[10]]);
|
9435
|
+
const __m128i q2_4_0 = _mm_set_epi64x(iq2xs_grid[gindex[13]], iq2xs_grid[gindex[12]]);
|
9436
|
+
const __m128i q2_4_1 = _mm_set_epi64x(iq2xs_grid[gindex[15]], iq2xs_grid[gindex[14]]);
|
9437
|
+
|
9438
|
+
// AVX2 full_signs_1 is full_sign_bits_0 here
|
9439
|
+
// AVX2 full_signs_2 is full_sign_bits_1 here
|
9440
|
+
__m128i signs_0, signs_1;
|
9441
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_0);
|
9442
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_1_1);
|
9443
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9444
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9445
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, _mm_or_si128(signs_0, mone));
|
9446
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, _mm_or_si128(signs_1, mone));
|
9447
|
+
|
9448
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_0);
|
9449
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_0, block_sign_shuffle_2_1);
|
9450
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9451
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9452
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, _mm_or_si128(signs_0, mone));
|
9453
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, _mm_or_si128(signs_1, mone));
|
9454
|
+
|
9455
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_0);
|
9456
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_1_1);
|
9457
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9458
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9459
|
+
const __m128i q8s_3_0 = _mm_sign_epi8(q8_3_0, _mm_or_si128(signs_0, mone));
|
9460
|
+
const __m128i q8s_3_1 = _mm_sign_epi8(q8_3_1, _mm_or_si128(signs_1, mone));
|
9461
|
+
|
9462
|
+
signs_0 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_0);
|
9463
|
+
signs_1 = _mm_shuffle_epi8(full_sign_bits_1, block_sign_shuffle_2_1);
|
9464
|
+
signs_0 = _mm_cmpeq_epi8(_mm_and_si128(signs_0, bit_selector_mask_0), bit_selector_mask_0);
|
9465
|
+
signs_1 = _mm_cmpeq_epi8(_mm_and_si128(signs_1, bit_selector_mask_1), bit_selector_mask_1);
|
9466
|
+
const __m128i q8s_4_0 = _mm_sign_epi8(q8_4_0, _mm_or_si128(signs_0, mone));
|
9467
|
+
const __m128i q8s_4_1 = _mm_sign_epi8(q8_4_1, _mm_or_si128(signs_1, mone));
|
9468
|
+
|
9469
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
9470
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
9471
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
9472
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
9473
|
+
const __m128i dot3_0 = _mm_maddubs_epi16(q2_3_0, q8s_3_0);
|
9474
|
+
const __m128i dot3_1 = _mm_maddubs_epi16(q2_3_1, q8s_3_1);
|
9475
|
+
const __m128i dot4_0 = _mm_maddubs_epi16(q2_4_0, q8s_4_0);
|
9476
|
+
const __m128i dot4_1 = _mm_maddubs_epi16(q2_4_1, q8s_4_1);
|
9477
|
+
|
9478
|
+
__m128i sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0));
|
9479
|
+
const __m128i sc1_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9480
|
+
const __m128i sc1_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9481
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1));
|
9482
|
+
const __m128i sc2_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9483
|
+
const __m128i sc2_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9484
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+2));
|
9485
|
+
const __m128i sc3_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9486
|
+
const __m128i sc3_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9487
|
+
sc_tmp = _mm_shuffle_epi8(scales, get_scale_shuffle(ib32+3));
|
9488
|
+
const __m128i sc4_0 = _mm_cvtepi8_epi16(sc_tmp);
|
9489
|
+
const __m128i sc4_1 = _mm_cvtepi8_epi16(_mm_srli_si128(sc_tmp, 8));
|
9490
|
+
|
9491
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot1_0, sc1_0));
|
9492
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot1_1, sc1_1));
|
9493
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot2_0, sc2_0));
|
9494
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot2_1, sc2_1));
|
9495
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_madd_epi16(dot3_0, sc3_0));
|
9496
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_madd_epi16(dot3_1, sc3_1));
|
9497
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_madd_epi16(dot4_0, sc4_0));
|
9498
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_madd_epi16(dot4_1, sc4_1));
|
9499
|
+
}
|
9500
|
+
|
9501
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9502
|
+
|
9503
|
+
}
|
9504
|
+
|
9505
|
+
*s = 0.125f * hsum_float_8(accumf);
|
9506
|
+
|
9310
9507
|
#elif defined(__loongarch_asx)
|
9311
9508
|
|
9312
9509
|
const __m256i mone = __lasx_xvreplgr2vr_b(1);
|
@@ -9425,6 +9622,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9425
9622
|
|
9426
9623
|
*s = 0.125f * hsum_float_8(accumf);
|
9427
9624
|
#elif defined(__POWER9_VECTOR__)
|
9625
|
+
const vector int v0 = vec_splats((int32_t)0);
|
9428
9626
|
vector float vsumf0 = vec_splats(0.0f);
|
9429
9627
|
vector float vsumf1 = vec_splats(0.0f);
|
9430
9628
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -9437,14 +9635,10 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9437
9635
|
vector float vyd = vec_splats(y[i].d);
|
9438
9636
|
vector float vd = vec_mul(vxd, vyd);
|
9439
9637
|
|
9440
|
-
vector signed int vsumi0 =
|
9441
|
-
vector signed int vsumi1 =
|
9442
|
-
vector signed int vsumi2 =
|
9443
|
-
vector signed int vsumi3 =
|
9444
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9445
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9446
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9447
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9638
|
+
vector signed int vsumi0 = v0;
|
9639
|
+
vector signed int vsumi1 = v0;
|
9640
|
+
vector signed int vsumi2 = v0;
|
9641
|
+
vector signed int vsumi3 = v0;
|
9448
9642
|
|
9449
9643
|
const uint16_t * restrict q2 = x[i].qs;
|
9450
9644
|
const uint8_t * restrict sc = x[i].scales;
|
@@ -9492,21 +9686,12 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9492
9686
|
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
9493
9687
|
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
9494
9688
|
|
9495
|
-
vsumi0 =
|
9496
|
-
vsumi1 =
|
9497
|
-
vsumi2 =
|
9498
|
-
vsumi3 =
|
9499
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
9500
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
9501
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
9502
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
9689
|
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
9690
|
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
9691
|
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
9692
|
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
9503
9693
|
}
|
9504
9694
|
|
9505
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9506
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9507
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9508
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9509
|
-
|
9510
9695
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9511
9696
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9512
9697
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -9722,6 +9907,98 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9722
9907
|
|
9723
9908
|
*s = 0.125f * hsum_float_8(accumf);
|
9724
9909
|
|
9910
|
+
#elif defined(__AVX__)
|
9911
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9912
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
9913
|
+
};
|
9914
|
+
|
9915
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9916
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
9917
|
+
};
|
9918
|
+
|
9919
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
9920
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
9921
|
+
|
9922
|
+
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
9923
|
+
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
9924
|
+
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
9925
|
+
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
9926
|
+
|
9927
|
+
uint64_t aux64;
|
9928
|
+
|
9929
|
+
__m256 accumf = _mm256_setzero_ps();
|
9930
|
+
for (int i = 0; i < nb; ++i) {
|
9931
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
9932
|
+
const uint8_t * restrict qs = x[i].qs;
|
9933
|
+
const uint8_t * restrict qh = x[i].qh;
|
9934
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
9935
|
+
const int8_t * restrict q8 = y[i].qs;
|
9936
|
+
|
9937
|
+
memcpy(&aux64, x[i].scales, 8);
|
9938
|
+
const __m128i scales8 = _mm_add_epi8(_mm_slli_epi16(_mm_and_si128(_mm_set_epi64x(aux64 >> 4, aux64), m4), 1), m1);
|
9939
|
+
const __m128i scales16_0 = _mm_cvtepi8_epi16(scales8);
|
9940
|
+
const __m128i scales16_1 = _mm_cvtepi8_epi16(_mm_srli_si128(scales8, 8));
|
9941
|
+
|
9942
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
9943
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
9944
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
9945
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
9946
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
9947
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9948
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9949
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9950
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
9951
|
+
const __m128i q2_1_0 = _mm_set_epi64x(iq2s_grid[qs[1] | ((qh[ib32+0] << 6) & 0x300)],
|
9952
|
+
iq2s_grid[qs[0] | ((qh[ib32+0] << 8) & 0x300)]);
|
9953
|
+
const __m128i q2_1_1 = _mm_set_epi64x(iq2s_grid[qs[3] | ((qh[ib32+0] << 2) & 0x300)],
|
9954
|
+
iq2s_grid[qs[2] | ((qh[ib32+0] << 4) & 0x300)]);
|
9955
|
+
const __m128i q2_2_0 = _mm_set_epi64x(iq2s_grid[qs[5] | ((qh[ib32+1] << 6) & 0x300)],
|
9956
|
+
iq2s_grid[qs[4] | ((qh[ib32+1] << 8) & 0x300)]);
|
9957
|
+
const __m128i q2_2_1 = _mm_set_epi64x(iq2s_grid[qs[7] | ((qh[ib32+1] << 2) & 0x300)],
|
9958
|
+
iq2s_grid[qs[6] | ((qh[ib32+1] << 4) & 0x300)]);
|
9959
|
+
qs += 8;
|
9960
|
+
|
9961
|
+
__m128i aux128_0 = _mm_set1_epi32(signs[0] | ((uint32_t) signs[1] << 16));
|
9962
|
+
__m128i aux128_1 = aux128_0;
|
9963
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
9964
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
9965
|
+
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
9966
|
+
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
9967
|
+
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
9968
|
+
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
9969
|
+
|
9970
|
+
aux128_0 = _mm_set1_epi32(signs[2] | ((uint32_t) signs[3] << 16));
|
9971
|
+
aux128_1 = aux128_0;
|
9972
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
9973
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
9974
|
+
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
9975
|
+
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
9976
|
+
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
9977
|
+
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
9978
|
+
|
9979
|
+
signs += 4;
|
9980
|
+
|
9981
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
9982
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
9983
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
9984
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
9985
|
+
|
9986
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 0)));
|
9987
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+0), 1)));
|
9988
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_shuffle_epi8(scales16_0, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 0)));
|
9989
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_shuffle_epi8(scales16_1, _mm256_extractf128_si256(get_scale_shuffle_k4(ib32+1), 1)));
|
9990
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
9991
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
9992
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
9993
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
9994
|
+
}
|
9995
|
+
|
9996
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
9997
|
+
|
9998
|
+
}
|
9999
|
+
|
10000
|
+
*s = 0.125f * hsum_float_8(accumf);
|
10001
|
+
|
9725
10002
|
#elif defined(__POWER9_VECTOR__)
|
9726
10003
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
9727
10004
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
@@ -9729,6 +10006,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9729
10006
|
|
9730
10007
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
9731
10008
|
|
10009
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10010
|
+
|
9732
10011
|
vector float vsumf0 = vec_splats(0.0f);
|
9733
10012
|
vector float vsumf1 = vec_splats(0.0f);
|
9734
10013
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -9743,14 +10022,10 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9743
10022
|
vector float vyd = vec_splats(y[i].d);
|
9744
10023
|
vector float vd = vec_mul(vxd, vyd);
|
9745
10024
|
|
9746
|
-
vector signed int vsumi0 =
|
9747
|
-
vector signed int vsumi1 =
|
9748
|
-
vector signed int vsumi2 =
|
9749
|
-
vector signed int vsumi3 =
|
9750
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9751
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9752
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9753
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10025
|
+
vector signed int vsumi0 = v0;
|
10026
|
+
vector signed int vsumi1 = v0;
|
10027
|
+
vector signed int vsumi2 = v0;
|
10028
|
+
vector signed int vsumi3 = v0;
|
9754
10029
|
|
9755
10030
|
const uint8_t * restrict q2 = x[i].qs;
|
9756
10031
|
const uint8_t * restrict qh = x[i].qh;
|
@@ -9810,21 +10085,12 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9810
10085
|
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
9811
10086
|
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
9812
10087
|
|
9813
|
-
vsumi0 =
|
9814
|
-
vsumi1 =
|
9815
|
-
vsumi2 =
|
9816
|
-
vsumi3 =
|
9817
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
9818
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
9819
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
9820
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10088
|
+
vsumi0 = vec_msum(qv0, vscales0, vsumi0);
|
10089
|
+
vsumi1 = vec_msum(qv1, vscales1, vsumi1);
|
10090
|
+
vsumi2 = vec_msum(qv2, vscales2, vsumi2);
|
10091
|
+
vsumi3 = vec_msum(qv3, vscales3, vsumi3);
|
9821
10092
|
}
|
9822
10093
|
|
9823
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
9824
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
9825
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
9826
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
9827
|
-
|
9828
10094
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9829
10095
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9830
10096
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10059,9 +10325,68 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10059
10325
|
|
10060
10326
|
*s = 0.25f * hsum_float_8(accumf);
|
10061
10327
|
|
10328
|
+
#elif defined(__AVX__)
|
10329
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10330
|
+
|
10331
|
+
uint32_t aux32[2];
|
10332
|
+
|
10333
|
+
__m256 accumf = _mm256_setzero_ps();
|
10334
|
+
for (int i = 0; i < nb; ++i) {
|
10335
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10336
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10337
|
+
const uint8_t * restrict gas = x[i].qs + QK_K/4;
|
10338
|
+
const int8_t * restrict q8 = y[i].qs;
|
10339
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
10340
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
10341
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
10342
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
10343
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10344
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10345
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10346
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10347
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10348
|
+
const __m128i q2_1_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10349
|
+
const __m128i q2_1_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
10350
|
+
q3 += 8;
|
10351
|
+
const __m128i q2_2_0 = _mm_set_epi32(iq3xxs_grid[q3[3]], iq3xxs_grid[q3[2]], iq3xxs_grid[q3[1]], iq3xxs_grid[q3[0]]);
|
10352
|
+
const __m128i q2_2_1 = _mm_set_epi32(iq3xxs_grid[q3[7]], iq3xxs_grid[q3[6]], iq3xxs_grid[q3[5]], iq3xxs_grid[q3[4]]);
|
10353
|
+
q3 += 8;
|
10354
|
+
memcpy(aux32, gas, 8); gas += 8;
|
10355
|
+
const __m128i s2_1_0 = _mm_set_epi64x(signs64[(aux32[0] >> 7) & 127], signs64[(aux32[0] >> 0) & 127]);
|
10356
|
+
const __m128i s2_1_1 = _mm_set_epi64x(signs64[(aux32[0] >> 21) & 127], signs64[(aux32[0] >> 14) & 127]);
|
10357
|
+
const __m128i s2_2_0 = _mm_set_epi64x(signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
10358
|
+
const __m128i s2_2_1 = _mm_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127]);
|
10359
|
+
const __m128i q8s_1_0 = _mm_sign_epi8(q8_1_0, s2_1_0);
|
10360
|
+
const __m128i q8s_1_1 = _mm_sign_epi8(q8_1_1, s2_1_1);
|
10361
|
+
const __m128i q8s_2_0 = _mm_sign_epi8(q8_2_0, s2_2_0);
|
10362
|
+
const __m128i q8s_2_1 = _mm_sign_epi8(q8_2_1, s2_2_1);
|
10363
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
10364
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
10365
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
10366
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
10367
|
+
const uint16_t ls1 = aux32[0] >> 28;
|
10368
|
+
const uint16_t ls2 = aux32[1] >> 28;
|
10369
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
10370
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
10371
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
10372
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
10373
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
10374
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
10375
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
10376
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
10377
|
+
}
|
10378
|
+
|
10379
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
10380
|
+
|
10381
|
+
}
|
10382
|
+
|
10383
|
+
*s = 0.25f * hsum_float_8(accumf);
|
10384
|
+
|
10062
10385
|
#elif defined(__POWER9_VECTOR__)
|
10063
10386
|
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10064
10387
|
|
10388
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10389
|
+
|
10065
10390
|
vector float vsumf0 = vec_splats(0.0f);
|
10066
10391
|
vector float vsumf1 = vec_splats(0.0f);
|
10067
10392
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -10072,14 +10397,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10072
10397
|
vector float vyd = vec_splats(y[i].d);
|
10073
10398
|
vector float vd = vec_mul(vxd, vyd);
|
10074
10399
|
|
10075
|
-
vector signed int vsumi0 =
|
10076
|
-
vector signed int vsumi1 =
|
10077
|
-
vector signed int vsumi2 =
|
10078
|
-
vector signed int vsumi3 =
|
10079
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10080
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10081
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10082
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10400
|
+
vector signed int vsumi0 = v0;
|
10401
|
+
vector signed int vsumi1 = v0;
|
10402
|
+
vector signed int vsumi2 = v0;
|
10403
|
+
vector signed int vsumi3 = v0;
|
10083
10404
|
|
10084
10405
|
const uint8_t * restrict q3 = x[i].qs;
|
10085
10406
|
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
@@ -10124,21 +10445,12 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
10124
10445
|
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10125
10446
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10126
10447
|
|
10127
|
-
vsumi0 =
|
10128
|
-
vsumi1 =
|
10129
|
-
vsumi2 =
|
10130
|
-
vsumi3 =
|
10131
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10132
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10133
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10134
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10448
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
10449
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
10450
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
10451
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10135
10452
|
}
|
10136
10453
|
|
10137
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10138
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10139
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10140
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10141
|
-
|
10142
10454
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10143
10455
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10144
10456
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10421,6 +10733,112 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10421
10733
|
|
10422
10734
|
*s = hsum_float_8(accumf);
|
10423
10735
|
|
10736
|
+
#elif defined(__AVX__)
|
10737
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10738
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10739
|
+
};
|
10740
|
+
|
10741
|
+
static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10742
|
+
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
|
10743
|
+
};
|
10744
|
+
|
10745
|
+
const __m128i mask1_0 = _mm_loadu_si128((const __m128i*)k_mask1);
|
10746
|
+
const __m128i mask1_1 = _mm_loadu_si128((const __m128i*)k_mask1 + 1);
|
10747
|
+
const __m128i mask2_0 = _mm_loadu_si128((const __m128i*)k_mask2);
|
10748
|
+
const __m128i mask2_1 = _mm_loadu_si128((const __m128i*)k_mask2 + 1);
|
10749
|
+
|
10750
|
+
const __m128i idx_mul_0 = _mm_set_epi32(32, 64, 128, 256);
|
10751
|
+
const __m128i idx_mul_1 = _mm_set_epi32(2, 4, 8, 16);
|
10752
|
+
const __m128i idx_mask = _mm_set1_epi32(256);
|
10753
|
+
|
10754
|
+
typedef union {
|
10755
|
+
__m128i vec[4];
|
10756
|
+
uint32_t index[16];
|
10757
|
+
} index_t;
|
10758
|
+
|
10759
|
+
index_t idx;
|
10760
|
+
|
10761
|
+
__m256 accumf = _mm256_setzero_ps();
|
10762
|
+
for (int i = 0; i < nb; ++i) {
|
10763
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
10764
|
+
const uint8_t * restrict qs = x[i].qs;
|
10765
|
+
const uint8_t * restrict qh = x[i].qh;
|
10766
|
+
const uint16_t * restrict signs = (const uint16_t *)x[i].signs;
|
10767
|
+
const int8_t * restrict q8 = y[i].qs;
|
10768
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
10769
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
10770
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
10771
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
10772
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
10773
|
+
const __m128i q8_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10774
|
+
const __m128i q8_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10775
|
+
const __m128i q8_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10776
|
+
const __m128i q8_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
10777
|
+
const __m128i qs_tmp = _mm_loadu_si128((const __m128i *)qs);
|
10778
|
+
const __m128i idx_l_0 = _mm_cvtepu8_epi16(qs_tmp);
|
10779
|
+
const __m128i idx_l_1 = _mm_cvtepu8_epi16(_mm_srli_si128(qs_tmp, 8)); qs += 16;
|
10780
|
+
idx.vec[0] = _mm_set1_epi32(qh[ib32+0]);
|
10781
|
+
idx.vec[1] = idx.vec[0];
|
10782
|
+
idx.vec[2] = _mm_set1_epi32(qh[ib32+1]);
|
10783
|
+
idx.vec[3] = idx.vec[2];
|
10784
|
+
|
10785
|
+
idx.vec[0] = _mm_and_si128(_mm_mullo_epi32(idx.vec[0], idx_mul_0), idx_mask);
|
10786
|
+
idx.vec[1] = _mm_and_si128(_mm_mullo_epi32(idx.vec[1], idx_mul_1), idx_mask);
|
10787
|
+
idx.vec[2] = _mm_and_si128(_mm_mullo_epi32(idx.vec[2], idx_mul_0), idx_mask);
|
10788
|
+
idx.vec[3] = _mm_and_si128(_mm_mullo_epi32(idx.vec[3], idx_mul_1), idx_mask);
|
10789
|
+
|
10790
|
+
idx.vec[0] = _mm_or_si128(idx.vec[0], _mm_cvtepi16_epi32(idx_l_0));
|
10791
|
+
idx.vec[1] = _mm_or_si128(idx.vec[1], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_0, 8)));
|
10792
|
+
idx.vec[2] = _mm_or_si128(idx.vec[2], _mm_cvtepi16_epi32(idx_l_1));
|
10793
|
+
idx.vec[3] = _mm_or_si128(idx.vec[3], _mm_cvtepi16_epi32(_mm_srli_si128(idx_l_1, 8)));
|
10794
|
+
|
10795
|
+
const __m128i q2_1_0 = _mm_set_epi32(iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]]);
|
10796
|
+
const __m128i q2_1_1 = _mm_set_epi32(iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]]);
|
10797
|
+
const __m128i q2_2_0 = _mm_set_epi32(iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[9]], iq3s_grid[idx.index[8]]);
|
10798
|
+
const __m128i q2_2_1 = _mm_set_epi32(iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]]);
|
10799
|
+
|
10800
|
+
__m128i aux128_0 = _mm_set1_epi32(signs[0] | (signs[1] << 16));
|
10801
|
+
__m128i aux128_1 = aux128_0;
|
10802
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
10803
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
10804
|
+
const __m128i s2_1_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
10805
|
+
const __m128i s2_1_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
10806
|
+
const __m128i q8s_1_0 = _mm_sub_epi8(_mm_xor_si128(s2_1_0, q8_1_0), s2_1_0);
|
10807
|
+
const __m128i q8s_1_1 = _mm_sub_epi8(_mm_xor_si128(s2_1_1, q8_1_1), s2_1_1);
|
10808
|
+
|
10809
|
+
aux128_0 = _mm_set1_epi32(signs[2] | (signs[3] << 16));
|
10810
|
+
aux128_1 = aux128_0;
|
10811
|
+
aux128_0 = _mm_and_si128(_mm_shuffle_epi8(aux128_0,mask1_0), mask2_0);
|
10812
|
+
aux128_1 = _mm_and_si128(_mm_shuffle_epi8(aux128_1,mask1_1), mask2_1);
|
10813
|
+
const __m128i s2_2_0 = _mm_cmpeq_epi8(aux128_0, mask2_0);
|
10814
|
+
const __m128i s2_2_1 = _mm_cmpeq_epi8(aux128_1, mask2_1);
|
10815
|
+
const __m128i q8s_2_0 = _mm_sub_epi8(_mm_xor_si128(s2_2_0, q8_2_0), s2_2_0);
|
10816
|
+
const __m128i q8s_2_1 = _mm_sub_epi8(_mm_xor_si128(s2_2_1, q8_2_1), s2_2_1);
|
10817
|
+
|
10818
|
+
signs += 4;
|
10819
|
+
|
10820
|
+
const __m128i dot1_0 = _mm_maddubs_epi16(q2_1_0, q8s_1_0);
|
10821
|
+
const __m128i dot1_1 = _mm_maddubs_epi16(q2_1_1, q8s_1_1);
|
10822
|
+
const __m128i dot2_0 = _mm_maddubs_epi16(q2_2_0, q8s_2_0);
|
10823
|
+
const __m128i dot2_1 = _mm_maddubs_epi16(q2_2_1, q8s_2_1);
|
10824
|
+
const uint16_t ls1 = x[i].scales[ib32/2] & 0xf;
|
10825
|
+
const uint16_t ls2 = x[i].scales[ib32/2] >> 4;
|
10826
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(2*ls1+1));
|
10827
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(2*ls1+1));
|
10828
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(2*ls2+1));
|
10829
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(2*ls2+1));
|
10830
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, p1_0);
|
10831
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, p1_1);
|
10832
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, p2_0);
|
10833
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, p2_1);
|
10834
|
+
}
|
10835
|
+
|
10836
|
+
accumf = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_add_epi32(sumi1_1, sumi2_1), _mm_add_epi32(sumi1_0, sumi2_0)))), accumf);
|
10837
|
+
|
10838
|
+
}
|
10839
|
+
|
10840
|
+
*s = hsum_float_8(accumf);
|
10841
|
+
|
10424
10842
|
#elif defined(__POWER9_VECTOR__)
|
10425
10843
|
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10426
10844
|
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
@@ -10428,6 +10846,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10428
10846
|
|
10429
10847
|
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10430
10848
|
|
10849
|
+
const vector int v0 = vec_splats((int32_t)0);
|
10850
|
+
|
10431
10851
|
vector float vsumf0 = vec_splats(0.0f);
|
10432
10852
|
vector float vsumf1 = vec_splats(0.0f);
|
10433
10853
|
vector float vsumf2 = vec_splats(0.0f);
|
@@ -10448,14 +10868,10 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10448
10868
|
const uint8_t * restrict sc = x[i].scales;
|
10449
10869
|
const int8_t * restrict q8 = y[i].qs;
|
10450
10870
|
|
10451
|
-
vector signed int vsumi0 =
|
10452
|
-
vector signed int vsumi1 =
|
10453
|
-
vector signed int vsumi2 =
|
10454
|
-
vector signed int vsumi3 =
|
10455
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10456
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10457
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10458
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10871
|
+
vector signed int vsumi0 = v0;
|
10872
|
+
vector signed int vsumi1 = v0;
|
10873
|
+
vector signed int vsumi2 = v0;
|
10874
|
+
vector signed int vsumi3 = v0;
|
10459
10875
|
|
10460
10876
|
for (int j = 0; j < QK_K/32; j += 2) {
|
10461
10877
|
__builtin_prefetch(q3, 0, 1);
|
@@ -10509,21 +10925,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10509
10925
|
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10510
10926
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10511
10927
|
|
10512
|
-
vsumi0 =
|
10513
|
-
vsumi1 =
|
10514
|
-
vsumi2 =
|
10515
|
-
vsumi3 =
|
10516
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10517
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10518
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10519
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10928
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
10929
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
10930
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
10931
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10520
10932
|
}
|
10521
10933
|
|
10522
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10523
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10524
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10525
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10526
|
-
|
10527
10934
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10528
10935
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10529
10936
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -10669,6 +11076,14 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
10669
11076
|
}
|
10670
11077
|
|
10671
11078
|
|
11079
|
+
#if defined(__AVX__)
|
11080
|
+
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
11081
|
+
const __m128i ax = _mm_sign_epi8(x, x);
|
11082
|
+
const __m128i sy = _mm_sign_epi8(y, x);
|
11083
|
+
return _mm_maddubs_epi16(ax, sy);
|
11084
|
+
}
|
11085
|
+
#endif
|
11086
|
+
|
10672
11087
|
#if defined(__AVX2__)
|
10673
11088
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
10674
11089
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
@@ -10786,6 +11201,54 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10786
11201
|
|
10787
11202
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
10788
11203
|
|
11204
|
+
#elif defined __AVX__
|
11205
|
+
__m256 accum = _mm256_setzero_ps();
|
11206
|
+
float accum1 = 0;
|
11207
|
+
for (int i = 0; i < nb; ++i) {
|
11208
|
+
|
11209
|
+
const int8_t * q8 = y[i].qs;
|
11210
|
+
const uint8_t * qs = x[i].qs;
|
11211
|
+
const uint16_t * qh = x[i].qh;
|
11212
|
+
|
11213
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
11214
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
11215
|
+
int sumi1 = 0;
|
11216
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
11217
|
+
const __m128i q1b_1_0 = _mm_set_epi64x(iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
|
11218
|
+
const __m128i q1b_1_1 = _mm_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)]);
|
11219
|
+
const __m128i q1b_2_0 = _mm_set_epi64x(iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
|
11220
|
+
const __m128i q1b_2_1 = _mm_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)]);
|
11221
|
+
qs += 8;
|
11222
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11223
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11224
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11225
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11226
|
+
|
11227
|
+
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
11228
|
+
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
11229
|
+
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
11230
|
+
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
11231
|
+
const int16_t ls1 = 2*((qh[ib+0] >> 12) & 7) + 1;
|
11232
|
+
const int16_t ls2 = 2*((qh[ib+1] >> 12) & 7) + 1;
|
11233
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, _mm_set1_epi16(ls1));
|
11234
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, _mm_set1_epi16(ls1));
|
11235
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, _mm_set1_epi16(ls2));
|
11236
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, _mm_set1_epi16(ls2));
|
11237
|
+
|
11238
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
11239
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
11240
|
+
sumi1 += (y[i].bsums[2*ib+0] + y[i].bsums[2*ib+1]) * (qh[ib+0] & 0x8000 ? -1 : 1) * ls1
|
11241
|
+
+ (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
|
11242
|
+
}
|
11243
|
+
|
11244
|
+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
11245
|
+
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
|
11246
|
+
accum1 += d * sumi1;
|
11247
|
+
|
11248
|
+
}
|
11249
|
+
|
11250
|
+
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
11251
|
+
|
10789
11252
|
#elif defined(__POWER9_VECTOR__)
|
10790
11253
|
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
10791
11254
|
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
@@ -10804,10 +11267,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10804
11267
|
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10805
11268
|
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10806
11269
|
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10807
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10808
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10809
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10810
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10811
11270
|
vector signed int vsumi8 = vec_splats((int32_t)0);
|
10812
11271
|
|
10813
11272
|
const uint8_t * restrict q1 = x[i].qs;
|
@@ -10849,14 +11308,10 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10849
11308
|
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10850
11309
|
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
10851
11310
|
|
10852
|
-
vsumi0 =
|
10853
|
-
vsumi1 =
|
10854
|
-
vsumi2 =
|
10855
|
-
vsumi3 =
|
10856
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10857
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10858
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10859
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11311
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
11312
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
11313
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
11314
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
10860
11315
|
|
10861
11316
|
vector signed short q8ysums = vec_xl_len(qs, 8);
|
10862
11317
|
qs += 4;
|
@@ -10871,11 +11326,6 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
10871
11326
|
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
10872
11327
|
}
|
10873
11328
|
|
10874
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
10875
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
10876
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
10877
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
10878
|
-
|
10879
11329
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10880
11330
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10881
11331
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -11137,6 +11587,92 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void
|
|
11137
11587
|
|
11138
11588
|
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
11139
11589
|
|
11590
|
+
#elif defined __AVX__
|
11591
|
+
const __m128i mask = _mm_set1_epi16(0x7);
|
11592
|
+
const __m128i mone = _mm_set1_epi16(1);
|
11593
|
+
|
11594
|
+
__m256 accum1 = _mm256_setzero_ps();
|
11595
|
+
__m256 accum2 = _mm256_setzero_ps();
|
11596
|
+
for (int i = 0; i < nb; ++i) {
|
11597
|
+
|
11598
|
+
const int8_t * q8 = y[i].qs;
|
11599
|
+
const uint8_t * qs = x[i].qs;
|
11600
|
+
const uint8_t * qh = x[i].qh;
|
11601
|
+
const uint16_t * sc = (const uint16_t *)x[i].scales;
|
11602
|
+
|
11603
|
+
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
11604
|
+
|
11605
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
11606
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
11607
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
11608
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
11609
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
11610
|
+
const __m128i q1b_1_0 = _mm_set_epi64x(
|
11611
|
+
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]);
|
11612
|
+
const __m128i q1b_1_1 = _mm_set_epi64x(
|
11613
|
+
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)]);
|
11614
|
+
const __m128i q1b_2_0 = _mm_set_epi64x(
|
11615
|
+
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]);
|
11616
|
+
const __m128i q1b_2_1 = _mm_set_epi64x(
|
11617
|
+
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)]);
|
11618
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11619
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11620
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11621
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
11622
|
+
|
11623
|
+
const __m128i dot1_0 = mul_add_epi8_sse(q1b_1_0, q8b_1_0);
|
11624
|
+
const __m128i dot1_1 = mul_add_epi8_sse(q1b_1_1, q8b_1_1);
|
11625
|
+
const __m128i dot2_0 = mul_add_epi8_sse(q1b_2_0, q8b_2_0);
|
11626
|
+
const __m128i dot2_1 = mul_add_epi8_sse(q1b_2_1, q8b_2_1);
|
11627
|
+
|
11628
|
+
const __m128i delta1_0 = _mm_set_epi64x(qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11629
|
+
qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11630
|
+
const __m128i delta1_1 = _mm_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11631
|
+
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11632
|
+
const __m128i delta2_0 = _mm_set_epi64x(qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11633
|
+
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11634
|
+
const __m128i delta2_1 = _mm_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
|
11635
|
+
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
|
11636
|
+
|
11637
|
+
const __m128i dot3_0 = mul_add_epi8_sse(delta1_0, q8b_1_0);
|
11638
|
+
const __m128i dot3_1 = mul_add_epi8_sse(delta1_1, q8b_1_1);
|
11639
|
+
const __m128i dot4_0 = mul_add_epi8_sse(delta2_0, q8b_2_0);
|
11640
|
+
const __m128i dot4_1 = mul_add_epi8_sse(delta2_1, q8b_2_1);
|
11641
|
+
|
11642
|
+
__m128i scale1_0 = _mm_set1_epi16(sc[ib/2] >> 0);
|
11643
|
+
__m128i scale1_1 = _mm_set1_epi16(sc[ib/2] >> 3);
|
11644
|
+
__m128i scale2_0 = _mm_set1_epi16(sc[ib/2] >> 6);
|
11645
|
+
__m128i scale2_1 = _mm_set1_epi16(sc[ib/2] >> 9);
|
11646
|
+
|
11647
|
+
scale1_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_0, mask), 1), mone);
|
11648
|
+
scale1_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale1_1, mask), 1), mone);
|
11649
|
+
scale2_0 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_0, mask), 1), mone);
|
11650
|
+
scale2_1 = _mm_add_epi16(_mm_slli_epi16(_mm_and_si128(scale2_1, mask), 1), mone);
|
11651
|
+
const __m128i p1_0 = _mm_madd_epi16(dot1_0, scale1_0);
|
11652
|
+
const __m128i p1_1 = _mm_madd_epi16(dot1_1, scale1_1);
|
11653
|
+
const __m128i p2_0 = _mm_madd_epi16(dot2_0, scale2_0);
|
11654
|
+
const __m128i p2_1 = _mm_madd_epi16(dot2_1, scale2_1);
|
11655
|
+
const __m128i p3_0 = _mm_madd_epi16(dot3_0, scale1_0);
|
11656
|
+
const __m128i p3_1 = _mm_madd_epi16(dot3_1, scale1_1);
|
11657
|
+
const __m128i p4_0 = _mm_madd_epi16(dot4_0, scale2_0);
|
11658
|
+
const __m128i p4_1 = _mm_madd_epi16(dot4_1, scale2_1);
|
11659
|
+
|
11660
|
+
sumi1_0 = _mm_add_epi32(sumi1_0, _mm_add_epi32(p1_0, p2_0));
|
11661
|
+
sumi1_1 = _mm_add_epi32(sumi1_1, _mm_add_epi32(p1_1, p2_1));
|
11662
|
+
sumi2_0 = _mm_add_epi32(sumi2_0, _mm_add_epi32(p3_0, p4_0));
|
11663
|
+
sumi2_1 = _mm_add_epi32(sumi2_1, _mm_add_epi32(p3_1, p4_1));
|
11664
|
+
|
11665
|
+
qs += 8; qh += 4;
|
11666
|
+
}
|
11667
|
+
|
11668
|
+
const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
|
11669
|
+
|
11670
|
+
accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
|
11671
|
+
accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
|
11672
|
+
}
|
11673
|
+
|
11674
|
+
*s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2);
|
11675
|
+
|
11140
11676
|
#else
|
11141
11677
|
|
11142
11678
|
int sum1[2], sum2[2], delta[4];
|
@@ -11267,8 +11803,47 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
11267
11803
|
|
11268
11804
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
11269
11805
|
|
11806
|
+
#elif defined __AVX__
|
11807
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
11808
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
11809
|
+
const __m128i mone = _mm_set1_epi16(1);
|
11810
|
+
|
11811
|
+
__m256 accum1 = _mm256_setzero_ps();
|
11812
|
+
__m256 accum2 = _mm256_setzero_ps();
|
11813
|
+
for (int ib = 0; ib < nb; ib += 2) {
|
11814
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
|
11815
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[1].qs);
|
11816
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
|
11817
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[0].qs + 1);
|
11818
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[1].qs);
|
11819
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[1].qs + 1);
|
11820
|
+
|
11821
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
11822
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
11823
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
11824
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
11825
|
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
11826
|
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
11827
|
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
11828
|
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
11829
|
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
11830
|
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
11831
|
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
11832
|
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
11833
|
+
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[0].d)*GGML_FP16_TO_FP32(x[0].d)),
|
11834
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
11835
|
+
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[1].d)*GGML_FP16_TO_FP32(x[1].d)),
|
11836
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
11837
|
+
|
11838
|
+
y += 2;
|
11839
|
+
x += 2;
|
11840
|
+
}
|
11841
|
+
|
11842
|
+
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
11843
|
+
|
11270
11844
|
#elif defined(__POWER9_VECTOR__)
|
11271
11845
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11846
|
+
const vector signed int v0 = vec_splats((int32_t)0);
|
11272
11847
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11273
11848
|
|
11274
11849
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -11299,8 +11874,11 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
11299
11874
|
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
11300
11875
|
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
11301
11876
|
|
11302
|
-
vector signed int vsumi0 =
|
11303
|
-
vector signed int vsumi1 =
|
11877
|
+
vector signed int vsumi0 = v0;
|
11878
|
+
vector signed int vsumi1 = v0;
|
11879
|
+
|
11880
|
+
vsumi0 = vec_sum4s(qv0, vsumi0);
|
11881
|
+
vsumi1 = vec_sum4s(qv1, vsumi1);
|
11304
11882
|
|
11305
11883
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11306
11884
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
@@ -11453,8 +12031,57 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11453
12031
|
|
11454
12032
|
*s = hsum_float_8(accum);
|
11455
12033
|
|
12034
|
+
#elif defined __AVX__
|
12035
|
+
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
12036
|
+
const __m128i m4b = _mm_set1_epi8(0x0f);
|
12037
|
+
|
12038
|
+
__m256 accum = _mm256_setzero_ps();
|
12039
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
12040
|
+
const uint8_t * qs = x[ibl].qs;
|
12041
|
+
const int8_t * q8 = y[ibl].qs;
|
12042
|
+
uint16_t sh = x[ibl].scales_h;
|
12043
|
+
__m128i sumi1_0 = _mm_setzero_si128();
|
12044
|
+
__m128i sumi1_1 = _mm_setzero_si128();
|
12045
|
+
__m128i sumi2_0 = _mm_setzero_si128();
|
12046
|
+
__m128i sumi2_1 = _mm_setzero_si128();
|
12047
|
+
for (int ib = 0; ib < QK_K/32; ib += 2) {
|
12048
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
12049
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)qs); qs += 16;
|
12050
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12051
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12052
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12053
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)q8); q8 += 16;
|
12054
|
+
const __m128i q4b_1_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b));
|
12055
|
+
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
12056
|
+
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
12057
|
+
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
12058
|
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
12059
|
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
12060
|
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
12061
|
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
12062
|
+
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
12063
|
+
const int16_t ls2 = ((x[ibl].scales_l[ib/2] >> 4) | ((sh << 2) & 0x30)) - 32;
|
12064
|
+
sh >>= 4;
|
12065
|
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, _mm_set1_epi16(ls1));
|
12066
|
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, _mm_set1_epi16(ls1));
|
12067
|
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, _mm_set1_epi16(ls2));
|
12068
|
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, _mm_set1_epi16(ls2));
|
12069
|
+
sumi1_0 = _mm_add_epi32(p_1_0, sumi1_0);
|
12070
|
+
sumi1_1 = _mm_add_epi32(p_1_1, sumi1_1);
|
12071
|
+
sumi2_0 = _mm_add_epi32(p_2_0, sumi2_0);
|
12072
|
+
sumi2_1 = _mm_add_epi32(p_2_1, sumi2_1);
|
12073
|
+
}
|
12074
|
+
__m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
|
12075
|
+
__m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
|
12076
|
+
accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
|
12077
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
|
12078
|
+
}
|
12079
|
+
|
12080
|
+
*s = hsum_float_8(accum);
|
12081
|
+
|
11456
12082
|
#elif defined(__POWER9_VECTOR__)
|
11457
12083
|
const vector signed char lowMask = vec_splats((signed char)0xF);
|
12084
|
+
const vector int v0 = vec_splats((int32_t)0);
|
11458
12085
|
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11459
12086
|
|
11460
12087
|
vector float vsumf0 = vec_splats(0.0f);
|
@@ -11470,14 +12097,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11470
12097
|
vector float vyd = vec_splats(y[ibl].d);
|
11471
12098
|
vector float vd = vec_mul(vxd, vyd);
|
11472
12099
|
|
11473
|
-
vector signed int vsumi0 =
|
11474
|
-
vector signed int vsumi1 =
|
11475
|
-
vector signed int vsumi2 =
|
11476
|
-
vector signed int vsumi3 =
|
11477
|
-
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11478
|
-
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11479
|
-
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11480
|
-
vector signed int vsumi7 = vec_splats((int32_t)0);
|
12100
|
+
vector signed int vsumi0 = v0;
|
12101
|
+
vector signed int vsumi1 = v0;
|
12102
|
+
vector signed int vsumi2 = v0;
|
12103
|
+
vector signed int vsumi3 = v0;
|
11481
12104
|
|
11482
12105
|
uint16_t h = x[ibl].scales_h;
|
11483
12106
|
|
@@ -11522,21 +12145,12 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
11522
12145
|
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
11523
12146
|
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
11524
12147
|
|
11525
|
-
vsumi0 =
|
11526
|
-
vsumi1 =
|
11527
|
-
vsumi2 =
|
11528
|
-
vsumi3 =
|
11529
|
-
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11530
|
-
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11531
|
-
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11532
|
-
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
12148
|
+
vsumi0 = vec_msum(qv0, vscales01, vsumi0);
|
12149
|
+
vsumi1 = vec_msum(qv1, vscales01, vsumi1);
|
12150
|
+
vsumi2 = vec_msum(qv2, vscales23, vsumi2);
|
12151
|
+
vsumi3 = vec_msum(qv3, vscales23, vsumi3);
|
11533
12152
|
}
|
11534
12153
|
|
11535
|
-
vsumi0 = vec_add(vsumi0, vsumi4);
|
11536
|
-
vsumi1 = vec_add(vsumi1, vsumi5);
|
11537
|
-
vsumi2 = vec_add(vsumi2, vsumi6);
|
11538
|
-
vsumi3 = vec_add(vsumi3, vsumi7);
|
11539
|
-
|
11540
12154
|
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11541
12155
|
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11542
12156
|
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
@@ -13139,7 +13753,7 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u
|
|
13139
13753
|
const float * restrict xval, const float * restrict weight, float * scale, int8_t * restrict L, int ngrid) {
|
13140
13754
|
int num_neighbors = neighbours[0];
|
13141
13755
|
GGML_ASSERT(num_neighbors > 0);
|
13142
|
-
float best_score =
|
13756
|
+
float best_score = -FLT_MAX;
|
13143
13757
|
int grid_index = -1;
|
13144
13758
|
for (int j = 1; j <= num_neighbors; ++j) {
|
13145
13759
|
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
@@ -13337,7 +13951,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
13337
13951
|
sumw[j+1] = sumw[j] + weight[i];
|
13338
13952
|
}
|
13339
13953
|
}
|
13340
|
-
float best_score =
|
13954
|
+
float best_score = -FLT_MIN, scale = max;
|
13341
13955
|
int besti1 = -1, besti2 = -1, best_shift = 0;
|
13342
13956
|
for (int i1 = 0; i1 <= block_size; ++i1) {
|
13343
13957
|
for (int i2 = i1; i2 <= block_size; ++i2) {
|
@@ -13513,7 +14127,7 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
|
|
13513
14127
|
idx[2*j] = j;
|
13514
14128
|
}
|
13515
14129
|
qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper);
|
13516
|
-
float best_score =
|
14130
|
+
float best_score = -FLT_MIN, scale = max;
|
13517
14131
|
int besti1 = -1, besti2 = -1, best_k = -1;
|
13518
14132
|
// 0: +, +
|
13519
14133
|
// 1: +, -
|