llama_cpp 0.15.1 → 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
@@ -14,6 +14,12 @@
|
|
14
14
|
#include <stdlib.h> // for qsort
|
15
15
|
#include <stdio.h> // for GGML_ASSERT
|
16
16
|
|
17
|
+
#if defined(_MSC_VER)
|
18
|
+
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
19
|
+
// we should just be careful :)
|
20
|
+
#pragma warning(disable: 4244 4267)
|
21
|
+
#endif
|
22
|
+
|
17
23
|
#define UNUSED GGML_UNUSED
|
18
24
|
|
19
25
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
@@ -235,7 +241,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
235
241
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
236
242
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
237
243
|
|
238
|
-
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
244
|
+
#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
|
239
245
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
240
246
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
241
247
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
@@ -637,6 +643,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
637
643
|
// store result
|
638
644
|
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
639
645
|
}
|
646
|
+
#elif defined(__POWER9_VECTOR__)
|
647
|
+
for (int i = 0; i < nb; i++) {
|
648
|
+
vector float srcv [8];
|
649
|
+
vector float asrcv[8];
|
650
|
+
vector float amaxv[8];
|
651
|
+
vector signed int vi[8];
|
652
|
+
|
653
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
654
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
655
|
+
|
656
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
657
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
658
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
659
|
+
|
660
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
661
|
+
vec_extract(amaxv[0], 1)),
|
662
|
+
MAX(vec_extract(amaxv[0], 2),
|
663
|
+
vec_extract(amaxv[0], 3)));
|
664
|
+
|
665
|
+
const float d = amax / ((1 << 7) - 1);
|
666
|
+
const float id = d ? 1.0f/d : 0.0f;
|
667
|
+
const vector float vid = vec_splats(id);
|
668
|
+
|
669
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
670
|
+
|
671
|
+
for (int j = 0; j < 8; j++) {
|
672
|
+
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
673
|
+
vi[j] = vec_cts(v, 0);
|
674
|
+
}
|
675
|
+
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
676
|
+
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
677
|
+
}
|
640
678
|
#else
|
641
679
|
GGML_UNUSED(nb);
|
642
680
|
// scalar
|
@@ -892,6 +930,46 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
892
930
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
893
931
|
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
894
932
|
}
|
933
|
+
#elif defined(__POWER9_VECTOR__)
|
934
|
+
for (int i = 0; i < nb; i++) {
|
935
|
+
vector float srcv [8];
|
936
|
+
vector float asrcv[8];
|
937
|
+
vector float amaxv[8];
|
938
|
+
vector signed int vi[8];
|
939
|
+
|
940
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
941
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
942
|
+
|
943
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
944
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
945
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
946
|
+
|
947
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
948
|
+
vec_extract(amaxv[0], 1)),
|
949
|
+
MAX(vec_extract(amaxv[0], 2),
|
950
|
+
vec_extract(amaxv[0], 3)));
|
951
|
+
|
952
|
+
const float d = amax / ((1 << 7) - 1);
|
953
|
+
const float id = d ? 1.0f/d : 0.0f;
|
954
|
+
const vector float vid = vec_splats(id);
|
955
|
+
|
956
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
957
|
+
|
958
|
+
vector int accv = vec_splats(0);
|
959
|
+
|
960
|
+
for (int j = 0; j < 8; j++) {
|
961
|
+
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
962
|
+
vi[j] = vec_cts(v, 0);
|
963
|
+
|
964
|
+
accv = vec_add(accv, vi[j]);
|
965
|
+
}
|
966
|
+
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
967
|
+
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
968
|
+
|
969
|
+
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
970
|
+
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
971
|
+
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
972
|
+
}
|
895
973
|
#else
|
896
974
|
GGML_UNUSED(nb);
|
897
975
|
// scalar
|
@@ -1908,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
1908
1986
|
|
1909
1987
|
for (int j = 0; j < QK_K/16; ++j) {
|
1910
1988
|
if (quant_weights) {
|
1911
|
-
const float * qw = quant_weights
|
1989
|
+
const float * qw = quant_weights + QK_K * i + 16*j;
|
1912
1990
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
1913
1991
|
} else {
|
1914
1992
|
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
@@ -3409,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3409
3487
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3410
3488
|
if (nrc == 2) {
|
3411
3489
|
const block_q4_0 * restrict vx0 = vx;
|
3412
|
-
const block_q4_0 * restrict vx1 = vx + bx;
|
3413
|
-
|
3490
|
+
const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
3414
3491
|
const block_q8_0 * restrict vy0 = vy;
|
3415
|
-
const block_q8_0 * restrict vy1 = vy + by;
|
3492
|
+
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
3416
3493
|
|
3417
3494
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3418
3495
|
|
@@ -3446,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3446
3523
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
3447
3524
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3448
3525
|
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3526
|
+
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3527
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
3528
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3529
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
3530
|
+
|
3531
|
+
float32x4_t scale = vld1q_f32(_scale);
|
3453
3532
|
|
3454
3533
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3455
3534
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -3734,6 +3813,46 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3734
3813
|
}
|
3735
3814
|
|
3736
3815
|
*s = sumf;
|
3816
|
+
#elif defined(__POWER9_VECTOR__)
|
3817
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
3818
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
3819
|
+
const vector signed char v8 = vec_splats((signed char)0x8);
|
3820
|
+
|
3821
|
+
vector float vsumf0 = vec_splats(0.0f);
|
3822
|
+
|
3823
|
+
#pragma GCC unroll 4
|
3824
|
+
for (int i = 0; i < nb; i++) {
|
3825
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
3826
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
3827
|
+
|
3828
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
3829
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
3830
|
+
vector float vd = vec_mul(vxd, vyd);
|
3831
|
+
|
3832
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
3833
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
3834
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
3835
|
+
|
3836
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
3837
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
3838
|
+
|
3839
|
+
q4x0 = vec_sub(q4x0, v8);
|
3840
|
+
q4x1 = vec_sub(q4x1, v8);
|
3841
|
+
|
3842
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
3843
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
3844
|
+
|
3845
|
+
qv0 = vec_add(qv0, qv1);
|
3846
|
+
|
3847
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
3848
|
+
|
3849
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
3850
|
+
}
|
3851
|
+
|
3852
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
3853
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
3854
|
+
|
3855
|
+
*s = vec_extract(vsumf0, 0);
|
3737
3856
|
#else
|
3738
3857
|
// scalar
|
3739
3858
|
float sumf = 0.0;
|
@@ -3776,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3776
3895
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3777
3896
|
if (nrc == 2) {
|
3778
3897
|
const block_q4_1 * restrict vx0 = vx;
|
3779
|
-
const block_q4_1 * restrict vx1 = vx + bx;
|
3898
|
+
const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
3780
3899
|
const block_q8_1 * restrict vy0 = vy;
|
3781
|
-
const block_q8_1 * restrict vy1 = vy + by;
|
3900
|
+
const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
3782
3901
|
|
3783
3902
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3784
3903
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
@@ -3789,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3789
3908
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
3790
3909
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
3791
3910
|
|
3792
|
-
|
3793
|
-
|
3794
|
-
|
3795
|
-
|
3796
|
-
summs0
|
3911
|
+
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
3912
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
3913
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
3914
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
3915
|
+
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
3797
3916
|
|
3798
3917
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
3799
3918
|
|
@@ -3813,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3813
3932
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3814
3933
|
|
3815
3934
|
// mmla into int32x4_t
|
3816
|
-
|
3817
|
-
|
3818
|
-
|
3819
|
-
|
3935
|
+
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
3936
|
+
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
3937
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
3938
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
3939
|
+
float32x4_t scale = vld1q_f32(_scale);
|
3820
3940
|
|
3821
3941
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3822
3942
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -3835,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3835
3955
|
|
3836
3956
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
3837
3957
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
3838
|
-
sumv2 = sumv2
|
3958
|
+
sumv2 = vaddq_f32(sumv2, summs0);
|
3839
3959
|
|
3840
3960
|
vst1_f32(s, vget_low_f32(sumv2));
|
3841
3961
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
@@ -3952,6 +4072,46 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3952
4072
|
}
|
3953
4073
|
|
3954
4074
|
*s = sumf;
|
4075
|
+
#elif defined(__POWER9_VECTOR__)
|
4076
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4077
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4078
|
+
|
4079
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4080
|
+
|
4081
|
+
#pragma GCC unroll 4
|
4082
|
+
for (int i = 0; i < nb; i++) {
|
4083
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4084
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4085
|
+
|
4086
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4087
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4088
|
+
vector float vd = vec_mul(vxd, vyd);
|
4089
|
+
|
4090
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
|
4091
|
+
vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.0f, 0.0f, 0.0f};
|
4092
|
+
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
4093
|
+
|
4094
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4095
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4096
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4097
|
+
|
4098
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
4099
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
4100
|
+
|
4101
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
4102
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4103
|
+
|
4104
|
+
qv0 = vec_add(qv0, qv1);
|
4105
|
+
|
4106
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4107
|
+
|
4108
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4109
|
+
}
|
4110
|
+
|
4111
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4112
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4113
|
+
|
4114
|
+
*s = vec_extract(vsumf0, 0);
|
3955
4115
|
#else
|
3956
4116
|
// scalar
|
3957
4117
|
float sumf = 0.0;
|
@@ -4237,6 +4397,49 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4237
4397
|
}
|
4238
4398
|
|
4239
4399
|
*s = sumf;
|
4400
|
+
#elif defined(__POWER9_VECTOR__)
|
4401
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4402
|
+
const vector unsigned char v4 = vec_splats((unsigned char)4);
|
4403
|
+
|
4404
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4405
|
+
|
4406
|
+
#pragma GCC unroll 4
|
4407
|
+
for (int i = 0; i < nb; ++i) {
|
4408
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4409
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4410
|
+
|
4411
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4412
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4413
|
+
vector float vd = vec_mul(vxd, vyd);
|
4414
|
+
|
4415
|
+
vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[i].qh[0]]), (uint64_t)(table_b2b_1[x[i].qh[1]])};
|
4416
|
+
vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[i].qh[2]]), (uint64_t)(table_b2b_1[x[i].qh[3]])};
|
4417
|
+
|
4418
|
+
vector signed char qh0 = (vector signed char)aux64x2_0;
|
4419
|
+
vector signed char qh1 = (vector signed char)aux64x2_1;
|
4420
|
+
|
4421
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4422
|
+
|
4423
|
+
vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
|
4424
|
+
vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
|
4425
|
+
|
4426
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4427
|
+
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
4428
|
+
|
4429
|
+
vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
|
4430
|
+
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
4431
|
+
|
4432
|
+
qv0 = vec_add(qv0, qv1);
|
4433
|
+
|
4434
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4435
|
+
|
4436
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4437
|
+
}
|
4438
|
+
|
4439
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4440
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4441
|
+
|
4442
|
+
*s = vec_extract(vsumf0, 0);
|
4240
4443
|
#else
|
4241
4444
|
// scalar
|
4242
4445
|
float sumf = 0.0;
|
@@ -4541,6 +4744,53 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4541
4744
|
}
|
4542
4745
|
|
4543
4746
|
*s = sumf;
|
4747
|
+
#elif defined(__POWER9_VECTOR__)
|
4748
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4749
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4750
|
+
|
4751
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4752
|
+
|
4753
|
+
#pragma GCC unroll 4
|
4754
|
+
for (int i = 0; i < nb; ++i) {
|
4755
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4756
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4757
|
+
|
4758
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4759
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4760
|
+
vector float vd = vec_mul(vxd, vyd);
|
4761
|
+
|
4762
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
|
4763
|
+
vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.f, 0.f, 0.f};
|
4764
|
+
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
4765
|
+
|
4766
|
+
vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[i].qh[0]]), (uint64_t)(table_b2b_0[x[i].qh[1]])};
|
4767
|
+
vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[i].qh[2]]), (uint64_t)(table_b2b_0[x[i].qh[3]])};
|
4768
|
+
|
4769
|
+
vector signed char qh0 = (vector signed char)aux64x2_0;
|
4770
|
+
vector signed char qh1 = (vector signed char)aux64x2_1;
|
4771
|
+
|
4772
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4773
|
+
|
4774
|
+
vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
|
4775
|
+
vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
|
4776
|
+
|
4777
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4778
|
+
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
4779
|
+
|
4780
|
+
vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
|
4781
|
+
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
4782
|
+
|
4783
|
+
qv0 = vec_add(qv0, qv1);
|
4784
|
+
|
4785
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4786
|
+
|
4787
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4788
|
+
}
|
4789
|
+
|
4790
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4791
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4792
|
+
|
4793
|
+
*s = vec_extract(vsumf0, 0);
|
4544
4794
|
#else
|
4545
4795
|
// scalar
|
4546
4796
|
float sumf = 0.0;
|
@@ -4589,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4589
4839
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4590
4840
|
if (nrc == 2) {
|
4591
4841
|
const block_q8_0 * restrict vx0 = vx;
|
4592
|
-
const block_q8_0 * restrict vx1 = vx + bx;
|
4842
|
+
const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
4593
4843
|
const block_q8_0 * restrict vy0 = vy;
|
4594
|
-
const block_q8_0 * restrict vy1 = vy + by;
|
4844
|
+
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
4595
4845
|
|
4596
4846
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4597
4847
|
|
@@ -4613,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4613
4863
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4614
4864
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4615
4865
|
|
4616
|
-
|
4617
|
-
|
4618
|
-
|
4619
|
-
|
4866
|
+
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4867
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4868
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4869
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4870
|
+
float32x4_t scale = vld1q_f32(_scale);
|
4620
4871
|
|
4621
4872
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4622
4873
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -4716,6 +4967,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4716
4967
|
}
|
4717
4968
|
|
4718
4969
|
*s = sumf;
|
4970
|
+
#elif defined(__POWER9_VECTOR__)
|
4971
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4972
|
+
|
4973
|
+
#pragma GCC unroll 4
|
4974
|
+
for (int i = 0; i < nb; i++) {
|
4975
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4976
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4977
|
+
|
4978
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4979
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4980
|
+
vector float vd = vec_mul(vxd, vyd);
|
4981
|
+
|
4982
|
+
vector signed char q8x0 = vec_xl( 0, x[i].qs);
|
4983
|
+
vector signed char q8x1 = vec_xl(16, x[i].qs);
|
4984
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4985
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4986
|
+
|
4987
|
+
vector signed short qv0 = vec_mule(q8x0, q8y0);
|
4988
|
+
vector signed short qv1 = vec_mulo(q8x0, q8y0);
|
4989
|
+
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
4990
|
+
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
4991
|
+
|
4992
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
|
4993
|
+
vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
|
4994
|
+
vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
|
4995
|
+
vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
|
4996
|
+
|
4997
|
+
vsumi0 = vec_add(vsumi0, vsumi2);
|
4998
|
+
vsumi1 = vec_add(vsumi1, vsumi3);
|
4999
|
+
|
5000
|
+
vsumi0 = vec_add(vsumi0, vsumi1);
|
5001
|
+
|
5002
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5003
|
+
}
|
5004
|
+
|
5005
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5006
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5007
|
+
|
5008
|
+
*s = vec_extract(vsumf0, 0);
|
4719
5009
|
#else
|
4720
5010
|
// scalar
|
4721
5011
|
float sumf = 0.0;
|
@@ -5071,6 +5361,147 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5071
5361
|
|
5072
5362
|
*s = sumf;
|
5073
5363
|
|
5364
|
+
#elif defined(__POWER9_VECTOR__)
|
5365
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5366
|
+
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5367
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5368
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5369
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5370
|
+
|
5371
|
+
vector float vsumf0 = vec_splats(0.0f);
|
5372
|
+
vector float vsumf1 = vec_splats(0.0f);
|
5373
|
+
vector float vsumf2 = vec_splats(0.0f);
|
5374
|
+
vector float vsumf3 = vec_splats(0.0f);
|
5375
|
+
|
5376
|
+
for (int i = 0; i < nb; ++i) {
|
5377
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
5378
|
+
vector float vyd = vec_splats(y[i].d);
|
5379
|
+
vector float vd = vec_mul(vxd, vyd);
|
5380
|
+
|
5381
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
5382
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
5383
|
+
|
5384
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
5385
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
5386
|
+
|
5387
|
+
vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
|
5388
|
+
vector signed char vscales = vec_and(q2xmins, lowScaleMask);
|
5389
|
+
|
5390
|
+
q2xmins = vec_sr(q2xmins, v4);
|
5391
|
+
vector signed short q2xmins0 = vec_unpackh(q2xmins);
|
5392
|
+
vector signed short q2xmins1 = vec_unpackl(q2xmins);
|
5393
|
+
|
5394
|
+
vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
|
5395
|
+
vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
|
5396
|
+
vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
|
5397
|
+
vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
|
5398
|
+
|
5399
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
5400
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
5401
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
5402
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
5403
|
+
|
5404
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
5405
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
5406
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
5407
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
5408
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
5409
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
5410
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
5411
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
5412
|
+
|
5413
|
+
const uint8_t * restrict q2 = x[i].qs;
|
5414
|
+
const int8_t * restrict q8 = y[i].qs;
|
5415
|
+
|
5416
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
5417
|
+
__builtin_prefetch(q2, 0, 1);
|
5418
|
+
__builtin_prefetch(q8, 0, 1);
|
5419
|
+
|
5420
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
|
5421
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
5422
|
+
q2 += 32;
|
5423
|
+
|
5424
|
+
vector signed char q2x00 = vec_and(qxs0, lowMask);
|
5425
|
+
vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
5426
|
+
vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
5427
|
+
vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
5428
|
+
vector signed char q2x10 = vec_and(qxs1, lowMask);
|
5429
|
+
vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
|
5430
|
+
vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
|
5431
|
+
vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
|
5432
|
+
|
5433
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
5434
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
5435
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
5436
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
5437
|
+
vector signed char q8y02 = vec_xl( 64, q8);
|
5438
|
+
vector signed char q8y12 = vec_xl( 80, q8);
|
5439
|
+
vector signed char q8y03 = vec_xl( 96, q8);
|
5440
|
+
vector signed char q8y13 = vec_xl(112, q8);
|
5441
|
+
q8 += 128;
|
5442
|
+
|
5443
|
+
vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
|
5444
|
+
vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
|
5445
|
+
vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
|
5446
|
+
vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
|
5447
|
+
vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
|
5448
|
+
vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
|
5449
|
+
vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
|
5450
|
+
vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
|
5451
|
+
|
5452
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
5453
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
5454
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
5455
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
5456
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
5457
|
+
vector signed short vs4 = vec_splat(vscales_h, 4);
|
5458
|
+
vector signed short vs5 = vec_splat(vscales_h, 5);
|
5459
|
+
vector signed short vs6 = vec_splat(vscales_h, 6);
|
5460
|
+
vector signed short vs7 = vec_splat(vscales_h, 7);
|
5461
|
+
vscales = vec_sld(vscales, vscales, 8);
|
5462
|
+
|
5463
|
+
qv0 = vec_mul(qv0, vs0);
|
5464
|
+
qv1 = vec_mul(qv1, vs2);
|
5465
|
+
qv2 = vec_mul(qv2, vs4);
|
5466
|
+
qv3 = vec_mul(qv3, vs6);
|
5467
|
+
|
5468
|
+
qv0 = vec_madd(qv4, vs1, qv0);
|
5469
|
+
qv1 = vec_madd(qv5, vs3, qv1);
|
5470
|
+
qv2 = vec_madd(qv6, vs5, qv2);
|
5471
|
+
qv3 = vec_madd(qv7, vs7, qv3);
|
5472
|
+
|
5473
|
+
vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
|
5474
|
+
vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
|
5475
|
+
vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
|
5476
|
+
vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
|
5477
|
+
|
5478
|
+
vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
|
5479
|
+
vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
|
5480
|
+
vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
|
5481
|
+
vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
|
5482
|
+
}
|
5483
|
+
|
5484
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
5485
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
5486
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
5487
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
5488
|
+
|
5489
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5490
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
5491
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
5492
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
5493
|
+
}
|
5494
|
+
|
5495
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
5496
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
5497
|
+
|
5498
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
5499
|
+
|
5500
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5501
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5502
|
+
|
5503
|
+
*s = vec_extract(vsumf0, 0);
|
5504
|
+
|
5074
5505
|
#else
|
5075
5506
|
|
5076
5507
|
float sumf = 0;
|
@@ -5341,6 +5772,87 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5341
5772
|
|
5342
5773
|
*s = sumf;
|
5343
5774
|
|
5775
|
+
#elif defined(__POWER9_VECTOR__)
|
5776
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5777
|
+
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5778
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5779
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5780
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5781
|
+
|
5782
|
+
vector float vsumf0 = vec_splats(0.0f);
|
5783
|
+
vector float vsumf1 = vec_splats(0.0f);
|
5784
|
+
vector float vsumf2 = vec_splats(0.0f);
|
5785
|
+
vector float vsumf3 = vec_splats(0.0f);
|
5786
|
+
|
5787
|
+
#pragma GCC unroll 2
|
5788
|
+
for (int i = 0; i < nb; ++i) {
|
5789
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
5790
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
5791
|
+
|
5792
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
5793
|
+
vector float vyd = vec_splats(y[i].d);
|
5794
|
+
vector float vd = vec_mul(vxd, vyd);
|
5795
|
+
|
5796
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
5797
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
5798
|
+
|
5799
|
+
vector signed short q8ysums0 = vec_xl_len(y[i].bsums, 8);
|
5800
|
+
|
5801
|
+
vector signed char q2xmins = (vector signed char)vec_xl_len(x[i].scales, 4);
|
5802
|
+
vector signed char vscales = vec_and(q2xmins, lowScaleMask);
|
5803
|
+
|
5804
|
+
q2xmins = vec_sr(q2xmins, v4);
|
5805
|
+
vector signed short q2xmins0 = vec_unpackh((vector signed char)q2xmins);
|
5806
|
+
|
5807
|
+
vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
|
5808
|
+
vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
|
5809
|
+
|
5810
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
5811
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
5812
|
+
|
5813
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
5814
|
+
vector signed char q2x00 = vec_and(qxs0, lowMask);
|
5815
|
+
vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
5816
|
+
vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
5817
|
+
vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
5818
|
+
|
5819
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
5820
|
+
vector signed char q8y01 = vec_xl( 16, y[i].qs);
|
5821
|
+
vector signed char q8y02 = vec_xl( 32, y[i].qs);
|
5822
|
+
vector signed char q8y03 = vec_xl( 48, y[i].qs);
|
5823
|
+
|
5824
|
+
vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
|
5825
|
+
vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
|
5826
|
+
vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
|
5827
|
+
vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
|
5828
|
+
|
5829
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
5830
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
5831
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
5832
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
5833
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
5834
|
+
|
5835
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv0, vs0), vec_mulo(qv0, vs0));
|
5836
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv1, vs1), vec_mulo(qv1, vs1));
|
5837
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv2, vs2), vec_mulo(qv2, vs2));
|
5838
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv3, vs3), vec_mulo(qv3, vs3));
|
5839
|
+
|
5840
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5841
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
5842
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
5843
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
5844
|
+
}
|
5845
|
+
|
5846
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
5847
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
5848
|
+
|
5849
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
5850
|
+
|
5851
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5852
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5853
|
+
|
5854
|
+
*s = vec_extract(vsumf0, 0);
|
5855
|
+
|
5344
5856
|
#else
|
5345
5857
|
|
5346
5858
|
float sumf = 0;
|
@@ -5835,6 +6347,160 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5835
6347
|
|
5836
6348
|
*s = sumf;
|
5837
6349
|
|
6350
|
+
#elif defined(__POWER9_VECTOR__)
|
6351
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6352
|
+
const vector signed char v1 = vec_splats((signed char)0x1);
|
6353
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6354
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
6355
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
6356
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
6357
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
6358
|
+
|
6359
|
+
vector float vsumf0 = vec_splats(0.0f);
|
6360
|
+
vector float vsumf1 = vec_splats(0.0f);
|
6361
|
+
vector float vsumf2 = vec_splats(0.0f);
|
6362
|
+
vector float vsumf3 = vec_splats(0.0f);
|
6363
|
+
|
6364
|
+
for (int i = 0; i < nb; ++i) {
|
6365
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
6366
|
+
vector float vyd = vec_splats(y[i].d);
|
6367
|
+
vector float vd = vec_mul(vxd, vyd);
|
6368
|
+
|
6369
|
+
uint32_t aux[3];
|
6370
|
+
uint32_t utmp[4];
|
6371
|
+
|
6372
|
+
memcpy(aux, x[i].scales, 12);
|
6373
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
6374
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
6375
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
6376
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
6377
|
+
|
6378
|
+
vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
|
6379
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
6380
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
6381
|
+
|
6382
|
+
vscales = vec_sub(vscales, off);
|
6383
|
+
|
6384
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
6385
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
6386
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
6387
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
6388
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
6389
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
6390
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
6391
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
6392
|
+
|
6393
|
+
const uint8_t * restrict q3 = x[i].qs;
|
6394
|
+
const int8_t * restrict q8 = y[i].qs;
|
6395
|
+
|
6396
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
6397
|
+
__builtin_prefetch(q3, 0, 1);
|
6398
|
+
__builtin_prefetch(q8, 0, 1);
|
6399
|
+
|
6400
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
|
6401
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
|
6402
|
+
q3 += 32;
|
6403
|
+
|
6404
|
+
//the low 2 bits
|
6405
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
6406
|
+
vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
6407
|
+
vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
6408
|
+
vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
6409
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
6410
|
+
vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
|
6411
|
+
vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
|
6412
|
+
vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
|
6413
|
+
|
6414
|
+
//the 3rd bit
|
6415
|
+
vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
|
6416
|
+
vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
|
6417
|
+
vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
|
6418
|
+
vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
|
6419
|
+
vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
|
6420
|
+
vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
|
6421
|
+
vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
|
6422
|
+
vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
|
6423
|
+
qxhs0 = vec_sr(qxhs0, v4);
|
6424
|
+
qxhs1 = vec_sr(qxhs1, v4);
|
6425
|
+
|
6426
|
+
vector signed char q3x00 = vec_sub(qxs00, qxh00);
|
6427
|
+
vector signed char q3x01 = vec_sub(qxs01, qxh01);
|
6428
|
+
vector signed char q3x02 = vec_sub(qxs02, qxh02);
|
6429
|
+
vector signed char q3x03 = vec_sub(qxs03, qxh03);
|
6430
|
+
vector signed char q3x10 = vec_sub(qxs10, qxh10);
|
6431
|
+
vector signed char q3x11 = vec_sub(qxs11, qxh11);
|
6432
|
+
vector signed char q3x12 = vec_sub(qxs12, qxh12);
|
6433
|
+
vector signed char q3x13 = vec_sub(qxs13, qxh13);
|
6434
|
+
|
6435
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
6436
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
6437
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
6438
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
6439
|
+
vector signed char q8y02 = vec_xl( 64, q8);
|
6440
|
+
vector signed char q8y12 = vec_xl( 80, q8);
|
6441
|
+
vector signed char q8y03 = vec_xl( 96, q8);
|
6442
|
+
vector signed char q8y13 = vec_xl(112, q8);
|
6443
|
+
q8 += 128;
|
6444
|
+
|
6445
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
6446
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
6447
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
6448
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
6449
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
6450
|
+
vector signed short vs4 = vec_splat(vscales_h, 4);
|
6451
|
+
vector signed short vs5 = vec_splat(vscales_h, 5);
|
6452
|
+
vector signed short vs6 = vec_splat(vscales_h, 6);
|
6453
|
+
vector signed short vs7 = vec_splat(vscales_h, 7);
|
6454
|
+
vscales = vec_sld(vscales, vscales, 8);
|
6455
|
+
|
6456
|
+
vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
|
6457
|
+
vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
|
6458
|
+
vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
|
6459
|
+
vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
|
6460
|
+
vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
|
6461
|
+
vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
|
6462
|
+
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
6463
|
+
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
6464
|
+
|
6465
|
+
vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
6466
|
+
vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
6467
|
+
vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
|
6468
|
+
vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
|
6469
|
+
vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
6470
|
+
vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
6471
|
+
vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
|
6472
|
+
vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
|
6473
|
+
|
6474
|
+
vsumi0 = vec_add(vsum0, vsumi0);
|
6475
|
+
vsumi1 = vec_add(vsum1, vsumi1);
|
6476
|
+
vsumi2 = vec_add(vsum2, vsumi2);
|
6477
|
+
vsumi3 = vec_add(vsum3, vsumi3);
|
6478
|
+
vsumi4 = vec_add(vsum4, vsumi4);
|
6479
|
+
vsumi5 = vec_add(vsum5, vsumi5);
|
6480
|
+
vsumi6 = vec_add(vsum6, vsumi6);
|
6481
|
+
vsumi7 = vec_add(vsum7, vsumi7);
|
6482
|
+
}
|
6483
|
+
|
6484
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
6485
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
6486
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
6487
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
6488
|
+
|
6489
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
6490
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
6491
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
6492
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
6493
|
+
}
|
6494
|
+
|
6495
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
6496
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
6497
|
+
|
6498
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
6499
|
+
|
6500
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
6501
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
6502
|
+
|
6503
|
+
*s = vec_extract(vsumf0, 0);
|
5838
6504
|
#else
|
5839
6505
|
// scalar version
|
5840
6506
|
// This function is written like this so the compiler can manage to vectorize most of it
|
@@ -6201,6 +6867,95 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6201
6867
|
|
6202
6868
|
*s = sumf;
|
6203
6869
|
|
6870
|
+
#elif defined(__POWER9_VECTOR__)
|
6871
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6872
|
+
const vector signed char v1 = vec_splats((signed char)0x1);
|
6873
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6874
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
6875
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
6876
|
+
const vector signed char off = vec_splats((signed char)0x8);
|
6877
|
+
|
6878
|
+
vector float vsumf0 = vec_splats(0.0f);
|
6879
|
+
vector float vsumf1 = vec_splats(0.0f);
|
6880
|
+
vector float vsumf2 = vec_splats(0.0f);
|
6881
|
+
vector float vsumf3 = vec_splats(0.0f);
|
6882
|
+
|
6883
|
+
#pragma GCC unroll 2
|
6884
|
+
for (int i = 0; i < nb; ++i) {
|
6885
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
6886
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
6887
|
+
|
6888
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
6889
|
+
vector float vyd = vec_splats(y[i].d);
|
6890
|
+
vector float vd = vec_mul(vxd, vyd);
|
6891
|
+
|
6892
|
+
uint16_t aux16[2];
|
6893
|
+
int8_t * scales = (int8_t *)aux16;
|
6894
|
+
|
6895
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
6896
|
+
aux16[0] = a & 0x0f0f;
|
6897
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
6898
|
+
|
6899
|
+
vector signed char vscales = (vector signed char)vec_xl_len(scales, 8);
|
6900
|
+
vector signed char qxhs0 = (vector signed char)vec_xl_len(x[i].hmask, 8);
|
6901
|
+
qxhs0 = vec_or(qxhs0, vec_sr(vec_sld(qxhs0, qxhs0, 8), (vector unsigned char)v1));
|
6902
|
+
|
6903
|
+
vscales = vec_sub(vscales, off);
|
6904
|
+
|
6905
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
6906
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
6907
|
+
vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
6908
|
+
vector signed char qxs10 = vec_and(vec_sr(qxs0, v4), lowMask);
|
6909
|
+
vector signed char qxs11 = vec_and(vec_sr(qxs0, v6), lowMask);
|
6910
|
+
|
6911
|
+
//the 3rd bit
|
6912
|
+
vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
|
6913
|
+
vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
|
6914
|
+
vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v4)), v2);
|
6915
|
+
vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v6)), v2);
|
6916
|
+
qxhs0 = vec_sr(qxhs0, v4);
|
6917
|
+
|
6918
|
+
vector signed char q3x00 = vec_sub(qxs00, qxh00);
|
6919
|
+
vector signed char q3x01 = vec_sub(qxs01, qxh01);
|
6920
|
+
vector signed char q3x10 = vec_sub(qxs10, qxh02);
|
6921
|
+
vector signed char q3x11 = vec_sub(qxs11, qxh03);
|
6922
|
+
|
6923
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
6924
|
+
vector signed char q8y01 = vec_xl( 16, y[i].qs);
|
6925
|
+
vector signed char q8y10 = vec_xl( 32, y[i].qs);
|
6926
|
+
vector signed char q8y11 = vec_xl( 48, y[i].qs);
|
6927
|
+
|
6928
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
6929
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
6930
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
6931
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
6932
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
6933
|
+
|
6934
|
+
vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
|
6935
|
+
vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
|
6936
|
+
vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
|
6937
|
+
vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
|
6938
|
+
|
6939
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
6940
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
6941
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
6942
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
6943
|
+
|
6944
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
6945
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
6946
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
6947
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
6948
|
+
}
|
6949
|
+
|
6950
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
6951
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
6952
|
+
|
6953
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
6954
|
+
|
6955
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
6956
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
6957
|
+
|
6958
|
+
*s = vec_extract(vsumf0, 0);
|
6204
6959
|
#else
|
6205
6960
|
|
6206
6961
|
int8_t aux8[QK_K];
|
@@ -6553,6 +7308,142 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6553
7308
|
|
6554
7309
|
*s = sumf;
|
6555
7310
|
|
7311
|
+
#elif defined(__POWER9_VECTOR__)
|
7312
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7313
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7314
|
+
|
7315
|
+
vector float vsumf0 = vec_splats(0.0f);
|
7316
|
+
vector float vsumf1 = vec_splats(0.0f);
|
7317
|
+
vector float vsumf2 = vec_splats(0.0f);
|
7318
|
+
vector float vsumf3 = vec_splats(0.0f);
|
7319
|
+
|
7320
|
+
for (int i = 0; i < nb; ++i) {
|
7321
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
7322
|
+
vector float vyd = vec_splats(y[i].d);
|
7323
|
+
vector float vd = vec_mul(vxd, vyd);
|
7324
|
+
|
7325
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
7326
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
7327
|
+
|
7328
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7329
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7330
|
+
|
7331
|
+
memcpy(utmp, x[i].scales, 12);
|
7332
|
+
|
7333
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
7334
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
7335
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
7336
|
+
utmp[2] = uaux;
|
7337
|
+
utmp[0] &= kmask1;
|
7338
|
+
|
7339
|
+
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7340
|
+
vector signed short vscales = vec_unpackh(utmps);
|
7341
|
+
vector signed short q4xmins = vec_unpackl(utmps);
|
7342
|
+
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
7343
|
+
vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
|
7344
|
+
|
7345
|
+
vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
|
7346
|
+
vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
|
7347
|
+
vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
|
7348
|
+
vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
|
7349
|
+
|
7350
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
7351
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
7352
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
7353
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
7354
|
+
|
7355
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
7356
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
7357
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
7358
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
7359
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
7360
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
7361
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
7362
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
7363
|
+
|
7364
|
+
const uint8_t * restrict q4 = x[i].qs;
|
7365
|
+
const int8_t * restrict q8 = y[i].qs;
|
7366
|
+
|
7367
|
+
for (int j = 0; j < QK_K/64; j+=2) {
|
7368
|
+
__builtin_prefetch(q4, 0, 1);
|
7369
|
+
__builtin_prefetch(q8, 0, 1);
|
7370
|
+
|
7371
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
|
7372
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
|
7373
|
+
vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
|
7374
|
+
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
7375
|
+
q4 += 64;
|
7376
|
+
|
7377
|
+
vector signed char q4x00 = vec_and(qxs0, lowMask);
|
7378
|
+
vector signed char q4x01 = vec_sr(qxs0, v4);
|
7379
|
+
vector signed char q4x10 = vec_and(qxs1, lowMask);
|
7380
|
+
vector signed char q4x11 = vec_sr(qxs1, v4);
|
7381
|
+
vector signed char q4x20 = vec_and(qxs2, lowMask);
|
7382
|
+
vector signed char q4x21 = vec_sr(qxs2, v4);
|
7383
|
+
vector signed char q4x30 = vec_and(qxs3, lowMask);
|
7384
|
+
vector signed char q4x31 = vec_sr(qxs3, v4);
|
7385
|
+
|
7386
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
7387
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
7388
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
7389
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
7390
|
+
vector signed char q8y20 = vec_xl( 64, q8);
|
7391
|
+
vector signed char q8y30 = vec_xl( 80, q8);
|
7392
|
+
vector signed char q8y21 = vec_xl( 96, q8);
|
7393
|
+
vector signed char q8y31 = vec_xl(112, q8);
|
7394
|
+
q8 += 128;
|
7395
|
+
|
7396
|
+
vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
|
7397
|
+
vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
|
7398
|
+
vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
|
7399
|
+
vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
|
7400
|
+
vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
|
7401
|
+
vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
|
7402
|
+
vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
|
7403
|
+
vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
|
7404
|
+
|
7405
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
7406
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
7407
|
+
vector signed short vs2 = vec_splat(vscales, 2);
|
7408
|
+
vector signed short vs3 = vec_splat(vscales, 3);
|
7409
|
+
vscales = vec_sld(vscales, vscales, 8);
|
7410
|
+
|
7411
|
+
qv00 = vec_add(qv00, qv10);
|
7412
|
+
qv10 = vec_add(qv01, qv11);
|
7413
|
+
qv20 = vec_add(qv20, qv30);
|
7414
|
+
qv30 = vec_add(qv21, qv31);
|
7415
|
+
|
7416
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
7417
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
7418
|
+
vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
|
7419
|
+
vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
|
7420
|
+
vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
|
7421
|
+
vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
|
7422
|
+
vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
|
7423
|
+
vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
|
7424
|
+
}
|
7425
|
+
|
7426
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
7427
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
7428
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
7429
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
7430
|
+
|
7431
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7432
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7433
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
7434
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
7435
|
+
}
|
7436
|
+
|
7437
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
7438
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
7439
|
+
|
7440
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
7441
|
+
|
7442
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
7443
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
7444
|
+
|
7445
|
+
*s = vec_extract(vsumf0, 0);
|
7446
|
+
|
6556
7447
|
#else
|
6557
7448
|
|
6558
7449
|
|
@@ -6819,6 +7710,87 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6819
7710
|
|
6820
7711
|
*s = sumf;
|
6821
7712
|
|
7713
|
+
#elif defined(__POWER9_VECTOR__)
|
7714
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7715
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7716
|
+
|
7717
|
+
vector float vsumf0 = vec_splats(0.0f);
|
7718
|
+
vector float vsumf1 = vec_splats(0.0f);
|
7719
|
+
vector float vsumf2 = vec_splats(0.0f);
|
7720
|
+
vector float vsumf3 = vec_splats(0.0f);
|
7721
|
+
|
7722
|
+
#pragma GCC unroll 2
|
7723
|
+
for (int i = 0; i < nb; ++i) {
|
7724
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
7725
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
7726
|
+
|
7727
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d[1]));
|
7728
|
+
vector float vyd = vec_splats(y[i].d);
|
7729
|
+
vector float vd= vec_mul(vxd, vyd);
|
7730
|
+
|
7731
|
+
uint16_t s16[2];
|
7732
|
+
const uint8_t * scales = (const uint8_t *)s16;
|
7733
|
+
|
7734
|
+
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
7735
|
+
s16[0] = b[0] & 0x0f0f;
|
7736
|
+
s16[1] = (b[0] >> 4) & 0x0f0f;
|
7737
|
+
|
7738
|
+
vector signed char utmps = (vector signed char)vec_xl_len(scales, 4);
|
7739
|
+
vector signed short vscales = (vector signed short)vec_unpackh(utmps);
|
7740
|
+
vector signed short q4xmins0 = vec_mergeh(vscales, vscales);
|
7741
|
+
q4xmins0 = vec_sld(q4xmins0, q4xmins0, 8);
|
7742
|
+
|
7743
|
+
vector signed short q8ysums0 = vec_xl_len((const int16_t *)(y[i].bsums), 8);
|
7744
|
+
|
7745
|
+
vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
|
7746
|
+
vector signed int prod1 = vec_mulo(q4xmins0, q8ysums0);
|
7747
|
+
|
7748
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vd, vsumf0);
|
7749
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vd, vsumf1);
|
7750
|
+
|
7751
|
+
vd = vec_mul(vyd, vec_splats(GGML_FP16_TO_FP32(x[i].d[0])));
|
7752
|
+
|
7753
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
7754
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
|
7755
|
+
vector signed char q4x00 = vec_and(qxs0, lowMask);
|
7756
|
+
vector signed char q4x01 = vec_sr(qxs0, v4);
|
7757
|
+
vector signed char q4x10 = vec_and(qxs1, lowMask);
|
7758
|
+
vector signed char q4x11 = vec_sr(qxs1, v4);
|
7759
|
+
|
7760
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
7761
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
7762
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
7763
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
7764
|
+
|
7765
|
+
vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
|
7766
|
+
vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
|
7767
|
+
vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
|
7768
|
+
vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
|
7769
|
+
|
7770
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
7771
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
7772
|
+
|
7773
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
7774
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs0), vec_mulo(qv10, vs0));
|
7775
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs1), vec_mulo(qv01, vs1));
|
7776
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs1), vec_mulo(qv11, vs1));
|
7777
|
+
|
7778
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7779
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7780
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
7781
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
7782
|
+
}
|
7783
|
+
|
7784
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
7785
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
7786
|
+
|
7787
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
7788
|
+
|
7789
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
7790
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
7791
|
+
|
7792
|
+
*s = vec_extract(vsumf0, 0);
|
7793
|
+
|
6822
7794
|
#else
|
6823
7795
|
|
6824
7796
|
uint8_t aux8[QK_K];
|
@@ -7220,6 +8192,130 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7220
8192
|
|
7221
8193
|
*s = sumf+sums;
|
7222
8194
|
|
8195
|
+
#elif defined(__POWER9_VECTOR__)
|
8196
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8197
|
+
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
8198
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8199
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
8200
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
8201
|
+
|
8202
|
+
vector float vsumf0 = vec_splats(0.0f);
|
8203
|
+
vector float vsumf1 = vec_splats(0.0f);
|
8204
|
+
vector float vsumf2 = vec_splats(0.0f);
|
8205
|
+
vector float vsumf3 = vec_splats(0.0f);
|
8206
|
+
|
8207
|
+
for (int i = 0; i < nb; ++i) {
|
8208
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
8209
|
+
vector float vyd = vec_splats(y[i].d);
|
8210
|
+
vector float vd = vec_mul(vxd, vyd);
|
8211
|
+
|
8212
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
8213
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
8214
|
+
|
8215
|
+
memcpy(utmp, x[i].scales, 12);
|
8216
|
+
|
8217
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
8218
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
8219
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
8220
|
+
utmp[2] = uaux;
|
8221
|
+
utmp[0] &= kmask1;
|
8222
|
+
|
8223
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
8224
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
8225
|
+
|
8226
|
+
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
8227
|
+
vector signed short vscales = vec_unpackh(utmps);
|
8228
|
+
|
8229
|
+
vector signed short q5xmins = vec_unpackl(utmps);
|
8230
|
+
vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
|
8231
|
+
vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
|
8232
|
+
|
8233
|
+
vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
|
8234
|
+
vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
|
8235
|
+
vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
|
8236
|
+
vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
|
8237
|
+
|
8238
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
8239
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
8240
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
8241
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
8242
|
+
|
8243
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
8244
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
8245
|
+
|
8246
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
8247
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
8248
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
8249
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
8250
|
+
|
8251
|
+
const uint8_t * restrict q5 = x[i].qs;
|
8252
|
+
const int8_t * restrict q8 = y[i].qs;
|
8253
|
+
|
8254
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
8255
|
+
__builtin_prefetch(q5, 0, 1);
|
8256
|
+
__builtin_prefetch(q8, 0, 1);
|
8257
|
+
|
8258
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
|
8259
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
|
8260
|
+
q5 += 32;
|
8261
|
+
|
8262
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
8263
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
8264
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
8265
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
8266
|
+
|
8267
|
+
vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
|
8268
|
+
vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
|
8269
|
+
vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
|
8270
|
+
vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
|
8271
|
+
qxhs0 = vec_sr(qxhs0, v2);
|
8272
|
+
qxhs1 = vec_sr(qxhs1, v2);
|
8273
|
+
|
8274
|
+
vector signed char q5x00 = vec_or(q5h00, qxs00);
|
8275
|
+
vector signed char q5x01 = vec_or(q5h01, qxs01);
|
8276
|
+
vector signed char q5x10 = vec_or(q5h10, qxs10);
|
8277
|
+
vector signed char q5x11 = vec_or(q5h11, qxs11);
|
8278
|
+
|
8279
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
8280
|
+
vector signed char q8y10 = vec_xl(16, q8);
|
8281
|
+
vector signed char q8y01 = vec_xl(32, q8);
|
8282
|
+
vector signed char q8y11 = vec_xl(48, q8);
|
8283
|
+
q8 += 64;
|
8284
|
+
|
8285
|
+
vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
|
8286
|
+
vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
|
8287
|
+
vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
|
8288
|
+
vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
|
8289
|
+
|
8290
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
8291
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
8292
|
+
vscales = vec_sld(vscales, vscales, 12);
|
8293
|
+
|
8294
|
+
qv00 = vec_add(qv00, qv10);
|
8295
|
+
qv01 = vec_add(qv01, qv11);
|
8296
|
+
|
8297
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
8298
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
8299
|
+
vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
|
8300
|
+
vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
|
8301
|
+
}
|
8302
|
+
|
8303
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
8304
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
8305
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
8306
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
8307
|
+
}
|
8308
|
+
|
8309
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
8310
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
8311
|
+
|
8312
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
8313
|
+
|
8314
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
8315
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
8316
|
+
|
8317
|
+
*s = vec_extract(vsumf0, 0);
|
8318
|
+
|
7223
8319
|
#else
|
7224
8320
|
|
7225
8321
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
@@ -7517,6 +8613,83 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7517
8613
|
|
7518
8614
|
*s = sumf;
|
7519
8615
|
|
8616
|
+
#elif defined(__POWER9_VECTOR__)
|
8617
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8618
|
+
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
8619
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8620
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
8621
|
+
|
8622
|
+
vector float vsumf0 = vec_splats(0.0f);
|
8623
|
+
vector float vsumf1 = vec_splats(0.0f);
|
8624
|
+
vector float vsumf2 = vec_splats(0.0f);
|
8625
|
+
vector float vsumf3 = vec_splats(0.0f);
|
8626
|
+
|
8627
|
+
#pragma GCC unroll 2
|
8628
|
+
for (int i = 0; i < nb; ++i) {
|
8629
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
8630
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
8631
|
+
|
8632
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
8633
|
+
vector float vyd = vec_splats(y[i].d);
|
8634
|
+
vector float vd= vec_mul(vxd, vyd);
|
8635
|
+
|
8636
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
8637
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
|
8638
|
+
vector signed char qxs00 = (vector signed char)vec_and(qxs0, lowMask);
|
8639
|
+
vector signed char qxs01 = (vector signed char)vec_sr(qxs0, v4);
|
8640
|
+
vector signed char qxs10 = (vector signed char)vec_and(qxs1, lowMask);
|
8641
|
+
vector signed char qxs11 = (vector signed char)vec_sr(qxs1, v4);
|
8642
|
+
|
8643
|
+
vector signed char qxhs = (vector signed char)vec_xl_len(x[i].qh, 8);
|
8644
|
+
vector signed char qxhs0 = vec_or(qxhs, vec_sr(vec_sld(qxhs, qxhs, 8), v1));
|
8645
|
+
vector signed char qxhs1 = vec_sr(qxhs0, v2);
|
8646
|
+
vector signed char qxh00 = vec_sl(vec_andc((vector signed char)v1, qxhs0), v4);
|
8647
|
+
vector signed char qxh10 = vec_sl(vec_andc((vector signed char)v1, qxhs1), v4);
|
8648
|
+
vector signed char qxh01 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs0, v4)), v4);
|
8649
|
+
vector signed char qxh11 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs1, v4)), v4);
|
8650
|
+
|
8651
|
+
vector signed char q5x00 = vec_sub(qxs00, qxh00);
|
8652
|
+
vector signed char q5x10 = vec_sub(qxs10, qxh10);
|
8653
|
+
vector signed char q5x01 = vec_sub(qxs01, qxh01);
|
8654
|
+
vector signed char q5x11 = vec_sub(qxs11, qxh11);
|
8655
|
+
|
8656
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
8657
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
8658
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
8659
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
8660
|
+
|
8661
|
+
vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
|
8662
|
+
vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
|
8663
|
+
vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
|
8664
|
+
vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
|
8665
|
+
|
8666
|
+
vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
|
8667
|
+
vector signed short vs0 = vec_splat(vs, 0);
|
8668
|
+
vector signed short vs1 = vec_splat(vs, 1);
|
8669
|
+
vector signed short vs2 = vec_splat(vs, 2);
|
8670
|
+
vector signed short vs3 = vec_splat(vs, 3);
|
8671
|
+
|
8672
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
8673
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
8674
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
8675
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
8676
|
+
|
8677
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
8678
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
8679
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
8680
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
8681
|
+
}
|
8682
|
+
|
8683
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
8684
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
8685
|
+
|
8686
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
8687
|
+
|
8688
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
8689
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
8690
|
+
|
8691
|
+
*s = vec_extract(vsumf0, 0);
|
8692
|
+
|
7520
8693
|
#else
|
7521
8694
|
|
7522
8695
|
int8_t aux8[QK_K];
|
@@ -7947,6 +9120,151 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7947
9120
|
|
7948
9121
|
*s = sumf;
|
7949
9122
|
|
9123
|
+
#elif defined(__POWER9_VECTOR__)
|
9124
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
9125
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
9126
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
9127
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
9128
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
9129
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
9130
|
+
|
9131
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9132
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9133
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9134
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9135
|
+
|
9136
|
+
for (int i = 0; i < nb; ++i) {
|
9137
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9138
|
+
vector float vyd = vec_splats(y[i].d);
|
9139
|
+
vector float vd = vec_mul(vxd, vyd);
|
9140
|
+
|
9141
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
9142
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
9143
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
9144
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
9145
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9146
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9147
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9148
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9149
|
+
|
9150
|
+
const uint8_t * restrict q6 = x[i].ql;
|
9151
|
+
const uint8_t * restrict qh = x[i].qh;
|
9152
|
+
const int8_t * restrict qs = x[i].scales;
|
9153
|
+
const int8_t * restrict q8 = y[i].qs;
|
9154
|
+
|
9155
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
9156
|
+
__builtin_prefetch(q6, 0, 0);
|
9157
|
+
__builtin_prefetch(qh, 0, 0);
|
9158
|
+
__builtin_prefetch(q8, 0, 0);
|
9159
|
+
|
9160
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
|
9161
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
|
9162
|
+
vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
|
9163
|
+
vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
|
9164
|
+
q6 += 64;
|
9165
|
+
|
9166
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
9167
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
9168
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
9169
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
9170
|
+
vector signed char qxs20 = vec_and(qxs2, lowMask);
|
9171
|
+
vector signed char qxs21 = vec_sr(qxs2, v4);
|
9172
|
+
vector signed char qxs30 = vec_and(qxs3, lowMask);
|
9173
|
+
vector signed char qxs31 = vec_sr(qxs3, v4);
|
9174
|
+
|
9175
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
|
9176
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
|
9177
|
+
qh += 32;
|
9178
|
+
|
9179
|
+
vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
|
9180
|
+
vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
|
9181
|
+
vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
|
9182
|
+
vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
|
9183
|
+
vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
|
9184
|
+
vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
|
9185
|
+
vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
|
9186
|
+
vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
|
9187
|
+
|
9188
|
+
vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
|
9189
|
+
vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
|
9190
|
+
vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
|
9191
|
+
vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
|
9192
|
+
vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
|
9193
|
+
vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
|
9194
|
+
vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
|
9195
|
+
vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
|
9196
|
+
|
9197
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
9198
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
9199
|
+
vector signed char q8y20 = vec_xl( 32, q8);
|
9200
|
+
vector signed char q8y30 = vec_xl( 48, q8);
|
9201
|
+
vector signed char q8y01 = vec_xl( 64, q8);
|
9202
|
+
vector signed char q8y11 = vec_xl( 80, q8);
|
9203
|
+
vector signed char q8y21 = vec_xl( 96, q8);
|
9204
|
+
vector signed char q8y31 = vec_xl(112, q8);
|
9205
|
+
q8 += 128;
|
9206
|
+
|
9207
|
+
vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
|
9208
|
+
vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
|
9209
|
+
vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
|
9210
|
+
vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
|
9211
|
+
vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
|
9212
|
+
vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
|
9213
|
+
vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
|
9214
|
+
vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
|
9215
|
+
|
9216
|
+
vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
|
9217
|
+
qs += 8;
|
9218
|
+
|
9219
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
9220
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
9221
|
+
vector signed short vs2 = vec_splat(vscales, 2);
|
9222
|
+
vector signed short vs3 = vec_splat(vscales, 3);
|
9223
|
+
vector signed short vs4 = vec_splat(vscales, 4);
|
9224
|
+
vector signed short vs5 = vec_splat(vscales, 5);
|
9225
|
+
vector signed short vs6 = vec_splat(vscales, 6);
|
9226
|
+
vector signed short vs7 = vec_splat(vscales, 7);
|
9227
|
+
|
9228
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
9229
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
9230
|
+
vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
|
9231
|
+
vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
|
9232
|
+
vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
|
9233
|
+
vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
|
9234
|
+
vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
|
9235
|
+
vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
|
9236
|
+
|
9237
|
+
vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
|
9238
|
+
vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
|
9239
|
+
vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
|
9240
|
+
vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
|
9241
|
+
vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
|
9242
|
+
vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
|
9243
|
+
vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
|
9244
|
+
vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
|
9245
|
+
}
|
9246
|
+
|
9247
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
9248
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
9249
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
9250
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
9251
|
+
|
9252
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9253
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9254
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9255
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9256
|
+
}
|
9257
|
+
|
9258
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9259
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9260
|
+
|
9261
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9262
|
+
|
9263
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9264
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9265
|
+
|
9266
|
+
*s = vec_extract(vsumf0, 0);
|
9267
|
+
|
7950
9268
|
#else
|
7951
9269
|
|
7952
9270
|
int8_t aux8[QK_K];
|
@@ -8253,6 +9571,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8253
9571
|
|
8254
9572
|
*s = sumf;
|
8255
9573
|
|
9574
|
+
#elif defined(__POWER9_VECTOR__)
|
9575
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
9576
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
9577
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
9578
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
9579
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
9580
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
9581
|
+
|
9582
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9583
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9584
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9585
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9586
|
+
|
9587
|
+
#pragma GCC unroll 2
|
9588
|
+
for (int i = 0; i < nb; ++i) {
|
9589
|
+
__builtin_prefetch(x[i].ql, 0, 1);
|
9590
|
+
__builtin_prefetch(x[i].qh, 0, 1);
|
9591
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
9592
|
+
|
9593
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9594
|
+
vector float vyd = vec_splats(y[i].d);
|
9595
|
+
vector float vd= vec_mul(vxd, vyd);
|
9596
|
+
|
9597
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].ql);
|
9598
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].ql);
|
9599
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
9600
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
9601
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
9602
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
9603
|
+
|
9604
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
9605
|
+
|
9606
|
+
vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
|
9607
|
+
vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
|
9608
|
+
vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
|
9609
|
+
vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
|
9610
|
+
|
9611
|
+
vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
|
9612
|
+
vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
|
9613
|
+
vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
|
9614
|
+
vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
|
9615
|
+
|
9616
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
9617
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
9618
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
9619
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
9620
|
+
|
9621
|
+
vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
|
9622
|
+
vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
|
9623
|
+
vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
|
9624
|
+
vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
|
9625
|
+
|
9626
|
+
vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
|
9627
|
+
vector signed short vs0 = vec_splat(vs, 0);
|
9628
|
+
vector signed short vs1 = vec_splat(vs, 1);
|
9629
|
+
vector signed short vs2 = vec_splat(vs, 2);
|
9630
|
+
vector signed short vs3 = vec_splat(vs, 3);
|
9631
|
+
|
9632
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
9633
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
9634
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
9635
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
9636
|
+
|
9637
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9638
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9639
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9640
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9641
|
+
}
|
9642
|
+
|
9643
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9644
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9645
|
+
|
9646
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9647
|
+
|
9648
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9649
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9650
|
+
|
9651
|
+
*s = vec_extract(vsumf0, 0);
|
9652
|
+
|
8256
9653
|
#else
|
8257
9654
|
|
8258
9655
|
int8_t aux8[QK_K];
|
@@ -8294,7 +9691,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8294
9691
|
|
8295
9692
|
#endif
|
8296
9693
|
|
8297
|
-
#if defined (__AVX2__) || defined (__ARM_NEON)
|
9694
|
+
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__)
|
8298
9695
|
static const int8_t keven_signs_q2xs[1024] = {
|
8299
9696
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8300
9697
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8427,6 +9824,103 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8427
9824
|
|
8428
9825
|
*s = 0.125f * hsum_float_8(accumf);
|
8429
9826
|
|
9827
|
+
#elif defined(__POWER9_VECTOR__)
|
9828
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9829
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9830
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9831
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9832
|
+
|
9833
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
9834
|
+
|
9835
|
+
for (int i = 0; i < nb; ++i) {
|
9836
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9837
|
+
vector float vyd = vec_splats(y[i].d);
|
9838
|
+
vector float vd = vec_mul(vxd, vyd);
|
9839
|
+
|
9840
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
9841
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
9842
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
9843
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
9844
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9845
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9846
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9847
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9848
|
+
|
9849
|
+
const uint16_t * restrict q2 = x[i].qs;
|
9850
|
+
const int8_t * restrict q8 = y[i].qs;
|
9851
|
+
|
9852
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
9853
|
+
__builtin_prefetch(q2, 0, 1);
|
9854
|
+
__builtin_prefetch(q8, 0, 1);
|
9855
|
+
|
9856
|
+
uint32_t aux32[4];
|
9857
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
9858
|
+
|
9859
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t));
|
9860
|
+
q2 += 8;
|
9861
|
+
|
9862
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
|
9863
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
|
9864
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
|
9865
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
|
9866
|
+
|
9867
|
+
vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))};
|
9868
|
+
vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
|
9869
|
+
vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))};
|
9870
|
+
vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
|
9871
|
+
|
9872
|
+
vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
|
9873
|
+
vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
|
9874
|
+
vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
|
9875
|
+
vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
|
9876
|
+
|
9877
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
9878
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
9879
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
9880
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
9881
|
+
q8 += 64;
|
9882
|
+
|
9883
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
9884
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
9885
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
9886
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
9887
|
+
|
9888
|
+
const uint16_t ls0 = aux32[1] >> 28;
|
9889
|
+
const uint16_t ls1 = aux32[3] >> 28;
|
9890
|
+
|
9891
|
+
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
9892
|
+
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
9893
|
+
|
9894
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
9895
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
9896
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
9897
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
9898
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
9899
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
9900
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
9901
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
9902
|
+
}
|
9903
|
+
|
9904
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
9905
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
9906
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
9907
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
9908
|
+
|
9909
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9910
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9911
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9912
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9913
|
+
}
|
9914
|
+
|
9915
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9916
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9917
|
+
|
9918
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9919
|
+
|
9920
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9921
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9922
|
+
|
9923
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8430
9924
|
#else
|
8431
9925
|
|
8432
9926
|
uint32_t aux32[2];
|
@@ -8702,6 +10196,104 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
8702
10196
|
*s = 0.125f * hsum_float_8(accumf);
|
8703
10197
|
#endif
|
8704
10198
|
|
10199
|
+
#elif defined(__POWER9_VECTOR__)
|
10200
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10201
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10202
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10203
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10204
|
+
|
10205
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10206
|
+
|
10207
|
+
for (int i = 0; i < nb; ++i) {
|
10208
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10209
|
+
vector float vyd = vec_splats(y[i].d);
|
10210
|
+
vector float vd = vec_mul(vxd, vyd);
|
10211
|
+
|
10212
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10213
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10214
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10215
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10216
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10217
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10218
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10219
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10220
|
+
|
10221
|
+
const uint16_t * restrict q2 = x[i].qs;
|
10222
|
+
const uint8_t * restrict sc = x[i].scales;
|
10223
|
+
const int8_t * restrict q8 = y[i].qs;
|
10224
|
+
|
10225
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
10226
|
+
__builtin_prefetch(q2, 0, 1);
|
10227
|
+
__builtin_prefetch(q8, 0, 1);
|
10228
|
+
|
10229
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
|
10230
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
|
10231
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
|
10232
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
|
10233
|
+
|
10234
|
+
vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
|
10235
|
+
vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
|
10236
|
+
vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
|
10237
|
+
vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
|
10238
|
+
q2 += 8;
|
10239
|
+
|
10240
|
+
vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
|
10241
|
+
vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
|
10242
|
+
vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
|
10243
|
+
vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
|
10244
|
+
|
10245
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10246
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10247
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10248
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10249
|
+
q8 += 64;
|
10250
|
+
|
10251
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
10252
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
10253
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
10254
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
10255
|
+
|
10256
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
10257
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
10258
|
+
const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
|
10259
|
+
const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
|
10260
|
+
sc += 2;
|
10261
|
+
|
10262
|
+
vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
|
10263
|
+
vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
|
10264
|
+
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
10265
|
+
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
10266
|
+
|
10267
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
|
10268
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
|
10269
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
|
10270
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
|
10271
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
10272
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
10273
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
10274
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10275
|
+
}
|
10276
|
+
|
10277
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10278
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10279
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10280
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10281
|
+
|
10282
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10283
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10284
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10285
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10286
|
+
}
|
10287
|
+
|
10288
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10289
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10290
|
+
|
10291
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10292
|
+
|
10293
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10294
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10295
|
+
|
10296
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8705
10297
|
#else
|
8706
10298
|
|
8707
10299
|
float sumf = 0.f;
|
@@ -8902,6 +10494,124 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
8902
10494
|
|
8903
10495
|
*s = 0.125f * hsum_float_8(accumf);
|
8904
10496
|
|
10497
|
+
#elif defined(__POWER9_VECTOR__)
|
10498
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10499
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10500
|
+
};
|
10501
|
+
|
10502
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10503
|
+
|
10504
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10505
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10506
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10507
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10508
|
+
|
10509
|
+
const vector unsigned char mask0 = vec_xl( 0, k_mask1);
|
10510
|
+
const vector unsigned char mask1 = vec_xl(16, k_mask1);
|
10511
|
+
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
10512
|
+
|
10513
|
+
for (int i = 0; i < nb; ++i) {
|
10514
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10515
|
+
vector float vyd = vec_splats(y[i].d);
|
10516
|
+
vector float vd = vec_mul(vxd, vyd);
|
10517
|
+
|
10518
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10519
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10520
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10521
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10522
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10523
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10524
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10525
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10526
|
+
|
10527
|
+
const uint8_t * restrict q2 = x[i].qs;
|
10528
|
+
const uint8_t * restrict qh = x[i].qh;
|
10529
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
10530
|
+
const uint8_t * restrict sc = x[i].scales;
|
10531
|
+
const int8_t * restrict q8 = y[i].qs;
|
10532
|
+
|
10533
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
10534
|
+
__builtin_prefetch(q2, 0, 1);
|
10535
|
+
__builtin_prefetch(q8, 0, 1);
|
10536
|
+
|
10537
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
|
10538
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
|
10539
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
|
10540
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
|
10541
|
+
q2 += 8;
|
10542
|
+
qh += 2;
|
10543
|
+
|
10544
|
+
vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
|
10545
|
+
vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
|
10546
|
+
signs += 4;
|
10547
|
+
|
10548
|
+
vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
|
10549
|
+
vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
|
10550
|
+
vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
|
10551
|
+
vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
|
10552
|
+
|
10553
|
+
vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
|
10554
|
+
vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
|
10555
|
+
vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
|
10556
|
+
vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
|
10557
|
+
|
10558
|
+
vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
|
10559
|
+
vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
|
10560
|
+
vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
|
10561
|
+
vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
|
10562
|
+
|
10563
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10564
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10565
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10566
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10567
|
+
q8 += 64;
|
10568
|
+
|
10569
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
10570
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
10571
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
10572
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
10573
|
+
|
10574
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
10575
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
10576
|
+
const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
|
10577
|
+
const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
|
10578
|
+
sc += 2;
|
10579
|
+
|
10580
|
+
vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
|
10581
|
+
vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
|
10582
|
+
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
10583
|
+
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
10584
|
+
|
10585
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
|
10586
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
|
10587
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
|
10588
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
|
10589
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
10590
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
10591
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
10592
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10593
|
+
}
|
10594
|
+
|
10595
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10596
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10597
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10598
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10599
|
+
|
10600
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10601
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10602
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10603
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10604
|
+
}
|
10605
|
+
|
10606
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10607
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10608
|
+
|
10609
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10610
|
+
|
10611
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10612
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10613
|
+
|
10614
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8905
10615
|
#else
|
8906
10616
|
|
8907
10617
|
float sumf = 0;
|
@@ -9046,6 +10756,101 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9046
10756
|
|
9047
10757
|
*s = 0.25f * hsum_float_8(accumf);
|
9048
10758
|
|
10759
|
+
#elif defined(__POWER9_VECTOR__)
|
10760
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10761
|
+
|
10762
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10763
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10764
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10765
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10766
|
+
|
10767
|
+
for (int i = 0; i < nb; ++i) {
|
10768
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10769
|
+
vector float vyd = vec_splats(y[i].d);
|
10770
|
+
vector float vd = vec_mul(vxd, vyd);
|
10771
|
+
|
10772
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10773
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10774
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10775
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10776
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10777
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10778
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10779
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10780
|
+
|
10781
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10782
|
+
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
10783
|
+
const int8_t * restrict q8 = y[i].qs;
|
10784
|
+
|
10785
|
+
#pragma GCC unroll 1
|
10786
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
10787
|
+
__builtin_prefetch(q3, 0, 1);
|
10788
|
+
__builtin_prefetch(q8, 0, 1);
|
10789
|
+
|
10790
|
+
vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
|
10791
|
+
vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
|
10792
|
+
vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
|
10793
|
+
vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
|
10794
|
+
q3 += 16;
|
10795
|
+
|
10796
|
+
vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])};
|
10797
|
+
vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
|
10798
|
+
vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])};
|
10799
|
+
vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
|
10800
|
+
|
10801
|
+
vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
|
10802
|
+
vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
|
10803
|
+
vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
|
10804
|
+
vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
|
10805
|
+
|
10806
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10807
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10808
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10809
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10810
|
+
q8 += 64;
|
10811
|
+
|
10812
|
+
vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
|
10813
|
+
vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
|
10814
|
+
vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
|
10815
|
+
vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
|
10816
|
+
|
10817
|
+
const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
|
10818
|
+
const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
|
10819
|
+
signs += 2;
|
10820
|
+
|
10821
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10822
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10823
|
+
|
10824
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
10825
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
10826
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
10827
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
10828
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10829
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10830
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10831
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10832
|
+
}
|
10833
|
+
|
10834
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10835
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10836
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10837
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10838
|
+
|
10839
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10840
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10841
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10842
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10843
|
+
}
|
10844
|
+
|
10845
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10846
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10847
|
+
|
10848
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10849
|
+
|
10850
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10851
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10852
|
+
|
10853
|
+
*s = 0.25f * vec_extract(vsumf0, 0);
|
9049
10854
|
#else
|
9050
10855
|
|
9051
10856
|
uint32_t aux32;
|
@@ -9273,6 +11078,124 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
9273
11078
|
|
9274
11079
|
*s = hsum_float_8(accumf);
|
9275
11080
|
|
11081
|
+
#elif defined(__POWER9_VECTOR__)
|
11082
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
11083
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
11084
|
+
};
|
11085
|
+
|
11086
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
11087
|
+
|
11088
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11089
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11090
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11091
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11092
|
+
|
11093
|
+
const vector unsigned char mask0 = vec_xl( 0, k_mask1);
|
11094
|
+
const vector unsigned char mask1 = vec_xl(16, k_mask1);
|
11095
|
+
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
11096
|
+
|
11097
|
+
for (int i = 0; i < nb; ++i) {
|
11098
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
11099
|
+
vector float vyd = vec_splats(y[i].d);
|
11100
|
+
vector float vd = vec_mul(vxd, vyd);
|
11101
|
+
|
11102
|
+
const uint8_t * restrict q3 = x[i].qs;
|
11103
|
+
const uint8_t * restrict qh = x[i].qh;
|
11104
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
|
11105
|
+
const uint8_t * restrict sc = x[i].scales;
|
11106
|
+
const int8_t * restrict q8 = y[i].qs;
|
11107
|
+
|
11108
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11109
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11110
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11111
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11112
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11113
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11114
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11115
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11116
|
+
|
11117
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
11118
|
+
__builtin_prefetch(q3, 0, 1);
|
11119
|
+
__builtin_prefetch(q8, 0, 1);
|
11120
|
+
|
11121
|
+
vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
|
11122
|
+
iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
|
11123
|
+
vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
|
11124
|
+
iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
|
11125
|
+
vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
|
11126
|
+
iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
|
11127
|
+
vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
|
11128
|
+
iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
|
11129
|
+
q3 += 16;
|
11130
|
+
qh += 2;
|
11131
|
+
|
11132
|
+
vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
|
11133
|
+
vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
|
11134
|
+
signs += 4;
|
11135
|
+
|
11136
|
+
vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
|
11137
|
+
vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
|
11138
|
+
vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
|
11139
|
+
vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
|
11140
|
+
|
11141
|
+
vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
|
11142
|
+
vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
|
11143
|
+
vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
|
11144
|
+
vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
|
11145
|
+
|
11146
|
+
vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
|
11147
|
+
vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
|
11148
|
+
vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
|
11149
|
+
vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
|
11150
|
+
|
11151
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
11152
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
11153
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
11154
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
11155
|
+
q8 += 64;
|
11156
|
+
|
11157
|
+
vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
|
11158
|
+
vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
|
11159
|
+
vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
|
11160
|
+
vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
|
11161
|
+
|
11162
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
11163
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
11164
|
+
sc ++;
|
11165
|
+
|
11166
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
11167
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
11168
|
+
|
11169
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
11170
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
11171
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
11172
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
11173
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11174
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11175
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11176
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11177
|
+
}
|
11178
|
+
|
11179
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
11180
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
11181
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
11182
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
11183
|
+
|
11184
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11185
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11186
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
11187
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
11188
|
+
}
|
11189
|
+
|
11190
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
11191
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
11192
|
+
|
11193
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11194
|
+
|
11195
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11196
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11197
|
+
|
11198
|
+
*s = vec_extract(vsumf0, 0);
|
9276
11199
|
#else
|
9277
11200
|
|
9278
11201
|
float sumf = 0.f;
|
@@ -9427,6 +11350,113 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
9427
11350
|
|
9428
11351
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
9429
11352
|
|
11353
|
+
#elif defined(__POWER9_VECTOR__)
|
11354
|
+
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
11355
|
+
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
11356
|
+
|
11357
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11358
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11359
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11360
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11361
|
+
|
11362
|
+
for (int i = 0; i < nb; ++i) {
|
11363
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
11364
|
+
vector float vyd = vec_splats(y[i].d);
|
11365
|
+
vector float vd = vec_mul(vxd, vyd);
|
11366
|
+
|
11367
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11368
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11369
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11370
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11371
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11372
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11373
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11374
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11375
|
+
vector signed int vsumi8 = vec_splats((int32_t)0);
|
11376
|
+
|
11377
|
+
const uint8_t * restrict q1 = x[i].qs;
|
11378
|
+
const uint16_t * restrict qh = x[i].qh;
|
11379
|
+
const int8_t * restrict q8 = y[i].qs;
|
11380
|
+
const int16_t * restrict qs = y[i].bsums;
|
11381
|
+
|
11382
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
11383
|
+
__builtin_prefetch(q1, 0, 1);
|
11384
|
+
__builtin_prefetch(qh, 0, 1);
|
11385
|
+
__builtin_prefetch(q8, 0, 1);
|
11386
|
+
|
11387
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
|
11388
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
|
11389
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
|
11390
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
|
11391
|
+
q1 += 8;
|
11392
|
+
|
11393
|
+
vector signed char q1x0 = (vector signed char)aux64x2_0;
|
11394
|
+
vector signed char q1x1 = (vector signed char)aux64x2_1;
|
11395
|
+
vector signed char q1x2 = (vector signed char)aux64x2_2;
|
11396
|
+
vector signed char q1x3 = (vector signed char)aux64x2_3;
|
11397
|
+
|
11398
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
11399
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
11400
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
11401
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
11402
|
+
q8 += 64;
|
11403
|
+
|
11404
|
+
vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
|
11405
|
+
vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
|
11406
|
+
vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
|
11407
|
+
vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
|
11408
|
+
|
11409
|
+
const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
|
11410
|
+
const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
|
11411
|
+
|
11412
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
11413
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
11414
|
+
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
11415
|
+
|
11416
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
11417
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
11418
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
11419
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
11420
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11421
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11422
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11423
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11424
|
+
|
11425
|
+
vector signed short q8ysums = vec_xl_len(qs, 8);
|
11426
|
+
qs += 4;
|
11427
|
+
q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
|
11428
|
+
|
11429
|
+
vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
|
11430
|
+
qh += 2;
|
11431
|
+
vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
|
11432
|
+
|
11433
|
+
vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
|
11434
|
+
|
11435
|
+
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
11436
|
+
}
|
11437
|
+
|
11438
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
11439
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
11440
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
11441
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
11442
|
+
|
11443
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11444
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11445
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
11446
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
11447
|
+
|
11448
|
+
vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
|
11449
|
+
}
|
11450
|
+
|
11451
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
11452
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
11453
|
+
|
11454
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11455
|
+
|
11456
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11457
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11458
|
+
|
11459
|
+
*s = vec_extract(vsumf0, 0);
|
9430
11460
|
#else
|
9431
11461
|
|
9432
11462
|
float sumf = 0;
|
@@ -9783,6 +11813,51 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9783
11813
|
|
9784
11814
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
9785
11815
|
|
11816
|
+
#elif defined(__POWER9_VECTOR__)
|
11817
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11818
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11819
|
+
|
11820
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11821
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11822
|
+
|
11823
|
+
const vector signed char values = vec_xl( 0, kvalues_iq4nl);
|
11824
|
+
|
11825
|
+
#pragma GCC unroll 4
|
11826
|
+
for (int ib = 0; ib < nb; ++ib) {
|
11827
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
11828
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
11829
|
+
|
11830
|
+
|
11831
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
11832
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
11833
|
+
vector float vd = vec_mul(vxd, vyd);
|
11834
|
+
|
11835
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
11836
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
11837
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
11838
|
+
|
11839
|
+
q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
|
11840
|
+
q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
|
11841
|
+
|
11842
|
+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
|
11843
|
+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
|
11844
|
+
|
11845
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
11846
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
11847
|
+
|
11848
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
11849
|
+
vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
|
11850
|
+
|
11851
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11852
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11853
|
+
}
|
11854
|
+
|
11855
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11856
|
+
|
11857
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11858
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11859
|
+
|
11860
|
+
*s = vec_extract(vsumf0, 0);
|
9786
11861
|
#else
|
9787
11862
|
float sumf = 0;
|
9788
11863
|
for (int ib = 0; ib < nb; ++ib) {
|
@@ -9894,6 +11969,105 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9894
11969
|
|
9895
11970
|
*s = hsum_float_8(accum);
|
9896
11971
|
|
11972
|
+
#elif defined(__POWER9_VECTOR__)
|
11973
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11974
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11975
|
+
|
11976
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11977
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11978
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11979
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11980
|
+
|
11981
|
+
const vector signed char values = vec_xl( 0, kvalues_iq4nl);
|
11982
|
+
|
11983
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
11984
|
+
|
11985
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
|
11986
|
+
vector float vyd = vec_splats(y[ibl].d);
|
11987
|
+
vector float vd = vec_mul(vxd, vyd);
|
11988
|
+
|
11989
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11990
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11991
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11992
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11993
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11994
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11995
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11996
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11997
|
+
|
11998
|
+
uint16_t h = x[ibl].scales_h;
|
11999
|
+
|
12000
|
+
const uint8_t * restrict q4 = x[ibl].qs;
|
12001
|
+
const uint8_t * restrict sc = x[ibl].scales_l;
|
12002
|
+
const int8_t * restrict q8 = y[ibl].qs;
|
12003
|
+
|
12004
|
+
for (int ib = 0; ib < QK_K/64; ib ++ ) {
|
12005
|
+
__builtin_prefetch(q4, 0, 1);
|
12006
|
+
__builtin_prefetch(q8, 0, 1);
|
12007
|
+
|
12008
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
|
12009
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
|
12010
|
+
q4 += 32;
|
12011
|
+
|
12012
|
+
vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
|
12013
|
+
vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
|
12014
|
+
vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
|
12015
|
+
vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
|
12016
|
+
|
12017
|
+
q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
|
12018
|
+
q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
|
12019
|
+
q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
|
12020
|
+
q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
|
12021
|
+
|
12022
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
12023
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
12024
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
12025
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
12026
|
+
q8 += 64;
|
12027
|
+
|
12028
|
+
vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
|
12029
|
+
vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
|
12030
|
+
vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
|
12031
|
+
vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
|
12032
|
+
|
12033
|
+
const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
|
12034
|
+
const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32);
|
12035
|
+
h >>= 4;
|
12036
|
+
sc ++;
|
12037
|
+
|
12038
|
+
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
12039
|
+
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
12040
|
+
|
12041
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
12042
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
12043
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
12044
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
12045
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
12046
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
12047
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
12048
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
12049
|
+
}
|
12050
|
+
|
12051
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
12052
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
12053
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
12054
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
12055
|
+
|
12056
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
12057
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
12058
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
12059
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
12060
|
+
}
|
12061
|
+
|
12062
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
12063
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
12064
|
+
|
12065
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
12066
|
+
|
12067
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
12068
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
12069
|
+
|
12070
|
+
*s = vec_extract(vsumf0, 0);
|
9897
12071
|
#else
|
9898
12072
|
float sumf = 0;
|
9899
12073
|
for (int ibl = 0; ibl < nb; ++ibl) {
|