llama_cpp 0.15.1 → 0.15.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/vendor/tmp/llama.cpp/Makefile +3 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +2 -3
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +15 -7
- data/vendor/tmp/llama.cpp/ggml-impl.h +7 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +9 -3
- data/vendor/tmp/llama.cpp/ggml-metal.m +114 -125
- data/vendor/tmp/llama.cpp/ggml-metal.metal +86 -109
- data/vendor/tmp/llama.cpp/ggml-quants.c +2202 -28
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +1032 -0
- data/vendor/tmp/llama.cpp/ggml-rpc.h +24 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +24 -143
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +4 -2
- data/vendor/tmp/llama.cpp/ggml.c +726 -646
- data/vendor/tmp/llama.cpp/ggml.h +28 -17
- data/vendor/tmp/llama.cpp/llama.cpp +478 -281
- data/vendor/tmp/llama.cpp/llama.h +3 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +6969 -2169
- data/vendor/tmp/llama.cpp/unicode-data.h +15 -12
- data/vendor/tmp/llama.cpp/unicode.cpp +89 -111
- data/vendor/tmp/llama.cpp/unicode.h +44 -12
- metadata +4 -2
@@ -14,6 +14,12 @@
|
|
14
14
|
#include <stdlib.h> // for qsort
|
15
15
|
#include <stdio.h> // for GGML_ASSERT
|
16
16
|
|
17
|
+
#if defined(_MSC_VER)
|
18
|
+
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
19
|
+
// we should just be careful :)
|
20
|
+
#pragma warning(disable: 4244 4267)
|
21
|
+
#endif
|
22
|
+
|
17
23
|
#define UNUSED GGML_UNUSED
|
18
24
|
|
19
25
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
@@ -235,7 +241,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
235
241
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
236
242
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
237
243
|
|
238
|
-
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
244
|
+
#if defined(__ARM_NEON) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
|
239
245
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
240
246
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
241
247
|
#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
|
@@ -637,6 +643,38 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
637
643
|
// store result
|
638
644
|
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
|
639
645
|
}
|
646
|
+
#elif defined(__POWER9_VECTOR__)
|
647
|
+
for (int i = 0; i < nb; i++) {
|
648
|
+
vector float srcv [8];
|
649
|
+
vector float asrcv[8];
|
650
|
+
vector float amaxv[8];
|
651
|
+
vector signed int vi[8];
|
652
|
+
|
653
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
654
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
655
|
+
|
656
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
657
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
658
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
659
|
+
|
660
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
661
|
+
vec_extract(amaxv[0], 1)),
|
662
|
+
MAX(vec_extract(amaxv[0], 2),
|
663
|
+
vec_extract(amaxv[0], 3)));
|
664
|
+
|
665
|
+
const float d = amax / ((1 << 7) - 1);
|
666
|
+
const float id = d ? 1.0f/d : 0.0f;
|
667
|
+
const vector float vid = vec_splats(id);
|
668
|
+
|
669
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
670
|
+
|
671
|
+
for (int j = 0; j < 8; j++) {
|
672
|
+
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
673
|
+
vi[j] = vec_cts(v, 0);
|
674
|
+
}
|
675
|
+
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
676
|
+
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
677
|
+
}
|
640
678
|
#else
|
641
679
|
GGML_UNUSED(nb);
|
642
680
|
// scalar
|
@@ -892,6 +930,46 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
892
930
|
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
893
931
|
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
894
932
|
}
|
933
|
+
#elif defined(__POWER9_VECTOR__)
|
934
|
+
for (int i = 0; i < nb; i++) {
|
935
|
+
vector float srcv [8];
|
936
|
+
vector float asrcv[8];
|
937
|
+
vector float amaxv[8];
|
938
|
+
vector signed int vi[8];
|
939
|
+
|
940
|
+
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
|
941
|
+
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
|
942
|
+
|
943
|
+
for (int j = 0; j < 4; j++) amaxv[2*j] = vec_max(asrcv[2*j], asrcv[2*j+1]);
|
944
|
+
for (int j = 0; j < 2; j++) amaxv[4*j] = vec_max(amaxv[4*j], amaxv[4*j+2]);
|
945
|
+
for (int j = 0; j < 1; j++) amaxv[8*j] = vec_max(amaxv[8*j], amaxv[8*j+4]);
|
946
|
+
|
947
|
+
const float amax = MAX(MAX(vec_extract(amaxv[0], 0),
|
948
|
+
vec_extract(amaxv[0], 1)),
|
949
|
+
MAX(vec_extract(amaxv[0], 2),
|
950
|
+
vec_extract(amaxv[0], 3)));
|
951
|
+
|
952
|
+
const float d = amax / ((1 << 7) - 1);
|
953
|
+
const float id = d ? 1.0f/d : 0.0f;
|
954
|
+
const vector float vid = vec_splats(id);
|
955
|
+
|
956
|
+
y[i].d = GGML_FP32_TO_FP16(d);
|
957
|
+
|
958
|
+
vector int accv = vec_splats(0);
|
959
|
+
|
960
|
+
for (int j = 0; j < 8; j++) {
|
961
|
+
const vector float v = vec_round(vec_mul(srcv[j], vid));
|
962
|
+
vi[j] = vec_cts(v, 0);
|
963
|
+
|
964
|
+
accv = vec_add(accv, vi[j]);
|
965
|
+
}
|
966
|
+
vec_xst(vec_pack(vec_pack(vi[0], vi[1]), vec_pack(vi[2], vi[3])), 0, &y[i].qs[0]);
|
967
|
+
vec_xst(vec_pack(vec_pack(vi[4], vi[5]), vec_pack(vi[6], vi[7])), 16, &y[i].qs[0]);
|
968
|
+
|
969
|
+
accv = vec_add(accv, vec_sld(accv, accv, 4));
|
970
|
+
accv = vec_add(accv, vec_sld(accv, accv, 8));
|
971
|
+
y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
|
972
|
+
}
|
895
973
|
#else
|
896
974
|
GGML_UNUSED(nb);
|
897
975
|
// scalar
|
@@ -1908,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
1908
1986
|
|
1909
1987
|
for (int j = 0; j < QK_K/16; ++j) {
|
1910
1988
|
if (quant_weights) {
|
1911
|
-
const float * qw = quant_weights
|
1989
|
+
const float * qw = quant_weights + QK_K * i + 16*j;
|
1912
1990
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
1913
1991
|
} else {
|
1914
1992
|
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
@@ -3409,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3409
3487
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3410
3488
|
if (nrc == 2) {
|
3411
3489
|
const block_q4_0 * restrict vx0 = vx;
|
3412
|
-
const block_q4_0 * restrict vx1 = vx + bx;
|
3413
|
-
|
3490
|
+
const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
3414
3491
|
const block_q8_0 * restrict vy0 = vy;
|
3415
|
-
const block_q8_0 * restrict vy1 = vy + by;
|
3492
|
+
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
3416
3493
|
|
3417
3494
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3418
3495
|
|
@@ -3446,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3446
3523
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
3447
3524
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3448
3525
|
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3526
|
+
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3527
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
3528
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3529
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
3530
|
+
|
3531
|
+
float32x4_t scale = vld1q_f32(_scale);
|
3453
3532
|
|
3454
3533
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3455
3534
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -3734,6 +3813,46 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
3734
3813
|
}
|
3735
3814
|
|
3736
3815
|
*s = sumf;
|
3816
|
+
#elif defined(__POWER9_VECTOR__)
|
3817
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
3818
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
3819
|
+
const vector signed char v8 = vec_splats((signed char)0x8);
|
3820
|
+
|
3821
|
+
vector float vsumf0 = vec_splats(0.0f);
|
3822
|
+
|
3823
|
+
#pragma GCC unroll 4
|
3824
|
+
for (int i = 0; i < nb; i++) {
|
3825
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
3826
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
3827
|
+
|
3828
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
3829
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
3830
|
+
vector float vd = vec_mul(vxd, vyd);
|
3831
|
+
|
3832
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
3833
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
3834
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
3835
|
+
|
3836
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
3837
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
3838
|
+
|
3839
|
+
q4x0 = vec_sub(q4x0, v8);
|
3840
|
+
q4x1 = vec_sub(q4x1, v8);
|
3841
|
+
|
3842
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
3843
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
3844
|
+
|
3845
|
+
qv0 = vec_add(qv0, qv1);
|
3846
|
+
|
3847
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
3848
|
+
|
3849
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
3850
|
+
}
|
3851
|
+
|
3852
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
3853
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
3854
|
+
|
3855
|
+
*s = vec_extract(vsumf0, 0);
|
3737
3856
|
#else
|
3738
3857
|
// scalar
|
3739
3858
|
float sumf = 0.0;
|
@@ -3776,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3776
3895
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3777
3896
|
if (nrc == 2) {
|
3778
3897
|
const block_q4_1 * restrict vx0 = vx;
|
3779
|
-
const block_q4_1 * restrict vx1 = vx + bx;
|
3898
|
+
const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
3780
3899
|
const block_q8_1 * restrict vy0 = vy;
|
3781
|
-
const block_q8_1 * restrict vy1 = vy + by;
|
3900
|
+
const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
3782
3901
|
|
3783
3902
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3784
3903
|
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
@@ -3789,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3789
3908
|
const block_q8_1 * restrict b_y0 = &vy0[i];
|
3790
3909
|
const block_q8_1 * restrict b_y1 = &vy1[i];
|
3791
3910
|
|
3792
|
-
|
3793
|
-
|
3794
|
-
|
3795
|
-
|
3796
|
-
summs0
|
3911
|
+
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
3912
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
3913
|
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
3914
|
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
3915
|
+
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
3797
3916
|
|
3798
3917
|
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
3799
3918
|
|
@@ -3813,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3813
3932
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3814
3933
|
|
3815
3934
|
// mmla into int32x4_t
|
3816
|
-
|
3817
|
-
|
3818
|
-
|
3819
|
-
|
3935
|
+
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
3936
|
+
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
3937
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
3938
|
+
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
3939
|
+
float32x4_t scale = vld1q_f32(_scale);
|
3820
3940
|
|
3821
3941
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3822
3942
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -3835,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3835
3955
|
|
3836
3956
|
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
3837
3957
|
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
3838
|
-
sumv2 = sumv2
|
3958
|
+
sumv2 = vaddq_f32(sumv2, summs0);
|
3839
3959
|
|
3840
3960
|
vst1_f32(s, vget_low_f32(sumv2));
|
3841
3961
|
vst1_f32(s + bs, vget_high_f32(sumv2));
|
@@ -3952,6 +4072,46 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
3952
4072
|
}
|
3953
4073
|
|
3954
4074
|
*s = sumf;
|
4075
|
+
#elif defined(__POWER9_VECTOR__)
|
4076
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4077
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4078
|
+
|
4079
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4080
|
+
|
4081
|
+
#pragma GCC unroll 4
|
4082
|
+
for (int i = 0; i < nb; i++) {
|
4083
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4084
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4085
|
+
|
4086
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4087
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4088
|
+
vector float vd = vec_mul(vxd, vyd);
|
4089
|
+
|
4090
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
|
4091
|
+
vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.0f, 0.0f, 0.0f};
|
4092
|
+
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
4093
|
+
|
4094
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4095
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4096
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4097
|
+
|
4098
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
4099
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
4100
|
+
|
4101
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
4102
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
4103
|
+
|
4104
|
+
qv0 = vec_add(qv0, qv1);
|
4105
|
+
|
4106
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4107
|
+
|
4108
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4109
|
+
}
|
4110
|
+
|
4111
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4112
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4113
|
+
|
4114
|
+
*s = vec_extract(vsumf0, 0);
|
3955
4115
|
#else
|
3956
4116
|
// scalar
|
3957
4117
|
float sumf = 0.0;
|
@@ -4237,6 +4397,49 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4237
4397
|
}
|
4238
4398
|
|
4239
4399
|
*s = sumf;
|
4400
|
+
#elif defined(__POWER9_VECTOR__)
|
4401
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4402
|
+
const vector unsigned char v4 = vec_splats((unsigned char)4);
|
4403
|
+
|
4404
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4405
|
+
|
4406
|
+
#pragma GCC unroll 4
|
4407
|
+
for (int i = 0; i < nb; ++i) {
|
4408
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4409
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4410
|
+
|
4411
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4412
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4413
|
+
vector float vd = vec_mul(vxd, vyd);
|
4414
|
+
|
4415
|
+
vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[i].qh[0]]), (uint64_t)(table_b2b_1[x[i].qh[1]])};
|
4416
|
+
vector signed long long aux64x2_1 = {(uint64_t)(table_b2b_1[x[i].qh[2]]), (uint64_t)(table_b2b_1[x[i].qh[3]])};
|
4417
|
+
|
4418
|
+
vector signed char qh0 = (vector signed char)aux64x2_0;
|
4419
|
+
vector signed char qh1 = (vector signed char)aux64x2_1;
|
4420
|
+
|
4421
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4422
|
+
|
4423
|
+
vector signed char q5x0 = vec_sub(vec_and (qxs, lowMask), qh0);
|
4424
|
+
vector signed char q5x1 = vec_sub(vec_sr(qxs, v4), qh1);
|
4425
|
+
|
4426
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4427
|
+
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
4428
|
+
|
4429
|
+
vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
|
4430
|
+
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
4431
|
+
|
4432
|
+
qv0 = vec_add(qv0, qv1);
|
4433
|
+
|
4434
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4435
|
+
|
4436
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4437
|
+
}
|
4438
|
+
|
4439
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4440
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4441
|
+
|
4442
|
+
*s = vec_extract(vsumf0, 0);
|
4240
4443
|
#else
|
4241
4444
|
// scalar
|
4242
4445
|
float sumf = 0.0;
|
@@ -4541,6 +4744,53 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
4541
4744
|
}
|
4542
4745
|
|
4543
4746
|
*s = sumf;
|
4747
|
+
#elif defined(__POWER9_VECTOR__)
|
4748
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
4749
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
4750
|
+
|
4751
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4752
|
+
|
4753
|
+
#pragma GCC unroll 4
|
4754
|
+
for (int i = 0; i < nb; ++i) {
|
4755
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4756
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4757
|
+
|
4758
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4759
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4760
|
+
vector float vd = vec_mul(vxd, vyd);
|
4761
|
+
|
4762
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].m));
|
4763
|
+
vector float vys = {GGML_FP16_TO_FP32(y[i].s), 0.f, 0.f, 0.f};
|
4764
|
+
vsumf0 = vec_madd(vxmin, vys, vsumf0);
|
4765
|
+
|
4766
|
+
vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[i].qh[0]]), (uint64_t)(table_b2b_0[x[i].qh[1]])};
|
4767
|
+
vector unsigned long long aux64x2_1 = {(uint64_t)(table_b2b_0[x[i].qh[2]]), (uint64_t)(table_b2b_0[x[i].qh[3]])};
|
4768
|
+
|
4769
|
+
vector signed char qh0 = (vector signed char)aux64x2_0;
|
4770
|
+
vector signed char qh1 = (vector signed char)aux64x2_1;
|
4771
|
+
|
4772
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[i].qs);
|
4773
|
+
|
4774
|
+
vector signed char q5x0 = vec_or(vec_and(qxs, lowMask), qh0);
|
4775
|
+
vector signed char q5x1 = vec_or(vec_sr(qxs, v4), qh1);
|
4776
|
+
|
4777
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4778
|
+
vector signed char q8y1 = vec_xl( 16, y[i].qs);
|
4779
|
+
|
4780
|
+
vector signed short qv0 = vec_add(vec_mule(q5x0, q8y0), vec_mulo(q5x0, q8y0));
|
4781
|
+
vector signed short qv1 = vec_add(vec_mule(q5x1, q8y1), vec_mulo(q5x1, q8y1));
|
4782
|
+
|
4783
|
+
qv0 = vec_add(qv0, qv1);
|
4784
|
+
|
4785
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
4786
|
+
|
4787
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
4788
|
+
}
|
4789
|
+
|
4790
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
4791
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
4792
|
+
|
4793
|
+
*s = vec_extract(vsumf0, 0);
|
4544
4794
|
#else
|
4545
4795
|
// scalar
|
4546
4796
|
float sumf = 0.0;
|
@@ -4589,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4589
4839
|
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4590
4840
|
if (nrc == 2) {
|
4591
4841
|
const block_q8_0 * restrict vx0 = vx;
|
4592
|
-
const block_q8_0 * restrict vx1 = vx + bx;
|
4842
|
+
const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
4593
4843
|
const block_q8_0 * restrict vy0 = vy;
|
4594
|
-
const block_q8_0 * restrict vy1 = vy + by;
|
4844
|
+
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
4595
4845
|
|
4596
4846
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4597
4847
|
|
@@ -4613,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4613
4863
|
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4614
4864
|
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4615
4865
|
|
4616
|
-
|
4617
|
-
|
4618
|
-
|
4619
|
-
|
4866
|
+
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4867
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4868
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4869
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4870
|
+
float32x4_t scale = vld1q_f32(_scale);
|
4620
4871
|
|
4621
4872
|
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4622
4873
|
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
@@ -4716,6 +4967,45 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|
4716
4967
|
}
|
4717
4968
|
|
4718
4969
|
*s = sumf;
|
4970
|
+
#elif defined(__POWER9_VECTOR__)
|
4971
|
+
vector float vsumf0 = vec_splats(0.0f);
|
4972
|
+
|
4973
|
+
#pragma GCC unroll 4
|
4974
|
+
for (int i = 0; i < nb; i++) {
|
4975
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
4976
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
4977
|
+
|
4978
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
4979
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[i].d));
|
4980
|
+
vector float vd = vec_mul(vxd, vyd);
|
4981
|
+
|
4982
|
+
vector signed char q8x0 = vec_xl( 0, x[i].qs);
|
4983
|
+
vector signed char q8x1 = vec_xl(16, x[i].qs);
|
4984
|
+
vector signed char q8y0 = vec_xl( 0, y[i].qs);
|
4985
|
+
vector signed char q8y1 = vec_xl(16, y[i].qs);
|
4986
|
+
|
4987
|
+
vector signed short qv0 = vec_mule(q8x0, q8y0);
|
4988
|
+
vector signed short qv1 = vec_mulo(q8x0, q8y0);
|
4989
|
+
vector signed short qv2 = vec_mule(q8x1, q8y1);
|
4990
|
+
vector signed short qv3 = vec_mulo(q8x1, q8y1);
|
4991
|
+
|
4992
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackh(qv1));
|
4993
|
+
vector signed int vsumi1 = vec_add(vec_unpackl(qv0), vec_unpackl(qv1));
|
4994
|
+
vector signed int vsumi2 = vec_add(vec_unpackh(qv2), vec_unpackh(qv3));
|
4995
|
+
vector signed int vsumi3 = vec_add(vec_unpackl(qv2), vec_unpackl(qv3));
|
4996
|
+
|
4997
|
+
vsumi0 = vec_add(vsumi0, vsumi2);
|
4998
|
+
vsumi1 = vec_add(vsumi1, vsumi3);
|
4999
|
+
|
5000
|
+
vsumi0 = vec_add(vsumi0, vsumi1);
|
5001
|
+
|
5002
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5003
|
+
}
|
5004
|
+
|
5005
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5006
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5007
|
+
|
5008
|
+
*s = vec_extract(vsumf0, 0);
|
4719
5009
|
#else
|
4720
5010
|
// scalar
|
4721
5011
|
float sumf = 0.0;
|
@@ -5071,6 +5361,147 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5071
5361
|
|
5072
5362
|
*s = sumf;
|
5073
5363
|
|
5364
|
+
#elif defined(__POWER9_VECTOR__)
|
5365
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5366
|
+
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5367
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5368
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5369
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5370
|
+
|
5371
|
+
vector float vsumf0 = vec_splats(0.0f);
|
5372
|
+
vector float vsumf1 = vec_splats(0.0f);
|
5373
|
+
vector float vsumf2 = vec_splats(0.0f);
|
5374
|
+
vector float vsumf3 = vec_splats(0.0f);
|
5375
|
+
|
5376
|
+
for (int i = 0; i < nb; ++i) {
|
5377
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
5378
|
+
vector float vyd = vec_splats(y[i].d);
|
5379
|
+
vector float vd = vec_mul(vxd, vyd);
|
5380
|
+
|
5381
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
5382
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
5383
|
+
|
5384
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
5385
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
5386
|
+
|
5387
|
+
vector signed char q2xmins = (vector signed char)vec_xl( 0, x[i].scales);
|
5388
|
+
vector signed char vscales = vec_and(q2xmins, lowScaleMask);
|
5389
|
+
|
5390
|
+
q2xmins = vec_sr(q2xmins, v4);
|
5391
|
+
vector signed short q2xmins0 = vec_unpackh(q2xmins);
|
5392
|
+
vector signed short q2xmins1 = vec_unpackl(q2xmins);
|
5393
|
+
|
5394
|
+
vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
|
5395
|
+
vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
|
5396
|
+
vector signed int prod2 = vec_mule(q2xmins1, q8ysums1);
|
5397
|
+
vector signed int prod3 = vec_mulo(q2xmins1, q8ysums1);
|
5398
|
+
|
5399
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
5400
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
5401
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
5402
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
5403
|
+
|
5404
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
5405
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
5406
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
5407
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
5408
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
5409
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
5410
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
5411
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
5412
|
+
|
5413
|
+
const uint8_t * restrict q2 = x[i].qs;
|
5414
|
+
const int8_t * restrict q8 = y[i].qs;
|
5415
|
+
|
5416
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
5417
|
+
__builtin_prefetch(q2, 0, 1);
|
5418
|
+
__builtin_prefetch(q8, 0, 1);
|
5419
|
+
|
5420
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q2);
|
5421
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q2);
|
5422
|
+
q2 += 32;
|
5423
|
+
|
5424
|
+
vector signed char q2x00 = vec_and(qxs0, lowMask);
|
5425
|
+
vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
5426
|
+
vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
5427
|
+
vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
5428
|
+
vector signed char q2x10 = vec_and(qxs1, lowMask);
|
5429
|
+
vector signed char q2x11 = vec_and(vec_sr(qxs1, v2), lowMask);
|
5430
|
+
vector signed char q2x12 = vec_and(vec_sr(qxs1, v4), lowMask);
|
5431
|
+
vector signed char q2x13 = vec_and(vec_sr(qxs1, v6), lowMask);
|
5432
|
+
|
5433
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
5434
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
5435
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
5436
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
5437
|
+
vector signed char q8y02 = vec_xl( 64, q8);
|
5438
|
+
vector signed char q8y12 = vec_xl( 80, q8);
|
5439
|
+
vector signed char q8y03 = vec_xl( 96, q8);
|
5440
|
+
vector signed char q8y13 = vec_xl(112, q8);
|
5441
|
+
q8 += 128;
|
5442
|
+
|
5443
|
+
vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
|
5444
|
+
vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
|
5445
|
+
vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
|
5446
|
+
vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
|
5447
|
+
vector signed short qv4 = vec_add(vec_mule(q2x10, q8y10), vec_mulo(q2x10, q8y10));
|
5448
|
+
vector signed short qv5 = vec_add(vec_mule(q2x11, q8y11), vec_mulo(q2x11, q8y11));
|
5449
|
+
vector signed short qv6 = vec_add(vec_mule(q2x12, q8y12), vec_mulo(q2x12, q8y12));
|
5450
|
+
vector signed short qv7 = vec_add(vec_mule(q2x13, q8y13), vec_mulo(q2x13, q8y13));
|
5451
|
+
|
5452
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
5453
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
5454
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
5455
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
5456
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
5457
|
+
vector signed short vs4 = vec_splat(vscales_h, 4);
|
5458
|
+
vector signed short vs5 = vec_splat(vscales_h, 5);
|
5459
|
+
vector signed short vs6 = vec_splat(vscales_h, 6);
|
5460
|
+
vector signed short vs7 = vec_splat(vscales_h, 7);
|
5461
|
+
vscales = vec_sld(vscales, vscales, 8);
|
5462
|
+
|
5463
|
+
qv0 = vec_mul(qv0, vs0);
|
5464
|
+
qv1 = vec_mul(qv1, vs2);
|
5465
|
+
qv2 = vec_mul(qv2, vs4);
|
5466
|
+
qv3 = vec_mul(qv3, vs6);
|
5467
|
+
|
5468
|
+
qv0 = vec_madd(qv4, vs1, qv0);
|
5469
|
+
qv1 = vec_madd(qv5, vs3, qv1);
|
5470
|
+
qv2 = vec_madd(qv6, vs5, qv2);
|
5471
|
+
qv3 = vec_madd(qv7, vs7, qv3);
|
5472
|
+
|
5473
|
+
vsumi0 = vec_add(vec_unpackh(qv0), vsumi0);
|
5474
|
+
vsumi1 = vec_add(vec_unpackh(qv1), vsumi1);
|
5475
|
+
vsumi2 = vec_add(vec_unpackh(qv2), vsumi2);
|
5476
|
+
vsumi3 = vec_add(vec_unpackh(qv3), vsumi3);
|
5477
|
+
|
5478
|
+
vsumi4 = vec_add(vec_unpackl(qv0), vsumi4);
|
5479
|
+
vsumi5 = vec_add(vec_unpackl(qv1), vsumi5);
|
5480
|
+
vsumi6 = vec_add(vec_unpackl(qv2), vsumi6);
|
5481
|
+
vsumi7 = vec_add(vec_unpackl(qv3), vsumi7);
|
5482
|
+
}
|
5483
|
+
|
5484
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
5485
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
5486
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
5487
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
5488
|
+
|
5489
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5490
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
5491
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
5492
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
5493
|
+
}
|
5494
|
+
|
5495
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
5496
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
5497
|
+
|
5498
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
5499
|
+
|
5500
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5501
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5502
|
+
|
5503
|
+
*s = vec_extract(vsumf0, 0);
|
5504
|
+
|
5074
5505
|
#else
|
5075
5506
|
|
5076
5507
|
float sumf = 0;
|
@@ -5341,6 +5772,87 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5341
5772
|
|
5342
5773
|
*s = sumf;
|
5343
5774
|
|
5775
|
+
#elif defined(__POWER9_VECTOR__)
|
5776
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
5777
|
+
const vector signed char lowScaleMask = vec_splats((signed char)0xF);
|
5778
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
5779
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
5780
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
5781
|
+
|
5782
|
+
vector float vsumf0 = vec_splats(0.0f);
|
5783
|
+
vector float vsumf1 = vec_splats(0.0f);
|
5784
|
+
vector float vsumf2 = vec_splats(0.0f);
|
5785
|
+
vector float vsumf3 = vec_splats(0.0f);
|
5786
|
+
|
5787
|
+
#pragma GCC unroll 2
|
5788
|
+
for (int i = 0; i < nb; ++i) {
|
5789
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
5790
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
5791
|
+
|
5792
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
5793
|
+
vector float vyd = vec_splats(y[i].d);
|
5794
|
+
vector float vd = vec_mul(vxd, vyd);
|
5795
|
+
|
5796
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
5797
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
5798
|
+
|
5799
|
+
vector signed short q8ysums0 = vec_xl_len(y[i].bsums, 8);
|
5800
|
+
|
5801
|
+
vector signed char q2xmins = (vector signed char)vec_xl_len(x[i].scales, 4);
|
5802
|
+
vector signed char vscales = vec_and(q2xmins, lowScaleMask);
|
5803
|
+
|
5804
|
+
q2xmins = vec_sr(q2xmins, v4);
|
5805
|
+
vector signed short q2xmins0 = vec_unpackh((vector signed char)q2xmins);
|
5806
|
+
|
5807
|
+
vector signed int prod0 = vec_mule(q2xmins0, q8ysums0);
|
5808
|
+
vector signed int prod1 = vec_mulo(q2xmins0, q8ysums0);
|
5809
|
+
|
5810
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
5811
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
5812
|
+
|
5813
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
5814
|
+
vector signed char q2x00 = vec_and(qxs0, lowMask);
|
5815
|
+
vector signed char q2x01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
5816
|
+
vector signed char q2x02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
5817
|
+
vector signed char q2x03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
5818
|
+
|
5819
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
5820
|
+
vector signed char q8y01 = vec_xl( 16, y[i].qs);
|
5821
|
+
vector signed char q8y02 = vec_xl( 32, y[i].qs);
|
5822
|
+
vector signed char q8y03 = vec_xl( 48, y[i].qs);
|
5823
|
+
|
5824
|
+
vector signed short qv0 = vec_add(vec_mule(q2x00, q8y00), vec_mulo(q2x00, q8y00));
|
5825
|
+
vector signed short qv1 = vec_add(vec_mule(q2x01, q8y01), vec_mulo(q2x01, q8y01));
|
5826
|
+
vector signed short qv2 = vec_add(vec_mule(q2x02, q8y02), vec_mulo(q2x02, q8y02));
|
5827
|
+
vector signed short qv3 = vec_add(vec_mule(q2x03, q8y03), vec_mulo(q2x03, q8y03));
|
5828
|
+
|
5829
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
5830
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
5831
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
5832
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
5833
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
5834
|
+
|
5835
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv0, vs0), vec_mulo(qv0, vs0));
|
5836
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv1, vs1), vec_mulo(qv1, vs1));
|
5837
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv2, vs2), vec_mulo(qv2, vs2));
|
5838
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv3, vs3), vec_mulo(qv3, vs3));
|
5839
|
+
|
5840
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
5841
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
5842
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
5843
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
5844
|
+
}
|
5845
|
+
|
5846
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
5847
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
5848
|
+
|
5849
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
5850
|
+
|
5851
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
5852
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
5853
|
+
|
5854
|
+
*s = vec_extract(vsumf0, 0);
|
5855
|
+
|
5344
5856
|
#else
|
5345
5857
|
|
5346
5858
|
float sumf = 0;
|
@@ -5835,6 +6347,160 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
5835
6347
|
|
5836
6348
|
*s = sumf;
|
5837
6349
|
|
6350
|
+
#elif defined(__POWER9_VECTOR__)
|
6351
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6352
|
+
const vector signed char v1 = vec_splats((signed char)0x1);
|
6353
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6354
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
6355
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
6356
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
6357
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
6358
|
+
|
6359
|
+
vector float vsumf0 = vec_splats(0.0f);
|
6360
|
+
vector float vsumf1 = vec_splats(0.0f);
|
6361
|
+
vector float vsumf2 = vec_splats(0.0f);
|
6362
|
+
vector float vsumf3 = vec_splats(0.0f);
|
6363
|
+
|
6364
|
+
for (int i = 0; i < nb; ++i) {
|
6365
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
6366
|
+
vector float vyd = vec_splats(y[i].d);
|
6367
|
+
vector float vd = vec_mul(vxd, vyd);
|
6368
|
+
|
6369
|
+
uint32_t aux[3];
|
6370
|
+
uint32_t utmp[4];
|
6371
|
+
|
6372
|
+
memcpy(aux, x[i].scales, 12);
|
6373
|
+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
|
6374
|
+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
|
6375
|
+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
|
6376
|
+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
|
6377
|
+
|
6378
|
+
vector signed char vscales = (vector signed char)vec_xl( 0, utmp);
|
6379
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].hmask);
|
6380
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].hmask);
|
6381
|
+
|
6382
|
+
vscales = vec_sub(vscales, off);
|
6383
|
+
|
6384
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
6385
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
6386
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
6387
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
6388
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
6389
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
6390
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
6391
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
6392
|
+
|
6393
|
+
const uint8_t * restrict q3 = x[i].qs;
|
6394
|
+
const int8_t * restrict q8 = y[i].qs;
|
6395
|
+
|
6396
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
6397
|
+
__builtin_prefetch(q3, 0, 1);
|
6398
|
+
__builtin_prefetch(q8, 0, 1);
|
6399
|
+
|
6400
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q3);
|
6401
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q3);
|
6402
|
+
q3 += 32;
|
6403
|
+
|
6404
|
+
//the low 2 bits
|
6405
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
6406
|
+
vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
6407
|
+
vector signed char qxs02 = vec_and(vec_sr(qxs0, v4), lowMask);
|
6408
|
+
vector signed char qxs03 = vec_and(vec_sr(qxs0, v6), lowMask);
|
6409
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
6410
|
+
vector signed char qxs11 = vec_and(vec_sr(qxs1, v2), lowMask);
|
6411
|
+
vector signed char qxs12 = vec_and(vec_sr(qxs1, v4), lowMask);
|
6412
|
+
vector signed char qxs13 = vec_and(vec_sr(qxs1, v6), lowMask);
|
6413
|
+
|
6414
|
+
//the 3rd bit
|
6415
|
+
vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
|
6416
|
+
vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, (vector unsigned char)v1)), v2);
|
6417
|
+
vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
|
6418
|
+
vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v3)), v2);
|
6419
|
+
vector signed char qxh10 = vec_sl(vec_andc(v1, qxhs1), v2);
|
6420
|
+
vector signed char qxh11 = vec_sl(vec_andc(v1, vec_sr(qxhs1, (vector unsigned char)v1)), v2);
|
6421
|
+
vector signed char qxh12 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v2)), v2);
|
6422
|
+
vector signed char qxh13 = vec_sl(vec_andc(v1, vec_sr(qxhs1, v3)), v2);
|
6423
|
+
qxhs0 = vec_sr(qxhs0, v4);
|
6424
|
+
qxhs1 = vec_sr(qxhs1, v4);
|
6425
|
+
|
6426
|
+
vector signed char q3x00 = vec_sub(qxs00, qxh00);
|
6427
|
+
vector signed char q3x01 = vec_sub(qxs01, qxh01);
|
6428
|
+
vector signed char q3x02 = vec_sub(qxs02, qxh02);
|
6429
|
+
vector signed char q3x03 = vec_sub(qxs03, qxh03);
|
6430
|
+
vector signed char q3x10 = vec_sub(qxs10, qxh10);
|
6431
|
+
vector signed char q3x11 = vec_sub(qxs11, qxh11);
|
6432
|
+
vector signed char q3x12 = vec_sub(qxs12, qxh12);
|
6433
|
+
vector signed char q3x13 = vec_sub(qxs13, qxh13);
|
6434
|
+
|
6435
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
6436
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
6437
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
6438
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
6439
|
+
vector signed char q8y02 = vec_xl( 64, q8);
|
6440
|
+
vector signed char q8y12 = vec_xl( 80, q8);
|
6441
|
+
vector signed char q8y03 = vec_xl( 96, q8);
|
6442
|
+
vector signed char q8y13 = vec_xl(112, q8);
|
6443
|
+
q8 += 128;
|
6444
|
+
|
6445
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
6446
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
6447
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
6448
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
6449
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
6450
|
+
vector signed short vs4 = vec_splat(vscales_h, 4);
|
6451
|
+
vector signed short vs5 = vec_splat(vscales_h, 5);
|
6452
|
+
vector signed short vs6 = vec_splat(vscales_h, 6);
|
6453
|
+
vector signed short vs7 = vec_splat(vscales_h, 7);
|
6454
|
+
vscales = vec_sld(vscales, vscales, 8);
|
6455
|
+
|
6456
|
+
vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
|
6457
|
+
vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
|
6458
|
+
vector signed short qv02 = vec_add(vec_mule(q3x02, q8y02), vec_mulo(q3x02, q8y02));
|
6459
|
+
vector signed short qv03 = vec_add(vec_mule(q3x03, q8y03), vec_mulo(q3x03, q8y03));
|
6460
|
+
vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
|
6461
|
+
vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
|
6462
|
+
vector signed short qv12 = vec_add(vec_mule(q3x12, q8y12), vec_mulo(q3x12, q8y12));
|
6463
|
+
vector signed short qv13 = vec_add(vec_mule(q3x13, q8y13), vec_mulo(q3x13, q8y13));
|
6464
|
+
|
6465
|
+
vector signed int vsum0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
6466
|
+
vector signed int vsum1 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
6467
|
+
vector signed int vsum2 = vec_add(vec_mule(qv02, vs4), vec_mulo(qv02, vs4));
|
6468
|
+
vector signed int vsum3 = vec_add(vec_mule(qv03, vs6), vec_mulo(qv03, vs6));
|
6469
|
+
vector signed int vsum4 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
6470
|
+
vector signed int vsum5 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
6471
|
+
vector signed int vsum6 = vec_add(vec_mule(qv12, vs5), vec_mulo(qv12, vs5));
|
6472
|
+
vector signed int vsum7 = vec_add(vec_mule(qv13, vs7), vec_mulo(qv13, vs7));
|
6473
|
+
|
6474
|
+
vsumi0 = vec_add(vsum0, vsumi0);
|
6475
|
+
vsumi1 = vec_add(vsum1, vsumi1);
|
6476
|
+
vsumi2 = vec_add(vsum2, vsumi2);
|
6477
|
+
vsumi3 = vec_add(vsum3, vsumi3);
|
6478
|
+
vsumi4 = vec_add(vsum4, vsumi4);
|
6479
|
+
vsumi5 = vec_add(vsum5, vsumi5);
|
6480
|
+
vsumi6 = vec_add(vsum6, vsumi6);
|
6481
|
+
vsumi7 = vec_add(vsum7, vsumi7);
|
6482
|
+
}
|
6483
|
+
|
6484
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
6485
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
6486
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
6487
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
6488
|
+
|
6489
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
6490
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
6491
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
6492
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
6493
|
+
}
|
6494
|
+
|
6495
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
6496
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
6497
|
+
|
6498
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
6499
|
+
|
6500
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
6501
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
6502
|
+
|
6503
|
+
*s = vec_extract(vsumf0, 0);
|
5838
6504
|
#else
|
5839
6505
|
// scalar version
|
5840
6506
|
// This function is written like this so the compiler can manage to vectorize most of it
|
@@ -6201,6 +6867,95 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6201
6867
|
|
6202
6868
|
*s = sumf;
|
6203
6869
|
|
6870
|
+
#elif defined(__POWER9_VECTOR__)
|
6871
|
+
const vector signed char lowMask = vec_splats((signed char)0x3);
|
6872
|
+
const vector signed char v1 = vec_splats((signed char)0x1);
|
6873
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
6874
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
6875
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
6876
|
+
const vector signed char off = vec_splats((signed char)0x8);
|
6877
|
+
|
6878
|
+
vector float vsumf0 = vec_splats(0.0f);
|
6879
|
+
vector float vsumf1 = vec_splats(0.0f);
|
6880
|
+
vector float vsumf2 = vec_splats(0.0f);
|
6881
|
+
vector float vsumf3 = vec_splats(0.0f);
|
6882
|
+
|
6883
|
+
#pragma GCC unroll 2
|
6884
|
+
for (int i = 0; i < nb; ++i) {
|
6885
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
6886
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
6887
|
+
|
6888
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
6889
|
+
vector float vyd = vec_splats(y[i].d);
|
6890
|
+
vector float vd = vec_mul(vxd, vyd);
|
6891
|
+
|
6892
|
+
uint16_t aux16[2];
|
6893
|
+
int8_t * scales = (int8_t *)aux16;
|
6894
|
+
|
6895
|
+
const uint16_t a = *(const uint16_t *)x[i].scales;
|
6896
|
+
aux16[0] = a & 0x0f0f;
|
6897
|
+
aux16[1] = (a >> 4) & 0x0f0f;
|
6898
|
+
|
6899
|
+
vector signed char vscales = (vector signed char)vec_xl_len(scales, 8);
|
6900
|
+
vector signed char qxhs0 = (vector signed char)vec_xl_len(x[i].hmask, 8);
|
6901
|
+
qxhs0 = vec_or(qxhs0, vec_sr(vec_sld(qxhs0, qxhs0, 8), (vector unsigned char)v1));
|
6902
|
+
|
6903
|
+
vscales = vec_sub(vscales, off);
|
6904
|
+
|
6905
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
6906
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
6907
|
+
vector signed char qxs01 = vec_and(vec_sr(qxs0, v2), lowMask);
|
6908
|
+
vector signed char qxs10 = vec_and(vec_sr(qxs0, v4), lowMask);
|
6909
|
+
vector signed char qxs11 = vec_and(vec_sr(qxs0, v6), lowMask);
|
6910
|
+
|
6911
|
+
//the 3rd bit
|
6912
|
+
vector signed char qxh00 = vec_sl(vec_andc(v1, qxhs0), v2);
|
6913
|
+
vector signed char qxh01 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v2)), v2);
|
6914
|
+
vector signed char qxh02 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v4)), v2);
|
6915
|
+
vector signed char qxh03 = vec_sl(vec_andc(v1, vec_sr(qxhs0, v6)), v2);
|
6916
|
+
qxhs0 = vec_sr(qxhs0, v4);
|
6917
|
+
|
6918
|
+
vector signed char q3x00 = vec_sub(qxs00, qxh00);
|
6919
|
+
vector signed char q3x01 = vec_sub(qxs01, qxh01);
|
6920
|
+
vector signed char q3x10 = vec_sub(qxs10, qxh02);
|
6921
|
+
vector signed char q3x11 = vec_sub(qxs11, qxh03);
|
6922
|
+
|
6923
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
6924
|
+
vector signed char q8y01 = vec_xl( 16, y[i].qs);
|
6925
|
+
vector signed char q8y10 = vec_xl( 32, y[i].qs);
|
6926
|
+
vector signed char q8y11 = vec_xl( 48, y[i].qs);
|
6927
|
+
|
6928
|
+
vector signed short vscales_h = vec_unpackh(vscales);
|
6929
|
+
vector signed short vs0 = vec_splat(vscales_h, 0);
|
6930
|
+
vector signed short vs1 = vec_splat(vscales_h, 1);
|
6931
|
+
vector signed short vs2 = vec_splat(vscales_h, 2);
|
6932
|
+
vector signed short vs3 = vec_splat(vscales_h, 3);
|
6933
|
+
|
6934
|
+
vector signed short qv00 = vec_add(vec_mule(q3x00, q8y00), vec_mulo(q3x00, q8y00));
|
6935
|
+
vector signed short qv10 = vec_add(vec_mule(q3x10, q8y10), vec_mulo(q3x10, q8y10));
|
6936
|
+
vector signed short qv01 = vec_add(vec_mule(q3x01, q8y01), vec_mulo(q3x01, q8y01));
|
6937
|
+
vector signed short qv11 = vec_add(vec_mule(q3x11, q8y11), vec_mulo(q3x11, q8y11));
|
6938
|
+
|
6939
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
6940
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
6941
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
6942
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
6943
|
+
|
6944
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
6945
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
6946
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
6947
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
6948
|
+
}
|
6949
|
+
|
6950
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
6951
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
6952
|
+
|
6953
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
6954
|
+
|
6955
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
6956
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
6957
|
+
|
6958
|
+
*s = vec_extract(vsumf0, 0);
|
6204
6959
|
#else
|
6205
6960
|
|
6206
6961
|
int8_t aux8[QK_K];
|
@@ -6553,6 +7308,142 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6553
7308
|
|
6554
7309
|
*s = sumf;
|
6555
7310
|
|
7311
|
+
#elif defined(__POWER9_VECTOR__)
|
7312
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7313
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7314
|
+
|
7315
|
+
vector float vsumf0 = vec_splats(0.0f);
|
7316
|
+
vector float vsumf1 = vec_splats(0.0f);
|
7317
|
+
vector float vsumf2 = vec_splats(0.0f);
|
7318
|
+
vector float vsumf3 = vec_splats(0.0f);
|
7319
|
+
|
7320
|
+
for (int i = 0; i < nb; ++i) {
|
7321
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
7322
|
+
vector float vyd = vec_splats(y[i].d);
|
7323
|
+
vector float vd = vec_mul(vxd, vyd);
|
7324
|
+
|
7325
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
7326
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
7327
|
+
|
7328
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
7329
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
7330
|
+
|
7331
|
+
memcpy(utmp, x[i].scales, 12);
|
7332
|
+
|
7333
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
7334
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
7335
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
7336
|
+
utmp[2] = uaux;
|
7337
|
+
utmp[0] &= kmask1;
|
7338
|
+
|
7339
|
+
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
7340
|
+
vector signed short vscales = vec_unpackh(utmps);
|
7341
|
+
vector signed short q4xmins = vec_unpackl(utmps);
|
7342
|
+
vector signed short q4xmins0 = vec_mergeh(q4xmins, q4xmins);
|
7343
|
+
vector signed short q4xmins1 = vec_mergel(q4xmins, q4xmins);
|
7344
|
+
|
7345
|
+
vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
|
7346
|
+
vector signed int prod1 = vec_mule(q4xmins1, q8ysums1);
|
7347
|
+
vector signed int prod2 = vec_mulo(q4xmins0, q8ysums0);
|
7348
|
+
vector signed int prod3 = vec_mulo(q4xmins1, q8ysums1);
|
7349
|
+
|
7350
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
7351
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
7352
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
7353
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
7354
|
+
|
7355
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
7356
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
7357
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
7358
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
7359
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
7360
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
7361
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
7362
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
7363
|
+
|
7364
|
+
const uint8_t * restrict q4 = x[i].qs;
|
7365
|
+
const int8_t * restrict q8 = y[i].qs;
|
7366
|
+
|
7367
|
+
for (int j = 0; j < QK_K/64; j+=2) {
|
7368
|
+
__builtin_prefetch(q4, 0, 1);
|
7369
|
+
__builtin_prefetch(q8, 0, 1);
|
7370
|
+
|
7371
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
|
7372
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
|
7373
|
+
vector signed char qxs2 = (vector signed char)vec_xl(32, q4);
|
7374
|
+
vector signed char qxs3 = (vector signed char)vec_xl(48, q4);
|
7375
|
+
q4 += 64;
|
7376
|
+
|
7377
|
+
vector signed char q4x00 = vec_and(qxs0, lowMask);
|
7378
|
+
vector signed char q4x01 = vec_sr(qxs0, v4);
|
7379
|
+
vector signed char q4x10 = vec_and(qxs1, lowMask);
|
7380
|
+
vector signed char q4x11 = vec_sr(qxs1, v4);
|
7381
|
+
vector signed char q4x20 = vec_and(qxs2, lowMask);
|
7382
|
+
vector signed char q4x21 = vec_sr(qxs2, v4);
|
7383
|
+
vector signed char q4x30 = vec_and(qxs3, lowMask);
|
7384
|
+
vector signed char q4x31 = vec_sr(qxs3, v4);
|
7385
|
+
|
7386
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
7387
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
7388
|
+
vector signed char q8y01 = vec_xl( 32, q8);
|
7389
|
+
vector signed char q8y11 = vec_xl( 48, q8);
|
7390
|
+
vector signed char q8y20 = vec_xl( 64, q8);
|
7391
|
+
vector signed char q8y30 = vec_xl( 80, q8);
|
7392
|
+
vector signed char q8y21 = vec_xl( 96, q8);
|
7393
|
+
vector signed char q8y31 = vec_xl(112, q8);
|
7394
|
+
q8 += 128;
|
7395
|
+
|
7396
|
+
vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
|
7397
|
+
vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
|
7398
|
+
vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
|
7399
|
+
vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
|
7400
|
+
vector signed short qv20 = vec_add(vec_mule(q4x20, q8y20), vec_mulo(q4x20, q8y20));
|
7401
|
+
vector signed short qv21 = vec_add(vec_mule(q4x21, q8y21), vec_mulo(q4x21, q8y21));
|
7402
|
+
vector signed short qv30 = vec_add(vec_mule(q4x30, q8y30), vec_mulo(q4x30, q8y30));
|
7403
|
+
vector signed short qv31 = vec_add(vec_mule(q4x31, q8y31), vec_mulo(q4x31, q8y31));
|
7404
|
+
|
7405
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
7406
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
7407
|
+
vector signed short vs2 = vec_splat(vscales, 2);
|
7408
|
+
vector signed short vs3 = vec_splat(vscales, 3);
|
7409
|
+
vscales = vec_sld(vscales, vscales, 8);
|
7410
|
+
|
7411
|
+
qv00 = vec_add(qv00, qv10);
|
7412
|
+
qv10 = vec_add(qv01, qv11);
|
7413
|
+
qv20 = vec_add(qv20, qv30);
|
7414
|
+
qv30 = vec_add(qv21, qv31);
|
7415
|
+
|
7416
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
7417
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
7418
|
+
vsumi2 = vec_add(vec_mule(qv10, vs1), vsumi2);
|
7419
|
+
vsumi3 = vec_add(vec_mulo(qv10, vs1), vsumi3);
|
7420
|
+
vsumi4 = vec_add(vec_mule(qv20, vs2), vsumi4);
|
7421
|
+
vsumi5 = vec_add(vec_mulo(qv20, vs2), vsumi5);
|
7422
|
+
vsumi6 = vec_add(vec_mule(qv30, vs3), vsumi6);
|
7423
|
+
vsumi7 = vec_add(vec_mulo(qv30, vs3), vsumi7);
|
7424
|
+
}
|
7425
|
+
|
7426
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
7427
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
7428
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
7429
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
7430
|
+
|
7431
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7432
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7433
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
7434
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
7435
|
+
}
|
7436
|
+
|
7437
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
7438
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
7439
|
+
|
7440
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
7441
|
+
|
7442
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
7443
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
7444
|
+
|
7445
|
+
*s = vec_extract(vsumf0, 0);
|
7446
|
+
|
6556
7447
|
#else
|
6557
7448
|
|
6558
7449
|
|
@@ -6819,6 +7710,87 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
6819
7710
|
|
6820
7711
|
*s = sumf;
|
6821
7712
|
|
7713
|
+
#elif defined(__POWER9_VECTOR__)
|
7714
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
7715
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
7716
|
+
|
7717
|
+
vector float vsumf0 = vec_splats(0.0f);
|
7718
|
+
vector float vsumf1 = vec_splats(0.0f);
|
7719
|
+
vector float vsumf2 = vec_splats(0.0f);
|
7720
|
+
vector float vsumf3 = vec_splats(0.0f);
|
7721
|
+
|
7722
|
+
#pragma GCC unroll 2
|
7723
|
+
for (int i = 0; i < nb; ++i) {
|
7724
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
7725
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
7726
|
+
|
7727
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d[1]));
|
7728
|
+
vector float vyd = vec_splats(y[i].d);
|
7729
|
+
vector float vd= vec_mul(vxd, vyd);
|
7730
|
+
|
7731
|
+
uint16_t s16[2];
|
7732
|
+
const uint8_t * scales = (const uint8_t *)s16;
|
7733
|
+
|
7734
|
+
const uint16_t * restrict b = (const uint16_t *)x[i].scales;
|
7735
|
+
s16[0] = b[0] & 0x0f0f;
|
7736
|
+
s16[1] = (b[0] >> 4) & 0x0f0f;
|
7737
|
+
|
7738
|
+
vector signed char utmps = (vector signed char)vec_xl_len(scales, 4);
|
7739
|
+
vector signed short vscales = (vector signed short)vec_unpackh(utmps);
|
7740
|
+
vector signed short q4xmins0 = vec_mergeh(vscales, vscales);
|
7741
|
+
q4xmins0 = vec_sld(q4xmins0, q4xmins0, 8);
|
7742
|
+
|
7743
|
+
vector signed short q8ysums0 = vec_xl_len((const int16_t *)(y[i].bsums), 8);
|
7744
|
+
|
7745
|
+
vector signed int prod0 = vec_mule(q4xmins0, q8ysums0);
|
7746
|
+
vector signed int prod1 = vec_mulo(q4xmins0, q8ysums0);
|
7747
|
+
|
7748
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vd, vsumf0);
|
7749
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vd, vsumf1);
|
7750
|
+
|
7751
|
+
vd = vec_mul(vyd, vec_splats(GGML_FP16_TO_FP32(x[i].d[0])));
|
7752
|
+
|
7753
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
7754
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
|
7755
|
+
vector signed char q4x00 = vec_and(qxs0, lowMask);
|
7756
|
+
vector signed char q4x01 = vec_sr(qxs0, v4);
|
7757
|
+
vector signed char q4x10 = vec_and(qxs1, lowMask);
|
7758
|
+
vector signed char q4x11 = vec_sr(qxs1, v4);
|
7759
|
+
|
7760
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
7761
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
7762
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
7763
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
7764
|
+
|
7765
|
+
vector signed short qv00 = vec_add(vec_mule(q4x00, q8y00), vec_mulo(q4x00, q8y00));
|
7766
|
+
vector signed short qv01 = vec_add(vec_mule(q4x01, q8y01), vec_mulo(q4x01, q8y01));
|
7767
|
+
vector signed short qv10 = vec_add(vec_mule(q4x10, q8y10), vec_mulo(q4x10, q8y10));
|
7768
|
+
vector signed short qv11 = vec_add(vec_mule(q4x11, q8y11), vec_mulo(q4x11, q8y11));
|
7769
|
+
|
7770
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
7771
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
7772
|
+
|
7773
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
7774
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs0), vec_mulo(qv10, vs0));
|
7775
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs1), vec_mulo(qv01, vs1));
|
7776
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs1), vec_mulo(qv11, vs1));
|
7777
|
+
|
7778
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
7779
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
7780
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
7781
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
7782
|
+
}
|
7783
|
+
|
7784
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
7785
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
7786
|
+
|
7787
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
7788
|
+
|
7789
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
7790
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
7791
|
+
|
7792
|
+
*s = vec_extract(vsumf0, 0);
|
7793
|
+
|
6822
7794
|
#else
|
6823
7795
|
|
6824
7796
|
uint8_t aux8[QK_K];
|
@@ -7220,6 +8192,130 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7220
8192
|
|
7221
8193
|
*s = sumf+sums;
|
7222
8194
|
|
8195
|
+
#elif defined(__POWER9_VECTOR__)
|
8196
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8197
|
+
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
8198
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8199
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
8200
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
8201
|
+
|
8202
|
+
vector float vsumf0 = vec_splats(0.0f);
|
8203
|
+
vector float vsumf1 = vec_splats(0.0f);
|
8204
|
+
vector float vsumf2 = vec_splats(0.0f);
|
8205
|
+
vector float vsumf3 = vec_splats(0.0f);
|
8206
|
+
|
8207
|
+
for (int i = 0; i < nb; ++i) {
|
8208
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
8209
|
+
vector float vyd = vec_splats(y[i].d);
|
8210
|
+
vector float vd = vec_mul(vxd, vyd);
|
8211
|
+
|
8212
|
+
vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
|
8213
|
+
vector float vdmin = vec_mul(vxmin, vyd);
|
8214
|
+
|
8215
|
+
memcpy(utmp, x[i].scales, 12);
|
8216
|
+
|
8217
|
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
8218
|
+
const uint32_t uaux = utmp[1] & kmask1;
|
8219
|
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
8220
|
+
utmp[2] = uaux;
|
8221
|
+
utmp[0] &= kmask1;
|
8222
|
+
|
8223
|
+
vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
|
8224
|
+
vector signed short q8ysums1 = vec_xl(16, y[i].bsums);
|
8225
|
+
|
8226
|
+
vector signed char utmps = (vector signed char)vec_xl( 0, utmp);
|
8227
|
+
vector signed short vscales = vec_unpackh(utmps);
|
8228
|
+
|
8229
|
+
vector signed short q5xmins = vec_unpackl(utmps);
|
8230
|
+
vector signed short q5xmins0 = vec_mergeh(q5xmins, q5xmins);
|
8231
|
+
vector signed short q5xmins1 = vec_mergel(q5xmins, q5xmins);
|
8232
|
+
|
8233
|
+
vector signed int prod0 = vec_mule(q5xmins0, q8ysums0);
|
8234
|
+
vector signed int prod1 = vec_mule(q5xmins1, q8ysums1);
|
8235
|
+
vector signed int prod2 = vec_mulo(q5xmins0, q8ysums0);
|
8236
|
+
vector signed int prod3 = vec_mulo(q5xmins1, q8ysums1);
|
8237
|
+
|
8238
|
+
vsumf0 = vec_nmsub(vec_ctf(prod0, 0), vdmin, vsumf0);
|
8239
|
+
vsumf1 = vec_nmsub(vec_ctf(prod1, 0), vdmin, vsumf1);
|
8240
|
+
vsumf2 = vec_nmsub(vec_ctf(prod2, 0), vdmin, vsumf2);
|
8241
|
+
vsumf3 = vec_nmsub(vec_ctf(prod3, 0), vdmin, vsumf3);
|
8242
|
+
|
8243
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
8244
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, x[i].qh);
|
8245
|
+
|
8246
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
8247
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
8248
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
8249
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
8250
|
+
|
8251
|
+
const uint8_t * restrict q5 = x[i].qs;
|
8252
|
+
const int8_t * restrict q8 = y[i].qs;
|
8253
|
+
|
8254
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
8255
|
+
__builtin_prefetch(q5, 0, 1);
|
8256
|
+
__builtin_prefetch(q8, 0, 1);
|
8257
|
+
|
8258
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q5);
|
8259
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q5);
|
8260
|
+
q5 += 32;
|
8261
|
+
|
8262
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
8263
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
8264
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
8265
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
8266
|
+
|
8267
|
+
vector signed char q5h00 = vec_sl(vec_and((vector signed char)v1, qxhs0), v4);
|
8268
|
+
vector signed char q5h01 = vec_sl(vec_and((vector signed char)v2, qxhs0), v3);
|
8269
|
+
vector signed char q5h10 = vec_sl(vec_and((vector signed char)v1, qxhs1), v4);
|
8270
|
+
vector signed char q5h11 = vec_sl(vec_and((vector signed char)v2, qxhs1), v3);
|
8271
|
+
qxhs0 = vec_sr(qxhs0, v2);
|
8272
|
+
qxhs1 = vec_sr(qxhs1, v2);
|
8273
|
+
|
8274
|
+
vector signed char q5x00 = vec_or(q5h00, qxs00);
|
8275
|
+
vector signed char q5x01 = vec_or(q5h01, qxs01);
|
8276
|
+
vector signed char q5x10 = vec_or(q5h10, qxs10);
|
8277
|
+
vector signed char q5x11 = vec_or(q5h11, qxs11);
|
8278
|
+
|
8279
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
8280
|
+
vector signed char q8y10 = vec_xl(16, q8);
|
8281
|
+
vector signed char q8y01 = vec_xl(32, q8);
|
8282
|
+
vector signed char q8y11 = vec_xl(48, q8);
|
8283
|
+
q8 += 64;
|
8284
|
+
|
8285
|
+
vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
|
8286
|
+
vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
|
8287
|
+
vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
|
8288
|
+
vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
|
8289
|
+
|
8290
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
8291
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
8292
|
+
vscales = vec_sld(vscales, vscales, 12);
|
8293
|
+
|
8294
|
+
qv00 = vec_add(qv00, qv10);
|
8295
|
+
qv01 = vec_add(qv01, qv11);
|
8296
|
+
|
8297
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
8298
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
8299
|
+
vsumi2 = vec_add(vec_mule(qv01, vs1), vsumi2);
|
8300
|
+
vsumi3 = vec_add(vec_mulo(qv01, vs1), vsumi3);
|
8301
|
+
}
|
8302
|
+
|
8303
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
8304
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
8305
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
8306
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
8307
|
+
}
|
8308
|
+
|
8309
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
8310
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
8311
|
+
|
8312
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
8313
|
+
|
8314
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
8315
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
8316
|
+
|
8317
|
+
*s = vec_extract(vsumf0, 0);
|
8318
|
+
|
7223
8319
|
#else
|
7224
8320
|
|
7225
8321
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
@@ -7517,6 +8613,83 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7517
8613
|
|
7518
8614
|
*s = sumf;
|
7519
8615
|
|
8616
|
+
#elif defined(__POWER9_VECTOR__)
|
8617
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
8618
|
+
const vector unsigned char v1 = vec_splats((unsigned char)0x1);
|
8619
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
8620
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
8621
|
+
|
8622
|
+
vector float vsumf0 = vec_splats(0.0f);
|
8623
|
+
vector float vsumf1 = vec_splats(0.0f);
|
8624
|
+
vector float vsumf2 = vec_splats(0.0f);
|
8625
|
+
vector float vsumf3 = vec_splats(0.0f);
|
8626
|
+
|
8627
|
+
#pragma GCC unroll 2
|
8628
|
+
for (int i = 0; i < nb; ++i) {
|
8629
|
+
__builtin_prefetch(x[i].qs, 0, 1);
|
8630
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
8631
|
+
|
8632
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
8633
|
+
vector float vyd = vec_splats(y[i].d);
|
8634
|
+
vector float vd= vec_mul(vxd, vyd);
|
8635
|
+
|
8636
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].qs);
|
8637
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].qs);
|
8638
|
+
vector signed char qxs00 = (vector signed char)vec_and(qxs0, lowMask);
|
8639
|
+
vector signed char qxs01 = (vector signed char)vec_sr(qxs0, v4);
|
8640
|
+
vector signed char qxs10 = (vector signed char)vec_and(qxs1, lowMask);
|
8641
|
+
vector signed char qxs11 = (vector signed char)vec_sr(qxs1, v4);
|
8642
|
+
|
8643
|
+
vector signed char qxhs = (vector signed char)vec_xl_len(x[i].qh, 8);
|
8644
|
+
vector signed char qxhs0 = vec_or(qxhs, vec_sr(vec_sld(qxhs, qxhs, 8), v1));
|
8645
|
+
vector signed char qxhs1 = vec_sr(qxhs0, v2);
|
8646
|
+
vector signed char qxh00 = vec_sl(vec_andc((vector signed char)v1, qxhs0), v4);
|
8647
|
+
vector signed char qxh10 = vec_sl(vec_andc((vector signed char)v1, qxhs1), v4);
|
8648
|
+
vector signed char qxh01 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs0, v4)), v4);
|
8649
|
+
vector signed char qxh11 = vec_sl(vec_andc((vector signed char)v1, vec_sr(qxhs1, v4)), v4);
|
8650
|
+
|
8651
|
+
vector signed char q5x00 = vec_sub(qxs00, qxh00);
|
8652
|
+
vector signed char q5x10 = vec_sub(qxs10, qxh10);
|
8653
|
+
vector signed char q5x01 = vec_sub(qxs01, qxh01);
|
8654
|
+
vector signed char q5x11 = vec_sub(qxs11, qxh11);
|
8655
|
+
|
8656
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
8657
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
8658
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
8659
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
8660
|
+
|
8661
|
+
vector signed short qv00 = vec_add(vec_mule(q5x00, q8y00), vec_mulo(q5x00, q8y00));
|
8662
|
+
vector signed short qv01 = vec_add(vec_mule(q5x01, q8y01), vec_mulo(q5x01, q8y01));
|
8663
|
+
vector signed short qv10 = vec_add(vec_mule(q5x10, q8y10), vec_mulo(q5x10, q8y10));
|
8664
|
+
vector signed short qv11 = vec_add(vec_mule(q5x11, q8y11), vec_mulo(q5x11, q8y11));
|
8665
|
+
|
8666
|
+
vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
|
8667
|
+
vector signed short vs0 = vec_splat(vs, 0);
|
8668
|
+
vector signed short vs1 = vec_splat(vs, 1);
|
8669
|
+
vector signed short vs2 = vec_splat(vs, 2);
|
8670
|
+
vector signed short vs3 = vec_splat(vs, 3);
|
8671
|
+
|
8672
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
8673
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
8674
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
8675
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
8676
|
+
|
8677
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
8678
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
8679
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
8680
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
8681
|
+
}
|
8682
|
+
|
8683
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
8684
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
8685
|
+
|
8686
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
8687
|
+
|
8688
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
8689
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
8690
|
+
|
8691
|
+
*s = vec_extract(vsumf0, 0);
|
8692
|
+
|
7520
8693
|
#else
|
7521
8694
|
|
7522
8695
|
int8_t aux8[QK_K];
|
@@ -7947,6 +9120,151 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
7947
9120
|
|
7948
9121
|
*s = sumf;
|
7949
9122
|
|
9123
|
+
#elif defined(__POWER9_VECTOR__)
|
9124
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
9125
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
9126
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
9127
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
9128
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
9129
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
9130
|
+
|
9131
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9132
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9133
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9134
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9135
|
+
|
9136
|
+
for (int i = 0; i < nb; ++i) {
|
9137
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9138
|
+
vector float vyd = vec_splats(y[i].d);
|
9139
|
+
vector float vd = vec_mul(vxd, vyd);
|
9140
|
+
|
9141
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
9142
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
9143
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
9144
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
9145
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9146
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9147
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9148
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9149
|
+
|
9150
|
+
const uint8_t * restrict q6 = x[i].ql;
|
9151
|
+
const uint8_t * restrict qh = x[i].qh;
|
9152
|
+
const int8_t * restrict qs = x[i].scales;
|
9153
|
+
const int8_t * restrict q8 = y[i].qs;
|
9154
|
+
|
9155
|
+
for (int j = 0; j < QK_K/128; ++j) {
|
9156
|
+
__builtin_prefetch(q6, 0, 0);
|
9157
|
+
__builtin_prefetch(qh, 0, 0);
|
9158
|
+
__builtin_prefetch(q8, 0, 0);
|
9159
|
+
|
9160
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q6);
|
9161
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q6);
|
9162
|
+
vector signed char qxs2 = (vector signed char)vec_xl(32, q6);
|
9163
|
+
vector signed char qxs3 = (vector signed char)vec_xl(48, q6);
|
9164
|
+
q6 += 64;
|
9165
|
+
|
9166
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
9167
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
9168
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
9169
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
9170
|
+
vector signed char qxs20 = vec_and(qxs2, lowMask);
|
9171
|
+
vector signed char qxs21 = vec_sr(qxs2, v4);
|
9172
|
+
vector signed char qxs30 = vec_and(qxs3, lowMask);
|
9173
|
+
vector signed char qxs31 = vec_sr(qxs3, v4);
|
9174
|
+
|
9175
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, qh);
|
9176
|
+
vector signed char qxhs1 = (vector signed char)vec_xl(16, qh);
|
9177
|
+
qh += 32;
|
9178
|
+
|
9179
|
+
vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
|
9180
|
+
vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
|
9181
|
+
vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, qxhs1), v4);
|
9182
|
+
vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v4)), v4);
|
9183
|
+
vector signed char qxh20 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
|
9184
|
+
vector signed char qxh21 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
|
9185
|
+
vector signed char qxh30 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v2)), v4);
|
9186
|
+
vector signed char qxh31 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs1, v6)), v4);
|
9187
|
+
|
9188
|
+
vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
|
9189
|
+
vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
|
9190
|
+
vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
|
9191
|
+
vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
|
9192
|
+
vector signed char q6x20 = vec_sub(vec_or(qxh20, qxs20), off);
|
9193
|
+
vector signed char q6x21 = vec_sub(vec_or(qxh21, qxs21), off);
|
9194
|
+
vector signed char q6x30 = vec_sub(vec_or(qxh30, qxs30), off);
|
9195
|
+
vector signed char q6x31 = vec_sub(vec_or(qxh31, qxs31), off);
|
9196
|
+
|
9197
|
+
vector signed char q8y00 = vec_xl( 0, q8);
|
9198
|
+
vector signed char q8y10 = vec_xl( 16, q8);
|
9199
|
+
vector signed char q8y20 = vec_xl( 32, q8);
|
9200
|
+
vector signed char q8y30 = vec_xl( 48, q8);
|
9201
|
+
vector signed char q8y01 = vec_xl( 64, q8);
|
9202
|
+
vector signed char q8y11 = vec_xl( 80, q8);
|
9203
|
+
vector signed char q8y21 = vec_xl( 96, q8);
|
9204
|
+
vector signed char q8y31 = vec_xl(112, q8);
|
9205
|
+
q8 += 128;
|
9206
|
+
|
9207
|
+
vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
|
9208
|
+
vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
|
9209
|
+
vector signed short qv20 = vec_add(vec_mule(q6x20, q8y20), vec_mulo(q6x20, q8y20));
|
9210
|
+
vector signed short qv30 = vec_add(vec_mule(q6x30, q8y30), vec_mulo(q6x30, q8y30));
|
9211
|
+
vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
|
9212
|
+
vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
|
9213
|
+
vector signed short qv21 = vec_add(vec_mule(q6x21, q8y21), vec_mulo(q6x21, q8y21));
|
9214
|
+
vector signed short qv31 = vec_add(vec_mule(q6x31, q8y31), vec_mulo(q6x31, q8y31));
|
9215
|
+
|
9216
|
+
vector signed short vscales = vec_unpackh(vec_xl_len(qs, 8));
|
9217
|
+
qs += 8;
|
9218
|
+
|
9219
|
+
vector signed short vs0 = vec_splat(vscales, 0);
|
9220
|
+
vector signed short vs1 = vec_splat(vscales, 1);
|
9221
|
+
vector signed short vs2 = vec_splat(vscales, 2);
|
9222
|
+
vector signed short vs3 = vec_splat(vscales, 3);
|
9223
|
+
vector signed short vs4 = vec_splat(vscales, 4);
|
9224
|
+
vector signed short vs5 = vec_splat(vscales, 5);
|
9225
|
+
vector signed short vs6 = vec_splat(vscales, 6);
|
9226
|
+
vector signed short vs7 = vec_splat(vscales, 7);
|
9227
|
+
|
9228
|
+
vsumi0 = vec_add(vec_mule(qv00, vs0), vsumi0);
|
9229
|
+
vsumi1 = vec_add(vec_mulo(qv00, vs0), vsumi1);
|
9230
|
+
vsumi2 = vec_add(vec_mule(qv01, vs4), vsumi2);
|
9231
|
+
vsumi3 = vec_add(vec_mulo(qv01, vs4), vsumi3);
|
9232
|
+
vsumi4 = vec_add(vec_mule(qv10, vs1), vsumi4);
|
9233
|
+
vsumi5 = vec_add(vec_mulo(qv10, vs1), vsumi5);
|
9234
|
+
vsumi6 = vec_add(vec_mule(qv11, vs5), vsumi6);
|
9235
|
+
vsumi7 = vec_add(vec_mulo(qv11, vs5), vsumi7);
|
9236
|
+
|
9237
|
+
vsumi0 = vec_add(vec_mule(qv20, vs2), vsumi0);
|
9238
|
+
vsumi1 = vec_add(vec_mulo(qv20, vs2), vsumi1);
|
9239
|
+
vsumi2 = vec_add(vec_mule(qv21, vs6), vsumi2);
|
9240
|
+
vsumi3 = vec_add(vec_mulo(qv21, vs6), vsumi3);
|
9241
|
+
vsumi4 = vec_add(vec_mule(qv30, vs3), vsumi4);
|
9242
|
+
vsumi5 = vec_add(vec_mulo(qv30, vs3), vsumi5);
|
9243
|
+
vsumi6 = vec_add(vec_mule(qv31, vs7), vsumi6);
|
9244
|
+
vsumi7 = vec_add(vec_mulo(qv31, vs7), vsumi7);
|
9245
|
+
}
|
9246
|
+
|
9247
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
9248
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
9249
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
9250
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
9251
|
+
|
9252
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9253
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9254
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9255
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9256
|
+
}
|
9257
|
+
|
9258
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9259
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9260
|
+
|
9261
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9262
|
+
|
9263
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9264
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9265
|
+
|
9266
|
+
*s = vec_extract(vsumf0, 0);
|
9267
|
+
|
7950
9268
|
#else
|
7951
9269
|
|
7952
9270
|
int8_t aux8[QK_K];
|
@@ -8253,6 +9571,85 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8253
9571
|
|
8254
9572
|
*s = sumf;
|
8255
9573
|
|
9574
|
+
#elif defined(__POWER9_VECTOR__)
|
9575
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
9576
|
+
const vector unsigned char v2 = vec_splats((unsigned char)0x2);
|
9577
|
+
const vector unsigned char v3 = vec_splats((unsigned char)0x3);
|
9578
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
9579
|
+
const vector unsigned char v6 = vec_splats((unsigned char)0x6);
|
9580
|
+
const vector signed char off = vec_splats((signed char)0x20);
|
9581
|
+
|
9582
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9583
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9584
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9585
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9586
|
+
|
9587
|
+
#pragma GCC unroll 2
|
9588
|
+
for (int i = 0; i < nb; ++i) {
|
9589
|
+
__builtin_prefetch(x[i].ql, 0, 1);
|
9590
|
+
__builtin_prefetch(x[i].qh, 0, 1);
|
9591
|
+
__builtin_prefetch(y[i].qs, 0, 1);
|
9592
|
+
|
9593
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9594
|
+
vector float vyd = vec_splats(y[i].d);
|
9595
|
+
vector float vd= vec_mul(vxd, vyd);
|
9596
|
+
|
9597
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, x[i].ql);
|
9598
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, x[i].ql);
|
9599
|
+
vector signed char qxs00 = vec_and(qxs0, lowMask);
|
9600
|
+
vector signed char qxs01 = vec_sr(qxs0, v4);
|
9601
|
+
vector signed char qxs10 = vec_and(qxs1, lowMask);
|
9602
|
+
vector signed char qxs11 = vec_sr(qxs1, v4);
|
9603
|
+
|
9604
|
+
vector signed char qxhs0 = (vector signed char)vec_xl( 0, x[i].qh);
|
9605
|
+
|
9606
|
+
vector signed char qxh00 = vec_sl(vec_and((vector signed char)v3, qxhs0), v4);
|
9607
|
+
vector signed char qxh01 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v4)), v4);
|
9608
|
+
vector signed char qxh10 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v2)), v4);
|
9609
|
+
vector signed char qxh11 = vec_sl(vec_and((vector signed char)v3, vec_sr(qxhs0, v6)), v4);
|
9610
|
+
|
9611
|
+
vector signed char q6x00 = vec_sub(vec_or(qxh00, qxs00), off);
|
9612
|
+
vector signed char q6x01 = vec_sub(vec_or(qxh01, qxs01), off);
|
9613
|
+
vector signed char q6x10 = vec_sub(vec_or(qxh10, qxs10), off);
|
9614
|
+
vector signed char q6x11 = vec_sub(vec_or(qxh11, qxs11), off);
|
9615
|
+
|
9616
|
+
vector signed char q8y00 = vec_xl( 0, y[i].qs);
|
9617
|
+
vector signed char q8y10 = vec_xl(16, y[i].qs);
|
9618
|
+
vector signed char q8y01 = vec_xl(32, y[i].qs);
|
9619
|
+
vector signed char q8y11 = vec_xl(48, y[i].qs);
|
9620
|
+
|
9621
|
+
vector signed short qv00 = vec_add(vec_mule(q6x00, q8y00), vec_mulo(q6x00, q8y00));
|
9622
|
+
vector signed short qv10 = vec_add(vec_mule(q6x10, q8y10), vec_mulo(q6x10, q8y10));
|
9623
|
+
vector signed short qv01 = vec_add(vec_mule(q6x01, q8y01), vec_mulo(q6x01, q8y01));
|
9624
|
+
vector signed short qv11 = vec_add(vec_mule(q6x11, q8y11), vec_mulo(q6x11, q8y11));
|
9625
|
+
|
9626
|
+
vector signed short vs = (vector signed short)vec_unpackh(vec_xl_len(x[i].scales, 4));
|
9627
|
+
vector signed short vs0 = vec_splat(vs, 0);
|
9628
|
+
vector signed short vs1 = vec_splat(vs, 1);
|
9629
|
+
vector signed short vs2 = vec_splat(vs, 2);
|
9630
|
+
vector signed short vs3 = vec_splat(vs, 3);
|
9631
|
+
|
9632
|
+
vector signed int vsumi0 = vec_add(vec_mule(qv00, vs0), vec_mulo(qv00, vs0));
|
9633
|
+
vector signed int vsumi1 = vec_add(vec_mule(qv10, vs1), vec_mulo(qv10, vs1));
|
9634
|
+
vector signed int vsumi2 = vec_add(vec_mule(qv01, vs2), vec_mulo(qv01, vs2));
|
9635
|
+
vector signed int vsumi3 = vec_add(vec_mule(qv11, vs3), vec_mulo(qv11, vs3));
|
9636
|
+
|
9637
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9638
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9639
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9640
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9641
|
+
}
|
9642
|
+
|
9643
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9644
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9645
|
+
|
9646
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9647
|
+
|
9648
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9649
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9650
|
+
|
9651
|
+
*s = vec_extract(vsumf0, 0);
|
9652
|
+
|
8256
9653
|
#else
|
8257
9654
|
|
8258
9655
|
int8_t aux8[QK_K];
|
@@ -8294,7 +9691,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
8294
9691
|
|
8295
9692
|
#endif
|
8296
9693
|
|
8297
|
-
#if defined (__AVX2__) || defined (__ARM_NEON)
|
9694
|
+
#if defined (__AVX2__) || defined (__ARM_NEON) || defined (__POWER9_VECTOR__)
|
8298
9695
|
static const int8_t keven_signs_q2xs[1024] = {
|
8299
9696
|
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
8300
9697
|
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
@@ -8427,6 +9824,103 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
8427
9824
|
|
8428
9825
|
*s = 0.125f * hsum_float_8(accumf);
|
8429
9826
|
|
9827
|
+
#elif defined(__POWER9_VECTOR__)
|
9828
|
+
vector float vsumf0 = vec_splats(0.0f);
|
9829
|
+
vector float vsumf1 = vec_splats(0.0f);
|
9830
|
+
vector float vsumf2 = vec_splats(0.0f);
|
9831
|
+
vector float vsumf3 = vec_splats(0.0f);
|
9832
|
+
|
9833
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
9834
|
+
|
9835
|
+
for (int i = 0; i < nb; ++i) {
|
9836
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
9837
|
+
vector float vyd = vec_splats(y[i].d);
|
9838
|
+
vector float vd = vec_mul(vxd, vyd);
|
9839
|
+
|
9840
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
9841
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
9842
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
9843
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
9844
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
9845
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
9846
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
9847
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
9848
|
+
|
9849
|
+
const uint16_t * restrict q2 = x[i].qs;
|
9850
|
+
const int8_t * restrict q8 = y[i].qs;
|
9851
|
+
|
9852
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
9853
|
+
__builtin_prefetch(q2, 0, 1);
|
9854
|
+
__builtin_prefetch(q8, 0, 1);
|
9855
|
+
|
9856
|
+
uint32_t aux32[4];
|
9857
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
9858
|
+
|
9859
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t));
|
9860
|
+
q2 += 8;
|
9861
|
+
|
9862
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xxs_grid + aux8[ 0]), *(const int64_t *)(iq2xxs_grid + aux8[ 1])};
|
9863
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xxs_grid + aux8[ 2]), *(const int64_t *)(iq2xxs_grid + aux8[ 3])};
|
9864
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xxs_grid + aux8[ 8]), *(const int64_t *)(iq2xxs_grid + aux8[ 9])};
|
9865
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xxs_grid + aux8[10]), *(const int64_t *)(iq2xxs_grid + aux8[11])};
|
9866
|
+
|
9867
|
+
vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((aux32[1] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 7) & 127))};
|
9868
|
+
vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((aux32[1] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[1] >> 21) & 127))};
|
9869
|
+
vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((aux32[3] >> 0) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 7) & 127))};
|
9870
|
+
vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((aux32[3] >> 14) & 127)), *(const int64_t *)(signs64 + ((aux32[3] >> 21) & 127))};
|
9871
|
+
|
9872
|
+
vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
|
9873
|
+
vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
|
9874
|
+
vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
|
9875
|
+
vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
|
9876
|
+
|
9877
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
9878
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
9879
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
9880
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
9881
|
+
q8 += 64;
|
9882
|
+
|
9883
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
9884
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
9885
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
9886
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
9887
|
+
|
9888
|
+
const uint16_t ls0 = aux32[1] >> 28;
|
9889
|
+
const uint16_t ls1 = aux32[3] >> 28;
|
9890
|
+
|
9891
|
+
vector signed short vscales01 = vec_splats((int16_t)(2*ls0+1));
|
9892
|
+
vector signed short vscales23 = vec_splats((int16_t)(2*ls1+1));
|
9893
|
+
|
9894
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
9895
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
9896
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
9897
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
9898
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
9899
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
9900
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
9901
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
9902
|
+
}
|
9903
|
+
|
9904
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
9905
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
9906
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
9907
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
9908
|
+
|
9909
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
9910
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
9911
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
9912
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
9913
|
+
}
|
9914
|
+
|
9915
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
9916
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
9917
|
+
|
9918
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
9919
|
+
|
9920
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
9921
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
9922
|
+
|
9923
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8430
9924
|
#else
|
8431
9925
|
|
8432
9926
|
uint32_t aux32[2];
|
@@ -8702,6 +10196,104 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
8702
10196
|
*s = 0.125f * hsum_float_8(accumf);
|
8703
10197
|
#endif
|
8704
10198
|
|
10199
|
+
#elif defined(__POWER9_VECTOR__)
|
10200
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10201
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10202
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10203
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10204
|
+
|
10205
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10206
|
+
|
10207
|
+
for (int i = 0; i < nb; ++i) {
|
10208
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10209
|
+
vector float vyd = vec_splats(y[i].d);
|
10210
|
+
vector float vd = vec_mul(vxd, vyd);
|
10211
|
+
|
10212
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10213
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10214
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10215
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10216
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10217
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10218
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10219
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10220
|
+
|
10221
|
+
const uint16_t * restrict q2 = x[i].qs;
|
10222
|
+
const uint8_t * restrict sc = x[i].scales;
|
10223
|
+
const int8_t * restrict q8 = y[i].qs;
|
10224
|
+
|
10225
|
+
for (int j = 0; j < QK_K/64; ++j) {
|
10226
|
+
__builtin_prefetch(q2, 0, 1);
|
10227
|
+
__builtin_prefetch(q8, 0, 1);
|
10228
|
+
|
10229
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2xs_grid + (q2[0] & 511)), *(const int64_t *)(iq2xs_grid + (q2[1] & 511))};
|
10230
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2xs_grid + (q2[2] & 511)), *(const int64_t *)(iq2xs_grid + (q2[3] & 511))};
|
10231
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2xs_grid + (q2[4] & 511)), *(const int64_t *)(iq2xs_grid + (q2[5] & 511))};
|
10232
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2xs_grid + (q2[6] & 511)), *(const int64_t *)(iq2xs_grid + (q2[7] & 511))};
|
10233
|
+
|
10234
|
+
vector signed long long vsigns0 = {*(const int64_t *)(signs64 + ((q2[0] >> 9))), *(const int64_t *)(signs64 + ((q2[1] >> 9)))};
|
10235
|
+
vector signed long long vsigns1 = {*(const int64_t *)(signs64 + ((q2[2] >> 9))), *(const int64_t *)(signs64 + ((q2[3] >> 9)))};
|
10236
|
+
vector signed long long vsigns2 = {*(const int64_t *)(signs64 + ((q2[4] >> 9))), *(const int64_t *)(signs64 + ((q2[5] >> 9)))};
|
10237
|
+
vector signed long long vsigns3 = {*(const int64_t *)(signs64 + ((q2[6] >> 9))), *(const int64_t *)(signs64 + ((q2[7] >> 9)))};
|
10238
|
+
q2 += 8;
|
10239
|
+
|
10240
|
+
vector signed char q2x0 = (vector signed char)vec_mul((vector signed char)vsigns0, (vector signed char)aux64x2_0);
|
10241
|
+
vector signed char q2x1 = (vector signed char)vec_mul((vector signed char)vsigns1, (vector signed char)aux64x2_1);
|
10242
|
+
vector signed char q2x2 = (vector signed char)vec_mul((vector signed char)vsigns2, (vector signed char)aux64x2_2);
|
10243
|
+
vector signed char q2x3 = (vector signed char)vec_mul((vector signed char)vsigns3, (vector signed char)aux64x2_3);
|
10244
|
+
|
10245
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10246
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10247
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10248
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10249
|
+
q8 += 64;
|
10250
|
+
|
10251
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
10252
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
10253
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
10254
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
10255
|
+
|
10256
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
10257
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
10258
|
+
const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
|
10259
|
+
const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
|
10260
|
+
sc += 2;
|
10261
|
+
|
10262
|
+
vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
|
10263
|
+
vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
|
10264
|
+
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
10265
|
+
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
10266
|
+
|
10267
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
|
10268
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
|
10269
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
|
10270
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
|
10271
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
10272
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
10273
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
10274
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10275
|
+
}
|
10276
|
+
|
10277
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10278
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10279
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10280
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10281
|
+
|
10282
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10283
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10284
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10285
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10286
|
+
}
|
10287
|
+
|
10288
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10289
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10290
|
+
|
10291
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10292
|
+
|
10293
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10294
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10295
|
+
|
10296
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8705
10297
|
#else
|
8706
10298
|
|
8707
10299
|
float sumf = 0.f;
|
@@ -8902,6 +10494,124 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
8902
10494
|
|
8903
10495
|
*s = 0.125f * hsum_float_8(accumf);
|
8904
10496
|
|
10497
|
+
#elif defined(__POWER9_VECTOR__)
|
10498
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
10499
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
10500
|
+
};
|
10501
|
+
|
10502
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
10503
|
+
|
10504
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10505
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10506
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10507
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10508
|
+
|
10509
|
+
const vector unsigned char mask0 = vec_xl( 0, k_mask1);
|
10510
|
+
const vector unsigned char mask1 = vec_xl(16, k_mask1);
|
10511
|
+
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
10512
|
+
|
10513
|
+
for (int i = 0; i < nb; ++i) {
|
10514
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10515
|
+
vector float vyd = vec_splats(y[i].d);
|
10516
|
+
vector float vd = vec_mul(vxd, vyd);
|
10517
|
+
|
10518
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10519
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10520
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10521
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10522
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10523
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10524
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10525
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10526
|
+
|
10527
|
+
const uint8_t * restrict q2 = x[i].qs;
|
10528
|
+
const uint8_t * restrict qh = x[i].qh;
|
10529
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].qs + QK_K/8);
|
10530
|
+
const uint8_t * restrict sc = x[i].scales;
|
10531
|
+
const int8_t * restrict q8 = y[i].qs;
|
10532
|
+
|
10533
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
10534
|
+
__builtin_prefetch(q2, 0, 1);
|
10535
|
+
__builtin_prefetch(q8, 0, 1);
|
10536
|
+
|
10537
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq2s_grid + (q2[0] | ((qh[0] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[1] | ((qh[0] << 6) & 0x300)))};
|
10538
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq2s_grid + (q2[2] | ((qh[0] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[3] | ((qh[0] << 2) & 0x300)))};
|
10539
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq2s_grid + (q2[4] | ((qh[1] << 8) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[5] | ((qh[1] << 6) & 0x300)))};
|
10540
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq2s_grid + (q2[6] | ((qh[1] << 4) & 0x300))), *(const int64_t *)(iq2s_grid + (q2[7] | ((qh[1] << 2) & 0x300)))};
|
10541
|
+
q2 += 8;
|
10542
|
+
qh += 2;
|
10543
|
+
|
10544
|
+
vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
|
10545
|
+
vector signed char vsigns23 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
|
10546
|
+
signs += 4;
|
10547
|
+
|
10548
|
+
vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
|
10549
|
+
vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
|
10550
|
+
vector signed char vsigns2 = vec_perm(vsigns23, vsigns23, mask0);
|
10551
|
+
vector signed char vsigns3 = vec_perm(vsigns23, vsigns23, mask1);
|
10552
|
+
|
10553
|
+
vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
|
10554
|
+
vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
|
10555
|
+
vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
|
10556
|
+
vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
|
10557
|
+
|
10558
|
+
vector signed char q2x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux64x2_0), vsigns0);
|
10559
|
+
vector signed char q2x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux64x2_1), vsigns1);
|
10560
|
+
vector signed char q2x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux64x2_2), vsigns2);
|
10561
|
+
vector signed char q2x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux64x2_3), vsigns3);
|
10562
|
+
|
10563
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10564
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10565
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10566
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10567
|
+
q8 += 64;
|
10568
|
+
|
10569
|
+
vector signed short qv0 = vec_add(vec_mule(q2x0, q8y0), vec_mulo(q2x0, q8y0));
|
10570
|
+
vector signed short qv1 = vec_add(vec_mule(q2x1, q8y1), vec_mulo(q2x1, q8y1));
|
10571
|
+
vector signed short qv2 = vec_add(vec_mule(q2x2, q8y2), vec_mulo(q2x2, q8y2));
|
10572
|
+
vector signed short qv3 = vec_add(vec_mule(q2x3, q8y3), vec_mulo(q2x3, q8y3));
|
10573
|
+
|
10574
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
10575
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
10576
|
+
const uint16_t ls2 = (uint16_t)(sc[1] & 0xf);
|
10577
|
+
const uint16_t ls3 = (uint16_t)(sc[1] >> 4);
|
10578
|
+
sc += 2;
|
10579
|
+
|
10580
|
+
vector signed short vscales0 = vec_splats((int16_t)(2*ls0+1));
|
10581
|
+
vector signed short vscales1 = vec_splats((int16_t)(2*ls1+1));
|
10582
|
+
vector signed short vscales2 = vec_splats((int16_t)(2*ls2+1));
|
10583
|
+
vector signed short vscales3 = vec_splats((int16_t)(2*ls3+1));
|
10584
|
+
|
10585
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales0), vsumi0);
|
10586
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales1), vsumi1);
|
10587
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales2), vsumi2);
|
10588
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales3), vsumi3);
|
10589
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales0), vsumi4);
|
10590
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales1), vsumi5);
|
10591
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales2), vsumi6);
|
10592
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales3), vsumi7);
|
10593
|
+
}
|
10594
|
+
|
10595
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10596
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10597
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10598
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10599
|
+
|
10600
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10601
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10602
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10603
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10604
|
+
}
|
10605
|
+
|
10606
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10607
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10608
|
+
|
10609
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10610
|
+
|
10611
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10612
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10613
|
+
|
10614
|
+
*s = 0.125f * vec_extract(vsumf0, 0);
|
8905
10615
|
#else
|
8906
10616
|
|
8907
10617
|
float sumf = 0;
|
@@ -9046,6 +10756,101 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
9046
10756
|
|
9047
10757
|
*s = 0.25f * hsum_float_8(accumf);
|
9048
10758
|
|
10759
|
+
#elif defined(__POWER9_VECTOR__)
|
10760
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
10761
|
+
|
10762
|
+
vector float vsumf0 = vec_splats(0.0f);
|
10763
|
+
vector float vsumf1 = vec_splats(0.0f);
|
10764
|
+
vector float vsumf2 = vec_splats(0.0f);
|
10765
|
+
vector float vsumf3 = vec_splats(0.0f);
|
10766
|
+
|
10767
|
+
for (int i = 0; i < nb; ++i) {
|
10768
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
10769
|
+
vector float vyd = vec_splats(y[i].d);
|
10770
|
+
vector float vd = vec_mul(vxd, vyd);
|
10771
|
+
|
10772
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
10773
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
10774
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
10775
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
10776
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
10777
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
10778
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
10779
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
10780
|
+
|
10781
|
+
const uint8_t * restrict q3 = x[i].qs;
|
10782
|
+
const uint32_t * restrict signs = (const uint32_t *)(x[i].qs + QK_K/4);
|
10783
|
+
const int8_t * restrict q8 = y[i].qs;
|
10784
|
+
|
10785
|
+
#pragma GCC unroll 1
|
10786
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
10787
|
+
__builtin_prefetch(q3, 0, 1);
|
10788
|
+
__builtin_prefetch(q8, 0, 1);
|
10789
|
+
|
10790
|
+
vector unsigned int aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
|
10791
|
+
vector unsigned int aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
|
10792
|
+
vector unsigned int aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
|
10793
|
+
vector unsigned int aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
|
10794
|
+
q3 += 16;
|
10795
|
+
|
10796
|
+
vector unsigned long long aux64x2_0 = {(uint64_t)(signs64[(signs[0] >> 0) & 127]), (uint64_t)(signs64[(signs[0] >> 7) & 127])};
|
10797
|
+
vector unsigned long long aux64x2_1 = {(uint64_t)(signs64[(signs[0] >> 14) & 127]), (uint64_t)(signs64[(signs[0] >> 21) & 127])};
|
10798
|
+
vector unsigned long long aux64x2_2 = {(uint64_t)(signs64[(signs[1] >> 0) & 127]), (uint64_t)(signs64[(signs[1] >> 7) & 127])};
|
10799
|
+
vector unsigned long long aux64x2_3 = {(uint64_t)(signs64[(signs[1] >> 14) & 127]), (uint64_t)(signs64[(signs[1] >> 21) & 127])};
|
10800
|
+
|
10801
|
+
vector signed char q3x0 = vec_mul((vector signed char)aux64x2_0, (vector signed char)aux32x4_0);
|
10802
|
+
vector signed char q3x1 = vec_mul((vector signed char)aux64x2_1, (vector signed char)aux32x4_1);
|
10803
|
+
vector signed char q3x2 = vec_mul((vector signed char)aux64x2_2, (vector signed char)aux32x4_2);
|
10804
|
+
vector signed char q3x3 = vec_mul((vector signed char)aux64x2_3, (vector signed char)aux32x4_3);
|
10805
|
+
|
10806
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
10807
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
10808
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
10809
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
10810
|
+
q8 += 64;
|
10811
|
+
|
10812
|
+
vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
|
10813
|
+
vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
|
10814
|
+
vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
|
10815
|
+
vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
|
10816
|
+
|
10817
|
+
const uint16_t ls0 = (uint16_t)(signs[0] >> 28);
|
10818
|
+
const uint16_t ls1 = (uint16_t)(signs[1] >> 28);
|
10819
|
+
signs += 2;
|
10820
|
+
|
10821
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
10822
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
10823
|
+
|
10824
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
10825
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
10826
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
10827
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
10828
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
10829
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
10830
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
10831
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
10832
|
+
}
|
10833
|
+
|
10834
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
10835
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
10836
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
10837
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
10838
|
+
|
10839
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
10840
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
10841
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
10842
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
10843
|
+
}
|
10844
|
+
|
10845
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
10846
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
10847
|
+
|
10848
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
10849
|
+
|
10850
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
10851
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
10852
|
+
|
10853
|
+
*s = 0.25f * vec_extract(vsumf0, 0);
|
9049
10854
|
#else
|
9050
10855
|
|
9051
10856
|
uint32_t aux32;
|
@@ -9273,6 +11078,124 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
|
|
9273
11078
|
|
9274
11079
|
*s = hsum_float_8(accumf);
|
9275
11080
|
|
11081
|
+
#elif defined(__POWER9_VECTOR__)
|
11082
|
+
static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
|
11083
|
+
0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
|
11084
|
+
};
|
11085
|
+
|
11086
|
+
static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,};
|
11087
|
+
|
11088
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11089
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11090
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11091
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11092
|
+
|
11093
|
+
const vector unsigned char mask0 = vec_xl( 0, k_mask1);
|
11094
|
+
const vector unsigned char mask1 = vec_xl(16, k_mask1);
|
11095
|
+
const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
|
11096
|
+
|
11097
|
+
for (int i = 0; i < nb; ++i) {
|
11098
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
11099
|
+
vector float vyd = vec_splats(y[i].d);
|
11100
|
+
vector float vd = vec_mul(vxd, vyd);
|
11101
|
+
|
11102
|
+
const uint8_t * restrict q3 = x[i].qs;
|
11103
|
+
const uint8_t * restrict qh = x[i].qh;
|
11104
|
+
const uint16_t * restrict signs = (const uint16_t *)(x[i].signs);
|
11105
|
+
const uint8_t * restrict sc = x[i].scales;
|
11106
|
+
const int8_t * restrict q8 = y[i].qs;
|
11107
|
+
|
11108
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11109
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11110
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11111
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11112
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11113
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11114
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11115
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11116
|
+
|
11117
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
11118
|
+
__builtin_prefetch(q3, 0, 1);
|
11119
|
+
__builtin_prefetch(q8, 0, 1);
|
11120
|
+
|
11121
|
+
vector unsigned int aux32x4_0 = {iq3s_grid[q3[ 0] | ((qh[0] << 8) & 256)], iq3s_grid[q3[ 1] | ((qh[0] << 7) & 256)],
|
11122
|
+
iq3s_grid[q3[ 2] | ((qh[0] << 6) & 256)], iq3s_grid[q3[ 3] | ((qh[0] << 5) & 256)]};
|
11123
|
+
vector unsigned int aux32x4_1 = {iq3s_grid[q3[ 4] | ((qh[0] << 4) & 256)], iq3s_grid[q3[ 5] | ((qh[0] << 3) & 256)],
|
11124
|
+
iq3s_grid[q3[ 6] | ((qh[0] << 2) & 256)], iq3s_grid[q3[ 7] | ((qh[0] << 1) & 256)]};
|
11125
|
+
vector unsigned int aux32x4_2 = {iq3s_grid[q3[ 8] | ((qh[1] << 8) & 256)], iq3s_grid[q3[ 9] | ((qh[1] << 7) & 256)],
|
11126
|
+
iq3s_grid[q3[10] | ((qh[1] << 6) & 256)], iq3s_grid[q3[11] | ((qh[1] << 5) & 256)]};
|
11127
|
+
vector unsigned int aux32x4_3 = {iq3s_grid[q3[12] | ((qh[1] << 4) & 256)], iq3s_grid[q3[13] | ((qh[1] << 3) & 256)],
|
11128
|
+
iq3s_grid[q3[14] | ((qh[1] << 2) & 256)], iq3s_grid[q3[15] | ((qh[1] << 1) & 256)]};
|
11129
|
+
q3 += 16;
|
11130
|
+
qh += 2;
|
11131
|
+
|
11132
|
+
vector signed char vsigns01 = (vector signed char)vec_splats(*(const uint32_t *)&signs[0]);
|
11133
|
+
vector signed char vsigns02 = (vector signed char)vec_splats(*(const uint32_t *)&signs[2]);
|
11134
|
+
signs += 4;
|
11135
|
+
|
11136
|
+
vector signed char vsigns0 = vec_perm(vsigns01, vsigns01, mask0);
|
11137
|
+
vector signed char vsigns1 = vec_perm(vsigns01, vsigns01, mask1);
|
11138
|
+
vector signed char vsigns2 = vec_perm(vsigns02, vsigns02, mask0);
|
11139
|
+
vector signed char vsigns3 = vec_perm(vsigns02, vsigns02, mask1);
|
11140
|
+
|
11141
|
+
vsigns0 = (vector signed char)vec_cmpeq(vec_and(vsigns0, mask2), mask2);
|
11142
|
+
vsigns1 = (vector signed char)vec_cmpeq(vec_and(vsigns1, mask2), mask2);
|
11143
|
+
vsigns2 = (vector signed char)vec_cmpeq(vec_and(vsigns2, mask2), mask2);
|
11144
|
+
vsigns3 = (vector signed char)vec_cmpeq(vec_and(vsigns3, mask2), mask2);
|
11145
|
+
|
11146
|
+
vector signed char q3x0 = vec_sub(vec_xor(vsigns0, (vector signed char)aux32x4_0), vsigns0);
|
11147
|
+
vector signed char q3x1 = vec_sub(vec_xor(vsigns1, (vector signed char)aux32x4_1), vsigns1);
|
11148
|
+
vector signed char q3x2 = vec_sub(vec_xor(vsigns2, (vector signed char)aux32x4_2), vsigns2);
|
11149
|
+
vector signed char q3x3 = vec_sub(vec_xor(vsigns3, (vector signed char)aux32x4_3), vsigns3);
|
11150
|
+
|
11151
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
11152
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
11153
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
11154
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
11155
|
+
q8 += 64;
|
11156
|
+
|
11157
|
+
vector signed short qv0 = vec_add(vec_mule(q3x0, q8y0), vec_mulo(q3x0, q8y0));
|
11158
|
+
vector signed short qv1 = vec_add(vec_mule(q3x1, q8y1), vec_mulo(q3x1, q8y1));
|
11159
|
+
vector signed short qv2 = vec_add(vec_mule(q3x2, q8y2), vec_mulo(q3x2, q8y2));
|
11160
|
+
vector signed short qv3 = vec_add(vec_mule(q3x3, q8y3), vec_mulo(q3x3, q8y3));
|
11161
|
+
|
11162
|
+
const uint16_t ls0 = (uint16_t)(sc[0] & 0xf);
|
11163
|
+
const uint16_t ls1 = (uint16_t)(sc[0] >> 4);
|
11164
|
+
sc ++;
|
11165
|
+
|
11166
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
11167
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
11168
|
+
|
11169
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
11170
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
11171
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
11172
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
11173
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11174
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11175
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11176
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11177
|
+
}
|
11178
|
+
|
11179
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
11180
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
11181
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
11182
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
11183
|
+
|
11184
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11185
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11186
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
11187
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
11188
|
+
}
|
11189
|
+
|
11190
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
11191
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
11192
|
+
|
11193
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11194
|
+
|
11195
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11196
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11197
|
+
|
11198
|
+
*s = vec_extract(vsumf0, 0);
|
9276
11199
|
#else
|
9277
11200
|
|
9278
11201
|
float sumf = 0.f;
|
@@ -9427,6 +11350,113 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void
|
|
9427
11350
|
|
9428
11351
|
*s = hsum_float_8(accum) + IQ1S_DELTA * accum1;
|
9429
11352
|
|
11353
|
+
#elif defined(__POWER9_VECTOR__)
|
11354
|
+
const vector unsigned char v0 = vec_splats((unsigned char)0x0);
|
11355
|
+
const vector unsigned short vsign = vec_splats((unsigned short)0x8000);
|
11356
|
+
|
11357
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11358
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11359
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11360
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11361
|
+
|
11362
|
+
for (int i = 0; i < nb; ++i) {
|
11363
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
|
11364
|
+
vector float vyd = vec_splats(y[i].d);
|
11365
|
+
vector float vd = vec_mul(vxd, vyd);
|
11366
|
+
|
11367
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11368
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11369
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11370
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11371
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11372
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11373
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11374
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11375
|
+
vector signed int vsumi8 = vec_splats((int32_t)0);
|
11376
|
+
|
11377
|
+
const uint8_t * restrict q1 = x[i].qs;
|
11378
|
+
const uint16_t * restrict qh = x[i].qh;
|
11379
|
+
const int8_t * restrict q8 = y[i].qs;
|
11380
|
+
const int16_t * restrict qs = y[i].bsums;
|
11381
|
+
|
11382
|
+
for (int j = 0; j < QK_K/32; j += 2) {
|
11383
|
+
__builtin_prefetch(q1, 0, 1);
|
11384
|
+
__builtin_prefetch(qh, 0, 1);
|
11385
|
+
__builtin_prefetch(q8, 0, 1);
|
11386
|
+
|
11387
|
+
vector signed long long aux64x2_0 = {*(const int64_t *)(iq1s_grid + (q1[0] | ((qh[0] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[1] | ((qh[0] << 5) & 0x700)))};
|
11388
|
+
vector signed long long aux64x2_1 = {*(const int64_t *)(iq1s_grid + (q1[2] | ((qh[0] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[3] | ((qh[0] >> 1) & 0x700)))};
|
11389
|
+
vector signed long long aux64x2_2 = {*(const int64_t *)(iq1s_grid + (q1[4] | ((qh[1] << 8) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[5] | ((qh[1] << 5) & 0x700)))};
|
11390
|
+
vector signed long long aux64x2_3 = {*(const int64_t *)(iq1s_grid + (q1[6] | ((qh[1] << 2) & 0x700))), *(const int64_t *)(iq1s_grid + (q1[7] | ((qh[1] >> 1) & 0x700)))};
|
11391
|
+
q1 += 8;
|
11392
|
+
|
11393
|
+
vector signed char q1x0 = (vector signed char)aux64x2_0;
|
11394
|
+
vector signed char q1x1 = (vector signed char)aux64x2_1;
|
11395
|
+
vector signed char q1x2 = (vector signed char)aux64x2_2;
|
11396
|
+
vector signed char q1x3 = (vector signed char)aux64x2_3;
|
11397
|
+
|
11398
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
11399
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
11400
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
11401
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
11402
|
+
q8 += 64;
|
11403
|
+
|
11404
|
+
vector signed short qv0 = vec_add(vec_mule(q1x0, q8y0), vec_mulo(q1x0, q8y0));
|
11405
|
+
vector signed short qv1 = vec_add(vec_mule(q1x1, q8y1), vec_mulo(q1x1, q8y1));
|
11406
|
+
vector signed short qv2 = vec_add(vec_mule(q1x2, q8y2), vec_mulo(q1x2, q8y2));
|
11407
|
+
vector signed short qv3 = vec_add(vec_mule(q1x3, q8y3), vec_mulo(q1x3, q8y3));
|
11408
|
+
|
11409
|
+
const uint16_t ls0 = (uint16_t)((qh[0] >> 12) & 7);
|
11410
|
+
const uint16_t ls1 = (uint16_t)((qh[1] >> 12) & 7);
|
11411
|
+
|
11412
|
+
vector signed short vscales01 = (vector signed short)vec_splats((uint16_t)(2*ls0+1));
|
11413
|
+
vector signed short vscales23 = (vector signed short)vec_splats((uint16_t)(2*ls1+1));
|
11414
|
+
vector signed short vscales = vec_sld(vscales23, vscales01, 8);
|
11415
|
+
|
11416
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
11417
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
11418
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
11419
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
11420
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
11421
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
11422
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
11423
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
11424
|
+
|
11425
|
+
vector signed short q8ysums = vec_xl_len(qs, 8);
|
11426
|
+
qs += 4;
|
11427
|
+
q8ysums = vec_mergeh(q8ysums, (vector signed short)v0);
|
11428
|
+
|
11429
|
+
vector signed short qxh = (vector signed short)vec_sld(vec_splats(qh[1]), vec_splats(qh[0]), 8);
|
11430
|
+
qh += 2;
|
11431
|
+
vector __bool short vsel = vec_cmpge(qxh, (vector signed short)v0);
|
11432
|
+
|
11433
|
+
vector signed short q8ysum = vec_sel((vector signed short)vec_xor((vector unsigned short)q8ysums, vsign), q8ysums, vsel);
|
11434
|
+
|
11435
|
+
vsumi8 = vec_add(vec_mule(q8ysum, vscales), vsumi8);
|
11436
|
+
}
|
11437
|
+
|
11438
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
11439
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
11440
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
11441
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
11442
|
+
|
11443
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11444
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11445
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
11446
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
11447
|
+
|
11448
|
+
vsumf0 = vec_madd(vec_ctf(vsumi8, 0), vec_mul(vd, vec_splats(IQ1S_DELTA)), vsumf0);
|
11449
|
+
}
|
11450
|
+
|
11451
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
11452
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
11453
|
+
|
11454
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11455
|
+
|
11456
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11457
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11458
|
+
|
11459
|
+
*s = vec_extract(vsumf0, 0);
|
9430
11460
|
#else
|
9431
11461
|
|
9432
11462
|
float sumf = 0;
|
@@ -9783,6 +11813,51 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
9783
11813
|
|
9784
11814
|
*s = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
9785
11815
|
|
11816
|
+
#elif defined(__POWER9_VECTOR__)
|
11817
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11818
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11819
|
+
|
11820
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11821
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11822
|
+
|
11823
|
+
const vector signed char values = vec_xl( 0, kvalues_iq4nl);
|
11824
|
+
|
11825
|
+
#pragma GCC unroll 4
|
11826
|
+
for (int ib = 0; ib < nb; ++ib) {
|
11827
|
+
__builtin_prefetch(x[ib].qs, 0, 1);
|
11828
|
+
__builtin_prefetch(y[ib].qs, 0, 1);
|
11829
|
+
|
11830
|
+
|
11831
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
|
11832
|
+
vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
|
11833
|
+
vector float vd = vec_mul(vxd, vyd);
|
11834
|
+
|
11835
|
+
vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
|
11836
|
+
vector signed char q4x0 = vec_and(qxs, lowMask);
|
11837
|
+
vector signed char q4x1 = vec_sr(qxs, v4);
|
11838
|
+
|
11839
|
+
q4x0 = vec_perm(values, values, (vector unsigned char)q4x0);
|
11840
|
+
q4x1 = vec_perm(values, values, (vector unsigned char)q4x1);
|
11841
|
+
|
11842
|
+
vector signed char q8y0 = vec_xl( 0, y[ib].qs);
|
11843
|
+
vector signed char q8y1 = vec_xl(16, y[ib].qs);
|
11844
|
+
|
11845
|
+
vector signed short qv0 = vec_add(vec_mule(q4x0, q8y0), vec_mulo(q4x0, q8y0));
|
11846
|
+
vector signed short qv1 = vec_add(vec_mule(q4x1, q8y1), vec_mulo(q4x1, q8y1));
|
11847
|
+
|
11848
|
+
vector signed int vsumi0 = vec_add(vec_unpackh(qv0), vec_unpackl(qv0));
|
11849
|
+
vector signed int vsumi1 = vec_add(vec_unpackh(qv1), vec_unpackl(qv1));
|
11850
|
+
|
11851
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
11852
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
11853
|
+
}
|
11854
|
+
|
11855
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
11856
|
+
|
11857
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
11858
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
11859
|
+
|
11860
|
+
*s = vec_extract(vsumf0, 0);
|
9786
11861
|
#else
|
9787
11862
|
float sumf = 0;
|
9788
11863
|
for (int ib = 0; ib < nb; ++ib) {
|
@@ -9894,6 +11969,105 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
9894
11969
|
|
9895
11970
|
*s = hsum_float_8(accum);
|
9896
11971
|
|
11972
|
+
#elif defined(__POWER9_VECTOR__)
|
11973
|
+
const vector signed char lowMask = vec_splats((signed char)0xF);
|
11974
|
+
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
11975
|
+
|
11976
|
+
vector float vsumf0 = vec_splats(0.0f);
|
11977
|
+
vector float vsumf1 = vec_splats(0.0f);
|
11978
|
+
vector float vsumf2 = vec_splats(0.0f);
|
11979
|
+
vector float vsumf3 = vec_splats(0.0f);
|
11980
|
+
|
11981
|
+
const vector signed char values = vec_xl( 0, kvalues_iq4nl);
|
11982
|
+
|
11983
|
+
for (int ibl = 0; ibl < nb; ++ibl) {
|
11984
|
+
|
11985
|
+
vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
|
11986
|
+
vector float vyd = vec_splats(y[ibl].d);
|
11987
|
+
vector float vd = vec_mul(vxd, vyd);
|
11988
|
+
|
11989
|
+
vector signed int vsumi0 = vec_splats((int32_t)0);
|
11990
|
+
vector signed int vsumi1 = vec_splats((int32_t)0);
|
11991
|
+
vector signed int vsumi2 = vec_splats((int32_t)0);
|
11992
|
+
vector signed int vsumi3 = vec_splats((int32_t)0);
|
11993
|
+
vector signed int vsumi4 = vec_splats((int32_t)0);
|
11994
|
+
vector signed int vsumi5 = vec_splats((int32_t)0);
|
11995
|
+
vector signed int vsumi6 = vec_splats((int32_t)0);
|
11996
|
+
vector signed int vsumi7 = vec_splats((int32_t)0);
|
11997
|
+
|
11998
|
+
uint16_t h = x[ibl].scales_h;
|
11999
|
+
|
12000
|
+
const uint8_t * restrict q4 = x[ibl].qs;
|
12001
|
+
const uint8_t * restrict sc = x[ibl].scales_l;
|
12002
|
+
const int8_t * restrict q8 = y[ibl].qs;
|
12003
|
+
|
12004
|
+
for (int ib = 0; ib < QK_K/64; ib ++ ) {
|
12005
|
+
__builtin_prefetch(q4, 0, 1);
|
12006
|
+
__builtin_prefetch(q8, 0, 1);
|
12007
|
+
|
12008
|
+
vector signed char qxs0 = (vector signed char)vec_xl( 0, q4);
|
12009
|
+
vector signed char qxs1 = (vector signed char)vec_xl(16, q4);
|
12010
|
+
q4 += 32;
|
12011
|
+
|
12012
|
+
vector signed char q4x00 = (vector signed char)vec_and(qxs0, lowMask);
|
12013
|
+
vector signed char q4x01 = (vector signed char)vec_sr(qxs0, v4);
|
12014
|
+
vector signed char q4x10 = (vector signed char)vec_and(qxs1, lowMask);
|
12015
|
+
vector signed char q4x11 = (vector signed char)vec_sr(qxs1, v4);
|
12016
|
+
|
12017
|
+
q4x00 = vec_perm(values, values, (vector unsigned char)q4x00);
|
12018
|
+
q4x01 = vec_perm(values, values, (vector unsigned char)q4x01);
|
12019
|
+
q4x10 = vec_perm(values, values, (vector unsigned char)q4x10);
|
12020
|
+
q4x11 = vec_perm(values, values, (vector unsigned char)q4x11);
|
12021
|
+
|
12022
|
+
vector signed char q8y0 = vec_xl( 0, q8);
|
12023
|
+
vector signed char q8y1 = vec_xl(16, q8);
|
12024
|
+
vector signed char q8y2 = vec_xl(32, q8);
|
12025
|
+
vector signed char q8y3 = vec_xl(48, q8);
|
12026
|
+
q8 += 64;
|
12027
|
+
|
12028
|
+
vector signed short qv0 = vec_add(vec_mule(q4x00, q8y0), vec_mulo(q4x00, q8y0));
|
12029
|
+
vector signed short qv1 = vec_add(vec_mule(q4x01, q8y1), vec_mulo(q4x01, q8y1));
|
12030
|
+
vector signed short qv2 = vec_add(vec_mule(q4x10, q8y2), vec_mulo(q4x10, q8y2));
|
12031
|
+
vector signed short qv3 = vec_add(vec_mule(q4x11, q8y3), vec_mulo(q4x11, q8y3));
|
12032
|
+
|
12033
|
+
const uint16_t ls0 = (uint16_t)(((sc[0] & 0xf) | ((h << 4) & 0x30)) - 32);
|
12034
|
+
const uint16_t ls1 = (uint16_t)(((sc[0] >> 4) | ((h << 2) & 0x30)) - 32);
|
12035
|
+
h >>= 4;
|
12036
|
+
sc ++;
|
12037
|
+
|
12038
|
+
vector signed short vscales01 = vec_splats((int16_t)ls0);
|
12039
|
+
vector signed short vscales23 = vec_splats((int16_t)ls1);
|
12040
|
+
|
12041
|
+
vsumi0 = vec_add(vec_mule(qv0, vscales01), vsumi0);
|
12042
|
+
vsumi1 = vec_add(vec_mule(qv1, vscales01), vsumi1);
|
12043
|
+
vsumi2 = vec_add(vec_mule(qv2, vscales23), vsumi2);
|
12044
|
+
vsumi3 = vec_add(vec_mule(qv3, vscales23), vsumi3);
|
12045
|
+
vsumi4 = vec_add(vec_mulo(qv0, vscales01), vsumi4);
|
12046
|
+
vsumi5 = vec_add(vec_mulo(qv1, vscales01), vsumi5);
|
12047
|
+
vsumi6 = vec_add(vec_mulo(qv2, vscales23), vsumi6);
|
12048
|
+
vsumi7 = vec_add(vec_mulo(qv3, vscales23), vsumi7);
|
12049
|
+
}
|
12050
|
+
|
12051
|
+
vsumi0 = vec_add(vsumi0, vsumi4);
|
12052
|
+
vsumi1 = vec_add(vsumi1, vsumi5);
|
12053
|
+
vsumi2 = vec_add(vsumi2, vsumi6);
|
12054
|
+
vsumi3 = vec_add(vsumi3, vsumi7);
|
12055
|
+
|
12056
|
+
vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
|
12057
|
+
vsumf1 = vec_madd(vec_ctf(vsumi1, 0), vd, vsumf1);
|
12058
|
+
vsumf2 = vec_madd(vec_ctf(vsumi2, 0), vd, vsumf2);
|
12059
|
+
vsumf3 = vec_madd(vec_ctf(vsumi3, 0), vd, vsumf3);
|
12060
|
+
}
|
12061
|
+
|
12062
|
+
vsumf0 = vec_add(vsumf0, vsumf2);
|
12063
|
+
vsumf1 = vec_add(vsumf1, vsumf3);
|
12064
|
+
|
12065
|
+
vsumf0 = vec_add(vsumf0, vsumf1);
|
12066
|
+
|
12067
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
|
12068
|
+
vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
|
12069
|
+
|
12070
|
+
*s = vec_extract(vsumf0, 0);
|
9897
12071
|
#else
|
9898
12072
|
float sumf = 0;
|
9899
12073
|
for (int ibl = 0; ibl < nb; ++ibl) {
|