llama_cpp 0.12.5 → 0.12.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -49,6 +49,8 @@
|
|
49
49
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
50
50
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
51
51
|
|
52
|
+
#define UNUSED GGML_UNUSED
|
53
|
+
|
52
54
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
53
55
|
|
54
56
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
@@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
268
270
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
269
271
|
|
270
272
|
#if defined(__ARM_NEON)
|
273
|
+
|
274
|
+
#ifdef _MSC_VER
|
275
|
+
|
276
|
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
277
|
+
|
278
|
+
#else
|
279
|
+
|
280
|
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
281
|
+
|
282
|
+
#endif
|
283
|
+
|
271
284
|
#if !defined(__aarch64__)
|
272
285
|
|
273
286
|
// 64-bit compatibility
|
@@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
|
|
3666
3679
|
}
|
3667
3680
|
#endif
|
3668
3681
|
|
3669
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3682
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
3670
3683
|
const int qk = QK8_0;
|
3671
3684
|
const int nb = n / qk;
|
3672
3685
|
|
3673
3686
|
assert(n % qk == 0);
|
3687
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3688
|
+
assert((nrc == 2) || (nrc == 1));
|
3689
|
+
#else
|
3690
|
+
assert(nrc == 1);
|
3691
|
+
#endif
|
3692
|
+
UNUSED(nrc);
|
3693
|
+
UNUSED(bx);
|
3694
|
+
UNUSED(by);
|
3695
|
+
UNUSED(bs);
|
3674
3696
|
|
3675
3697
|
const block_q4_0 * restrict x = vx;
|
3676
3698
|
const block_q8_0 * restrict y = vy;
|
3677
3699
|
|
3700
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3701
|
+
if (nrc == 2) {
|
3702
|
+
const block_q4_0 * restrict vx0 = vx;
|
3703
|
+
const block_q4_0 * restrict vx1 = vx + bx;
|
3704
|
+
|
3705
|
+
const block_q8_0 * restrict vy0 = vy;
|
3706
|
+
const block_q8_0 * restrict vy1 = vy + by;
|
3707
|
+
|
3708
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3709
|
+
|
3710
|
+
for (int i = 0; i < nb; i++) {
|
3711
|
+
const block_q4_0 * restrict b_x0 = &vx0[i];
|
3712
|
+
const block_q4_0 * restrict b_x1 = &vx1[i];
|
3713
|
+
const block_q8_0 * restrict b_y0 = &vy0[i];
|
3714
|
+
const block_q8_0 * restrict b_y1 = &vy1[i];
|
3715
|
+
|
3716
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
3717
|
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
3718
|
+
|
3719
|
+
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
3720
|
+
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
3721
|
+
|
3722
|
+
// 4-bit -> 8-bit
|
3723
|
+
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
3724
|
+
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
3725
|
+
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
3726
|
+
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
3727
|
+
|
3728
|
+
// sub 8
|
3729
|
+
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
|
3730
|
+
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
|
3731
|
+
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
|
3732
|
+
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
|
3733
|
+
|
3734
|
+
// load y
|
3735
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
3736
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
3737
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
3738
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3739
|
+
|
3740
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3741
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
3742
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3743
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
3744
|
+
|
3745
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3746
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3747
|
+
|
3748
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
3749
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
3750
|
+
|
3751
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
3752
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
3753
|
+
|
3754
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
3755
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
3756
|
+
|
3757
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
3758
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
3759
|
+
}
|
3760
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
3761
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
3762
|
+
|
3763
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
3764
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
3765
|
+
return;
|
3766
|
+
}
|
3767
|
+
#endif
|
3678
3768
|
#if defined(__ARM_NEON)
|
3679
3769
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3680
3770
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
@@ -3729,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|
3729
3819
|
/* Compute combined scale for the block */
|
3730
3820
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
3731
3821
|
|
3732
|
-
__m256i
|
3822
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
3733
3823
|
|
3734
3824
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
3735
3825
|
const __m256i off = _mm256_set1_epi8( 8 );
|
3736
|
-
|
3826
|
+
qx = _mm256_sub_epi8( qx, off );
|
3737
3827
|
|
3738
|
-
__m256i
|
3828
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3739
3829
|
|
3740
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
3830
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
3741
3831
|
|
3742
3832
|
/* Multiply q with scale and accumulate */
|
3743
3833
|
acc = _mm256_fmadd_ps( d, q, acc );
|
@@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|
3956
4046
|
#endif
|
3957
4047
|
}
|
3958
4048
|
|
3959
|
-
void ggml_vec_dot_q4_1_q8_1(
|
4049
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
3960
4050
|
const int qk = QK8_1;
|
3961
4051
|
const int nb = n / qk;
|
3962
4052
|
|
3963
4053
|
assert(n % qk == 0);
|
4054
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4055
|
+
assert((nrc == 2) || (nrc == 1));
|
4056
|
+
#else
|
4057
|
+
assert(nrc == 1);
|
4058
|
+
#endif
|
4059
|
+
UNUSED(nrc);
|
4060
|
+
UNUSED(bx);
|
4061
|
+
UNUSED(by);
|
4062
|
+
UNUSED(bs);
|
3964
4063
|
|
3965
4064
|
const block_q4_1 * restrict x = vx;
|
3966
4065
|
const block_q8_1 * restrict y = vy;
|
3967
4066
|
|
4067
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4068
|
+
if (nrc == 2) {
|
4069
|
+
const block_q4_1 * restrict vx0 = vx;
|
4070
|
+
const block_q4_1 * restrict vx1 = vx + bx;
|
4071
|
+
const block_q8_1 * restrict vy0 = vy;
|
4072
|
+
const block_q8_1 * restrict vy1 = vy + by;
|
4073
|
+
|
4074
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4075
|
+
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
4076
|
+
|
4077
|
+
for (int i = 0; i < nb; i++) {
|
4078
|
+
const block_q4_1 * restrict b_x0 = &vx0[i];
|
4079
|
+
const block_q4_1 * restrict b_x1 = &vx1[i];
|
4080
|
+
const block_q8_1 * restrict b_y0 = &vy0[i];
|
4081
|
+
const block_q8_1 * restrict b_y1 = &vy1[i];
|
4082
|
+
|
4083
|
+
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
4084
|
+
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
4085
|
+
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
4086
|
+
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
4087
|
+
summs0 += summs_t;
|
4088
|
+
|
4089
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
4090
|
+
|
4091
|
+
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
4092
|
+
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
4093
|
+
|
4094
|
+
// 4-bit -> 8-bit
|
4095
|
+
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
4096
|
+
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
4097
|
+
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
4098
|
+
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
4099
|
+
|
4100
|
+
// load y
|
4101
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
4102
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
4103
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4104
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4105
|
+
|
4106
|
+
// mmla into int32x4_t
|
4107
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4108
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4109
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4110
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4111
|
+
|
4112
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4113
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4114
|
+
|
4115
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4116
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4117
|
+
|
4118
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4119
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4120
|
+
|
4121
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4122
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4123
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
4124
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
4125
|
+
}
|
4126
|
+
|
4127
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
4128
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
4129
|
+
sumv2 = sumv2 + summs0;
|
4130
|
+
|
4131
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
4132
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
4133
|
+
return;
|
4134
|
+
}
|
4135
|
+
#endif
|
3968
4136
|
// TODO: add WASM SIMD
|
3969
4137
|
#if defined(__ARM_NEON)
|
3970
4138
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
@@ -4028,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4028
4196
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
4029
4197
|
|
4030
4198
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
4031
|
-
const __m256i
|
4032
|
-
const __m256i
|
4199
|
+
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4200
|
+
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
4033
4201
|
|
4034
|
-
const __m256 xy = mul_sum_us8_pairs_float(
|
4202
|
+
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
4035
4203
|
|
4036
4204
|
// Accumulate d0*d1*x*y
|
4037
4205
|
#if defined(__AVX2__)
|
@@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4096
4264
|
#endif
|
4097
4265
|
}
|
4098
4266
|
|
4099
|
-
void ggml_vec_dot_q5_0_q8_0(
|
4267
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4100
4268
|
const int qk = QK8_0;
|
4101
4269
|
const int nb = n / qk;
|
4102
4270
|
|
4103
4271
|
assert(n % qk == 0);
|
4104
4272
|
assert(qk == QK5_0);
|
4273
|
+
assert(nrc == 1);
|
4274
|
+
UNUSED(nrc);
|
4275
|
+
UNUSED(bx);
|
4276
|
+
UNUSED(by);
|
4277
|
+
UNUSED(bs);
|
4105
4278
|
|
4106
4279
|
const block_q5_0 * restrict x = vx;
|
4107
4280
|
const block_q8_0 * restrict y = vy;
|
@@ -4245,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4245
4418
|
/* Compute combined scale for the block */
|
4246
4419
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
4247
4420
|
|
4248
|
-
__m256i
|
4421
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4249
4422
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4250
4423
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
4251
|
-
|
4424
|
+
qx = _mm256_or_si256(qx, bxhi);
|
4252
4425
|
|
4253
|
-
__m256i
|
4426
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4254
4427
|
|
4255
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
4428
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
4256
4429
|
|
4257
4430
|
/* Multiply q with scale and accumulate */
|
4258
4431
|
acc = _mm256_fmadd_ps(d, q, acc);
|
@@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4382
4555
|
#endif
|
4383
4556
|
}
|
4384
4557
|
|
4385
|
-
void ggml_vec_dot_q5_1_q8_1(
|
4558
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4386
4559
|
const int qk = QK8_1;
|
4387
4560
|
const int nb = n / qk;
|
4388
4561
|
|
4389
4562
|
assert(n % qk == 0);
|
4390
4563
|
assert(qk == QK5_1);
|
4564
|
+
assert(nrc == 1);
|
4565
|
+
UNUSED(nrc);
|
4566
|
+
UNUSED(bx);
|
4567
|
+
UNUSED(by);
|
4568
|
+
UNUSED(bs);
|
4391
4569
|
|
4392
4570
|
const block_q5_1 * restrict x = vx;
|
4393
4571
|
const block_q8_1 * restrict y = vy;
|
@@ -4544,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4544
4722
|
|
4545
4723
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4546
4724
|
|
4547
|
-
__m256i
|
4725
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4548
4726
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4549
4727
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
4550
|
-
|
4728
|
+
qx = _mm256_or_si256(qx, bxhi);
|
4551
4729
|
|
4552
4730
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4553
|
-
const __m256i
|
4731
|
+
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4554
4732
|
|
4555
|
-
const __m256 q = mul_sum_us8_pairs_float(
|
4733
|
+
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
4556
4734
|
|
4557
4735
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
4558
4736
|
}
|
@@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4681
4859
|
#endif
|
4682
4860
|
}
|
4683
4861
|
|
4684
|
-
void ggml_vec_dot_q8_0_q8_0(
|
4862
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4685
4863
|
const int qk = QK8_0;
|
4686
4864
|
const int nb = n / qk;
|
4687
4865
|
|
4688
4866
|
assert(n % qk == 0);
|
4867
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4868
|
+
assert((nrc == 2) || (nrc == 1));
|
4869
|
+
#else
|
4870
|
+
assert(nrc == 1);
|
4871
|
+
#endif
|
4872
|
+
UNUSED(nrc);
|
4873
|
+
UNUSED(bx);
|
4874
|
+
UNUSED(by);
|
4875
|
+
UNUSED(bs);
|
4689
4876
|
|
4690
4877
|
const block_q8_0 * restrict x = vx;
|
4691
4878
|
const block_q8_0 * restrict y = vy;
|
4692
4879
|
|
4880
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4881
|
+
if (nrc == 2) {
|
4882
|
+
const block_q8_0 * restrict vx0 = vx;
|
4883
|
+
const block_q8_0 * restrict vx1 = vx + bx;
|
4884
|
+
const block_q8_0 * restrict vy0 = vy;
|
4885
|
+
const block_q8_0 * restrict vy1 = vy + by;
|
4886
|
+
|
4887
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4888
|
+
|
4889
|
+
for (int i = 0; i < nb; i++) {
|
4890
|
+
const block_q8_0 * restrict b_x0 = &vx0[i];
|
4891
|
+
const block_q8_0 * restrict b_y0 = &vy0[i];
|
4892
|
+
|
4893
|
+
const block_q8_0 * restrict b_x1 = &vx1[i];
|
4894
|
+
const block_q8_0 * restrict b_y1 = &vy1[i];
|
4895
|
+
|
4896
|
+
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
4897
|
+
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
4898
|
+
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
|
4899
|
+
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
|
4900
|
+
|
4901
|
+
// load y
|
4902
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
4903
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
4904
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4905
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4906
|
+
|
4907
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4908
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4909
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4910
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4911
|
+
|
4912
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4913
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4914
|
+
|
4915
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4916
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4917
|
+
|
4918
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4919
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4920
|
+
|
4921
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4922
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4923
|
+
|
4924
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
4925
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
4926
|
+
}
|
4927
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
4928
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
4929
|
+
|
4930
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
4931
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
4932
|
+
return;
|
4933
|
+
}
|
4934
|
+
#endif
|
4693
4935
|
#if defined(__ARM_NEON)
|
4694
4936
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4695
4937
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
@@ -4731,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4731
4973
|
for (int i = 0; i < nb; ++i) {
|
4732
4974
|
// Compute combined scale for the block
|
4733
4975
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
4734
|
-
__m256i
|
4735
|
-
__m256i
|
4976
|
+
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
4977
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4736
4978
|
|
4737
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
4979
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
4738
4980
|
|
4739
4981
|
// Multiply q with scale and accumulate
|
4740
4982
|
#if defined(__AVX2__)
|
@@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4784
5026
|
}
|
4785
5027
|
|
4786
5028
|
#if QK_K == 256
|
4787
|
-
void ggml_vec_dot_q2_K_q8_K(
|
5029
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5030
|
+
assert(nrc == 1);
|
5031
|
+
UNUSED(nrc);
|
5032
|
+
UNUSED(bx);
|
5033
|
+
UNUSED(by);
|
5034
|
+
UNUSED(bs);
|
4788
5035
|
|
4789
5036
|
const block_q2_K * restrict x = vx;
|
4790
5037
|
const block_q8_K * restrict y = vy;
|
@@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5160
5407
|
|
5161
5408
|
#else
|
5162
5409
|
|
5163
|
-
void ggml_vec_dot_q2_K_q8_K(
|
5410
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5411
|
+
assert(nrc == 1);
|
5412
|
+
UNUSED(nrc);
|
5413
|
+
UNUSED(bx);
|
5414
|
+
UNUSED(by);
|
5415
|
+
UNUSED(bs);
|
5164
5416
|
|
5165
5417
|
const block_q2_K * restrict x = vx;
|
5166
5418
|
const block_q8_K * restrict y = vy;
|
@@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5418
5670
|
#endif
|
5419
5671
|
|
5420
5672
|
#if QK_K == 256
|
5421
|
-
void ggml_vec_dot_q3_K_q8_K(
|
5673
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5422
5674
|
assert(n % QK_K == 0);
|
5675
|
+
assert(nrc == 1);
|
5676
|
+
UNUSED(nrc);
|
5677
|
+
UNUSED(bx);
|
5678
|
+
UNUSED(by);
|
5679
|
+
UNUSED(bs);
|
5423
5680
|
|
5424
5681
|
const uint32_t kmask1 = 0x03030303;
|
5425
5682
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
@@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5938
6195
|
|
5939
6196
|
#else
|
5940
6197
|
|
5941
|
-
void ggml_vec_dot_q3_K_q8_K(
|
6198
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5942
6199
|
assert(n % QK_K == 0);
|
6200
|
+
assert(nrc == 1);
|
6201
|
+
UNUSED(nrc);
|
6202
|
+
UNUSED(bx);
|
6203
|
+
UNUSED(by);
|
6204
|
+
UNUSED(bs);
|
5943
6205
|
|
5944
6206
|
const block_q3_K * restrict x = vx;
|
5945
6207
|
const block_q8_K * restrict y = vy;
|
@@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6281
6543
|
#endif
|
6282
6544
|
|
6283
6545
|
#if QK_K == 256
|
6284
|
-
void ggml_vec_dot_q4_K_q8_K(
|
6546
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6285
6547
|
assert(n % QK_K == 0);
|
6548
|
+
assert(nrc == 1);
|
6549
|
+
UNUSED(nrc);
|
6550
|
+
UNUSED(bx);
|
6551
|
+
UNUSED(by);
|
6552
|
+
UNUSED(bs);
|
6286
6553
|
|
6287
6554
|
const block_q4_K * restrict x = vx;
|
6288
6555
|
const block_q8_K * restrict y = vy;
|
@@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6637
6904
|
#endif
|
6638
6905
|
}
|
6639
6906
|
#else
|
6640
|
-
void ggml_vec_dot_q4_K_q8_K(
|
6907
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6641
6908
|
assert(n % QK_K == 0);
|
6909
|
+
assert(nrc == 1);
|
6910
|
+
UNUSED(nrc);
|
6911
|
+
UNUSED(bx);
|
6912
|
+
UNUSED(by);
|
6913
|
+
UNUSED(bs);
|
6642
6914
|
|
6643
6915
|
const block_q4_K * restrict x = vx;
|
6644
6916
|
const block_q8_K * restrict y = vy;
|
@@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6880
7152
|
#endif
|
6881
7153
|
|
6882
7154
|
#if QK_K == 256
|
6883
|
-
void ggml_vec_dot_q5_K_q8_K(
|
7155
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6884
7156
|
assert(n % QK_K == 0);
|
7157
|
+
assert(nrc == 1);
|
7158
|
+
UNUSED(nrc);
|
7159
|
+
UNUSED(bx);
|
7160
|
+
UNUSED(by);
|
7161
|
+
UNUSED(bs);
|
6885
7162
|
|
6886
7163
|
const block_q5_K * restrict x = vx;
|
6887
7164
|
const block_q8_K * restrict y = vy;
|
@@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7300
7577
|
|
7301
7578
|
#else
|
7302
7579
|
|
7303
|
-
void ggml_vec_dot_q5_K_q8_K(
|
7580
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
7304
7581
|
assert(n % QK_K == 0);
|
7582
|
+
assert(nrc == 1);
|
7583
|
+
UNUSED(nrc);
|
7584
|
+
UNUSED(bx);
|
7585
|
+
UNUSED(by);
|
7586
|
+
UNUSED(bs);
|
7305
7587
|
|
7306
7588
|
const block_q5_K * restrict x = vx;
|
7307
7589
|
const block_q8_K * restrict y = vy;
|
@@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7566
7848
|
|
7567
7849
|
|
7568
7850
|
#if QK_K == 256
|
7569
|
-
void ggml_vec_dot_q6_K_q8_K(
|
7851
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
7570
7852
|
assert(n % QK_K == 0);
|
7853
|
+
assert(nrc == 1);
|
7854
|
+
UNUSED(nrc);
|
7855
|
+
UNUSED(bx);
|
7856
|
+
UNUSED(by);
|
7857
|
+
UNUSED(bs);
|
7571
7858
|
|
7572
7859
|
const block_q6_K * restrict x = vx;
|
7573
7860
|
const block_q8_K * restrict y = vy;
|
@@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7998
8285
|
|
7999
8286
|
#else
|
8000
8287
|
|
8001
|
-
void ggml_vec_dot_q6_K_q8_K(
|
8288
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8002
8289
|
assert(n % QK_K == 0);
|
8290
|
+
assert(nrc == 1);
|
8291
|
+
UNUSED(nrc);
|
8292
|
+
UNUSED(bx);
|
8293
|
+
UNUSED(by);
|
8294
|
+
UNUSED(bs);
|
8003
8295
|
|
8004
8296
|
const block_q6_K * restrict x = vx;
|
8005
8297
|
const block_q8_K * restrict y = vy;
|
@@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
8328
8620
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
8329
8621
|
};
|
8330
8622
|
|
8331
|
-
void ggml_vec_dot_iq2_xxs_q8_K(
|
8623
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8332
8624
|
assert(n % QK_K == 0);
|
8625
|
+
assert(nrc == 1);
|
8626
|
+
UNUSED(nrc);
|
8627
|
+
UNUSED(bx);
|
8628
|
+
UNUSED(by);
|
8629
|
+
UNUSED(bs);
|
8333
8630
|
|
8334
8631
|
const block_iq2_xxs * restrict x = vx;
|
8335
8632
|
const block_q8_K * restrict y = vy;
|
@@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
|
|
8451
8748
|
#endif
|
8452
8749
|
}
|
8453
8750
|
|
8454
|
-
void ggml_vec_dot_iq2_xs_q8_K(
|
8751
|
+
void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8455
8752
|
assert(n % QK_K == 0);
|
8753
|
+
assert(nrc == 1);
|
8754
|
+
UNUSED(nrc);
|
8755
|
+
UNUSED(bx);
|
8756
|
+
UNUSED(by);
|
8757
|
+
UNUSED(bs);
|
8456
8758
|
|
8457
8759
|
const block_iq2_xs * restrict x = vx;
|
8458
8760
|
const block_q8_K * restrict y = vy;
|
@@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
8671
8973
|
}
|
8672
8974
|
|
8673
8975
|
// TODO
|
8674
|
-
void ggml_vec_dot_iq3_xxs_q8_K(
|
8976
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8675
8977
|
assert(n % QK_K == 0);
|
8978
|
+
assert(nrc == 1);
|
8979
|
+
UNUSED(nrc);
|
8980
|
+
UNUSED(bx);
|
8981
|
+
UNUSED(by);
|
8982
|
+
UNUSED(bs);
|
8676
8983
|
|
8677
8984
|
const block_iq3_xxs * restrict x = vx;
|
8678
8985
|
const block_q8_K * restrict y = vy;
|
@@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
|
|
8698
9005
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
8699
9006
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
8700
9007
|
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
8701
|
-
const uint32x4_t aux32x4_0 =
|
8702
|
-
const uint32x4_t aux32x4_1 =
|
8703
|
-
const uint32x4_t aux32x4_2 =
|
8704
|
-
const uint32x4_t aux32x4_3 =
|
9008
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
9009
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
9010
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
9011
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
8705
9012
|
q3 += 16;
|
8706
9013
|
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
8707
9014
|
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
@@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
|
245
245
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
246
246
|
|
247
247
|
// Dot product
|
248
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
249
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
250
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
251
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
252
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
253
|
-
|
254
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
255
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
256
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
257
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
258
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
259
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
260
|
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
261
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
248
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
249
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
250
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
251
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
252
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
253
|
+
|
254
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
255
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
256
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
257
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
258
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
259
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
260
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
261
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
262
262
|
|
263
263
|
//
|
264
264
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|