llama_cpp 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -49,6 +49,8 @@
|
|
49
49
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
50
50
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
51
51
|
|
52
|
+
#define UNUSED GGML_UNUSED
|
53
|
+
|
52
54
|
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
53
55
|
|
54
56
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
@@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
268
270
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
269
271
|
|
270
272
|
#if defined(__ARM_NEON)
|
273
|
+
|
274
|
+
#ifdef _MSC_VER
|
275
|
+
|
276
|
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
277
|
+
|
278
|
+
#else
|
279
|
+
|
280
|
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
281
|
+
|
282
|
+
#endif
|
283
|
+
|
271
284
|
#if !defined(__aarch64__)
|
272
285
|
|
273
286
|
// 64-bit compatibility
|
@@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
|
|
3666
3679
|
}
|
3667
3680
|
#endif
|
3668
3681
|
|
3669
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
3682
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
3670
3683
|
const int qk = QK8_0;
|
3671
3684
|
const int nb = n / qk;
|
3672
3685
|
|
3673
3686
|
assert(n % qk == 0);
|
3687
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3688
|
+
assert((nrc == 2) || (nrc == 1));
|
3689
|
+
#else
|
3690
|
+
assert(nrc == 1);
|
3691
|
+
#endif
|
3692
|
+
UNUSED(nrc);
|
3693
|
+
UNUSED(bx);
|
3694
|
+
UNUSED(by);
|
3695
|
+
UNUSED(bs);
|
3674
3696
|
|
3675
3697
|
const block_q4_0 * restrict x = vx;
|
3676
3698
|
const block_q8_0 * restrict y = vy;
|
3677
3699
|
|
3700
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
3701
|
+
if (nrc == 2) {
|
3702
|
+
const block_q4_0 * restrict vx0 = vx;
|
3703
|
+
const block_q4_0 * restrict vx1 = vx + bx;
|
3704
|
+
|
3705
|
+
const block_q8_0 * restrict vy0 = vy;
|
3706
|
+
const block_q8_0 * restrict vy1 = vy + by;
|
3707
|
+
|
3708
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3709
|
+
|
3710
|
+
for (int i = 0; i < nb; i++) {
|
3711
|
+
const block_q4_0 * restrict b_x0 = &vx0[i];
|
3712
|
+
const block_q4_0 * restrict b_x1 = &vx1[i];
|
3713
|
+
const block_q8_0 * restrict b_y0 = &vy0[i];
|
3714
|
+
const block_q8_0 * restrict b_y1 = &vy1[i];
|
3715
|
+
|
3716
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
3717
|
+
const int8x16_t s8b = vdupq_n_s8(0x8);
|
3718
|
+
|
3719
|
+
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
3720
|
+
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
3721
|
+
|
3722
|
+
// 4-bit -> 8-bit
|
3723
|
+
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
3724
|
+
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
3725
|
+
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
3726
|
+
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
3727
|
+
|
3728
|
+
// sub 8
|
3729
|
+
const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
|
3730
|
+
const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
|
3731
|
+
const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
|
3732
|
+
const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
|
3733
|
+
|
3734
|
+
// load y
|
3735
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
3736
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
3737
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
3738
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
3739
|
+
|
3740
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3741
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
3742
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
3743
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
3744
|
+
|
3745
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3746
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
3747
|
+
|
3748
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
3749
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
3750
|
+
|
3751
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
3752
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
3753
|
+
|
3754
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
3755
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
3756
|
+
|
3757
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
3758
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
3759
|
+
}
|
3760
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
3761
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
3762
|
+
|
3763
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
3764
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
3765
|
+
return;
|
3766
|
+
}
|
3767
|
+
#endif
|
3678
3768
|
#if defined(__ARM_NEON)
|
3679
3769
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
3680
3770
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
@@ -3729,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|
3729
3819
|
/* Compute combined scale for the block */
|
3730
3820
|
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
|
3731
3821
|
|
3732
|
-
__m256i
|
3822
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
3733
3823
|
|
3734
3824
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
3735
3825
|
const __m256i off = _mm256_set1_epi8( 8 );
|
3736
|
-
|
3826
|
+
qx = _mm256_sub_epi8( qx, off );
|
3737
3827
|
|
3738
|
-
__m256i
|
3828
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
3739
3829
|
|
3740
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
3830
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
3741
3831
|
|
3742
3832
|
/* Multiply q with scale and accumulate */
|
3743
3833
|
acc = _mm256_fmadd_ps( d, q, acc );
|
@@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|
3956
4046
|
#endif
|
3957
4047
|
}
|
3958
4048
|
|
3959
|
-
void ggml_vec_dot_q4_1_q8_1(
|
4049
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
3960
4050
|
const int qk = QK8_1;
|
3961
4051
|
const int nb = n / qk;
|
3962
4052
|
|
3963
4053
|
assert(n % qk == 0);
|
4054
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4055
|
+
assert((nrc == 2) || (nrc == 1));
|
4056
|
+
#else
|
4057
|
+
assert(nrc == 1);
|
4058
|
+
#endif
|
4059
|
+
UNUSED(nrc);
|
4060
|
+
UNUSED(bx);
|
4061
|
+
UNUSED(by);
|
4062
|
+
UNUSED(bs);
|
3964
4063
|
|
3965
4064
|
const block_q4_1 * restrict x = vx;
|
3966
4065
|
const block_q8_1 * restrict y = vy;
|
3967
4066
|
|
4067
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4068
|
+
if (nrc == 2) {
|
4069
|
+
const block_q4_1 * restrict vx0 = vx;
|
4070
|
+
const block_q4_1 * restrict vx1 = vx + bx;
|
4071
|
+
const block_q8_1 * restrict vy0 = vy;
|
4072
|
+
const block_q8_1 * restrict vy1 = vy + by;
|
4073
|
+
|
4074
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4075
|
+
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
4076
|
+
|
4077
|
+
for (int i = 0; i < nb; i++) {
|
4078
|
+
const block_q4_1 * restrict b_x0 = &vx0[i];
|
4079
|
+
const block_q4_1 * restrict b_x1 = &vx1[i];
|
4080
|
+
const block_q8_1 * restrict b_y0 = &vy0[i];
|
4081
|
+
const block_q8_1 * restrict b_y1 = &vy1[i];
|
4082
|
+
|
4083
|
+
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
4084
|
+
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
4085
|
+
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
4086
|
+
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
4087
|
+
summs0 += summs_t;
|
4088
|
+
|
4089
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
4090
|
+
|
4091
|
+
const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
|
4092
|
+
const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
|
4093
|
+
|
4094
|
+
// 4-bit -> 8-bit
|
4095
|
+
const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
|
4096
|
+
const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
4097
|
+
const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
|
4098
|
+
const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
4099
|
+
|
4100
|
+
// load y
|
4101
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
4102
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
4103
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4104
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4105
|
+
|
4106
|
+
// mmla into int32x4_t
|
4107
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4108
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4109
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4110
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4111
|
+
|
4112
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4113
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4114
|
+
|
4115
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4116
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4117
|
+
|
4118
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4119
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4120
|
+
|
4121
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4122
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4123
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
4124
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
4125
|
+
}
|
4126
|
+
|
4127
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
4128
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
4129
|
+
sumv2 = sumv2 + summs0;
|
4130
|
+
|
4131
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
4132
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
4133
|
+
return;
|
4134
|
+
}
|
4135
|
+
#endif
|
3968
4136
|
// TODO: add WASM SIMD
|
3969
4137
|
#if defined(__ARM_NEON)
|
3970
4138
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
@@ -4028,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4028
4196
|
const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
|
4029
4197
|
|
4030
4198
|
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
|
4031
|
-
const __m256i
|
4032
|
-
const __m256i
|
4199
|
+
const __m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4200
|
+
const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
|
4033
4201
|
|
4034
|
-
const __m256 xy = mul_sum_us8_pairs_float(
|
4202
|
+
const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
|
4035
4203
|
|
4036
4204
|
// Accumulate d0*d1*x*y
|
4037
4205
|
#if defined(__AVX2__)
|
@@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4096
4264
|
#endif
|
4097
4265
|
}
|
4098
4266
|
|
4099
|
-
void ggml_vec_dot_q5_0_q8_0(
|
4267
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4100
4268
|
const int qk = QK8_0;
|
4101
4269
|
const int nb = n / qk;
|
4102
4270
|
|
4103
4271
|
assert(n % qk == 0);
|
4104
4272
|
assert(qk == QK5_0);
|
4273
|
+
assert(nrc == 1);
|
4274
|
+
UNUSED(nrc);
|
4275
|
+
UNUSED(bx);
|
4276
|
+
UNUSED(by);
|
4277
|
+
UNUSED(bs);
|
4105
4278
|
|
4106
4279
|
const block_q5_0 * restrict x = vx;
|
4107
4280
|
const block_q8_0 * restrict y = vy;
|
@@ -4245,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4245
4418
|
/* Compute combined scale for the block */
|
4246
4419
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
4247
4420
|
|
4248
|
-
__m256i
|
4421
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4249
4422
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4250
4423
|
bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
|
4251
|
-
|
4424
|
+
qx = _mm256_or_si256(qx, bxhi);
|
4252
4425
|
|
4253
|
-
__m256i
|
4426
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4254
4427
|
|
4255
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
4428
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
4256
4429
|
|
4257
4430
|
/* Multiply q with scale and accumulate */
|
4258
4431
|
acc = _mm256_fmadd_ps(d, q, acc);
|
@@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4382
4555
|
#endif
|
4383
4556
|
}
|
4384
4557
|
|
4385
|
-
void ggml_vec_dot_q5_1_q8_1(
|
4558
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4386
4559
|
const int qk = QK8_1;
|
4387
4560
|
const int nb = n / qk;
|
4388
4561
|
|
4389
4562
|
assert(n % qk == 0);
|
4390
4563
|
assert(qk == QK5_1);
|
4564
|
+
assert(nrc == 1);
|
4565
|
+
UNUSED(nrc);
|
4566
|
+
UNUSED(bx);
|
4567
|
+
UNUSED(by);
|
4568
|
+
UNUSED(bs);
|
4391
4569
|
|
4392
4570
|
const block_q5_1 * restrict x = vx;
|
4393
4571
|
const block_q8_1 * restrict y = vy;
|
@@ -4544,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4544
4722
|
|
4545
4723
|
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
4546
4724
|
|
4547
|
-
__m256i
|
4725
|
+
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
4548
4726
|
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
4549
4727
|
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
4550
|
-
|
4728
|
+
qx = _mm256_or_si256(qx, bxhi);
|
4551
4729
|
|
4552
4730
|
const __m256 dy = _mm256_set1_ps(y[i].d);
|
4553
|
-
const __m256i
|
4731
|
+
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4554
4732
|
|
4555
|
-
const __m256 q = mul_sum_us8_pairs_float(
|
4733
|
+
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
4556
4734
|
|
4557
4735
|
acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
|
4558
4736
|
}
|
@@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|
4681
4859
|
#endif
|
4682
4860
|
}
|
4683
4861
|
|
4684
|
-
void ggml_vec_dot_q8_0_q8_0(
|
4862
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
4685
4863
|
const int qk = QK8_0;
|
4686
4864
|
const int nb = n / qk;
|
4687
4865
|
|
4688
4866
|
assert(n % qk == 0);
|
4867
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4868
|
+
assert((nrc == 2) || (nrc == 1));
|
4869
|
+
#else
|
4870
|
+
assert(nrc == 1);
|
4871
|
+
#endif
|
4872
|
+
UNUSED(nrc);
|
4873
|
+
UNUSED(bx);
|
4874
|
+
UNUSED(by);
|
4875
|
+
UNUSED(bs);
|
4689
4876
|
|
4690
4877
|
const block_q8_0 * restrict x = vx;
|
4691
4878
|
const block_q8_0 * restrict y = vy;
|
4692
4879
|
|
4880
|
+
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
4881
|
+
if (nrc == 2) {
|
4882
|
+
const block_q8_0 * restrict vx0 = vx;
|
4883
|
+
const block_q8_0 * restrict vx1 = vx + bx;
|
4884
|
+
const block_q8_0 * restrict vy0 = vy;
|
4885
|
+
const block_q8_0 * restrict vy1 = vy + by;
|
4886
|
+
|
4887
|
+
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4888
|
+
|
4889
|
+
for (int i = 0; i < nb; i++) {
|
4890
|
+
const block_q8_0 * restrict b_x0 = &vx0[i];
|
4891
|
+
const block_q8_0 * restrict b_y0 = &vy0[i];
|
4892
|
+
|
4893
|
+
const block_q8_0 * restrict b_x1 = &vx1[i];
|
4894
|
+
const block_q8_0 * restrict b_y1 = &vy1[i];
|
4895
|
+
|
4896
|
+
const int8x16_t x0_l = vld1q_s8(b_x0->qs);
|
4897
|
+
const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
|
4898
|
+
const int8x16_t x1_l = vld1q_s8(b_x1->qs);
|
4899
|
+
const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
|
4900
|
+
|
4901
|
+
// load y
|
4902
|
+
const int8x16_t y0_l = vld1q_s8(b_y0->qs);
|
4903
|
+
const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
|
4904
|
+
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
4905
|
+
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
4906
|
+
|
4907
|
+
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4908
|
+
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
4909
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
4910
|
+
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
4911
|
+
|
4912
|
+
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4913
|
+
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
4914
|
+
|
4915
|
+
int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4916
|
+
int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
|
4917
|
+
|
4918
|
+
int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4919
|
+
int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
|
4920
|
+
|
4921
|
+
int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4922
|
+
int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
|
4923
|
+
|
4924
|
+
sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
|
4925
|
+
l1, r1)), l2, r2)), l3, r3))), scale);
|
4926
|
+
}
|
4927
|
+
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
4928
|
+
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
4929
|
+
|
4930
|
+
vst1_f32(s, vget_low_f32(sumv2));
|
4931
|
+
vst1_f32(s + bs, vget_high_f32(sumv2));
|
4932
|
+
return;
|
4933
|
+
}
|
4934
|
+
#endif
|
4693
4935
|
#if defined(__ARM_NEON)
|
4694
4936
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
4695
4937
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
@@ -4731,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4731
4973
|
for (int i = 0; i < nb; ++i) {
|
4732
4974
|
// Compute combined scale for the block
|
4733
4975
|
const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
|
4734
|
-
__m256i
|
4735
|
-
__m256i
|
4976
|
+
__m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
|
4977
|
+
__m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
4736
4978
|
|
4737
|
-
const __m256 q = mul_sum_i8_pairs_float(
|
4979
|
+
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
4738
4980
|
|
4739
4981
|
// Multiply q with scale and accumulate
|
4740
4982
|
#if defined(__AVX2__)
|
@@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|
4784
5026
|
}
|
4785
5027
|
|
4786
5028
|
#if QK_K == 256
|
4787
|
-
void ggml_vec_dot_q2_K_q8_K(
|
5029
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5030
|
+
assert(nrc == 1);
|
5031
|
+
UNUSED(nrc);
|
5032
|
+
UNUSED(bx);
|
5033
|
+
UNUSED(by);
|
5034
|
+
UNUSED(bs);
|
4788
5035
|
|
4789
5036
|
const block_q2_K * restrict x = vx;
|
4790
5037
|
const block_q8_K * restrict y = vy;
|
@@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5160
5407
|
|
5161
5408
|
#else
|
5162
5409
|
|
5163
|
-
void ggml_vec_dot_q2_K_q8_K(
|
5410
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5411
|
+
assert(nrc == 1);
|
5412
|
+
UNUSED(nrc);
|
5413
|
+
UNUSED(bx);
|
5414
|
+
UNUSED(by);
|
5415
|
+
UNUSED(bs);
|
5164
5416
|
|
5165
5417
|
const block_q2_K * restrict x = vx;
|
5166
5418
|
const block_q8_K * restrict y = vy;
|
@@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5418
5670
|
#endif
|
5419
5671
|
|
5420
5672
|
#if QK_K == 256
|
5421
|
-
void ggml_vec_dot_q3_K_q8_K(
|
5673
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5422
5674
|
assert(n % QK_K == 0);
|
5675
|
+
assert(nrc == 1);
|
5676
|
+
UNUSED(nrc);
|
5677
|
+
UNUSED(bx);
|
5678
|
+
UNUSED(by);
|
5679
|
+
UNUSED(bs);
|
5423
5680
|
|
5424
5681
|
const uint32_t kmask1 = 0x03030303;
|
5425
5682
|
const uint32_t kmask2 = 0x0f0f0f0f;
|
@@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
5938
6195
|
|
5939
6196
|
#else
|
5940
6197
|
|
5941
|
-
void ggml_vec_dot_q3_K_q8_K(
|
6198
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
5942
6199
|
assert(n % QK_K == 0);
|
6200
|
+
assert(nrc == 1);
|
6201
|
+
UNUSED(nrc);
|
6202
|
+
UNUSED(bx);
|
6203
|
+
UNUSED(by);
|
6204
|
+
UNUSED(bs);
|
5943
6205
|
|
5944
6206
|
const block_q3_K * restrict x = vx;
|
5945
6207
|
const block_q8_K * restrict y = vy;
|
@@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6281
6543
|
#endif
|
6282
6544
|
|
6283
6545
|
#if QK_K == 256
|
6284
|
-
void ggml_vec_dot_q4_K_q8_K(
|
6546
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6285
6547
|
assert(n % QK_K == 0);
|
6548
|
+
assert(nrc == 1);
|
6549
|
+
UNUSED(nrc);
|
6550
|
+
UNUSED(bx);
|
6551
|
+
UNUSED(by);
|
6552
|
+
UNUSED(bs);
|
6286
6553
|
|
6287
6554
|
const block_q4_K * restrict x = vx;
|
6288
6555
|
const block_q8_K * restrict y = vy;
|
@@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6637
6904
|
#endif
|
6638
6905
|
}
|
6639
6906
|
#else
|
6640
|
-
void ggml_vec_dot_q4_K_q8_K(
|
6907
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6641
6908
|
assert(n % QK_K == 0);
|
6909
|
+
assert(nrc == 1);
|
6910
|
+
UNUSED(nrc);
|
6911
|
+
UNUSED(bx);
|
6912
|
+
UNUSED(by);
|
6913
|
+
UNUSED(bs);
|
6642
6914
|
|
6643
6915
|
const block_q4_K * restrict x = vx;
|
6644
6916
|
const block_q8_K * restrict y = vy;
|
@@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
6880
7152
|
#endif
|
6881
7153
|
|
6882
7154
|
#if QK_K == 256
|
6883
|
-
void ggml_vec_dot_q5_K_q8_K(
|
7155
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
6884
7156
|
assert(n % QK_K == 0);
|
7157
|
+
assert(nrc == 1);
|
7158
|
+
UNUSED(nrc);
|
7159
|
+
UNUSED(bx);
|
7160
|
+
UNUSED(by);
|
7161
|
+
UNUSED(bs);
|
6885
7162
|
|
6886
7163
|
const block_q5_K * restrict x = vx;
|
6887
7164
|
const block_q8_K * restrict y = vy;
|
@@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7300
7577
|
|
7301
7578
|
#else
|
7302
7579
|
|
7303
|
-
void ggml_vec_dot_q5_K_q8_K(
|
7580
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
7304
7581
|
assert(n % QK_K == 0);
|
7582
|
+
assert(nrc == 1);
|
7583
|
+
UNUSED(nrc);
|
7584
|
+
UNUSED(bx);
|
7585
|
+
UNUSED(by);
|
7586
|
+
UNUSED(bs);
|
7305
7587
|
|
7306
7588
|
const block_q5_K * restrict x = vx;
|
7307
7589
|
const block_q8_K * restrict y = vy;
|
@@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7566
7848
|
|
7567
7849
|
|
7568
7850
|
#if QK_K == 256
|
7569
|
-
void ggml_vec_dot_q6_K_q8_K(
|
7851
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
7570
7852
|
assert(n % QK_K == 0);
|
7853
|
+
assert(nrc == 1);
|
7854
|
+
UNUSED(nrc);
|
7855
|
+
UNUSED(bx);
|
7856
|
+
UNUSED(by);
|
7857
|
+
UNUSED(bs);
|
7571
7858
|
|
7572
7859
|
const block_q6_K * restrict x = vx;
|
7573
7860
|
const block_q8_K * restrict y = vy;
|
@@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7998
8285
|
|
7999
8286
|
#else
|
8000
8287
|
|
8001
|
-
void ggml_vec_dot_q6_K_q8_K(
|
8288
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8002
8289
|
assert(n % QK_K == 0);
|
8290
|
+
assert(nrc == 1);
|
8291
|
+
UNUSED(nrc);
|
8292
|
+
UNUSED(bx);
|
8293
|
+
UNUSED(by);
|
8294
|
+
UNUSED(bs);
|
8003
8295
|
|
8004
8296
|
const block_q6_K * restrict x = vx;
|
8005
8297
|
const block_q8_K * restrict y = vy;
|
@@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
|
|
8328
8620
|
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
8329
8621
|
};
|
8330
8622
|
|
8331
|
-
void ggml_vec_dot_iq2_xxs_q8_K(
|
8623
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8332
8624
|
assert(n % QK_K == 0);
|
8625
|
+
assert(nrc == 1);
|
8626
|
+
UNUSED(nrc);
|
8627
|
+
UNUSED(bx);
|
8628
|
+
UNUSED(by);
|
8629
|
+
UNUSED(bs);
|
8333
8630
|
|
8334
8631
|
const block_iq2_xxs * restrict x = vx;
|
8335
8632
|
const block_q8_K * restrict y = vy;
|
@@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
|
|
8451
8748
|
#endif
|
8452
8749
|
}
|
8453
8750
|
|
8454
|
-
void ggml_vec_dot_iq2_xs_q8_K(
|
8751
|
+
void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8455
8752
|
assert(n % QK_K == 0);
|
8753
|
+
assert(nrc == 1);
|
8754
|
+
UNUSED(nrc);
|
8755
|
+
UNUSED(bx);
|
8756
|
+
UNUSED(by);
|
8757
|
+
UNUSED(bs);
|
8456
8758
|
|
8457
8759
|
const block_iq2_xs * restrict x = vx;
|
8458
8760
|
const block_q8_K * restrict y = vy;
|
@@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
8671
8973
|
}
|
8672
8974
|
|
8673
8975
|
// TODO
|
8674
|
-
void ggml_vec_dot_iq3_xxs_q8_K(
|
8976
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
8675
8977
|
assert(n % QK_K == 0);
|
8978
|
+
assert(nrc == 1);
|
8979
|
+
UNUSED(nrc);
|
8980
|
+
UNUSED(bx);
|
8981
|
+
UNUSED(by);
|
8982
|
+
UNUSED(bs);
|
8676
8983
|
|
8677
8984
|
const block_iq3_xxs * restrict x = vx;
|
8678
8985
|
const block_q8_K * restrict y = vy;
|
@@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
|
|
8698
9005
|
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
8699
9006
|
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
8700
9007
|
memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
|
8701
|
-
const uint32x4_t aux32x4_0 =
|
8702
|
-
const uint32x4_t aux32x4_1 =
|
8703
|
-
const uint32x4_t aux32x4_2 =
|
8704
|
-
const uint32x4_t aux32x4_3 =
|
9008
|
+
const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
|
9009
|
+
const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
|
9010
|
+
const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
|
9011
|
+
const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
|
8705
9012
|
q3 += 16;
|
8706
9013
|
q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
|
8707
9014
|
q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
|
@@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
|
|
245
245
|
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
246
246
|
|
247
247
|
// Dot product
|
248
|
-
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
249
|
-
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
250
|
-
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
251
|
-
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
252
|
-
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
253
|
-
|
254
|
-
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
255
|
-
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
256
|
-
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
257
|
-
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
258
|
-
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
259
|
-
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
260
|
-
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
261
|
-
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
|
248
|
+
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
249
|
+
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
250
|
+
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
251
|
+
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
252
|
+
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
253
|
+
|
254
|
+
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
255
|
+
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
256
|
+
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
257
|
+
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
258
|
+
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
259
|
+
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
260
|
+
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
261
|
+
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
262
262
|
|
263
263
|
//
|
264
264
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|