llama_cpp 0.12.5 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,6 +49,8 @@
49
49
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
50
50
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
51
51
 
52
+ #define UNUSED GGML_UNUSED
53
+
52
54
  #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
53
55
 
54
56
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
268
270
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
269
271
 
270
272
  #if defined(__ARM_NEON)
273
+
274
+ #ifdef _MSC_VER
275
+
276
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
277
+
278
+ #else
279
+
280
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
281
+
282
+ #endif
283
+
271
284
  #if !defined(__aarch64__)
272
285
 
273
286
  // 64-bit compatibility
@@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
3666
3679
  }
3667
3680
  #endif
3668
3681
 
3669
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3682
+ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3670
3683
  const int qk = QK8_0;
3671
3684
  const int nb = n / qk;
3672
3685
 
3673
3686
  assert(n % qk == 0);
3687
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
3688
+ assert((nrc == 2) || (nrc == 1));
3689
+ #else
3690
+ assert(nrc == 1);
3691
+ #endif
3692
+ UNUSED(nrc);
3693
+ UNUSED(bx);
3694
+ UNUSED(by);
3695
+ UNUSED(bs);
3674
3696
 
3675
3697
  const block_q4_0 * restrict x = vx;
3676
3698
  const block_q8_0 * restrict y = vy;
3677
3699
 
3700
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
3701
+ if (nrc == 2) {
3702
+ const block_q4_0 * restrict vx0 = vx;
3703
+ const block_q4_0 * restrict vx1 = vx + bx;
3704
+
3705
+ const block_q8_0 * restrict vy0 = vy;
3706
+ const block_q8_0 * restrict vy1 = vy + by;
3707
+
3708
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
3709
+
3710
+ for (int i = 0; i < nb; i++) {
3711
+ const block_q4_0 * restrict b_x0 = &vx0[i];
3712
+ const block_q4_0 * restrict b_x1 = &vx1[i];
3713
+ const block_q8_0 * restrict b_y0 = &vy0[i];
3714
+ const block_q8_0 * restrict b_y1 = &vy1[i];
3715
+
3716
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
3717
+ const int8x16_t s8b = vdupq_n_s8(0x8);
3718
+
3719
+ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
3720
+ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
3721
+
3722
+ // 4-bit -> 8-bit
3723
+ const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
3724
+ const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
3725
+ const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
3726
+ const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
3727
+
3728
+ // sub 8
3729
+ const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
3730
+ const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
3731
+ const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
3732
+ const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
3733
+
3734
+ // load y
3735
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
3736
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
3737
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
3738
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3739
+
3740
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3741
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3742
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3743
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3744
+
3745
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3746
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3747
+
3748
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
3749
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
3750
+
3751
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
3752
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
3753
+
3754
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
3755
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
3756
+
3757
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
3758
+ l1, r1)), l2, r2)), l3, r3))), scale);
3759
+ }
3760
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
3761
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
3762
+
3763
+ vst1_f32(s, vget_low_f32(sumv2));
3764
+ vst1_f32(s + bs, vget_high_f32(sumv2));
3765
+ return;
3766
+ }
3767
+ #endif
3678
3768
  #if defined(__ARM_NEON)
3679
3769
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3680
3770
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -3729,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
3729
3819
  /* Compute combined scale for the block */
3730
3820
  const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
3731
3821
 
3732
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
3822
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
3733
3823
 
3734
3824
  // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
3735
3825
  const __m256i off = _mm256_set1_epi8( 8 );
3736
- bx = _mm256_sub_epi8( bx, off );
3826
+ qx = _mm256_sub_epi8( qx, off );
3737
3827
 
3738
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3828
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
3739
3829
 
3740
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
3830
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
3741
3831
 
3742
3832
  /* Multiply q with scale and accumulate */
3743
3833
  acc = _mm256_fmadd_ps( d, q, acc );
@@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
3956
4046
  #endif
3957
4047
  }
3958
4048
 
3959
- void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4049
+ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3960
4050
  const int qk = QK8_1;
3961
4051
  const int nb = n / qk;
3962
4052
 
3963
4053
  assert(n % qk == 0);
4054
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4055
+ assert((nrc == 2) || (nrc == 1));
4056
+ #else
4057
+ assert(nrc == 1);
4058
+ #endif
4059
+ UNUSED(nrc);
4060
+ UNUSED(bx);
4061
+ UNUSED(by);
4062
+ UNUSED(bs);
3964
4063
 
3965
4064
  const block_q4_1 * restrict x = vx;
3966
4065
  const block_q8_1 * restrict y = vy;
3967
4066
 
4067
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4068
+ if (nrc == 2) {
4069
+ const block_q4_1 * restrict vx0 = vx;
4070
+ const block_q4_1 * restrict vx1 = vx + bx;
4071
+ const block_q8_1 * restrict vy0 = vy;
4072
+ const block_q8_1 * restrict vy1 = vy + by;
4073
+
4074
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
4075
+ float32x4_t summs0 = vdupq_n_f32(0.0f);
4076
+
4077
+ for (int i = 0; i < nb; i++) {
4078
+ const block_q4_1 * restrict b_x0 = &vx0[i];
4079
+ const block_q4_1 * restrict b_x1 = &vx1[i];
4080
+ const block_q8_1 * restrict b_y0 = &vy0[i];
4081
+ const block_q8_1 * restrict b_y1 = &vy1[i];
4082
+
4083
+ float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
4084
+ GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
4085
+ GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
4086
+ GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
4087
+ summs0 += summs_t;
4088
+
4089
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
4090
+
4091
+ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
4092
+ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
4093
+
4094
+ // 4-bit -> 8-bit
4095
+ const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
4096
+ const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
4097
+ const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
4098
+ const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
4099
+
4100
+ // load y
4101
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
4102
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
4103
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4104
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4105
+
4106
+ // mmla into int32x4_t
4107
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4108
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4109
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4110
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4111
+
4112
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4113
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4114
+
4115
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4116
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4117
+
4118
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4119
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4120
+
4121
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4122
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4123
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
4124
+ l1, r1)), l2, r2)), l3, r3))), scale);
4125
+ }
4126
+
4127
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
4128
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
4129
+ sumv2 = sumv2 + summs0;
4130
+
4131
+ vst1_f32(s, vget_low_f32(sumv2));
4132
+ vst1_f32(s + bs, vget_high_f32(sumv2));
4133
+ return;
4134
+ }
4135
+ #endif
3968
4136
  // TODO: add WASM SIMD
3969
4137
  #if defined(__ARM_NEON)
3970
4138
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -4028,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
4028
4196
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
4029
4197
 
4030
4198
  // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
4031
- const __m256i bx = bytes_from_nibbles_32(x[i].qs);
4032
- const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
4199
+ const __m256i qx = bytes_from_nibbles_32(x[i].qs);
4200
+ const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
4033
4201
 
4034
- const __m256 xy = mul_sum_us8_pairs_float(bx, by);
4202
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
4035
4203
 
4036
4204
  // Accumulate d0*d1*x*y
4037
4205
  #if defined(__AVX2__)
@@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
4096
4264
  #endif
4097
4265
  }
4098
4266
 
4099
- void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4267
+ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4100
4268
  const int qk = QK8_0;
4101
4269
  const int nb = n / qk;
4102
4270
 
4103
4271
  assert(n % qk == 0);
4104
4272
  assert(qk == QK5_0);
4273
+ assert(nrc == 1);
4274
+ UNUSED(nrc);
4275
+ UNUSED(bx);
4276
+ UNUSED(by);
4277
+ UNUSED(bs);
4105
4278
 
4106
4279
  const block_q5_0 * restrict x = vx;
4107
4280
  const block_q8_0 * restrict y = vy;
@@ -4245,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
4245
4418
  /* Compute combined scale for the block */
4246
4419
  const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
4247
4420
 
4248
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
4421
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
4249
4422
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
4250
4423
  bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
4251
- bx = _mm256_or_si256(bx, bxhi);
4424
+ qx = _mm256_or_si256(qx, bxhi);
4252
4425
 
4253
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4426
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4254
4427
 
4255
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
4428
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
4256
4429
 
4257
4430
  /* Multiply q with scale and accumulate */
4258
4431
  acc = _mm256_fmadd_ps(d, q, acc);
@@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
4382
4555
  #endif
4383
4556
  }
4384
4557
 
4385
- void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4558
+ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4386
4559
  const int qk = QK8_1;
4387
4560
  const int nb = n / qk;
4388
4561
 
4389
4562
  assert(n % qk == 0);
4390
4563
  assert(qk == QK5_1);
4564
+ assert(nrc == 1);
4565
+ UNUSED(nrc);
4566
+ UNUSED(bx);
4567
+ UNUSED(by);
4568
+ UNUSED(bs);
4391
4569
 
4392
4570
  const block_q5_1 * restrict x = vx;
4393
4571
  const block_q8_1 * restrict y = vy;
@@ -4544,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
4544
4722
 
4545
4723
  summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4546
4724
 
4547
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
4725
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
4548
4726
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
4549
4727
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
4550
- bx = _mm256_or_si256(bx, bxhi);
4728
+ qx = _mm256_or_si256(qx, bxhi);
4551
4729
 
4552
4730
  const __m256 dy = _mm256_set1_ps(y[i].d);
4553
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4731
+ const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4554
4732
 
4555
- const __m256 q = mul_sum_us8_pairs_float(bx, by);
4733
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
4556
4734
 
4557
4735
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
4558
4736
  }
@@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
4681
4859
  #endif
4682
4860
  }
4683
4861
 
4684
- void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4862
+ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4685
4863
  const int qk = QK8_0;
4686
4864
  const int nb = n / qk;
4687
4865
 
4688
4866
  assert(n % qk == 0);
4867
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4868
+ assert((nrc == 2) || (nrc == 1));
4869
+ #else
4870
+ assert(nrc == 1);
4871
+ #endif
4872
+ UNUSED(nrc);
4873
+ UNUSED(bx);
4874
+ UNUSED(by);
4875
+ UNUSED(bs);
4689
4876
 
4690
4877
  const block_q8_0 * restrict x = vx;
4691
4878
  const block_q8_0 * restrict y = vy;
4692
4879
 
4880
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4881
+ if (nrc == 2) {
4882
+ const block_q8_0 * restrict vx0 = vx;
4883
+ const block_q8_0 * restrict vx1 = vx + bx;
4884
+ const block_q8_0 * restrict vy0 = vy;
4885
+ const block_q8_0 * restrict vy1 = vy + by;
4886
+
4887
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
4888
+
4889
+ for (int i = 0; i < nb; i++) {
4890
+ const block_q8_0 * restrict b_x0 = &vx0[i];
4891
+ const block_q8_0 * restrict b_y0 = &vy0[i];
4892
+
4893
+ const block_q8_0 * restrict b_x1 = &vx1[i];
4894
+ const block_q8_0 * restrict b_y1 = &vy1[i];
4895
+
4896
+ const int8x16_t x0_l = vld1q_s8(b_x0->qs);
4897
+ const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
4898
+ const int8x16_t x1_l = vld1q_s8(b_x1->qs);
4899
+ const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
4900
+
4901
+ // load y
4902
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
4903
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
4904
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4905
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4906
+
4907
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4908
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4909
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4910
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4911
+
4912
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4913
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4914
+
4915
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4916
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4917
+
4918
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4919
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4920
+
4921
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4922
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4923
+
4924
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
4925
+ l1, r1)), l2, r2)), l3, r3))), scale);
4926
+ }
4927
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
4928
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
4929
+
4930
+ vst1_f32(s, vget_low_f32(sumv2));
4931
+ vst1_f32(s + bs, vget_high_f32(sumv2));
4932
+ return;
4933
+ }
4934
+ #endif
4693
4935
  #if defined(__ARM_NEON)
4694
4936
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
4695
4937
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -4731,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
4731
4973
  for (int i = 0; i < nb; ++i) {
4732
4974
  // Compute combined scale for the block
4733
4975
  const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
4734
- __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
4735
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4976
+ __m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
4977
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4736
4978
 
4737
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
4979
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
4738
4980
 
4739
4981
  // Multiply q with scale and accumulate
4740
4982
  #if defined(__AVX2__)
@@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
4784
5026
  }
4785
5027
 
4786
5028
  #if QK_K == 256
4787
- void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5029
+ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5030
+ assert(nrc == 1);
5031
+ UNUSED(nrc);
5032
+ UNUSED(bx);
5033
+ UNUSED(by);
5034
+ UNUSED(bs);
4788
5035
 
4789
5036
  const block_q2_K * restrict x = vx;
4790
5037
  const block_q8_K * restrict y = vy;
@@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
5160
5407
 
5161
5408
  #else
5162
5409
 
5163
- void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5410
+ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5411
+ assert(nrc == 1);
5412
+ UNUSED(nrc);
5413
+ UNUSED(bx);
5414
+ UNUSED(by);
5415
+ UNUSED(bs);
5164
5416
 
5165
5417
  const block_q2_K * restrict x = vx;
5166
5418
  const block_q8_K * restrict y = vy;
@@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
5418
5670
  #endif
5419
5671
 
5420
5672
  #if QK_K == 256
5421
- void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5673
+ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5422
5674
  assert(n % QK_K == 0);
5675
+ assert(nrc == 1);
5676
+ UNUSED(nrc);
5677
+ UNUSED(bx);
5678
+ UNUSED(by);
5679
+ UNUSED(bs);
5423
5680
 
5424
5681
  const uint32_t kmask1 = 0x03030303;
5425
5682
  const uint32_t kmask2 = 0x0f0f0f0f;
@@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
5938
6195
 
5939
6196
  #else
5940
6197
 
5941
- void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6198
+ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5942
6199
  assert(n % QK_K == 0);
6200
+ assert(nrc == 1);
6201
+ UNUSED(nrc);
6202
+ UNUSED(bx);
6203
+ UNUSED(by);
6204
+ UNUSED(bs);
5943
6205
 
5944
6206
  const block_q3_K * restrict x = vx;
5945
6207
  const block_q8_K * restrict y = vy;
@@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
6281
6543
  #endif
6282
6544
 
6283
6545
  #if QK_K == 256
6284
- void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6546
+ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6285
6547
  assert(n % QK_K == 0);
6548
+ assert(nrc == 1);
6549
+ UNUSED(nrc);
6550
+ UNUSED(bx);
6551
+ UNUSED(by);
6552
+ UNUSED(bs);
6286
6553
 
6287
6554
  const block_q4_K * restrict x = vx;
6288
6555
  const block_q8_K * restrict y = vy;
@@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
6637
6904
  #endif
6638
6905
  }
6639
6906
  #else
6640
- void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6907
+ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6641
6908
  assert(n % QK_K == 0);
6909
+ assert(nrc == 1);
6910
+ UNUSED(nrc);
6911
+ UNUSED(bx);
6912
+ UNUSED(by);
6913
+ UNUSED(bs);
6642
6914
 
6643
6915
  const block_q4_K * restrict x = vx;
6644
6916
  const block_q8_K * restrict y = vy;
@@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
6880
7152
  #endif
6881
7153
 
6882
7154
  #if QK_K == 256
6883
- void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7155
+ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6884
7156
  assert(n % QK_K == 0);
7157
+ assert(nrc == 1);
7158
+ UNUSED(nrc);
7159
+ UNUSED(bx);
7160
+ UNUSED(by);
7161
+ UNUSED(bs);
6885
7162
 
6886
7163
  const block_q5_K * restrict x = vx;
6887
7164
  const block_q8_K * restrict y = vy;
@@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
7300
7577
 
7301
7578
  #else
7302
7579
 
7303
- void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7580
+ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7304
7581
  assert(n % QK_K == 0);
7582
+ assert(nrc == 1);
7583
+ UNUSED(nrc);
7584
+ UNUSED(bx);
7585
+ UNUSED(by);
7586
+ UNUSED(bs);
7305
7587
 
7306
7588
  const block_q5_K * restrict x = vx;
7307
7589
  const block_q8_K * restrict y = vy;
@@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
7566
7848
 
7567
7849
 
7568
7850
  #if QK_K == 256
7569
- void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7851
+ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7570
7852
  assert(n % QK_K == 0);
7853
+ assert(nrc == 1);
7854
+ UNUSED(nrc);
7855
+ UNUSED(bx);
7856
+ UNUSED(by);
7857
+ UNUSED(bs);
7571
7858
 
7572
7859
  const block_q6_K * restrict x = vx;
7573
7860
  const block_q8_K * restrict y = vy;
@@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7998
8285
 
7999
8286
  #else
8000
8287
 
8001
- void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8288
+ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8002
8289
  assert(n % QK_K == 0);
8290
+ assert(nrc == 1);
8291
+ UNUSED(nrc);
8292
+ UNUSED(bx);
8293
+ UNUSED(by);
8294
+ UNUSED(bs);
8003
8295
 
8004
8296
  const block_q6_K * restrict x = vx;
8005
8297
  const block_q8_K * restrict y = vy;
@@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
8328
8620
  1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
8329
8621
  };
8330
8622
 
8331
- void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8623
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8332
8624
  assert(n % QK_K == 0);
8625
+ assert(nrc == 1);
8626
+ UNUSED(nrc);
8627
+ UNUSED(bx);
8628
+ UNUSED(by);
8629
+ UNUSED(bs);
8333
8630
 
8334
8631
  const block_iq2_xxs * restrict x = vx;
8335
8632
  const block_q8_K * restrict y = vy;
@@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
8451
8748
  #endif
8452
8749
  }
8453
8750
 
8454
- void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8751
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8455
8752
  assert(n % QK_K == 0);
8753
+ assert(nrc == 1);
8754
+ UNUSED(nrc);
8755
+ UNUSED(bx);
8756
+ UNUSED(by);
8757
+ UNUSED(bs);
8456
8758
 
8457
8759
  const block_iq2_xs * restrict x = vx;
8458
8760
  const block_q8_K * restrict y = vy;
@@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
8671
8973
  }
8672
8974
 
8673
8975
  // TODO
8674
- void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8976
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8675
8977
  assert(n % QK_K == 0);
8978
+ assert(nrc == 1);
8979
+ UNUSED(nrc);
8980
+ UNUSED(bx);
8981
+ UNUSED(by);
8982
+ UNUSED(bs);
8676
8983
 
8677
8984
  const block_iq3_xxs * restrict x = vx;
8678
8985
  const block_q8_K * restrict y = vy;
@@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
8698
9005
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8699
9006
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
8700
9007
  memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
8701
- const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
8702
- const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
8703
- const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
8704
- const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
9008
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9009
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9010
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9011
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
8705
9012
  q3 += 16;
8706
9013
  q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
8707
9014
  q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
@@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
245
245
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
246
 
247
247
  // Dot product
248
- void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
- void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
- void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
- void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
- void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
-
254
- void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
- void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
- void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
- void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
- void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
248
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
249
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
250
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
251
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
252
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
253
+
254
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
255
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
256
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
257
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
258
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
259
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
260
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
261
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
262
262
 
263
263
  //
264
264
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")