llama_cpp 0.12.5 → 0.12.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -49,6 +49,8 @@
49
49
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
50
50
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
51
51
 
52
+ #define UNUSED GGML_UNUSED
53
+
52
54
  #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
53
55
 
54
56
  #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
@@ -268,6 +270,17 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
268
270
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
269
271
 
270
272
  #if defined(__ARM_NEON)
273
+
274
+ #ifdef _MSC_VER
275
+
276
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
277
+
278
+ #else
279
+
280
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
281
+
282
+ #endif
283
+
271
284
  #if !defined(__aarch64__)
272
285
 
273
286
  // 64-bit compatibility
@@ -3666,15 +3679,92 @@ static inline __m128i get_scale_shuffle(int i) {
3666
3679
  }
3667
3680
  #endif
3668
3681
 
3669
- void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3682
+ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3670
3683
  const int qk = QK8_0;
3671
3684
  const int nb = n / qk;
3672
3685
 
3673
3686
  assert(n % qk == 0);
3687
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
3688
+ assert((nrc == 2) || (nrc == 1));
3689
+ #else
3690
+ assert(nrc == 1);
3691
+ #endif
3692
+ UNUSED(nrc);
3693
+ UNUSED(bx);
3694
+ UNUSED(by);
3695
+ UNUSED(bs);
3674
3696
 
3675
3697
  const block_q4_0 * restrict x = vx;
3676
3698
  const block_q8_0 * restrict y = vy;
3677
3699
 
3700
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
3701
+ if (nrc == 2) {
3702
+ const block_q4_0 * restrict vx0 = vx;
3703
+ const block_q4_0 * restrict vx1 = vx + bx;
3704
+
3705
+ const block_q8_0 * restrict vy0 = vy;
3706
+ const block_q8_0 * restrict vy1 = vy + by;
3707
+
3708
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
3709
+
3710
+ for (int i = 0; i < nb; i++) {
3711
+ const block_q4_0 * restrict b_x0 = &vx0[i];
3712
+ const block_q4_0 * restrict b_x1 = &vx1[i];
3713
+ const block_q8_0 * restrict b_y0 = &vy0[i];
3714
+ const block_q8_0 * restrict b_y1 = &vy1[i];
3715
+
3716
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
3717
+ const int8x16_t s8b = vdupq_n_s8(0x8);
3718
+
3719
+ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
3720
+ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
3721
+
3722
+ // 4-bit -> 8-bit
3723
+ const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
3724
+ const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
3725
+ const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
3726
+ const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
3727
+
3728
+ // sub 8
3729
+ const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
3730
+ const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
3731
+ const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
3732
+ const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
3733
+
3734
+ // load y
3735
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
3736
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
3737
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
3738
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
3739
+
3740
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
3741
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
3742
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
3743
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
3744
+
3745
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3746
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
3747
+
3748
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
3749
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
3750
+
3751
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
3752
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
3753
+
3754
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
3755
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
3756
+
3757
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
3758
+ l1, r1)), l2, r2)), l3, r3))), scale);
3759
+ }
3760
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
3761
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
3762
+
3763
+ vst1_f32(s, vget_low_f32(sumv2));
3764
+ vst1_f32(s + bs, vget_high_f32(sumv2));
3765
+ return;
3766
+ }
3767
+ #endif
3678
3768
  #if defined(__ARM_NEON)
3679
3769
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
3680
3770
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -3729,15 +3819,15 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
3729
3819
  /* Compute combined scale for the block */
3730
3820
  const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
3731
3821
 
3732
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
3822
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
3733
3823
 
3734
3824
  // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
3735
3825
  const __m256i off = _mm256_set1_epi8( 8 );
3736
- bx = _mm256_sub_epi8( bx, off );
3826
+ qx = _mm256_sub_epi8( qx, off );
3737
3827
 
3738
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3828
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
3739
3829
 
3740
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
3830
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
3741
3831
 
3742
3832
  /* Multiply q with scale and accumulate */
3743
3833
  acc = _mm256_fmadd_ps( d, q, acc );
@@ -3956,15 +4046,93 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
3956
4046
  #endif
3957
4047
  }
3958
4048
 
3959
- void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4049
+ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
3960
4050
  const int qk = QK8_1;
3961
4051
  const int nb = n / qk;
3962
4052
 
3963
4053
  assert(n % qk == 0);
4054
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4055
+ assert((nrc == 2) || (nrc == 1));
4056
+ #else
4057
+ assert(nrc == 1);
4058
+ #endif
4059
+ UNUSED(nrc);
4060
+ UNUSED(bx);
4061
+ UNUSED(by);
4062
+ UNUSED(bs);
3964
4063
 
3965
4064
  const block_q4_1 * restrict x = vx;
3966
4065
  const block_q8_1 * restrict y = vy;
3967
4066
 
4067
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4068
+ if (nrc == 2) {
4069
+ const block_q4_1 * restrict vx0 = vx;
4070
+ const block_q4_1 * restrict vx1 = vx + bx;
4071
+ const block_q8_1 * restrict vy0 = vy;
4072
+ const block_q8_1 * restrict vy1 = vy + by;
4073
+
4074
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
4075
+ float32x4_t summs0 = vdupq_n_f32(0.0f);
4076
+
4077
+ for (int i = 0; i < nb; i++) {
4078
+ const block_q4_1 * restrict b_x0 = &vx0[i];
4079
+ const block_q4_1 * restrict b_x1 = &vx1[i];
4080
+ const block_q8_1 * restrict b_y0 = &vy0[i];
4081
+ const block_q8_1 * restrict b_y1 = &vy1[i];
4082
+
4083
+ float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
4084
+ GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
4085
+ GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
4086
+ GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
4087
+ summs0 += summs_t;
4088
+
4089
+ const uint8x16_t m4b = vdupq_n_u8(0x0F);
4090
+
4091
+ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
4092
+ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
4093
+
4094
+ // 4-bit -> 8-bit
4095
+ const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
4096
+ const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
4097
+ const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
4098
+ const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
4099
+
4100
+ // load y
4101
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
4102
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
4103
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4104
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4105
+
4106
+ // mmla into int32x4_t
4107
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4108
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4109
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4110
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4111
+
4112
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4113
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4114
+
4115
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4116
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4117
+
4118
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4119
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4120
+
4121
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4122
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4123
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
4124
+ l1, r1)), l2, r2)), l3, r3))), scale);
4125
+ }
4126
+
4127
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
4128
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
4129
+ sumv2 = sumv2 + summs0;
4130
+
4131
+ vst1_f32(s, vget_low_f32(sumv2));
4132
+ vst1_f32(s + bs, vget_high_f32(sumv2));
4133
+ return;
4134
+ }
4135
+ #endif
3968
4136
  // TODO: add WASM SIMD
3969
4137
  #if defined(__ARM_NEON)
3970
4138
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -4028,10 +4196,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
4028
4196
  const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
4029
4197
 
4030
4198
  // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
4031
- const __m256i bx = bytes_from_nibbles_32(x[i].qs);
4032
- const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
4199
+ const __m256i qx = bytes_from_nibbles_32(x[i].qs);
4200
+ const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[i].qs );
4033
4201
 
4034
- const __m256 xy = mul_sum_us8_pairs_float(bx, by);
4202
+ const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
4035
4203
 
4036
4204
  // Accumulate d0*d1*x*y
4037
4205
  #if defined(__AVX2__)
@@ -4096,12 +4264,17 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
4096
4264
  #endif
4097
4265
  }
4098
4266
 
4099
- void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4267
+ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4100
4268
  const int qk = QK8_0;
4101
4269
  const int nb = n / qk;
4102
4270
 
4103
4271
  assert(n % qk == 0);
4104
4272
  assert(qk == QK5_0);
4273
+ assert(nrc == 1);
4274
+ UNUSED(nrc);
4275
+ UNUSED(bx);
4276
+ UNUSED(by);
4277
+ UNUSED(bs);
4105
4278
 
4106
4279
  const block_q5_0 * restrict x = vx;
4107
4280
  const block_q8_0 * restrict y = vy;
@@ -4245,14 +4418,14 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
4245
4418
  /* Compute combined scale for the block */
4246
4419
  const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
4247
4420
 
4248
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
4421
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
4249
4422
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
4250
4423
  bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
4251
- bx = _mm256_or_si256(bx, bxhi);
4424
+ qx = _mm256_or_si256(qx, bxhi);
4252
4425
 
4253
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4426
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4254
4427
 
4255
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
4428
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
4256
4429
 
4257
4430
  /* Multiply q with scale and accumulate */
4258
4431
  acc = _mm256_fmadd_ps(d, q, acc);
@@ -4382,12 +4555,17 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
4382
4555
  #endif
4383
4556
  }
4384
4557
 
4385
- void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4558
+ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4386
4559
  const int qk = QK8_1;
4387
4560
  const int nb = n / qk;
4388
4561
 
4389
4562
  assert(n % qk == 0);
4390
4563
  assert(qk == QK5_1);
4564
+ assert(nrc == 1);
4565
+ UNUSED(nrc);
4566
+ UNUSED(bx);
4567
+ UNUSED(by);
4568
+ UNUSED(bs);
4391
4569
 
4392
4570
  const block_q5_1 * restrict x = vx;
4393
4571
  const block_q8_1 * restrict y = vy;
@@ -4544,15 +4722,15 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
4544
4722
 
4545
4723
  summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
4546
4724
 
4547
- __m256i bx = bytes_from_nibbles_32(x[i].qs);
4725
+ __m256i qx = bytes_from_nibbles_32(x[i].qs);
4548
4726
  __m256i bxhi = bytes_from_bits_32(x[i].qh);
4549
4727
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
4550
- bx = _mm256_or_si256(bx, bxhi);
4728
+ qx = _mm256_or_si256(qx, bxhi);
4551
4729
 
4552
4730
  const __m256 dy = _mm256_set1_ps(y[i].d);
4553
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4731
+ const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4554
4732
 
4555
- const __m256 q = mul_sum_us8_pairs_float(bx, by);
4733
+ const __m256 q = mul_sum_us8_pairs_float(qx, qy);
4556
4734
 
4557
4735
  acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
4558
4736
  }
@@ -4681,15 +4859,79 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
4681
4859
  #endif
4682
4860
  }
4683
4861
 
4684
- void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
4862
+ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
4685
4863
  const int qk = QK8_0;
4686
4864
  const int nb = n / qk;
4687
4865
 
4688
4866
  assert(n % qk == 0);
4867
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4868
+ assert((nrc == 2) || (nrc == 1));
4869
+ #else
4870
+ assert(nrc == 1);
4871
+ #endif
4872
+ UNUSED(nrc);
4873
+ UNUSED(bx);
4874
+ UNUSED(by);
4875
+ UNUSED(bs);
4689
4876
 
4690
4877
  const block_q8_0 * restrict x = vx;
4691
4878
  const block_q8_0 * restrict y = vy;
4692
4879
 
4880
+ #if defined(__ARM_FEATURE_MATMUL_INT8)
4881
+ if (nrc == 2) {
4882
+ const block_q8_0 * restrict vx0 = vx;
4883
+ const block_q8_0 * restrict vx1 = vx + bx;
4884
+ const block_q8_0 * restrict vy0 = vy;
4885
+ const block_q8_0 * restrict vy1 = vy + by;
4886
+
4887
+ float32x4_t sumv0 = vdupq_n_f32(0.0f);
4888
+
4889
+ for (int i = 0; i < nb; i++) {
4890
+ const block_q8_0 * restrict b_x0 = &vx0[i];
4891
+ const block_q8_0 * restrict b_y0 = &vy0[i];
4892
+
4893
+ const block_q8_0 * restrict b_x1 = &vx1[i];
4894
+ const block_q8_0 * restrict b_y1 = &vy1[i];
4895
+
4896
+ const int8x16_t x0_l = vld1q_s8(b_x0->qs);
4897
+ const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
4898
+ const int8x16_t x1_l = vld1q_s8(b_x1->qs);
4899
+ const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
4900
+
4901
+ // load y
4902
+ const int8x16_t y0_l = vld1q_s8(b_y0->qs);
4903
+ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
4904
+ const int8x16_t y1_l = vld1q_s8(b_y1->qs);
4905
+ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
4906
+
4907
+ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
4908
+ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
4909
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
4910
+ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
4911
+
4912
+ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4913
+ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
4914
+
4915
+ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4916
+ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
4917
+
4918
+ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4919
+ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
4920
+
4921
+ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4922
+ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
4923
+
4924
+ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
4925
+ l1, r1)), l2, r2)), l3, r3))), scale);
4926
+ }
4927
+ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
4928
+ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
4929
+
4930
+ vst1_f32(s, vget_low_f32(sumv2));
4931
+ vst1_f32(s + bs, vget_high_f32(sumv2));
4932
+ return;
4933
+ }
4934
+ #endif
4693
4935
  #if defined(__ARM_NEON)
4694
4936
  float32x4_t sumv0 = vdupq_n_f32(0.0f);
4695
4937
  float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -4731,10 +4973,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
4731
4973
  for (int i = 0; i < nb; ++i) {
4732
4974
  // Compute combined scale for the block
4733
4975
  const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d));
4734
- __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
4735
- __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
4976
+ __m256i qx = _mm256_loadu_si256((const __m256i *)x[i].qs);
4977
+ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
4736
4978
 
4737
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
4979
+ const __m256 q = mul_sum_i8_pairs_float(qx, qy);
4738
4980
 
4739
4981
  // Multiply q with scale and accumulate
4740
4982
  #if defined(__AVX2__)
@@ -4784,7 +5026,12 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
4784
5026
  }
4785
5027
 
4786
5028
  #if QK_K == 256
4787
- void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5029
+ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5030
+ assert(nrc == 1);
5031
+ UNUSED(nrc);
5032
+ UNUSED(bx);
5033
+ UNUSED(by);
5034
+ UNUSED(bs);
4788
5035
 
4789
5036
  const block_q2_K * restrict x = vx;
4790
5037
  const block_q8_K * restrict y = vy;
@@ -5160,7 +5407,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
5160
5407
 
5161
5408
  #else
5162
5409
 
5163
- void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5410
+ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5411
+ assert(nrc == 1);
5412
+ UNUSED(nrc);
5413
+ UNUSED(bx);
5414
+ UNUSED(by);
5415
+ UNUSED(bs);
5164
5416
 
5165
5417
  const block_q2_K * restrict x = vx;
5166
5418
  const block_q8_K * restrict y = vy;
@@ -5418,8 +5670,13 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
5418
5670
  #endif
5419
5671
 
5420
5672
  #if QK_K == 256
5421
- void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
5673
+ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5422
5674
  assert(n % QK_K == 0);
5675
+ assert(nrc == 1);
5676
+ UNUSED(nrc);
5677
+ UNUSED(bx);
5678
+ UNUSED(by);
5679
+ UNUSED(bs);
5423
5680
 
5424
5681
  const uint32_t kmask1 = 0x03030303;
5425
5682
  const uint32_t kmask2 = 0x0f0f0f0f;
@@ -5938,8 +6195,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
5938
6195
 
5939
6196
  #else
5940
6197
 
5941
- void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6198
+ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
5942
6199
  assert(n % QK_K == 0);
6200
+ assert(nrc == 1);
6201
+ UNUSED(nrc);
6202
+ UNUSED(bx);
6203
+ UNUSED(by);
6204
+ UNUSED(bs);
5943
6205
 
5944
6206
  const block_q3_K * restrict x = vx;
5945
6207
  const block_q8_K * restrict y = vy;
@@ -6281,8 +6543,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
6281
6543
  #endif
6282
6544
 
6283
6545
  #if QK_K == 256
6284
- void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6546
+ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6285
6547
  assert(n % QK_K == 0);
6548
+ assert(nrc == 1);
6549
+ UNUSED(nrc);
6550
+ UNUSED(bx);
6551
+ UNUSED(by);
6552
+ UNUSED(bs);
6286
6553
 
6287
6554
  const block_q4_K * restrict x = vx;
6288
6555
  const block_q8_K * restrict y = vy;
@@ -6637,8 +6904,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
6637
6904
  #endif
6638
6905
  }
6639
6906
  #else
6640
- void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
6907
+ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6641
6908
  assert(n % QK_K == 0);
6909
+ assert(nrc == 1);
6910
+ UNUSED(nrc);
6911
+ UNUSED(bx);
6912
+ UNUSED(by);
6913
+ UNUSED(bs);
6642
6914
 
6643
6915
  const block_q4_K * restrict x = vx;
6644
6916
  const block_q8_K * restrict y = vy;
@@ -6880,8 +7152,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
6880
7152
  #endif
6881
7153
 
6882
7154
  #if QK_K == 256
6883
- void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7155
+ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
6884
7156
  assert(n % QK_K == 0);
7157
+ assert(nrc == 1);
7158
+ UNUSED(nrc);
7159
+ UNUSED(bx);
7160
+ UNUSED(by);
7161
+ UNUSED(bs);
6885
7162
 
6886
7163
  const block_q5_K * restrict x = vx;
6887
7164
  const block_q8_K * restrict y = vy;
@@ -7300,8 +7577,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
7300
7577
 
7301
7578
  #else
7302
7579
 
7303
- void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7580
+ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7304
7581
  assert(n % QK_K == 0);
7582
+ assert(nrc == 1);
7583
+ UNUSED(nrc);
7584
+ UNUSED(bx);
7585
+ UNUSED(by);
7586
+ UNUSED(bs);
7305
7587
 
7306
7588
  const block_q5_K * restrict x = vx;
7307
7589
  const block_q8_K * restrict y = vy;
@@ -7566,8 +7848,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
7566
7848
 
7567
7849
 
7568
7850
  #if QK_K == 256
7569
- void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7851
+ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
7570
7852
  assert(n % QK_K == 0);
7853
+ assert(nrc == 1);
7854
+ UNUSED(nrc);
7855
+ UNUSED(bx);
7856
+ UNUSED(by);
7857
+ UNUSED(bs);
7571
7858
 
7572
7859
  const block_q6_K * restrict x = vx;
7573
7860
  const block_q8_K * restrict y = vy;
@@ -7998,8 +8285,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7998
8285
 
7999
8286
  #else
8000
8287
 
8001
- void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8288
+ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8002
8289
  assert(n % QK_K == 0);
8290
+ assert(nrc == 1);
8291
+ UNUSED(nrc);
8292
+ UNUSED(bx);
8293
+ UNUSED(by);
8294
+ UNUSED(bs);
8003
8295
 
8004
8296
  const block_q6_K * restrict x = vx;
8005
8297
  const block_q8_K * restrict y = vy;
@@ -8328,8 +8620,13 @@ static const int8_t keven_signs_q2xs[1024] = {
8328
8620
  1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
8329
8621
  };
8330
8622
 
8331
- void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8623
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8332
8624
  assert(n % QK_K == 0);
8625
+ assert(nrc == 1);
8626
+ UNUSED(nrc);
8627
+ UNUSED(bx);
8628
+ UNUSED(by);
8629
+ UNUSED(bs);
8333
8630
 
8334
8631
  const block_iq2_xxs * restrict x = vx;
8335
8632
  const block_q8_K * restrict y = vy;
@@ -8451,8 +8748,13 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res
8451
8748
  #endif
8452
8749
  }
8453
8750
 
8454
- void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8751
+ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8455
8752
  assert(n % QK_K == 0);
8753
+ assert(nrc == 1);
8754
+ UNUSED(nrc);
8755
+ UNUSED(bx);
8756
+ UNUSED(by);
8757
+ UNUSED(bs);
8456
8758
 
8457
8759
  const block_iq2_xs * restrict x = vx;
8458
8760
  const block_q8_K * restrict y = vy;
@@ -8671,8 +8973,13 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
8671
8973
  }
8672
8974
 
8673
8975
  // TODO
8674
- void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
8976
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
8675
8977
  assert(n % QK_K == 0);
8978
+ assert(nrc == 1);
8979
+ UNUSED(nrc);
8980
+ UNUSED(bx);
8981
+ UNUSED(by);
8982
+ UNUSED(bs);
8676
8983
 
8677
8984
  const block_iq3_xxs * restrict x = vx;
8678
8985
  const block_q8_K * restrict y = vy;
@@ -8698,10 +9005,10 @@ void ggml_vec_dot_iq3_xxs_q8_K(const int n, float * restrict s, const void * res
8698
9005
  for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
8699
9006
  q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
8700
9007
  memcpy(aux32, gas, 2*sizeof(uint32_t)); gas += 2*sizeof(uint32_t);
8701
- const uint32x4_t aux32x4_0 = {iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]};
8702
- const uint32x4_t aux32x4_1 = {iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]};
8703
- const uint32x4_t aux32x4_2 = {iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]};
8704
- const uint32x4_t aux32x4_3 = {iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]};
9008
+ const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3xxs_grid[q3[ 0]], iq3xxs_grid[q3[ 1]], iq3xxs_grid[q3[ 2]], iq3xxs_grid[q3[ 3]]);
9009
+ const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3xxs_grid[q3[ 4]], iq3xxs_grid[q3[ 5]], iq3xxs_grid[q3[ 6]], iq3xxs_grid[q3[ 7]]);
9010
+ const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3xxs_grid[q3[ 8]], iq3xxs_grid[q3[ 9]], iq3xxs_grid[q3[10]], iq3xxs_grid[q3[11]]);
9011
+ const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3xxs_grid[q3[12]], iq3xxs_grid[q3[13]], iq3xxs_grid[q3[14]], iq3xxs_grid[q3[15]]);
8705
9012
  q3 += 16;
8706
9013
  q3s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 7) & 127))));
8707
9014
  q3s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[0] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[0] >> 21) & 127))));
@@ -245,20 +245,20 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_
245
245
  void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
246
246
 
247
247
  // Dot product
248
- void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
249
- void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
250
- void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
251
- void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
252
- void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
253
-
254
- void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
255
- void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
256
- void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
257
- void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
258
- void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
259
- void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
260
- void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
261
- void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy);
248
+ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
249
+ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
250
+ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
251
+ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
252
+ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
253
+
254
+ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
255
+ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
256
+ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
257
+ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
258
+ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
259
+ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
260
+ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
261
+ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
262
262
 
263
263
  //
264
264
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")