llama_cpp 0.9.2 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,26 +14,6 @@
14
14
  //
15
15
  #include <arm_neon.h>
16
16
 
17
- #if !defined(__aarch64__)
18
- inline static int32_t vaddvq_s16(int16x8_t v) {
19
- return
20
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
21
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
22
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
23
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
24
- }
25
-
26
- inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
27
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
28
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
29
- return vcombine_s16(a0, b0);
30
- }
31
-
32
- inline static int32_t vaddvq_s32(int32x4_t v) {
33
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
34
- }
35
- #endif
36
-
37
17
  #else
38
18
 
39
19
  #ifdef __wasm_simd128__
@@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
47
27
  #if defined(_MSC_VER) || defined(__MINGW32__)
48
28
  #include <intrin.h>
49
29
  #else
50
- #if !defined(__riscv) && !defined(__s390__)
30
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
31
+ #if !defined(__riscv)
51
32
  #include <immintrin.h>
52
33
  #endif
53
34
  #endif
54
35
  #endif
55
36
  #endif
56
37
  #endif
38
+ #endif
57
39
 
58
40
  #ifdef __riscv_v_intrinsic
59
41
  #include <riscv_vector.h>
@@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
61
43
 
62
44
  #undef MIN
63
45
  #undef MAX
46
+
64
47
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
65
48
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
66
49
 
@@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
283
266
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
284
267
 
285
268
  #if defined(__ARM_NEON)
286
-
287
269
  #if !defined(__aarch64__)
288
270
 
271
+ // 64-bit compatibility
272
+
273
+ // vaddvq_s16
274
+ // vpaddq_s16
275
+ // vaddvq_s32
276
+ // vaddvq_f32
277
+ // vmaxvq_f32
278
+ // vcvtnq_s32_f32
279
+
280
+ inline static int32_t vaddvq_s16(int16x8_t v) {
281
+ return
282
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
283
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
284
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
285
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
286
+ }
287
+
288
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
289
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
290
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
291
+ return vcombine_s16(a0, b0);
292
+ }
293
+
289
294
  inline static int32_t vaddvq_s32(int32x4_t v) {
290
295
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
291
296
  }
@@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
311
316
  return res;
312
317
  }
313
318
 
319
+ // vld1q_s16_x2
320
+ // vld1q_u8_x2
321
+ // vld1q_u8_x4
322
+ // vld1q_s8_x2
323
+ // vld1q_s8_x4
324
+ // TODO: double-check these work correctly
325
+
326
+ typedef struct ggml_int16x8x2_t {
327
+ int16x8_t val[2];
328
+ } ggml_int16x8x2_t;
329
+
330
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
331
+ ggml_int16x8x2_t res;
332
+
333
+ res.val[0] = vld1q_s16(ptr + 0);
334
+ res.val[1] = vld1q_s16(ptr + 8);
335
+
336
+ return res;
337
+ }
338
+
339
+ typedef struct ggml_uint8x16x2_t {
340
+ uint8x16_t val[2];
341
+ } ggml_uint8x16x2_t;
342
+
343
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
344
+ ggml_uint8x16x2_t res;
345
+
346
+ res.val[0] = vld1q_u8(ptr + 0);
347
+ res.val[1] = vld1q_u8(ptr + 16);
348
+
349
+ return res;
350
+ }
351
+
352
+ typedef struct ggml_uint8x16x4_t {
353
+ uint8x16_t val[4];
354
+ } ggml_uint8x16x4_t;
355
+
356
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
357
+ ggml_uint8x16x4_t res;
358
+
359
+ res.val[0] = vld1q_u8(ptr + 0);
360
+ res.val[1] = vld1q_u8(ptr + 16);
361
+ res.val[2] = vld1q_u8(ptr + 32);
362
+ res.val[3] = vld1q_u8(ptr + 48);
363
+
364
+ return res;
365
+ }
366
+
367
+ typedef struct ggml_int8x16x2_t {
368
+ int8x16_t val[2];
369
+ } ggml_int8x16x2_t;
370
+
371
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
372
+ ggml_int8x16x2_t res;
373
+
374
+ res.val[0] = vld1q_s8(ptr + 0);
375
+ res.val[1] = vld1q_s8(ptr + 16);
376
+
377
+ return res;
378
+ }
379
+
380
+ typedef struct ggml_int8x16x4_t {
381
+ int8x16_t val[4];
382
+ } ggml_int8x16x4_t;
383
+
384
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
385
+ ggml_int8x16x4_t res;
386
+
387
+ res.val[0] = vld1q_s8(ptr + 0);
388
+ res.val[1] = vld1q_s8(ptr + 16);
389
+ res.val[2] = vld1q_s8(ptr + 32);
390
+ res.val[3] = vld1q_s8(ptr + 48);
391
+
392
+ return res;
393
+ }
394
+
395
+ #else
396
+
397
+ #define ggml_int16x8x2_t int16x8x2_t
398
+ #define ggml_uint8x16x2_t uint8x16x2_t
399
+ #define ggml_uint8x16x4_t uint8x16x4_t
400
+ #define ggml_int8x16x2_t int8x16x2_t
401
+ #define ggml_int8x16x4_t int8x16x4_t
402
+
403
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
404
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
405
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
406
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
407
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
408
+
314
409
  #endif
315
410
  #endif
316
411
 
@@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1273
1368
  float max = x[0];
1274
1369
  float sum_w = weights[0];
1275
1370
  float sum_x = sum_w * x[0];
1371
+ #ifdef HAVE_BUGGY_APPLE_LINKER
1372
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1373
+ for (volatile int i = 1; i < n; ++i) {
1374
+ #else
1276
1375
  for (int i = 1; i < n; ++i) {
1376
+ #endif
1277
1377
  if (x[i] < min) min = x[i];
1278
1378
  if (x[i] > max) max = x[i];
1279
1379
  float w = weights[i];
@@ -3557,7 +3657,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3557
3657
  const int32x4_t vzero = vdupq_n_s32(0);
3558
3658
  #endif
3559
3659
 
3560
- int8x16x2_t q2bytes;
3660
+ ggml_int8x16x2_t q2bytes;
3561
3661
  uint8_t aux[16];
3562
3662
 
3563
3663
  float sum = 0;
@@ -3576,8 +3676,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3576
3676
  vst1q_u8(aux, scales);
3577
3677
 
3578
3678
  const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
3579
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
3580
- const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3679
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
3680
+ const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3581
3681
  const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
3582
3682
  vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
3583
3683
  const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@@ -3605,7 +3705,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3605
3705
  #endif
3606
3706
 
3607
3707
  #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
3608
- q8bytes = vld1q_s8_x2(q8); q8 += 32;\
3708
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
3609
3709
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
3610
3710
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
3611
3711
  MULTIPLY_ACCUM_WITH_SCALE((index));
@@ -3613,9 +3713,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3613
3713
 
3614
3714
  for (int j = 0; j < QK_K/128; ++j) {
3615
3715
 
3616
- const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32;
3716
+ const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
3617
3717
 
3618
- int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32;
3718
+ ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
3619
3719
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
3620
3720
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
3621
3721
  MULTIPLY_ACCUM_WITH_SCALE(0);
@@ -3949,7 +4049,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3949
4049
  const int32x4_t vzero = vdupq_n_s32(0);
3950
4050
  #endif
3951
4051
 
3952
- int8x16x4_t q2bytes;
4052
+ ggml_int8x16x4_t q2bytes;
3953
4053
 
3954
4054
  uint32_t aux32[2];
3955
4055
  const uint8_t * scales = (const uint8_t *)aux32;
@@ -3974,7 +4074,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3974
4074
 
3975
4075
  const uint8x16_t q2bits = vld1q_u8(q2);
3976
4076
 
3977
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
4077
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
3978
4078
 
3979
4079
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
3980
4080
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
@@ -4238,7 +4338,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4238
4338
  const uint8x16_t m3 = vshlq_n_u8(m0, 3);
4239
4339
  const int8_t m32 = 32;
4240
4340
 
4241
- int8x16x4_t q3bytes;
4341
+ ggml_int8x16x4_t q3bytes;
4242
4342
 
4243
4343
  float sum = 0;
4244
4344
 
@@ -4250,9 +4350,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4250
4350
  const uint8_t * restrict qh = x[i].hmask;
4251
4351
  const int8_t * restrict q8 = y[i].qs;
4252
4352
 
4253
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
4353
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
4254
4354
 
4255
- uint8x16x4_t q3h;
4355
+ ggml_uint8x16x4_t q3h;
4256
4356
 
4257
4357
  int32_t isum = 0;
4258
4358
 
@@ -4268,9 +4368,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4268
4368
 
4269
4369
  for (int j = 0; j < QK_K/128; ++j) {
4270
4370
 
4271
- const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32;
4272
- const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64;
4273
- const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64;
4371
+ const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
4372
+ const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
4373
+ const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
4274
4374
 
4275
4375
  q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
4276
4376
  q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
@@ -4772,7 +4872,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4772
4872
  const uint8x16_t m3b = vdupq_n_u8(0x3);
4773
4873
  const uint8x16_t mh = vdupq_n_u8(4);
4774
4874
 
4775
- int8x16x4_t q3bytes;
4875
+ ggml_int8x16x4_t q3bytes;
4776
4876
 
4777
4877
  uint16_t aux16[2];
4778
4878
  int8_t * scales = (int8_t *)aux16;
@@ -4781,11 +4881,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4781
4881
 
4782
4882
  for (int i = 0; i < nb; ++i) {
4783
4883
 
4784
- uint8x16x4_t q3h;
4884
+ ggml_uint8x16x4_t q3h;
4785
4885
 
4786
4886
  const uint8x8_t hbits = vld1_u8(x[i].hmask);
4787
4887
  const uint8x16_t q3bits = vld1q_u8(x[i].qs);
4788
- const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
4888
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
4789
4889
 
4790
4890
  const uint16_t a = *(const uint16_t *)x[i].scales;
4791
4891
  aux16[0] = a & 0x0f0f;
@@ -5134,8 +5234,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5134
5234
  const int32x4_t mzero = vdupq_n_s32(0);
5135
5235
  #endif
5136
5236
 
5137
- int8x16x2_t q4bytes;
5138
- int8x16x2_t q8bytes;
5237
+ ggml_int8x16x2_t q4bytes;
5238
+ ggml_int8x16x2_t q8bytes;
5139
5239
 
5140
5240
  float sumf = 0;
5141
5241
 
@@ -5170,17 +5270,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5170
5270
 
5171
5271
  for (int j = 0; j < QK_K/64; ++j) {
5172
5272
 
5173
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32;
5273
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
5174
5274
 
5175
5275
  #ifdef __ARM_FEATURE_DOTPROD
5176
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5276
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5177
5277
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5178
5278
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5179
5279
 
5180
5280
  const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
5181
5281
  sumi1 += vaddvq_s32(p1) * scales[2*j+0];
5182
5282
 
5183
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5283
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5184
5284
  q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5185
5285
  q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5186
5286
 
@@ -5188,7 +5288,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5188
5288
 
5189
5289
  sumi2 += vaddvq_s32(p2) * scales[2*j+1];
5190
5290
  #else
5191
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5291
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5192
5292
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5193
5293
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5194
5294
  const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5197,7 +5297,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5197
5297
  vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5198
5298
  sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
5199
5299
 
5200
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5300
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5201
5301
  q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5202
5302
  q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5203
5303
  const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5512,8 +5612,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5512
5612
 
5513
5613
  float sumf = 0;
5514
5614
 
5515
- int8x16x2_t q4bytes;
5516
- int8x16x4_t q8bytes;
5615
+ ggml_int8x16x2_t q4bytes;
5616
+ ggml_int8x16x4_t q8bytes;
5517
5617
 
5518
5618
  float sum_mins = 0.f;
5519
5619
 
@@ -5534,10 +5634,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5534
5634
 
5535
5635
  const float d = y[i].d * (float)x[i].d[0];
5536
5636
 
5537
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
5637
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
5538
5638
 
5539
5639
  #ifdef __ARM_FEATURE_DOTPROD
5540
- q8bytes = vld1q_s8_x4(q8);
5640
+ q8bytes = ggml_vld1q_s8_x4(q8);
5541
5641
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5542
5642
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5543
5643
 
@@ -5551,7 +5651,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5551
5651
  const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
5552
5652
 
5553
5653
  #else
5554
- q8bytes = vld1q_s8_x4(q8);
5654
+ q8bytes = ggml_vld1q_s8_x4(q8);
5555
5655
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5556
5656
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5557
5657
  const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5785,7 +5885,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5785
5885
  const int32x4_t mzero = vdupq_n_s32(0);
5786
5886
  #endif
5787
5887
 
5788
- int8x16x4_t q5bytes;
5888
+ ggml_int8x16x4_t q5bytes;
5789
5889
 
5790
5890
  float sumf = 0;
5791
5891
 
@@ -5815,16 +5915,16 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5815
5915
  const uint8_t * restrict qh = x[i].qh;
5816
5916
  const int8_t * restrict q8 = y[i].qs;
5817
5917
 
5818
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
5918
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5819
5919
 
5820
- uint8x16x4_t q5h;
5920
+ ggml_uint8x16x4_t q5h;
5821
5921
 
5822
5922
  int32_t sumi = 0;
5823
5923
 
5824
5924
  for (int j = 0; j < QK_K/64; ++j) {
5825
5925
 
5826
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32;
5827
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
5926
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
5927
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
5828
5928
 
5829
5929
  q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
5830
5930
  q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6218,8 +6318,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6218
6318
  const int32x4_t mzero = vdupq_n_s32(0);
6219
6319
  #endif
6220
6320
 
6221
- int8x16x4_t q5bytes;
6222
- uint8x16x4_t q5h;
6321
+ ggml_int8x16x4_t q5bytes;
6322
+ ggml_uint8x16x4_t q5h;
6223
6323
 
6224
6324
  float sumf = 0;
6225
6325
 
@@ -6234,8 +6334,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6234
6334
 
6235
6335
  const uint8x8_t qhbits = vld1_u8(qh);
6236
6336
 
6237
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
6238
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
6337
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
6338
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
6239
6339
 
6240
6340
  const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
6241
6341
  q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
@@ -6511,8 +6611,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6511
6611
 
6512
6612
  const uint8x16_t mone = vdupq_n_u8(3);
6513
6613
 
6514
- int8x16x4_t q6bytes;
6515
- uint8x16x4_t q6h;
6614
+ ggml_int8x16x4_t q6bytes;
6615
+ ggml_uint8x16x4_t q6h;
6516
6616
 
6517
6617
  for (int i = 0; i < nb; ++i) {
6518
6618
 
@@ -6524,9 +6624,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6524
6624
 
6525
6625
  const int8_t * restrict scale = x[i].scales;
6526
6626
 
6527
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
6627
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
6528
6628
  const int8x16_t scales = vld1q_s8(scale);
6529
- const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6629
+ const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6530
6630
 
6531
6631
  const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
6532
6632
  vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
@@ -6538,9 +6638,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6538
6638
 
6539
6639
  for (int j = 0; j < QK_K/128; ++j) {
6540
6640
 
6541
- uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32;
6542
- uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64;
6543
- int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
6641
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
6642
+ ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
6643
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6544
6644
 
6545
6645
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
6546
6646
  q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6583,7 +6683,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6583
6683
  scale += 2;
6584
6684
  #endif
6585
6685
 
6586
- q8bytes = vld1q_s8_x4(q8); q8 += 64;
6686
+ q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6587
6687
 
6588
6688
  shifted = vshrq_n_u8(qhbits.val[0], 4);
6589
6689
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
@@ -6987,8 +7087,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6987
7087
 
6988
7088
  const uint8x16_t mone = vdupq_n_u8(3);
6989
7089
 
6990
- int8x16x4_t q6bytes;
6991
- uint8x16x4_t q6h;
7090
+ ggml_int8x16x4_t q6bytes;
7091
+ ggml_uint8x16x4_t q6h;
6992
7092
 
6993
7093
  for (int i = 0; i < nb; ++i) {
6994
7094
 
@@ -7002,9 +7102,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7002
7102
 
7003
7103
  int32_t isum = 0;
7004
7104
 
7005
- uint8x16_t qhbits = vld1q_u8(qh);
7006
- uint8x16x2_t q6bits = vld1q_u8_x2(q6);
7007
- int8x16x4_t q8bytes = vld1q_s8_x4(q8);
7105
+ uint8x16_t qhbits = vld1q_u8(qh);
7106
+ ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
7107
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
7008
7108
 
7009
7109
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
7010
7110
  uint8x16_t shifted = vshrq_n_u8(qhbits, 2);