llama_cpp 0.9.2 → 0.9.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,32 +14,12 @@
14
14
  //
15
15
  #include <arm_neon.h>
16
16
 
17
- #if !defined(__aarch64__)
18
- inline static int32_t vaddvq_s16(int16x8_t v) {
19
- return
20
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
21
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
22
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
23
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
24
- }
25
-
26
- inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
27
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
28
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
29
- return vcombine_s16(a0, b0);
30
- }
31
-
32
- inline static int32_t vaddvq_s32(int32x4_t v) {
33
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
34
- }
35
- #endif
36
-
37
17
  #else
38
18
 
39
19
  #ifdef __wasm_simd128__
40
20
  #include <wasm_simd128.h>
41
21
  #else
42
- #ifdef __POWER9_VECTOR__
22
+ #if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
43
23
  #include <altivec.h>
44
24
  #undef bool
45
25
  #define bool _Bool
@@ -47,13 +27,15 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
47
27
  #if defined(_MSC_VER) || defined(__MINGW32__)
48
28
  #include <intrin.h>
49
29
  #else
50
- #if !defined(__riscv) && !defined(__s390__)
30
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
31
+ #if !defined(__riscv)
51
32
  #include <immintrin.h>
52
33
  #endif
53
34
  #endif
54
35
  #endif
55
36
  #endif
56
37
  #endif
38
+ #endif
57
39
 
58
40
  #ifdef __riscv_v_intrinsic
59
41
  #include <riscv_vector.h>
@@ -61,6 +43,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
61
43
 
62
44
  #undef MIN
63
45
  #undef MAX
46
+
64
47
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
65
48
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
66
49
 
@@ -283,9 +266,31 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
283
266
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
284
267
 
285
268
  #if defined(__ARM_NEON)
286
-
287
269
  #if !defined(__aarch64__)
288
270
 
271
+ // 64-bit compatibility
272
+
273
+ // vaddvq_s16
274
+ // vpaddq_s16
275
+ // vaddvq_s32
276
+ // vaddvq_f32
277
+ // vmaxvq_f32
278
+ // vcvtnq_s32_f32
279
+
280
+ inline static int32_t vaddvq_s16(int16x8_t v) {
281
+ return
282
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
283
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
284
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
285
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
286
+ }
287
+
288
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
289
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
290
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
291
+ return vcombine_s16(a0, b0);
292
+ }
293
+
289
294
  inline static int32_t vaddvq_s32(int32x4_t v) {
290
295
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
291
296
  }
@@ -311,6 +316,96 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
311
316
  return res;
312
317
  }
313
318
 
319
+ // vld1q_s16_x2
320
+ // vld1q_u8_x2
321
+ // vld1q_u8_x4
322
+ // vld1q_s8_x2
323
+ // vld1q_s8_x4
324
+ // TODO: double-check these work correctly
325
+
326
+ typedef struct ggml_int16x8x2_t {
327
+ int16x8_t val[2];
328
+ } ggml_int16x8x2_t;
329
+
330
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
331
+ ggml_int16x8x2_t res;
332
+
333
+ res.val[0] = vld1q_s16(ptr + 0);
334
+ res.val[1] = vld1q_s16(ptr + 8);
335
+
336
+ return res;
337
+ }
338
+
339
+ typedef struct ggml_uint8x16x2_t {
340
+ uint8x16_t val[2];
341
+ } ggml_uint8x16x2_t;
342
+
343
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
344
+ ggml_uint8x16x2_t res;
345
+
346
+ res.val[0] = vld1q_u8(ptr + 0);
347
+ res.val[1] = vld1q_u8(ptr + 16);
348
+
349
+ return res;
350
+ }
351
+
352
+ typedef struct ggml_uint8x16x4_t {
353
+ uint8x16_t val[4];
354
+ } ggml_uint8x16x4_t;
355
+
356
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
357
+ ggml_uint8x16x4_t res;
358
+
359
+ res.val[0] = vld1q_u8(ptr + 0);
360
+ res.val[1] = vld1q_u8(ptr + 16);
361
+ res.val[2] = vld1q_u8(ptr + 32);
362
+ res.val[3] = vld1q_u8(ptr + 48);
363
+
364
+ return res;
365
+ }
366
+
367
+ typedef struct ggml_int8x16x2_t {
368
+ int8x16_t val[2];
369
+ } ggml_int8x16x2_t;
370
+
371
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
372
+ ggml_int8x16x2_t res;
373
+
374
+ res.val[0] = vld1q_s8(ptr + 0);
375
+ res.val[1] = vld1q_s8(ptr + 16);
376
+
377
+ return res;
378
+ }
379
+
380
+ typedef struct ggml_int8x16x4_t {
381
+ int8x16_t val[4];
382
+ } ggml_int8x16x4_t;
383
+
384
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
385
+ ggml_int8x16x4_t res;
386
+
387
+ res.val[0] = vld1q_s8(ptr + 0);
388
+ res.val[1] = vld1q_s8(ptr + 16);
389
+ res.val[2] = vld1q_s8(ptr + 32);
390
+ res.val[3] = vld1q_s8(ptr + 48);
391
+
392
+ return res;
393
+ }
394
+
395
+ #else
396
+
397
+ #define ggml_int16x8x2_t int16x8x2_t
398
+ #define ggml_uint8x16x2_t uint8x16x2_t
399
+ #define ggml_uint8x16x4_t uint8x16x4_t
400
+ #define ggml_int8x16x2_t int8x16x2_t
401
+ #define ggml_int8x16x4_t int8x16x4_t
402
+
403
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
404
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
405
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
406
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
407
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
408
+
314
409
  #endif
315
410
  #endif
316
411
 
@@ -1273,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
1273
1368
  float max = x[0];
1274
1369
  float sum_w = weights[0];
1275
1370
  float sum_x = sum_w * x[0];
1371
+ #ifdef HAVE_BUGGY_APPLE_LINKER
1372
+ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
1373
+ for (volatile int i = 1; i < n; ++i) {
1374
+ #else
1276
1375
  for (int i = 1; i < n; ++i) {
1376
+ #endif
1277
1377
  if (x[i] < min) min = x[i];
1278
1378
  if (x[i] > max) max = x[i];
1279
1379
  float w = weights[i];
@@ -3557,7 +3657,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3557
3657
  const int32x4_t vzero = vdupq_n_s32(0);
3558
3658
  #endif
3559
3659
 
3560
- int8x16x2_t q2bytes;
3660
+ ggml_int8x16x2_t q2bytes;
3561
3661
  uint8_t aux[16];
3562
3662
 
3563
3663
  float sum = 0;
@@ -3576,8 +3676,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3576
3676
  vst1q_u8(aux, scales);
3577
3677
 
3578
3678
  const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
3579
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
3580
- const int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3679
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
3680
+ const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
3581
3681
  const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
3582
3682
  vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
3583
3683
  const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@@ -3605,7 +3705,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3605
3705
  #endif
3606
3706
 
3607
3707
  #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
3608
- q8bytes = vld1q_s8_x2(q8); q8 += 32;\
3708
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
3609
3709
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
3610
3710
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
3611
3711
  MULTIPLY_ACCUM_WITH_SCALE((index));
@@ -3613,9 +3713,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3613
3713
 
3614
3714
  for (int j = 0; j < QK_K/128; ++j) {
3615
3715
 
3616
- const uint8x16x2_t q2bits = vld1q_u8_x2(q2); q2 += 32;
3716
+ const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
3617
3717
 
3618
- int8x16x2_t q8bytes = vld1q_s8_x2(q8); q8 += 32;
3718
+ ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
3619
3719
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
3620
3720
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
3621
3721
  MULTIPLY_ACCUM_WITH_SCALE(0);
@@ -3949,7 +4049,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3949
4049
  const int32x4_t vzero = vdupq_n_s32(0);
3950
4050
  #endif
3951
4051
 
3952
- int8x16x4_t q2bytes;
4052
+ ggml_int8x16x4_t q2bytes;
3953
4053
 
3954
4054
  uint32_t aux32[2];
3955
4055
  const uint8_t * scales = (const uint8_t *)aux32;
@@ -3974,7 +4074,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3974
4074
 
3975
4075
  const uint8x16_t q2bits = vld1q_u8(q2);
3976
4076
 
3977
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
4077
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
3978
4078
 
3979
4079
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits, m3));
3980
4080
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 2), m3));
@@ -4238,7 +4338,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4238
4338
  const uint8x16_t m3 = vshlq_n_u8(m0, 3);
4239
4339
  const int8_t m32 = 32;
4240
4340
 
4241
- int8x16x4_t q3bytes;
4341
+ ggml_int8x16x4_t q3bytes;
4242
4342
 
4243
4343
  float sum = 0;
4244
4344
 
@@ -4250,9 +4350,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4250
4350
  const uint8_t * restrict qh = x[i].hmask;
4251
4351
  const int8_t * restrict q8 = y[i].qs;
4252
4352
 
4253
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
4353
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
4254
4354
 
4255
- uint8x16x4_t q3h;
4355
+ ggml_uint8x16x4_t q3h;
4256
4356
 
4257
4357
  int32_t isum = 0;
4258
4358
 
@@ -4268,9 +4368,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4268
4368
 
4269
4369
  for (int j = 0; j < QK_K/128; ++j) {
4270
4370
 
4271
- const uint8x16x2_t q3bits = vld1q_u8_x2(q3); q3 += 32;
4272
- const int8x16x4_t q8bytes_1 = vld1q_s8_x4(q8); q8 += 64;
4273
- const int8x16x4_t q8bytes_2 = vld1q_s8_x4(q8); q8 += 64;
4371
+ const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
4372
+ const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
4373
+ const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
4274
4374
 
4275
4375
  q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
4276
4376
  q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
@@ -4772,7 +4872,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4772
4872
  const uint8x16_t m3b = vdupq_n_u8(0x3);
4773
4873
  const uint8x16_t mh = vdupq_n_u8(4);
4774
4874
 
4775
- int8x16x4_t q3bytes;
4875
+ ggml_int8x16x4_t q3bytes;
4776
4876
 
4777
4877
  uint16_t aux16[2];
4778
4878
  int8_t * scales = (int8_t *)aux16;
@@ -4781,11 +4881,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4781
4881
 
4782
4882
  for (int i = 0; i < nb; ++i) {
4783
4883
 
4784
- uint8x16x4_t q3h;
4884
+ ggml_uint8x16x4_t q3h;
4785
4885
 
4786
4886
  const uint8x8_t hbits = vld1_u8(x[i].hmask);
4787
4887
  const uint8x16_t q3bits = vld1q_u8(x[i].qs);
4788
- const int8x16x4_t q8bytes = vld1q_s8_x4(y[i].qs);
4888
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(y[i].qs);
4789
4889
 
4790
4890
  const uint16_t a = *(const uint16_t *)x[i].scales;
4791
4891
  aux16[0] = a & 0x0f0f;
@@ -5134,8 +5234,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5134
5234
  const int32x4_t mzero = vdupq_n_s32(0);
5135
5235
  #endif
5136
5236
 
5137
- int8x16x2_t q4bytes;
5138
- int8x16x2_t q8bytes;
5237
+ ggml_int8x16x2_t q4bytes;
5238
+ ggml_int8x16x2_t q8bytes;
5139
5239
 
5140
5240
  float sumf = 0;
5141
5241
 
@@ -5170,17 +5270,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5170
5270
 
5171
5271
  for (int j = 0; j < QK_K/64; ++j) {
5172
5272
 
5173
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4); q4 += 32;
5273
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
5174
5274
 
5175
5275
  #ifdef __ARM_FEATURE_DOTPROD
5176
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5276
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5177
5277
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5178
5278
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5179
5279
 
5180
5280
  const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
5181
5281
  sumi1 += vaddvq_s32(p1) * scales[2*j+0];
5182
5282
 
5183
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5283
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5184
5284
  q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5185
5285
  q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5186
5286
 
@@ -5188,7 +5288,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5188
5288
 
5189
5289
  sumi2 += vaddvq_s32(p2) * scales[2*j+1];
5190
5290
  #else
5191
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5291
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5192
5292
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5193
5293
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5194
5294
  const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5197,7 +5297,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5197
5297
  vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5198
5298
  sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
5199
5299
 
5200
- q8bytes = vld1q_s8_x2(q8); q8 += 32;
5300
+ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5201
5301
  q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5202
5302
  q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5203
5303
  const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5512,8 +5612,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5512
5612
 
5513
5613
  float sumf = 0;
5514
5614
 
5515
- int8x16x2_t q4bytes;
5516
- int8x16x4_t q8bytes;
5615
+ ggml_int8x16x2_t q4bytes;
5616
+ ggml_int8x16x4_t q8bytes;
5517
5617
 
5518
5618
  float sum_mins = 0.f;
5519
5619
 
@@ -5534,10 +5634,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5534
5634
 
5535
5635
  const float d = y[i].d * (float)x[i].d[0];
5536
5636
 
5537
- const uint8x16x2_t q4bits = vld1q_u8_x2(q4);
5637
+ const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
5538
5638
 
5539
5639
  #ifdef __ARM_FEATURE_DOTPROD
5540
- q8bytes = vld1q_s8_x4(q8);
5640
+ q8bytes = ggml_vld1q_s8_x4(q8);
5541
5641
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5542
5642
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5543
5643
 
@@ -5551,7 +5651,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5551
5651
  const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
5552
5652
 
5553
5653
  #else
5554
- q8bytes = vld1q_s8_x4(q8);
5654
+ q8bytes = ggml_vld1q_s8_x4(q8);
5555
5655
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5556
5656
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5557
5657
  const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
@@ -5785,7 +5885,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5785
5885
  const int32x4_t mzero = vdupq_n_s32(0);
5786
5886
  #endif
5787
5887
 
5788
- int8x16x4_t q5bytes;
5888
+ ggml_int8x16x4_t q5bytes;
5789
5889
 
5790
5890
  float sumf = 0;
5791
5891
 
@@ -5815,16 +5915,16 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5815
5915
  const uint8_t * restrict qh = x[i].qh;
5816
5916
  const int8_t * restrict q8 = y[i].qs;
5817
5917
 
5818
- uint8x16x2_t qhbits = vld1q_u8_x2(qh);
5918
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
5819
5919
 
5820
- uint8x16x4_t q5h;
5920
+ ggml_uint8x16x4_t q5h;
5821
5921
 
5822
5922
  int32_t sumi = 0;
5823
5923
 
5824
5924
  for (int j = 0; j < QK_K/64; ++j) {
5825
5925
 
5826
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5); q5 += 32;
5827
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
5926
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5); q5 += 32;
5927
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
5828
5928
 
5829
5929
  q5h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
5830
5930
  q5h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6218,8 +6318,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6218
6318
  const int32x4_t mzero = vdupq_n_s32(0);
6219
6319
  #endif
6220
6320
 
6221
- int8x16x4_t q5bytes;
6222
- uint8x16x4_t q5h;
6321
+ ggml_int8x16x4_t q5bytes;
6322
+ ggml_uint8x16x4_t q5h;
6223
6323
 
6224
6324
  float sumf = 0;
6225
6325
 
@@ -6234,8 +6334,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6234
6334
 
6235
6335
  const uint8x8_t qhbits = vld1_u8(qh);
6236
6336
 
6237
- const uint8x16x2_t q5bits = vld1q_u8_x2(q5);
6238
- const int8x16x4_t q8bytes = vld1q_s8_x4(q8);
6337
+ const ggml_uint8x16x2_t q5bits = ggml_vld1q_u8_x2(q5);
6338
+ const ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
6239
6339
 
6240
6340
  const uint8x16_t htmp = vcombine_u8(qhbits, vshr_n_u8(qhbits, 1));
6241
6341
  q5h.val[0] = vbicq_u8(mh, vshlq_n_u8(htmp, 4));
@@ -6511,8 +6611,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6511
6611
 
6512
6612
  const uint8x16_t mone = vdupq_n_u8(3);
6513
6613
 
6514
- int8x16x4_t q6bytes;
6515
- uint8x16x4_t q6h;
6614
+ ggml_int8x16x4_t q6bytes;
6615
+ ggml_uint8x16x4_t q6h;
6516
6616
 
6517
6617
  for (int i = 0; i < nb; ++i) {
6518
6618
 
@@ -6524,9 +6624,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6524
6624
 
6525
6625
  const int8_t * restrict scale = x[i].scales;
6526
6626
 
6527
- const int16x8x2_t q8sums = vld1q_s16_x2(y[i].bsums);
6627
+ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
6528
6628
  const int8x16_t scales = vld1q_s8(scale);
6529
- const int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6629
+ const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
6530
6630
 
6531
6631
  const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
6532
6632
  vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
@@ -6538,9 +6638,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6538
6638
 
6539
6639
  for (int j = 0; j < QK_K/128; ++j) {
6540
6640
 
6541
- uint8x16x2_t qhbits = vld1q_u8_x2(qh); qh += 32;
6542
- uint8x16x4_t q6bits = vld1q_u8_x4(q6); q6 += 64;
6543
- int8x16x4_t q8bytes = vld1q_s8_x4(q8); q8 += 64;
6641
+ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
6642
+ ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
6643
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6544
6644
 
6545
6645
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
6546
6646
  q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
@@ -6583,7 +6683,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6583
6683
  scale += 2;
6584
6684
  #endif
6585
6685
 
6586
- q8bytes = vld1q_s8_x4(q8); q8 += 64;
6686
+ q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6587
6687
 
6588
6688
  shifted = vshrq_n_u8(qhbits.val[0], 4);
6589
6689
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
@@ -6987,8 +7087,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6987
7087
 
6988
7088
  const uint8x16_t mone = vdupq_n_u8(3);
6989
7089
 
6990
- int8x16x4_t q6bytes;
6991
- uint8x16x4_t q6h;
7090
+ ggml_int8x16x4_t q6bytes;
7091
+ ggml_uint8x16x4_t q6h;
6992
7092
 
6993
7093
  for (int i = 0; i < nb; ++i) {
6994
7094
 
@@ -7002,9 +7102,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7002
7102
 
7003
7103
  int32_t isum = 0;
7004
7104
 
7005
- uint8x16_t qhbits = vld1q_u8(qh);
7006
- uint8x16x2_t q6bits = vld1q_u8_x2(q6);
7007
- int8x16x4_t q8bytes = vld1q_s8_x4(q8);
7105
+ uint8x16_t qhbits = vld1q_u8(qh);
7106
+ ggml_uint8x16x2_t q6bits = ggml_vld1q_u8_x2(q6);
7107
+ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8);
7008
7108
 
7009
7109
  q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits), 4);
7010
7110
  uint8x16_t shifted = vshrq_n_u8(qhbits, 2);