llama_cpp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }
@@ -1,4 +1,3 @@
1
- #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
1
  #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
2
 
4
3
  #include "ggml.h"
@@ -47,6 +46,10 @@
47
46
  // disable "possible loss of data" to avoid hundreds of casts
48
47
  // we should just be careful :)
49
48
  #pragma warning(disable: 4244 4267)
49
+
50
+ // disable POSIX deprecation warnigns
51
+ // these functions are never going away, anyway
52
+ #pragma warning(disable: 4996)
50
53
  #endif
51
54
 
52
55
  #if defined(_WIN32)
@@ -103,6 +106,9 @@ typedef void * thread_ret_t;
103
106
  #include <sys/stat.h>
104
107
  #include <unistd.h>
105
108
 
109
+ #endif
110
+ #ifdef GGML_USE_CPU_HBM
111
+ #include <hbwmalloc.h>
106
112
  #endif
107
113
 
108
114
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -192,9 +198,15 @@ typedef void * thread_ret_t;
192
198
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
193
199
  #else
194
200
  inline static void * ggml_aligned_malloc(size_t size) {
201
+ if (size == 0) {
202
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
203
+ return NULL;
204
+ }
195
205
  void * aligned_memory = NULL;
196
- #ifdef GGML_USE_METAL
197
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
206
+ #ifdef GGML_USE_CPU_HBM
207
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
208
+ #elif GGML_USE_METAL
209
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
198
210
  #else
199
211
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
200
212
  #endif
@@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
215
227
  return aligned_memory;
216
228
  }
217
229
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
230
+ #ifdef GGML_USE_CPU_HBM
231
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
232
+ #else
218
233
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
219
234
  #endif
235
+ #endif
220
236
 
221
237
  #define UNUSED GGML_UNUSED
222
238
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -267,7 +283,7 @@ typedef double ggml_float;
267
283
  // 16-bit float
268
284
  // on Arm, we use __fp16
269
285
  // on x86, we use uint16_t
270
- #ifdef __ARM_NEON
286
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
271
287
 
272
288
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
273
289
  //
@@ -294,12 +310,14 @@ typedef double ggml_float;
294
310
  #if defined(_MSC_VER) || defined(__MINGW32__)
295
311
  #include <intrin.h>
296
312
  #else
313
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
297
314
  #if !defined(__riscv)
298
315
  #include <immintrin.h>
299
316
  #endif
300
317
  #endif
301
318
  #endif
302
319
  #endif
320
+ #endif
303
321
 
304
322
  #ifdef __riscv_v_intrinsic
305
323
  #include <riscv_vector.h>
@@ -817,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
835
 
818
836
  #if !defined(__aarch64__)
819
837
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
838
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
839
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
840
  }
@@ -865,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
843
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
844
  }
867
845
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
846
  inline static float vmaxvq_f32(float32x4_t v) {
875
847
  return
876
848
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -4612,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4612
4584
  return NULL;
4613
4585
  }
4614
4586
 
4587
+ // allow to call ggml_init with 0 size
4588
+ if (params.mem_size == 0) {
4589
+ params.mem_size = GGML_MEM_ALIGN;
4590
+ }
4591
+
4615
4592
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4616
4593
 
4617
4594
  *ctx = (struct ggml_context) {
@@ -4814,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4814
4791
 
4815
4792
  size_t obj_alloc_size = 0;
4816
4793
 
4817
- if (view_src == NULL && ctx->no_alloc == false) {
4794
+ if (view_src == NULL && !ctx->no_alloc) {
4818
4795
  if (ctx->scratch.data != NULL) {
4819
4796
  // allocate tensor data in the scratch buffer
4820
4797
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5515,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
5515
5492
  }
5516
5493
 
5517
5494
  if (inplace) {
5518
- GGML_ASSERT(is_node == false);
5495
+ GGML_ASSERT(!is_node);
5519
5496
  }
5520
5497
 
5521
5498
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5558,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
5558
5535
  }
5559
5536
 
5560
5537
  if (inplace) {
5561
- GGML_ASSERT(is_node == false);
5538
+ GGML_ASSERT(!is_node);
5562
5539
  }
5563
5540
 
5564
5541
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -18900,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
18900
18877
  // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
18901
18878
  return count;
18902
18879
  }
18903
- return count;
18904
18880
  }
18905
18881
  }
18906
18882
 
@@ -20003,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20003
19979
 
20004
19980
  struct ggml_tensor * data = NULL;
20005
19981
 
20006
- if (params.no_alloc == false) {
19982
+ if (!params.no_alloc) {
20007
19983
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
19984
 
20009
19985
  ok = ok && data != NULL;
@@ -20044,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20044
20020
  }
20045
20021
 
20046
20022
  // point the data member to the appropriate location in the binary blob using the tensor infos
20047
- if (params.no_alloc == false) {
20023
+ if (!params.no_alloc) {
20048
20024
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
20025
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
20026
  }
@@ -270,7 +270,7 @@ extern "C" {
270
270
 
271
271
  #if defined(__ARM_NEON) && defined(__CUDACC__)
272
272
  typedef half ggml_fp16_t;
273
- #elif defined(__ARM_NEON)
273
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
274
274
  typedef __fp16 ggml_fp16_t;
275
275
  #else
276
276
  typedef uint16_t ggml_fp16_t;
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2582,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2582
2609
 
2583
2610
  memcpy(utmp, x[i].scales, 12);
2584
2611
 
2585
- const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
2612
+ uint32x2_t mins8 = { 0 };
2613
+ mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
2614
+ mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
2615
+
2586
2616
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2587
2617
  utmp[0] &= kmask1;
2588
2618
 
@@ -2596,8 +2626,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2626
  const uint8_t * restrict q4 = x[i].qs;
2597
2627
  const int8_t * restrict q8 = y[i].qs;
2598
2628
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2629
  int32_t sumi1 = 0;
2602
2630
  int32_t sumi2 = 0;
2603
2631
 
@@ -3096,9 +3124,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3124
  #ifdef __ARM_NEON
3097
3125
 
3098
3126
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3127
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3128
  const uint8x16_t mtwo = vdupq_n_u8(2);
3129
+ #if defined(__ARM_FEATURE_DOTPROD)
3130
+ const int32x4_t mzero = vdupq_n_s32(0);
3131
+ #endif
3102
3132
 
3103
3133
  int8x16x4_t q5bytes;
3104
3134
 
@@ -3441,8 +3471,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3471
  #ifdef __ARM_NEON
3442
3472
 
3443
3473
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3474
  const uint8x16_t mh = vdupq_n_u8(16);
3475
+ #if defined(__ARM_FEATURE_DOTPROD)
3476
+ const int32x4_t mzero = vdupq_n_s32(0);
3477
+ #endif
3446
3478
 
3447
3479
  int8x16x4_t q5bytes;
3448
3480
  uint8x16x4_t q5h;
@@ -3660,7 +3692,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3692
  float sum = 0;
3661
3693
 
3662
3694
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3695
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3696
  const int32x4_t vzero = vdupq_n_s32(0);
3697
+ #endif
3664
3698
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3699
 
3666
3700
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4083,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4083
  float sum = 0;
4050
4084
 
4051
4085
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4086
  const int8x16_t m32s = vdupq_n_s8(32);
4087
+ #if defined(__ARM_FEATURE_DOTPROD)
4088
+ const int32x4_t vzero = vdupq_n_s32(0);
4089
+ #endif
4054
4090
 
4055
4091
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4092