llama_cpp 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
1334
1334
  return;
1335
1335
  }
1336
1336
 
1337
- cl_mem mem = (cl_mem)tensor->data;
1337
+ cl_mem mem = (cl_mem)tensor->extra;
1338
1338
  clReleaseMemObject(mem);
1339
1339
  }
1340
1340
 
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
1393
1393
  size_t d_size;
1394
1394
 
1395
1395
  cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
1396
- cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
1396
+ cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
1397
1397
  cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
1398
1398
 
1399
1399
 
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1491
1491
  size_t d_size;
1492
1492
  cl_mem d_X;
1493
1493
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1494
- d_X = (cl_mem) src0->data;
1494
+ d_X = (cl_mem) src0->extra;
1495
1495
  } else {
1496
- d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1496
+ d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
1497
1497
  }
1498
1498
  cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
1499
1499
  cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1567
1567
  size_t d_size;
1568
1568
  cl_mem d_X;
1569
1569
  if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
1570
- d_X = (cl_mem) src0->data;
1570
+ d_X = (cl_mem) src0->extra;
1571
1571
  } else {
1572
1572
  d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
1573
1573
  }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1697
1697
  events.emplace_back();
1698
1698
  CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1699
1699
  } else if (src0->backend == GGML_BACKEND_GPU) {
1700
- d_Q = (cl_mem) src0->data;
1700
+ d_Q = (cl_mem) src0->extra;
1701
1701
  } else {
1702
1702
  GGML_ASSERT(false);
1703
1703
  }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1860
1860
 
1861
1861
  CL_CHECK(clFinish(queue));
1862
1862
 
1863
- tensor->data = dst;
1863
+ tensor->extra = dst;
1864
1864
  GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
1865
1865
  }
@@ -1,4 +1,3 @@
1
- #define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
2
1
  #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
3
2
 
4
3
  #include "ggml.h"
@@ -47,6 +46,10 @@
47
46
  // disable "possible loss of data" to avoid hundreds of casts
48
47
  // we should just be careful :)
49
48
  #pragma warning(disable: 4244 4267)
49
+
50
+ // disable POSIX deprecation warnigns
51
+ // these functions are never going away, anyway
52
+ #pragma warning(disable: 4996)
50
53
  #endif
51
54
 
52
55
  #if defined(_WIN32)
@@ -103,6 +106,9 @@ typedef void * thread_ret_t;
103
106
  #include <sys/stat.h>
104
107
  #include <unistd.h>
105
108
 
109
+ #endif
110
+ #ifdef GGML_USE_CPU_HBM
111
+ #include <hbwmalloc.h>
106
112
  #endif
107
113
 
108
114
  // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -192,9 +198,15 @@ typedef void * thread_ret_t;
192
198
  #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
193
199
  #else
194
200
  inline static void * ggml_aligned_malloc(size_t size) {
201
+ if (size == 0) {
202
+ GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
203
+ return NULL;
204
+ }
195
205
  void * aligned_memory = NULL;
196
- #ifdef GGML_USE_METAL
197
- int result = posix_memalign(&aligned_memory, getpagesize(), size);
206
+ #ifdef GGML_USE_CPU_HBM
207
+ int result = hbw_posix_memalign(&aligned_memory, 16, size);
208
+ #elif GGML_USE_METAL
209
+ int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
198
210
  #else
199
211
  int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
200
212
  #endif
@@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
215
227
  return aligned_memory;
216
228
  }
217
229
  #define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
230
+ #ifdef GGML_USE_CPU_HBM
231
+ #define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
232
+ #else
218
233
  #define GGML_ALIGNED_FREE(ptr) free(ptr)
219
234
  #endif
235
+ #endif
220
236
 
221
237
  #define UNUSED GGML_UNUSED
222
238
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@@ -267,7 +283,7 @@ typedef double ggml_float;
267
283
  // 16-bit float
268
284
  // on Arm, we use __fp16
269
285
  // on x86, we use uint16_t
270
- #ifdef __ARM_NEON
286
+ #if defined(__ARM_NEON) && !defined(_MSC_VER)
271
287
 
272
288
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
273
289
  //
@@ -294,12 +310,14 @@ typedef double ggml_float;
294
310
  #if defined(_MSC_VER) || defined(__MINGW32__)
295
311
  #include <intrin.h>
296
312
  #else
313
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
297
314
  #if !defined(__riscv)
298
315
  #include <immintrin.h>
299
316
  #endif
300
317
  #endif
301
318
  #endif
302
319
  #endif
320
+ #endif
303
321
 
304
322
  #ifdef __riscv_v_intrinsic
305
323
  #include <riscv_vector.h>
@@ -817,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
817
835
 
818
836
  #if !defined(__aarch64__)
819
837
 
820
- inline static uint16_t vaddvq_u8(uint8x16_t v) {
821
- return
822
- (uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
823
- (uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
824
- (uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
825
- (uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
826
- (uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
827
- (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
828
- (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
829
- (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
830
- }
831
-
832
- inline static int16_t vaddvq_s8(int8x16_t v) {
833
- return
834
- (int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
835
- (int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
836
- (int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
837
- (int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
838
- (int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
839
- (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
840
- (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
841
- (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
842
- }
843
-
844
- inline static int32_t vaddvq_s16(int16x8_t v) {
845
- return
846
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
847
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
848
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
849
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
850
- }
851
-
852
- inline static uint32_t vaddvq_u16(uint16x8_t v) {
853
- return
854
- (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
855
- (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
856
- (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
857
- (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
858
- }
859
-
860
838
  inline static int32_t vaddvq_s32(int32x4_t v) {
861
839
  return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
862
840
  }
@@ -865,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) {
865
843
  return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
866
844
  }
867
845
 
868
- inline static float vminvq_f32(float32x4_t v) {
869
- return
870
- MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
871
- MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
872
- }
873
-
874
846
  inline static float vmaxvq_f32(float32x4_t v) {
875
847
  return
876
848
  MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
@@ -4612,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4612
4584
  return NULL;
4613
4585
  }
4614
4586
 
4587
+ // allow to call ggml_init with 0 size
4588
+ if (params.mem_size == 0) {
4589
+ params.mem_size = GGML_MEM_ALIGN;
4590
+ }
4591
+
4615
4592
  const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
4616
4593
 
4617
4594
  *ctx = (struct ggml_context) {
@@ -4814,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
4814
4791
 
4815
4792
  size_t obj_alloc_size = 0;
4816
4793
 
4817
- if (view_src == NULL && ctx->no_alloc == false) {
4794
+ if (view_src == NULL && !ctx->no_alloc) {
4818
4795
  if (ctx->scratch.data != NULL) {
4819
4796
  // allocate tensor data in the scratch buffer
4820
4797
  if (ctx->scratch.offs + data_size > ctx->scratch.size) {
@@ -5515,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
5515
5492
  }
5516
5493
 
5517
5494
  if (inplace) {
5518
- GGML_ASSERT(is_node == false);
5495
+ GGML_ASSERT(!is_node);
5519
5496
  }
5520
5497
 
5521
5498
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -5558,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
5558
5535
  }
5559
5536
 
5560
5537
  if (inplace) {
5561
- GGML_ASSERT(is_node == false);
5538
+ GGML_ASSERT(!is_node);
5562
5539
  }
5563
5540
 
5564
5541
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -18900,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
18900
18877
  // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
18901
18878
  return count;
18902
18879
  }
18903
- return count;
18904
18880
  }
18905
18881
  }
18906
18882
 
@@ -20003,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20003
19979
 
20004
19980
  struct ggml_tensor * data = NULL;
20005
19981
 
20006
- if (params.no_alloc == false) {
19982
+ if (!params.no_alloc) {
20007
19983
  data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
20008
19984
 
20009
19985
  ok = ok && data != NULL;
@@ -20044,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
20044
20020
  }
20045
20021
 
20046
20022
  // point the data member to the appropriate location in the binary blob using the tensor infos
20047
- if (params.no_alloc == false) {
20023
+ if (!params.no_alloc) {
20048
20024
  //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
20049
20025
  cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
20050
20026
  }
@@ -270,7 +270,7 @@ extern "C" {
270
270
 
271
271
  #if defined(__ARM_NEON) && defined(__CUDACC__)
272
272
  typedef half ggml_fp16_t;
273
- #elif defined(__ARM_NEON)
273
+ #elif defined(__ARM_NEON) && !defined(_MSC_VER)
274
274
  typedef __fp16 ggml_fp16_t;
275
275
  #else
276
276
  typedef uint16_t ggml_fp16_t;
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2582,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2582
2609
 
2583
2610
  memcpy(utmp, x[i].scales, 12);
2584
2611
 
2585
- const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
2612
+ uint32x2_t mins8 = { 0 };
2613
+ mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
2614
+ mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
2615
+
2586
2616
  utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
2587
2617
  utmp[0] &= kmask1;
2588
2618
 
@@ -2596,8 +2626,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2626
  const uint8_t * restrict q4 = x[i].qs;
2597
2627
  const int8_t * restrict q8 = y[i].qs;
2598
2628
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2629
  int32_t sumi1 = 0;
2602
2630
  int32_t sumi2 = 0;
2603
2631
 
@@ -3096,9 +3124,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3124
  #ifdef __ARM_NEON
3097
3125
 
3098
3126
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3127
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3128
  const uint8x16_t mtwo = vdupq_n_u8(2);
3129
+ #if defined(__ARM_FEATURE_DOTPROD)
3130
+ const int32x4_t mzero = vdupq_n_s32(0);
3131
+ #endif
3102
3132
 
3103
3133
  int8x16x4_t q5bytes;
3104
3134
 
@@ -3441,8 +3471,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3471
  #ifdef __ARM_NEON
3442
3472
 
3443
3473
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3474
  const uint8x16_t mh = vdupq_n_u8(16);
3475
+ #if defined(__ARM_FEATURE_DOTPROD)
3476
+ const int32x4_t mzero = vdupq_n_s32(0);
3477
+ #endif
3446
3478
 
3447
3479
  int8x16x4_t q5bytes;
3448
3480
  uint8x16x4_t q5h;
@@ -3660,7 +3692,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3692
  float sum = 0;
3661
3693
 
3662
3694
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3695
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3696
  const int32x4_t vzero = vdupq_n_s32(0);
3697
+ #endif
3664
3698
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3699
 
3666
3700
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4083,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4083
  float sum = 0;
4050
4084
 
4051
4085
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4086
  const int8x16_t m32s = vdupq_n_s8(32);
4087
+ #if defined(__ARM_FEATURE_DOTPROD)
4088
+ const int32x4_t vzero = vdupq_n_s32(0);
4089
+ #endif
4054
4090
 
4055
4091
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4092