@fugood/llama.node 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +37 -0
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -62,7 +63,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
62
63
  const float d = amax / ((1 << 7) - 1);
63
64
  const float id = d ? 1.0f/d : 0.0f;
64
65
 
65
- y[i].d = GGML_FP32_TO_FP16(d);
66
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
66
67
 
67
68
  for (int j = 0; j < 8; j++) {
68
69
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
@@ -104,7 +105,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
104
105
  const float d = amax / ((1 << 7) - 1);
105
106
  const float id = d ? 1.0f/d : 0.0f;
106
107
 
107
- y[i].d = GGML_FP32_TO_FP16(d);
108
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
108
109
 
109
110
  int32x4_t accv = vdupq_n_s32(0);
110
111
 
@@ -120,7 +121,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
120
121
  accv = vaddq_s32(accv, vi);
121
122
  }
122
123
 
123
- y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
124
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * vaddvq_s32(accv));
124
125
  }
125
126
  #else
126
127
  GGML_UNUSED(nb);
@@ -194,10 +195,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
194
195
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
195
196
 
196
197
  float32_t _scale[4] = {
197
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
198
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
199
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
200
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
198
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
199
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
200
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
201
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
201
202
  };
202
203
  float32x4_t scale = vld1q_f32(_scale);
203
204
 
@@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
274
275
  // dot product
275
276
  sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
276
277
  svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
277
- svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
278
+ svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
278
279
  sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
279
280
  svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
280
- svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
281
+ svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
281
282
  }
282
283
 
283
284
  sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
313
314
 
314
315
  // dot product
315
316
  sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
316
- svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
317
+ svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
317
318
  sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
318
- svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
319
+ svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
319
320
  }
320
321
 
321
322
  sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
354
355
 
355
356
  // dot product
356
357
  sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
357
- svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
358
+ svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
358
359
  sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
359
- svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
360
+ svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
360
361
  }
361
362
 
362
363
  sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
@@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
404
405
  const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
405
406
  const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
406
407
 
407
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
408
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
408
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
409
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
409
410
  }
410
411
 
411
412
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
423
424
  }
424
425
 
425
426
  int sumi = sumi0 + sumi1;
426
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
427
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
427
428
  }
428
429
 
429
430
  *s = sumf;
@@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
464
465
  const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i];
465
466
 
466
467
  float32_t summs_t[4] = {
467
- GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
468
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
469
- GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
470
- GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
468
+ GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
469
+ GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s),
470
+ GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s),
471
+ GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s)
471
472
  };
472
473
  summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
473
474
 
@@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
490
491
 
491
492
  // mmla into int32x4_t
492
493
  float32_t _scale[4] = {
493
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
494
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
495
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
496
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
494
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
495
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
496
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
497
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
497
498
  };
498
499
  float32x4_t scale = vld1q_f32(_scale);
499
500
 
@@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
539
540
  const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0];
540
541
  const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1];
541
542
 
542
- summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
543
+ summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
543
544
 
544
545
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
545
546
 
@@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
562
563
  const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
563
564
  const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
564
565
 
565
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
566
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
566
+ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
567
+ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
567
568
  }
568
569
 
569
570
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
582
583
  }
583
584
 
584
585
  int sumi = sumi0 + sumi1;
585
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
586
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
586
587
  }
587
588
 
588
589
  *s = sumf;
@@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
666
667
 
667
668
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
668
669
  ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
669
- ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
670
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
670
671
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
671
672
  ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
672
- ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
673
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
673
674
  }
674
675
 
675
676
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
694
695
  }
695
696
 
696
697
  int sumi = sumi0 + sumi1;
697
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
698
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
698
699
  }
699
700
 
700
701
  *s = sumf;
@@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
739
740
 
740
741
  const uint8x16_t m4b = vdupq_n_u8(0x0F);
741
742
 
742
- summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
743
- summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
743
+ summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s);
744
+ summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s);
744
745
 
745
746
  // extract the 5th bit via lookup table ((b) << 4)
746
747
  memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
784
785
 
785
786
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
786
787
  ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
787
- ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
788
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
788
789
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
789
790
  ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
790
- ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
791
+ ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
791
792
  }
792
793
 
793
794
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
812
813
  }
813
814
 
814
815
  int sumi = sumi0 + sumi1;
815
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
816
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
816
817
  }
817
818
 
818
819
  *s = sumf;
@@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
864
865
  const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
865
866
 
866
867
  float32_t _scale[4] = {
867
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
868
- GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
869
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
870
- GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
868
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
869
+ GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d),
870
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d),
871
+ GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d)
871
872
  };
872
873
  float32x4_t scale = vld1q_f32(_scale);
873
874
 
@@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
934
935
 
935
936
  sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
936
937
  svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
937
- svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
938
+ svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
938
939
  sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
939
940
  svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
940
- svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
941
+ svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
941
942
  }
942
943
 
943
944
  sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
@@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
960
961
  const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
961
962
 
962
963
  sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
963
- svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
964
+ svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
964
965
  sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
965
- svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
966
+ svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
966
967
  }
967
968
 
968
969
  sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
@@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1002
1003
  qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
1003
1004
 
1004
1005
  // scale creation
1005
- const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
1006
- const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
1006
+ const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d);
1007
+ const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d);
1007
1008
 
1008
1009
  // duplicate deq1 in first half of vector and deq2 in second half of vector
1009
1010
  const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
@@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1043
1044
 
1044
1045
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
1045
1046
  ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
1046
- ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
1047
+ ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d));
1047
1048
 
1048
1049
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
1049
1050
  ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
1050
- ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
1051
+ ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d));
1051
1052
  }
1052
1053
 
1053
1054
  sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1059
1060
  sumi += x[ib].qs[j]*y[ib].qs[j];
1060
1061
  }
1061
1062
 
1062
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
1063
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1063
1064
  }
1064
1065
 
1065
1066
  *s = sumf;
@@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1217
1218
  const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
1218
1219
  const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
1219
1220
 
1220
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1221
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1221
1222
 
1222
1223
  #if defined(__ARM_FEATURE_DOTPROD)
1223
1224
  sumi0 = vaddq_s32(sumi0, sumi1);
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1269
1270
  }
1270
1271
  }
1271
1272
 
1272
- sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
1273
+ sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1273
1274
  }
1274
1275
 
1275
1276
  *s = sumf;
@@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1362
1363
  const int16x8_t ysum0 = vld1q_s16(y[i].bsums);
1363
1364
  const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8);
1364
1365
 
1365
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1366
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1366
1367
 
1367
1368
  #if defined(__ARM_FEATURE_DOTPROD)
1368
1369
  sumi0 = vaddq_s32(sumi0, sumi1);
@@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1393
1394
  }
1394
1395
  }
1395
1396
 
1396
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1397
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1397
1398
 
1398
1399
  sumf += (float) sumi * d;
1399
1400
  }
@@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1425
1426
  switch (vector_length) {
1426
1427
  case 128:
1427
1428
  for (int i = 0; i < nb; ++i) {
1428
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1429
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1429
1430
  svfloat32_t d_broad = svdup_n_f32((float32_t)d);
1430
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1431
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1431
1432
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
1432
1433
 
1433
1434
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
@@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1570
1571
  case 256:
1571
1572
  case 512:
1572
1573
  for (int i = 0; i < nb; ++i) {
1573
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1574
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1574
1575
  svfloat32_t d_broad = svdup_n_f32((float32_t)d);
1575
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1576
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1576
1577
  svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin);
1577
1578
 
1578
1579
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
@@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1671
1672
  float sum = 0;
1672
1673
 
1673
1674
  for (int i = 0; i < nb; ++i) {
1674
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1675
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1675
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1676
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1676
1677
 
1677
1678
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1678
1679
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1742
1743
  summs += y[i].bsums[j] * (sc[j] >> 4);
1743
1744
  }
1744
1745
 
1745
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1746
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1746
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1747
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1747
1748
 
1748
1749
  int isum = 0;
1749
1750
  int is = 0;
@@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1805
1806
 
1806
1807
  for (int i = 0; i < nb; ++i) {
1807
1808
 
1808
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1809
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1809
1810
 
1810
1811
  const uint8_t * GGML_RESTRICT q3_sv = x[i].qs;
1811
1812
  const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask;
@@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1981
1982
 
1982
1983
  for (int i = 0; i < nb; ++i) {
1983
1984
 
1984
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1985
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1985
1986
 
1986
1987
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1987
1988
  const uint8_t * GGML_RESTRICT qh = x[i].hmask;
@@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2112
2113
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
2113
2114
  q8 += 8; a += 8;
2114
2115
  }
2115
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2116
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2116
2117
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2117
2118
  }
2118
2119
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2258
2259
  bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)),
2259
2260
  vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins))));
2260
2261
  const float32x4_t dmins = {
2261
- GGML_FP16_TO_FP32(x0->dmin) * y0->d,
2262
- GGML_FP16_TO_FP32(x0->dmin) * y1->d,
2263
- GGML_FP16_TO_FP32(x1->dmin) * y0->d,
2264
- GGML_FP16_TO_FP32(x1->dmin) * y1->d,
2262
+ GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d,
2263
+ GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d,
2264
+ GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d,
2265
+ GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d,
2265
2266
  };
2266
2267
  vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins);
2267
2268
 
2268
2269
  const float32x4_t superblock_scale = {
2269
- GGML_FP16_TO_FP32(x0->d) * y0->d,
2270
- GGML_FP16_TO_FP32(x0->d) * y1->d,
2271
- GGML_FP16_TO_FP32(x1->d) * y0->d,
2272
- GGML_FP16_TO_FP32(x1->d) * y1->d,
2270
+ GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
2271
+ GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
2272
+ GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
2273
+ GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
2273
2274
  };
2274
2275
  vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale);
2275
2276
  }
@@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2289
2290
  float sumf = 0;
2290
2291
  for (int i = 0; i < nb; ++i) {
2291
2292
 
2292
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2293
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2293
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2294
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2294
2295
 
2295
2296
  const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
2296
2297
 
@@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2377
2378
 
2378
2379
  for (int i = 0; i < nb; ++i) {
2379
2380
 
2380
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2381
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2381
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2382
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2382
2383
 
2383
2384
  const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
2384
2385
 
@@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2478
2479
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2479
2480
  q8 += 8; a += 8;
2480
2481
  }
2481
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2482
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2482
2483
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2483
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2484
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2484
2485
  sumf -= dmin * sumi;
2485
2486
  }
2486
2487
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2520
2521
 
2521
2522
  for (int i = 0; i < nb; ++i) {
2522
2523
 
2523
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2524
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2524
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2525
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2525
2526
 
2526
2527
  const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
2527
2528
 
@@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2630
2631
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2631
2632
  q8 += 8; a += 8;
2632
2633
  }
2633
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2634
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2634
2635
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2635
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2636
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2636
2637
  sumf -= dmin * sumi;
2637
2638
  }
2638
2639
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2827
2828
  const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32);
2828
2829
 
2829
2830
  const float32x4_t superblock_scale = {
2830
- GGML_FP16_TO_FP32(x0->d) * y0->d,
2831
- GGML_FP16_TO_FP32(x0->d) * y1->d,
2832
- GGML_FP16_TO_FP32(x1->d) * y0->d,
2833
- GGML_FP16_TO_FP32(x1->d) * y1->d,
2831
+ GGML_CPU_FP16_TO_FP32(x0->d) * y0->d,
2832
+ GGML_CPU_FP16_TO_FP32(x0->d) * y1->d,
2833
+ GGML_CPU_FP16_TO_FP32(x1->d) * y0->d,
2834
+ GGML_CPU_FP16_TO_FP32(x1->d) * y1->d,
2834
2835
  };
2835
2836
 
2836
2837
  visum = vsubq_s32(visum, vibias);
@@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2858
2859
  svuint8_t q6h_1, q6h_2, q6h_3, q6h_4;
2859
2860
 
2860
2861
  for (int i = 0; i < nb; ++i) {
2861
- const float d_all = GGML_FP16_TO_FP32(x[i].d);
2862
+ const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
2862
2863
 
2863
2864
  const uint8_t * GGML_RESTRICT q6 = x[i].ql;
2864
2865
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
3011
3012
 
3012
3013
  for (int i = 0; i < nb; ++i) {
3013
3014
 
3014
- const float d_all = GGML_FP16_TO_FP32(x[i].d);
3015
+ const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
3015
3016
 
3016
3017
  const uint8_t * GGML_RESTRICT q6 = x[i].ql;
3017
3018
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
3128
3129
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
3129
3130
  q8 += 8; a += 8;
3130
3131
  }
3131
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3132
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3132
3133
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
3133
3134
  }
3134
3135
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3199
3200
 
3200
3201
  float sumf = 0;
3201
3202
  for (int i = 0; i < nb; ++i) {
3202
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3203
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3203
3204
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3204
3205
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
3205
3206
  float sumf1 = 0, sumf2 = 0;
@@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3234
3235
 
3235
3236
  float sumf = 0.f;
3236
3237
  for (int i = 0; i < nb; ++i) {
3237
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3238
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3238
3239
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3239
3240
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
3240
3241
  int32_t bsum = 0;
@@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3284
3285
 
3285
3286
  float sumf = 0;
3286
3287
  for (int i = 0; i < nb; ++i) {
3287
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3288
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3288
3289
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3289
3290
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
3290
3291
  const uint8x8_t scales8 = vld1_u8(x[i].scales);
@@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3329
3330
 
3330
3331
  float sumf = 0.f;
3331
3332
  for (int i = 0; i < nb; ++i) {
3332
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3333
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3333
3334
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3334
3335
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
3335
3336
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3398
3399
  float sumf = 0;
3399
3400
  for (int i = 0; i < nb; ++i) {
3400
3401
 
3401
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3402
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3402
3403
 
3403
3404
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3404
3405
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3458
3459
  float sumf = 0;
3459
3460
  for (int i = 0; i < nb; i++) {
3460
3461
 
3461
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3462
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3462
3463
  const int8_t * q8 = y[i].qs;
3463
3464
  const uint8_t * qs = x[i].qs;
3464
3465
  const uint8_t * qh = x[i].qh;
@@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3521
3522
 
3522
3523
  float sumf = 0;
3523
3524
  for (int i = 0; i < nb; ++i) {
3524
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3525
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3525
3526
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3526
3527
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3527
3528
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3557
3558
 
3558
3559
  float sumf = 0.f;
3559
3560
  for (int i = 0; i < nb; ++i) {
3560
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3561
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3561
3562
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3562
3563
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3563
3564
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3630
3631
 
3631
3632
  float sumf = 0;
3632
3633
  for (int i = 0; i < nb; ++i) {
3633
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3634
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3634
3635
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3635
3636
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3636
3637
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3691
3692
 
3692
3693
  float sumf = 0.f;
3693
3694
  for (int i = 0; i < nb; ++i) {
3694
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3695
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3695
3696
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3696
3697
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3697
3698
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3786
3787
 
3787
3788
  }
3788
3789
 
3789
- sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
3790
+ sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3);
3790
3791
  }
3791
3792
 
3792
3793
  *s = sumf;
@@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3817
3818
  qs += 4;
3818
3819
  }
3819
3820
 
3820
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3821
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3821
3822
  }
3822
3823
 
3823
3824
  *s = sumf;
@@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3905
3906
 
3906
3907
  }
3907
3908
 
3908
- sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
3909
+ sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2));
3909
3910
  }
3910
3911
 
3911
3912
  *s = sumf;
@@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3952
3953
  qh += 2;
3953
3954
  }
3954
3955
 
3955
- sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
3956
+ sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
3956
3957
  }
3957
3958
 
3958
3959
  *s = sumf;
@@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
4003
4004
  prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
4004
4005
 
4005
4006
  sumf +=
4006
- GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
4007
- GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
4007
+ GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
4008
+ GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
4008
4009
  }
4009
4010
 
4010
4011
  #endif
4011
4012
  for (; ib < nb; ++ib) {
4012
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
4013
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
4013
4014
  int sumi1 = 0, sumi2 = 0;
4014
4015
  for (int j = 0; j < QK4_NL/2; ++j) {
4015
4016
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4071
4072
 
4072
4073
  }
4073
4074
 
4074
- sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
4075
+ sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
4075
4076
  }
4076
4077
 
4077
4078
  *s = sumf;
@@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4079
4080
  #else
4080
4081
  float sumf = 0;
4081
4082
  for (int ibl = 0; ibl < nb; ++ibl) {
4082
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4083
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4083
4084
  uint16_t h = x[ibl].scales_h;
4084
4085
  const uint8_t * qs = x[ibl].qs;
4085
4086
  const int8_t * q8 = y[ibl].qs;