@fugood/llama.node 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +37 -0
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_
256
257
 
257
258
  // quad fp16 delta calculation
258
259
  static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
259
- // GGML_FP16_TO_FP32 is faster than Intel F16C
260
- return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
261
- _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
260
+ // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C
261
+ return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)),
262
+ _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0)));
262
263
  }
263
264
  #endif
264
265
  #elif defined(__SSSE3__)
@@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
305
306
 
306
307
  // Quantize these floats
307
308
  const float d = maxScalar / 127.f;
308
- y[i].d = GGML_FP32_TO_FP16(d);
309
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
309
310
  const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
310
311
  const __m256 mul = _mm256_set1_ps( id );
311
312
 
@@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
401
402
 
402
403
  // Quantize these floats
403
404
  const float d = max_scalar / 127.f;
404
- y[i].d = GGML_FP32_TO_FP16(d);
405
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
405
406
  const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
406
407
  const __m256 mul = _mm256_set1_ps( id );
407
408
 
@@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
425
426
 
426
427
  #if defined(__AVX2__)
427
428
  // Compute the sum of the quants and set y[i].s
428
- y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
429
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
429
430
 
430
431
  // Convert int32 to int16
431
432
  i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
@@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
455
456
  // Compute the sum of the quants and set y[i].s
456
457
  const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
457
458
  const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
458
- y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
459
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
459
460
 
460
461
  // Convert int32 to int16
461
462
  ni0 = _mm_packs_epi32( ni0, ni1 );
@@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
552
553
  // Main loop
553
554
  for (; ib < nb; ++ib) {
554
555
  /* Compute combined scale for the block */
555
- const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
556
+ const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
556
557
 
557
558
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
558
559
 
@@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
613
614
  _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0);
614
615
 
615
616
  // Compute combined scale for the block 0 and 1
616
- const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
617
+ const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
617
618
 
618
619
  const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs);
619
620
 
@@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
631
632
  _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
632
633
 
633
634
  // Compute combined scale for the block 2 and 3
634
- const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
635
+ const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
635
636
 
636
637
  const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
637
638
 
@@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
680
681
  }
681
682
 
682
683
  int sumi = sumi0 + sumi1;
683
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
684
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
684
685
  }
685
686
 
686
687
  *s = sumf;
@@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
711
712
 
712
713
  // Main loop
713
714
  for (; ib < nb; ++ib) {
714
- const float d0 = GGML_FP16_TO_FP32(x[ib].d);
715
- const float d1 = GGML_FP16_TO_FP32(y[ib].d);
715
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
716
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
716
717
 
717
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
718
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
718
719
 
719
720
  const __m256 d0v = _mm256_set1_ps( d0 );
720
721
  const __m256 d1v = _mm256_set1_ps( d1 );
@@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
752
753
  }
753
754
 
754
755
  int sumi = sumi0 + sumi1;
755
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
756
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
756
757
  }
757
758
 
758
759
  *s = sumf;
@@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
783
784
  // Main loop
784
785
  for (; ib < nb; ++ib) {
785
786
  /* Compute combined scale for the block */
786
- const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
787
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
787
788
 
788
789
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
789
790
  __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
807
808
  // Main loop
808
809
  for (; ib < nb; ++ib) {
809
810
  /* Compute combined scale for the block */
810
- const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
811
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
811
812
 
812
813
  __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
813
814
  const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
851
852
  }
852
853
 
853
854
  int sumi = sumi0 + sumi1;
854
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
855
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
855
856
  }
856
857
 
857
858
  *s = sumf;
@@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
883
884
 
884
885
  // Main loop
885
886
  for (; ib < nb; ++ib) {
886
- const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
887
+ const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
887
888
 
888
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
889
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
889
890
 
890
891
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
891
892
  __m256i bxhi = bytes_from_bits_32(x[ib].qh);
892
893
  bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
893
894
  qx = _mm256_or_si256(qx, bxhi);
894
895
 
895
- const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
896
+ const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
896
897
  const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
897
898
 
898
899
  const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
910
911
 
911
912
  // Main loop
912
913
  for (; ib < nb; ++ib) {
913
- const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d));
914
+ const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d));
914
915
 
915
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
916
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
916
917
 
917
918
  __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs);
918
919
  const __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
926
927
  bxh = _mm_or_si128(bxh, bxhih);
927
928
  bx_0 = MM256_SET_M128I(bxh, bxl);
928
929
 
929
- const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d));
930
+ const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d));
930
931
  const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs);
931
932
 
932
933
  const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
@@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
956
957
  }
957
958
 
958
959
  int sumi = sumi0 + sumi1;
959
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
960
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
960
961
  }
961
962
 
962
963
  *s = sumf;
@@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
986
987
  // Main loop
987
988
  for (; ib < nb; ++ib) {
988
989
  // Compute combined scale for the block
989
- const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
990
+ const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
990
991
  __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs);
991
992
  __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs);
992
993
 
@@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1025
1026
  sumi += x[ib].qs[j]*y[ib].qs[j];
1026
1027
  }
1027
1028
 
1028
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
1029
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1029
1030
  }
1030
1031
 
1031
1032
  *s = sumf;
@@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1144
1145
  }
1145
1146
 
1146
1147
  const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1147
- const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
1148
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
1148
1149
 
1149
1150
  sumi0 = _mm256_sub_epi16(sumi0, ysum);
1150
1151
  sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2));
@@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1190
1191
  }
1191
1192
  }
1192
1193
 
1193
- sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d);
1194
+ sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d);
1194
1195
  }
1195
1196
 
1196
1197
  *s = sumf;
@@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1244
1245
  }
1245
1246
 
1246
1247
  const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums);
1247
- const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d));
1248
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d));
1248
1249
 
1249
1250
  sumi0 = _mm256_add_epi16(sumi0, sumi1);
1250
1251
  sumi0 = _mm256_sub_epi16(sumi0, ysum);
@@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1269
1270
  }
1270
1271
  }
1271
1272
 
1272
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1273
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1273
1274
 
1274
1275
  sumf += (float) sumi * d;
1275
1276
  }
@@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1299
1300
 
1300
1301
  for (int i = 0; i < nb; ++i) {
1301
1302
 
1302
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1303
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1303
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1304
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1304
1305
 
1305
1306
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1306
1307
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1366
1367
 
1367
1368
  for (int i = 0; i < nb; ++i) {
1368
1369
 
1369
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1370
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1370
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1371
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1371
1372
 
1372
1373
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1373
1374
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1477
1478
  summs += y[i].bsums[j] * (sc[j] >> 4);
1478
1479
  }
1479
1480
 
1480
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1481
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1481
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1482
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1482
1483
 
1483
1484
  int isum = 0;
1484
1485
  int is = 0;
@@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1533
1534
 
1534
1535
  for (int i = 0; i < nb; ++i) {
1535
1536
 
1536
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1537
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1537
1538
 
1538
1539
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1539
1540
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1638
1639
 
1639
1640
  for (int i = 0; i < nb; ++i) {
1640
1641
 
1641
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1642
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1642
1643
 
1643
1644
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1644
1645
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1824
1825
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1825
1826
  q8 += 8; a += 8;
1826
1827
  }
1827
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1828
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1828
1829
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1829
1830
  }
1830
1831
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1862
1863
 
1863
1864
  for (int i = 0; i < nb; ++i) {
1864
1865
 
1865
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1866
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1866
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1867
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1867
1868
 
1868
1869
  memcpy(utmp, x[i].scales, 12);
1869
1870
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1928
1929
 
1929
1930
  for (int i = 0; i < nb; ++i) {
1930
1931
 
1931
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1932
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1932
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1933
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1933
1934
 
1934
1935
  const uint8_t * GGML_RESTRICT q4 = x[i].qs;
1935
1936
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2049
2050
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2050
2051
  q8 += 8; a += 8;
2051
2052
  }
2052
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2053
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2053
2054
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2054
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2055
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2055
2056
  sumf -= dmin * sumi;
2056
2057
  }
2057
2058
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2092
2093
  const uint8_t * GGML_RESTRICT q5 = x[i].qs;
2093
2094
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2094
2095
 
2095
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2096
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2096
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2097
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2097
2098
 
2098
2099
  memcpy(utmp, x[i].scales, 12);
2099
2100
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2170
2171
 
2171
2172
  for (int i = 0; i < nb; ++i) {
2172
2173
 
2173
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2174
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
2174
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2175
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
2175
2176
 
2176
2177
  const uint8_t * GGML_RESTRICT q5 = x[i].qs;
2177
2178
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2311
2312
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2312
2313
  q8 += 8; a += 8;
2313
2314
  }
2314
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2315
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2315
2316
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2316
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
2317
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
2317
2318
  sumf -= dmin * sumi;
2318
2319
  }
2319
2320
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2344
2345
 
2345
2346
  for (int i = 0; i < nb; ++i) {
2346
2347
 
2347
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2348
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2348
2349
 
2349
2350
  const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2350
2351
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2422
2423
 
2423
2424
  for (int i = 0; i < nb; ++i) {
2424
2425
 
2425
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2426
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2426
2427
 
2427
2428
  const uint8_t * GGML_RESTRICT q4 = x[i].ql;
2428
2429
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2555
2556
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2556
2557
  q8 += 8; a += 8;
2557
2558
  }
2558
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2559
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2559
2560
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2560
2561
  }
2561
2562
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2622
2623
 
2623
2624
  __m256 accumf = _mm256_setzero_ps();
2624
2625
  for (int i = 0; i < nb; ++i) {
2625
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2626
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2626
2627
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2627
2628
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2628
2629
  __m256i sumi1 = _mm256_setzero_si256();
@@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2663
2664
 
2664
2665
  __m256 accumf = _mm256_setzero_ps();
2665
2666
  for (int i = 0; i < nb; ++i) {
2666
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2667
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2667
2668
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2668
2669
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2669
2670
  __m128i sumi1_0 = _mm_setzero_si128();
@@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2717
2718
 
2718
2719
  float sumf = 0.f;
2719
2720
  for (int i = 0; i < nb; ++i) {
2720
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2721
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2721
2722
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2722
2723
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2723
2724
  int32_t bsum = 0;
@@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2792
2793
 
2793
2794
  __m256 accumf = _mm256_setzero_ps();
2794
2795
  for (int i = 0; i < nb; ++i) {
2795
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2796
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2796
2797
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2797
2798
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2798
2799
 
@@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2913
2914
 
2914
2915
  __m256 accumf = _mm256_setzero_ps();
2915
2916
  for (int i = 0; i < nb; ++i) {
2916
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2917
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2917
2918
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
2918
2919
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
2919
2920
 
@@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
3035
3036
 
3036
3037
  float sumf = 0.f;
3037
3038
  for (int i = 0; i < nb; ++i) {
3038
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3039
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3039
3040
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
3040
3041
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
3041
3042
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3104
3105
 
3105
3106
  __m256 accumf = _mm256_setzero_ps();
3106
3107
  for (int i = 0; i < nb; ++i) {
3107
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3108
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3108
3109
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3109
3110
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3110
3111
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3177
3178
 
3178
3179
  __m256 accumf = _mm256_setzero_ps();
3179
3180
  for (int i = 0; i < nb; ++i) {
3180
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3181
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3181
3182
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3182
3183
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3183
3184
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3253
3254
  float sumf = 0;
3254
3255
  for (int i = 0; i < nb; i++) {
3255
3256
 
3256
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3257
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3257
3258
  const int8_t * q8 = y[i].qs;
3258
3259
  const uint8_t * qs = x[i].qs;
3259
3260
  const uint8_t * qh = x[i].qh;
@@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3313
3314
 
3314
3315
  __m256 accumf = _mm256_setzero_ps();
3315
3316
  for (int i = 0; i < nb; ++i) {
3316
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3317
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3317
3318
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3318
3319
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3319
3320
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3358
3359
 
3359
3360
  __m256 accumf = _mm256_setzero_ps();
3360
3361
  for (int i = 0; i < nb; ++i) {
3361
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3362
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3362
3363
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3363
3364
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3364
3365
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
3414
3415
 
3415
3416
  float sumf = 0.f;
3416
3417
  for (int i = 0; i < nb; ++i) {
3417
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3418
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3418
3419
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
3419
3420
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
3420
3421
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3480
3481
 
3481
3482
  __m256 accumf = _mm256_setzero_ps();
3482
3483
  for (int i = 0; i < nb; ++i) {
3483
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3484
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3484
3485
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3485
3486
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3486
3487
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3565
3566
 
3566
3567
  __m256 accumf = _mm256_setzero_ps();
3567
3568
  for (int i = 0; i < nb; ++i) {
3568
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3569
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3569
3570
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3570
3571
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3571
3572
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3648
3649
 
3649
3650
  float sumf = 0.f;
3650
3651
  for (int i = 0; i < nb; ++i) {
3651
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
3652
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
3652
3653
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
3653
3654
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
3654
3655
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3753
3754
  + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3754
3755
  }
3755
3756
 
3756
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3757
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
3757
3758
  accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum);
3758
3759
  accum1 += d * sumi1;
3759
3760
 
@@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3801
3802
  + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
3802
3803
  }
3803
3804
 
3804
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3805
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
3805
3806
  accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum);
3806
3807
  accum1 += d * sumi1;
3807
3808
 
@@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3835
3836
  qs += 4;
3836
3837
  }
3837
3838
 
3838
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3839
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
3839
3840
  }
3840
3841
 
3841
3842
  *s = sumf;
@@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3947
3948
  qs += 8; qh += 4;
3948
3949
  }
3949
3950
 
3950
- const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
3951
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
3951
3952
 
3952
3953
  accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1);
3953
3954
  accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2);
@@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
4033
4034
  qs += 8; qh += 4;
4034
4035
  }
4035
4036
 
4036
- const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16));
4037
+ const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16));
4037
4038
 
4038
4039
  accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1);
4039
4040
  accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2);
@@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
4083
4084
  qh += 2;
4084
4085
  }
4085
4086
 
4086
- sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
4087
+ sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2);
4087
4088
  }
4088
4089
 
4089
4090
  *s = sumf;
@@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
4129
4130
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
4130
4131
  const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
4131
4132
  const __m256i p_2 = _mm256_madd_epi16(p16_2, mone);
4132
- accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
4133
+ accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
4133
4134
  _mm256_cvtepi32_ps(p_1), accum1);
4134
- accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
4135
+ accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
4135
4136
  _mm256_cvtepi32_ps(p_2), accum2);
4136
4137
  }
4137
4138
 
@@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
4164
4165
 
4165
4166
  #endif
4166
4167
  for (; ib < nb; ++ib) {
4167
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
4168
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
4168
4169
  int sumi1 = 0, sumi2 = 0;
4169
4170
  for (int j = 0; j < QK4_NL/2; ++j) {
4170
4171
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4219
4220
  sumi1 = _mm256_add_epi32(p_1, sumi1);
4220
4221
  sumi2 = _mm256_add_epi32(p_2, sumi2);
4221
4222
  }
4222
- accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4223
+ accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4223
4224
  _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum);
4224
4225
  }
4225
4226
 
@@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4267
4268
  }
4268
4269
  __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0);
4269
4270
  __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1);
4270
- accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4271
+ accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
4271
4272
  _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum);
4272
4273
  }
4273
4274
 
@@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
4276
4277
  #else
4277
4278
  float sumf = 0;
4278
4279
  for (int ibl = 0; ibl < nb; ++ibl) {
4279
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4280
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
4280
4281
  uint16_t h = x[ibl].scales_h;
4281
4282
  const uint8_t * qs = x[ibl].qs;
4282
4283
  const int8_t * q8 = y[ibl].qs;