@fugood/llama.node 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +37 -0
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include <cmath>
@@ -51,7 +52,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
51
52
  const float d = amax / ((1 << 7) - 1);
52
53
  id[row_iter] = d ? 1.0f / d : 0.0f;
53
54
 
54
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
55
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
55
56
  }
56
57
 
57
58
  for (int j = 0; j < 8; j++) {
@@ -102,7 +103,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR
102
103
  const float d = amax / ((1 << 7) - 1);
103
104
  id[row_iter] = d ? 1.0f / d : 0.0f;
104
105
 
105
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
106
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
106
107
  }
107
108
 
108
109
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -145,7 +146,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
145
146
  const float d = amax / ((1 << 7) - 1);
146
147
  id[row_iter] = d ? 1.0f / d : 0.0f;
147
148
 
148
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
149
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
149
150
  }
150
151
 
151
152
  for (int j = 0; j < 4; j++) {
@@ -221,7 +222,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
221
222
  const float d = amax / ((1 << 7) - 1);
222
223
  id[row_iter] = d ? 1.0f / d : 0.0f;
223
224
 
224
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
225
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
225
226
  }
226
227
 
227
228
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -311,7 +312,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
311
312
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
312
313
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
313
314
  }
314
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
315
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
315
316
  }
316
317
  }
317
318
  }
@@ -399,7 +400,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
399
400
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
400
401
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
401
402
  }
402
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
403
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
403
404
  }
404
405
  }
405
406
  }
@@ -514,7 +515,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
514
515
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
515
516
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
516
517
  }
517
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
518
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
518
519
  }
519
520
  }
520
521
  }
@@ -608,7 +609,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
608
609
  const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
609
610
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
610
611
  }
611
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
612
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
612
613
  }
613
614
  }
614
615
  }
@@ -1117,7 +1118,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1117
1118
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1118
1119
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1119
1120
  }
1120
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1121
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1121
1122
  }
1122
1123
  }
1123
1124
  }
@@ -1570,7 +1571,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1570
1571
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1571
1572
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1572
1573
  }
1573
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1574
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1574
1575
  }
1575
1576
  }
1576
1577
  }
@@ -2039,7 +2040,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
2039
2040
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2040
2041
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
2041
2042
  }
2042
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
2043
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2043
2044
  }
2044
2045
  }
2045
2046
  }
@@ -2147,7 +2148,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const
2147
2148
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
2148
2149
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
2149
2150
  }
2150
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
2151
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
2151
2152
  }
2152
2153
  }
2153
2154
  }
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
474
475
 
475
476
  // Quantize these floats
476
477
  const float d = max_scalar / 127.f;
477
- y[i].d = GGML_FP32_TO_FP16(d);
478
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
478
479
  const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
479
480
  const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id );
480
481
 
@@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
548
549
 
549
550
  // Quantize these floats
550
551
  const float d = max_scalar / 127.f;
551
- y[i].d = GGML_FP32_TO_FP16(d);
552
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
552
553
  const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f;
553
554
  const __m256 mul = __lasx_xvreplfr2vr_s( id );
554
555
 
@@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
576
577
  // Compute the sum of the quants and set y[i].s
577
578
  const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3));
578
579
  const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7));
579
- y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
580
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1)));
580
581
 
581
582
  // Convert int32 to int16
582
583
  ni0 = lsx_packs_w( ni0, ni1 );
@@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
667
668
  // Main loop
668
669
  for (; ib < nb; ++ib) {
669
670
  /* Compute combined scale for the block */
670
- const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
671
+ const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
671
672
 
672
673
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
673
674
 
@@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
699
700
  for (; ib + 1 < nb; ib += 2) {
700
701
 
701
702
  // Compute combined scale for the block 0 and 1
702
- const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
703
+ const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );
703
704
 
704
705
  const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);
705
706
 
@@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
717
718
  //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
718
719
 
719
720
  // Compute combined scale for the block 2 and 3
720
- const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) );
721
+ const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );
721
722
 
722
723
  const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);
723
724
 
@@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
766
767
  }
767
768
 
768
769
  int sumi = sumi0 + sumi1;
769
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
770
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
770
771
  }
771
772
 
772
773
  *s = sumf;
@@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
797
798
 
798
799
  // Main loop
799
800
  for (; ib < nb; ++ib) {
800
- const float d0 = GGML_FP16_TO_FP32(x[ib].d);
801
- const float d1 = GGML_FP16_TO_FP32(y[ib].d);
801
+ const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
802
+ const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d);
802
803
 
803
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
804
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
804
805
 
805
806
  const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
806
807
  const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
@@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
834
835
  }
835
836
 
836
837
  int sumi = sumi0 + sumi1;
837
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
838
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
838
839
  }
839
840
 
840
841
  *s = sumf;
@@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
865
866
  // Main loop
866
867
  for (; ib < nb; ++ib) {
867
868
  /* Compute combined scale for the block */
868
- const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME
869
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME
869
870
 
870
871
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
871
872
  __m256i bxhi = bytes_from_bits_32(x[ib].qh);
@@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
902
903
  }
903
904
 
904
905
  int sumi = sumi0 + sumi1;
905
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
906
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
906
907
  }
907
908
 
908
909
  *s = sumf;
@@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
934
935
 
935
936
  // Main loop
936
937
  for (; ib < nb; ++ib) {
937
- const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d));
938
+ const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d));
938
939
 
939
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
940
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
940
941
 
941
942
  __m256i qx = bytes_from_nibbles_32(x[ib].qs);
942
943
  __m256i bxhi = bytes_from_bits_32(x[ib].qh);
943
944
  bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10));
944
945
  qx = __lasx_xvor_v(qx, bxhi);
945
946
 
946
- const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d));
947
+ const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d));
947
948
  const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
948
949
 
949
950
  const __m256 q = mul_sum_us8_pairs_float(qx, qy);
@@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
973
974
  }
974
975
 
975
976
  int sumi = sumi0 + sumi1;
976
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
977
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
977
978
  }
978
979
 
979
980
  *s = sumf;
@@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1003
1004
  // Main loop
1004
1005
  for (; ib < nb; ++ib) {
1005
1006
  // Compute combined scale for the block
1006
- const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
1007
+ const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
1007
1008
  __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0);
1008
1009
  __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0);
1009
1010
 
@@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
1023
1024
  sumi += x[ib].qs[j]*y[ib].qs[j];
1024
1025
  }
1025
1026
 
1026
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
1027
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
1027
1028
  }
1028
1029
 
1029
1030
  *s = sumf;
@@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1047
1048
 
1048
1049
  for (int i = 0; i < nb; ++i) {
1049
1050
 
1050
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1051
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1051
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1052
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1052
1053
 
1053
1054
  const uint8_t * GGML_RESTRICT q2 = x[i].qs;
1054
1055
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1116
1117
  summs += y[i].bsums[j] * (sc[j] >> 4);
1117
1118
  }
1118
1119
 
1119
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1120
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1120
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1121
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1121
1122
 
1122
1123
  int isum = 0;
1123
1124
  int is = 0;
@@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1170
1171
 
1171
1172
  for (int i = 0; i < nb; ++i) {
1172
1173
 
1173
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1174
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1174
1175
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
1175
1176
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1176
1177
  // Set up scales
@@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1294
1295
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1295
1296
  q8 += 8; a += 8;
1296
1297
  }
1297
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1298
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1298
1299
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1299
1300
  }
1300
1301
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1330
1331
 
1331
1332
  for (int i = 0; i < nb; ++i) {
1332
1333
 
1333
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1334
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1334
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1335
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1335
1336
 
1336
1337
  memcpy(utmp, x[i].scales, 12);
1337
1338
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1438
1439
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1439
1440
  q8 += 8; a += 8;
1440
1441
  }
1441
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1442
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1442
1443
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1443
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1444
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1444
1445
  sumf -= dmin * sumi;
1445
1446
  }
1446
1447
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1477
1478
  const uint8_t * GGML_RESTRICT q5 = x[i].qs;
1478
1479
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1479
1480
 
1480
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1481
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1481
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1482
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1482
1483
 
1483
1484
  memcpy(utmp, x[i].scales, 12);
1484
1485
  utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
@@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1593
1594
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1594
1595
  q8 += 8; a += 8;
1595
1596
  }
1596
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1597
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1597
1598
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1598
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1599
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1599
1600
  sumf -= dmin * sumi;
1600
1601
  }
1601
1602
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1624
1625
 
1625
1626
  for (int i = 0; i < nb; ++i) {
1626
1627
 
1627
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1628
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1628
1629
 
1629
1630
  const uint8_t * GGML_RESTRICT q4 = x[i].ql;
1630
1631
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1713
1714
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1714
1715
  q8 += 8; a += 8;
1715
1716
  }
1716
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1717
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1717
1718
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1718
1719
  }
1719
1720
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1780
1781
 
1781
1782
  __m256 accumf = (__m256)__lasx_xvldi(0);
1782
1783
  for (int i = 0; i < nb; ++i) {
1783
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1784
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1784
1785
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1785
1786
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1786
1787
  __m256i sumi1 = __lasx_xvldi(0);
@@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1820
1821
 
1821
1822
  float sumf = 0.f;
1822
1823
  for (int i = 0; i < nb; ++i) {
1823
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1824
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1824
1825
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1825
1826
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1826
1827
  int32_t bsum = 0;
@@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1895
1896
 
1896
1897
  __m256 accumf = (__m256)__lasx_xvldi(0);
1897
1898
  for (int i = 0; i < nb; ++i) {
1898
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1899
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1899
1900
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1900
1901
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1901
1902
 
@@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1980
1981
 
1981
1982
  float sumf = 0.f;
1982
1983
  for (int i = 0; i < nb; ++i) {
1983
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1984
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1984
1985
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1985
1986
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
1986
1987
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2049
2050
 
2050
2051
  __m256 accumf = (__m256)__lasx_xvldi(0);
2051
2052
  for (int i = 0; i < nb; ++i) {
2052
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2053
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2053
2054
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
2054
2055
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
2055
2056
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8);
@@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2108
2109
  float sumf = 0;
2109
2110
  for (int i = 0; i < nb; i++) {
2110
2111
 
2111
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2112
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2112
2113
  const int8_t * q8 = y[i].qs;
2113
2114
  const uint8_t * qs = x[i].qs;
2114
2115
  const uint8_t * qh = x[i].qh;
@@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2168
2169
 
2169
2170
  __m256 accumf = (__m256)__lasx_xvldi(0);
2170
2171
  for (int i = 0; i < nb; ++i) {
2171
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2172
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2172
2173
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2173
2174
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2174
2175
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2213
2214
 
2214
2215
  float sumf = 0.f;
2215
2216
  for (int i = 0; i < nb; ++i) {
2216
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2217
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2217
2218
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2218
2219
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2219
2220
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2279
2280
 
2280
2281
  __m256 accumf = (__m256)__lasx_xvldi(0);
2281
2282
  for (int i = 0; i < nb; ++i) {
2282
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2283
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2283
2284
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
2284
2285
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
2285
2286
  const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs;
@@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2340
2341
 
2341
2342
  float sumf = 0.f;
2342
2343
  for (int i = 0; i < nb; ++i) {
2343
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2344
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2344
2345
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
2345
2346
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
2346
2347
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2451
2452
  + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2;
2452
2453
  }
2453
2454
 
2454
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
2455
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
2455
2456
  accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum);
2456
2457
  accum1 += d * sumi1;
2457
2458
  }
@@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2484
2485
  qs += 4;
2485
2486
  }
2486
2487
 
2487
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2488
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2488
2489
  }
2489
2490
 
2490
2491
  *s = sumf;
@@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2530
2531
  const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
2531
2532
  const __m256i p_1 = lasx_madd_h(p16_1, mone);
2532
2533
  const __m256i p_2 = lasx_madd_h(p16_2, mone);
2533
- accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
2534
+ accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)),
2534
2535
  __lasx_xvffint_s_w(p_1), accum1);
2535
- accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
2536
+ accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)),
2536
2537
  __lasx_xvffint_s_w(p_2), accum2);
2537
2538
  }
2538
2539
 
@@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2540
2541
 
2541
2542
  #endif
2542
2543
  for (; ib < nb; ++ib) {
2543
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
2544
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
2544
2545
  int sumi1 = 0, sumi2 = 0;
2545
2546
  for (int j = 0; j < QK4_NL/2; ++j) {
2546
2547
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2595
2596
  sumi1 = __lasx_xvadd_w(p_1, sumi1);
2596
2597
  sumi2 = __lasx_xvadd_w(p_2, sumi2);
2597
2598
  }
2598
- accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
2599
+ accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d),
2599
2600
  __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum);
2600
2601
  }
2601
2602
 
@@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2604
2605
  #else
2605
2606
  float sumf = 0;
2606
2607
  for (int ibl = 0; ibl < nb; ++ibl) {
2607
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2608
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2608
2609
  uint16_t h = x[ibl].scales_h;
2609
2610
  const uint8_t * qs = x[ibl].qs;
2610
2611
  const int8_t * q8 = y[ibl].qs;