@fugood/llama.node 1.0.0-beta.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +12 -0
  3. package/lib/index.js +10 -0
  4. package/lib/index.ts +17 -1
  5. package/package.json +14 -14
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +7 -3
  8. package/src/LlamaCompletionWorker.h +2 -0
  9. package/src/LlamaContext.cpp +49 -6
  10. package/src/LlamaContext.h +1 -0
  11. package/src/RerankWorker.h +26 -0
  12. package/src/common.hpp +1 -1
  13. package/src/llama.cpp/CMakeLists.txt +1 -1
  14. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  15. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  16. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  28. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  29. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  35. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  40. package/src/llama.cpp/include/llama.h +6 -3
  41. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  42. package/src/llama.cpp/src/llama-arch.h +17 -0
  43. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  44. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  45. package/src/llama.cpp/src/llama-context.cpp +0 -1
  46. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  47. package/src/llama.cpp/src/llama-graph.h +14 -2
  48. package/src/llama.cpp/src/llama-hparams.h +6 -0
  49. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  50. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  51. package/src/llama.cpp/src/llama-model.cpp +518 -1
  52. package/src/llama.cpp/src/llama-model.h +22 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
67
68
  const float id = d ? 1.0f/d : 0.0f;
68
69
  const vector float vid = vec_splats(id);
69
70
 
70
- y[i].d = GGML_FP32_TO_FP16(d);
71
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
71
72
 
72
73
  for (int j = 0; j < 8; j++) {
73
74
  const vector float v = vec_round(vec_mul(srcv[j], vid));
@@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
112
113
  const float id = d ? 1.0f/d : 0.0f;
113
114
  const vector float vid = vec_splats(id);
114
115
 
115
- y[i].d = GGML_FP32_TO_FP16(d);
116
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
116
117
 
117
118
  vector int accv = vec_splats(0);
118
119
 
@@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
127
128
 
128
129
  accv = vec_add(accv, vec_sld(accv, accv, 4));
129
130
  accv = vec_add(accv, vec_sld(accv, accv, 8));
130
- y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0));
131
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0));
131
132
  }
132
133
 
133
134
  #else
@@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
170
171
  __builtin_prefetch(x[ib].qs, 0, 1);
171
172
  __builtin_prefetch(y[ib].qs, 0, 1);
172
173
 
173
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
174
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
174
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
175
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
175
176
  vector float vd = vec_mul(vxd, vyd);
176
177
 
177
178
  vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
214
215
  }
215
216
 
216
217
  int sumi = sumi0 + sumi1;
217
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
218
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
218
219
  }
219
220
 
220
221
  *s = sumf;
@@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249
250
  __builtin_prefetch(x[ib].qs, 0, 1);
250
251
  __builtin_prefetch(y[ib].qs, 0, 1);
251
252
 
252
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
253
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
253
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
254
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
254
255
  vector float vd = vec_mul(vxd, vyd);
255
256
 
256
- vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
257
- vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
257
+ vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
258
+ vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
258
259
  vsumf0 = vec_madd(vxmin, vys, vsumf0);
259
260
 
260
261
  vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
291
292
  }
292
293
 
293
294
  int sumi = sumi0 + sumi1;
294
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
295
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
295
296
  }
296
297
 
297
298
  *s = sumf;
@@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
326
327
  __builtin_prefetch(x[ib].qs, 0, 1);
327
328
  __builtin_prefetch(y[ib].qs, 0, 1);
328
329
 
329
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
330
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
330
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
331
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
331
332
  vector float vd = vec_mul(vxd, vyd);
332
333
 
333
334
  vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])};
@@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
379
380
  }
380
381
 
381
382
  int sumi = sumi0 + sumi1;
382
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
383
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
383
384
  }
384
385
 
385
386
  *s = sumf;
@@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
415
416
  __builtin_prefetch(x[ib].qs, 0, 1);
416
417
  __builtin_prefetch(y[ib].qs, 0, 1);
417
418
 
418
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
419
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
419
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
420
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
420
421
  vector float vd = vec_mul(vxd, vyd);
421
422
 
422
- vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
423
- vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
423
+ vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m));
424
+ vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f};
424
425
  vsumf0 = vec_madd(vxmin, vys, vsumf0);
425
426
 
426
427
  vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])};
@@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
470
471
  }
471
472
 
472
473
  int sumi = sumi0 + sumi1;
473
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
474
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
474
475
  }
475
476
 
476
477
  *s = sumf;
@@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
502
503
  __builtin_prefetch(x[ib].qs, 0, 1);
503
504
  __builtin_prefetch(y[ib].qs, 0, 1);
504
505
 
505
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
506
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
506
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
507
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
507
508
  vector float vd = vec_mul(vxd, vyd);
508
509
 
509
510
  vector signed char q8x0 = vec_xl( 0, x[ib].qs);
@@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
542
543
  sumi += x[ib].qs[j]*y[ib].qs[j];
543
544
  }
544
545
 
545
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
546
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
546
547
  }
547
548
 
548
549
  *s = sumf;
@@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
574
575
  vector float vsumf3 = vec_splats(0.0f);
575
576
 
576
577
  for (int i = 0; i < nb; ++i) {
577
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
578
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
578
579
  vector float vyd = vec_splats(y[i].d);
579
580
  vector float vd = vec_mul(vxd, vyd);
580
581
 
581
- vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
582
+ vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
582
583
  vector float vdmin = vec_mul(vxmin, vyd);
583
584
 
584
585
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
@@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
708
709
  summs += y[i].bsums[j] * (sc[j] >> 4);
709
710
  }
710
711
 
711
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
712
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
712
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
713
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
713
714
 
714
715
  int isum = 0;
715
716
  int is = 0;
@@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
770
771
  vector float vsumf3 = vec_splats(0.0f);
771
772
 
772
773
  for (int i = 0; i < nb; ++i) {
773
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
774
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
774
775
  vector float vyd = vec_splats(y[i].d);
775
776
  vector float vd = vec_mul(vxd, vyd);
776
777
 
@@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
962
963
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
963
964
  q8 += 8; a += 8;
964
965
  }
965
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
966
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
966
967
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
967
968
  }
968
969
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1005
1006
  vector float vsumf3 = vec_splats(0.0f);
1006
1007
 
1007
1008
  for (int i = 0; i < nb; ++i) {
1008
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1009
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1009
1010
  vector float vyd = vec_splats(y[i].d);
1010
1011
  vector float vd = vec_mul(vxd, vyd);
1011
1012
 
1012
- vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
1013
+ vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
1013
1014
  vector float vdmin = vec_mul(vxmin, vyd);
1014
1015
 
1015
1016
  vector signed short q8ysums0 = vec_xl( 0, y[i].bsums);
@@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1177
1178
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1178
1179
  q8 += 8; a += 8;
1179
1180
  }
1180
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1181
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1181
1182
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1182
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1183
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1183
1184
  sumf -= dmin * sumi;
1184
1185
  }
1185
1186
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1222
1223
  vector float vsumf3 = vec_splats(0.0f);
1223
1224
 
1224
1225
  for (int i = 0; i < nb; ++i) {
1225
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1226
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1226
1227
  vector float vyd = vec_splats(y[i].d);
1227
1228
  vector float vd = vec_mul(vxd, vyd);
1228
1229
 
1229
- vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin));
1230
+ vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin));
1230
1231
  vector float vdmin = vec_mul(vxmin, vyd);
1231
1232
 
1232
1233
  UNUSED(kmask1);
@@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1394
1395
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1395
1396
  q8 += 8; a += 8;
1396
1397
  }
1397
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1398
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1398
1399
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1399
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1400
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1400
1401
  sumf -= dmin * sumi;
1401
1402
  }
1402
1403
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1432
1433
  vector float vsumf3 = vec_splats(0.0f);
1433
1434
 
1434
1435
  for (int i = 0; i < nb; ++i) {
1435
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1436
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1436
1437
  vector float vyd = vec_splats(y[i].d);
1437
1438
  vector float vd = vec_mul(vxd, vyd);
1438
1439
 
@@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1591
1592
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1592
1593
  q8 += 8; a += 8;
1593
1594
  }
1594
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1595
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1595
1596
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1596
1597
  }
1597
1598
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1659
1660
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1660
1661
 
1661
1662
  for (int i = 0; i < nb; ++i) {
1662
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1663
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1663
1664
  vector float vyd = vec_splats(y[i].d);
1664
1665
  vector float vd = vec_mul(vxd, vyd);
1665
1666
 
@@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
1742
1743
 
1743
1744
  float sumf = 0.f;
1744
1745
  for (int i = 0; i < nb; ++i) {
1745
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1746
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1746
1747
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1747
1748
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1748
1749
  int32_t bsum = 0;
@@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1790
1791
  const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
1791
1792
 
1792
1793
  for (int i = 0; i < nb; ++i) {
1793
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1794
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1794
1795
  vector float vyd = vec_splats(y[i].d);
1795
1796
  vector float vd = vec_mul(vxd, vyd);
1796
1797
 
@@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1871
1872
 
1872
1873
  float sumf = 0.f;
1873
1874
  for (int i = 0; i < nb; ++i) {
1874
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1875
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1875
1876
  const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1876
1877
  const uint8_t * GGML_RESTRICT sc = x[i].scales;
1877
1878
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
1939
1940
  const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
1940
1941
 
1941
1942
  for (int i = 0; i < nb; ++i) {
1942
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
1943
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
1943
1944
  vector float vyd = vec_splats(y[i].d);
1944
1945
  vector float vd = vec_mul(vxd, vyd);
1945
1946
 
@@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2033
2034
  float sumf = 0;
2034
2035
  for (int i = 0; i < nb; i++) {
2035
2036
 
2036
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2037
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2037
2038
  const int8_t * q8 = y[i].qs;
2038
2039
  const uint8_t * qs = x[i].qs;
2039
2040
  const uint8_t * qh = x[i].qh;
@@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2096
2097
  vector float vsumf3 = vec_splats(0.0f);
2097
2098
 
2098
2099
  for (int i = 0; i < nb; ++i) {
2099
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
2100
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
2100
2101
  vector float vyd = vec_splats(y[i].d);
2101
2102
  vector float vd = vec_mul(vxd, vyd);
2102
2103
 
@@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const
2176
2177
 
2177
2178
  float sumf = 0.f;
2178
2179
  for (int i = 0; i < nb; ++i) {
2179
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2180
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2180
2181
  const uint8_t * GGML_RESTRICT q3 = x[i].qs;
2181
2182
  const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4;
2182
2183
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
@@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2236
2237
  const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2);
2237
2238
 
2238
2239
  for (int i = 0; i < nb; ++i) {
2239
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
2240
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
2240
2241
  vector float vyd = vec_splats(y[i].d);
2241
2242
  vector float vd = vec_mul(vxd, vyd);
2242
2243
 
@@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2329
2330
 
2330
2331
  float sumf = 0.f;
2331
2332
  for (int i = 0; i < nb; ++i) {
2332
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2333
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2333
2334
  const uint8_t * GGML_RESTRICT qs = x[i].qs;
2334
2335
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
2335
2336
  const uint8_t * GGML_RESTRICT signs = x[i].signs;
@@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2394
2395
  vector float vsumf3 = vec_splats(0.0f);
2395
2396
 
2396
2397
  for (int i = 0; i < nb; ++i) {
2397
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d));
2398
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d));
2398
2399
  vector float vyd = vec_splats(y[i].d);
2399
2400
  vector float vd = vec_mul(vxd, vyd);
2400
2401
 
@@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
2505
2506
  qs += 4;
2506
2507
  }
2507
2508
 
2508
- sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2509
+ sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1);
2509
2510
  }
2510
2511
 
2511
2512
  *s = sumf;
@@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2546
2547
  __builtin_prefetch(y[ib].qs, 0, 1);
2547
2548
 
2548
2549
 
2549
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
2550
- vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
2550
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d));
2551
+ vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d));
2551
2552
  vector float vd = vec_mul(vxd, vyd);
2552
2553
 
2553
2554
  vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
@@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
2582
2583
 
2583
2584
  #endif
2584
2585
  for (; ib < nb; ++ib) {
2585
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
2586
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
2586
2587
  int sumi1 = 0, sumi2 = 0;
2587
2588
  for (int j = 0; j < QK4_NL/2; ++j) {
2588
2589
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2620
2621
 
2621
2622
  for (int ibl = 0; ibl < nb; ++ibl) {
2622
2623
 
2623
- vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d));
2624
+ vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d));
2624
2625
  vector float vyd = vec_splats(y[ibl].d);
2625
2626
  vector float vd = vec_mul(vxd, vyd);
2626
2627
 
@@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
2697
2698
  #else
2698
2699
  float sumf = 0;
2699
2700
  for (int ibl = 0; ibl < nb; ++ibl) {
2700
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2701
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
2701
2702
  uint16_t h = x[ibl].scales_h;
2702
2703
  const uint8_t * qs = x[ibl].qs;
2703
2704
  const int8_t * q8 = y[ibl].qs;