@fugood/llama.node 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +37 -0
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
45
46
  const float d = amax / ((1 << 7) - 1);
46
47
  const float id = d ? 1.0f/d : 0.0f;
47
48
 
48
- y[i].d = GGML_FP32_TO_FP16(d);
49
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
49
50
 
50
51
  vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
51
52
 
@@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
85
86
  const float d = amax / ((1 << 7) - 1);
86
87
  const float id = d ? 1.0f/d : 0.0f;
87
88
 
88
- y[i].d = GGML_FP32_TO_FP16(d);
89
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
89
90
 
90
91
  vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl);
91
92
 
@@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
102
103
 
103
104
  // set y[i].s
104
105
  int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
105
- y[i].s = GGML_FP32_TO_FP16(sum*d);
106
+ y[i].s = GGML_CPU_FP32_TO_FP16(sum*d);
106
107
  }
107
108
 
108
109
  #else
@@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
160
161
 
161
162
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
162
163
 
163
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
164
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
164
165
  }
165
166
 
166
167
  #endif
@@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
177
178
  }
178
179
 
179
180
  int sumi = sumi0 + sumi1;
180
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
181
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
181
182
  }
182
183
 
183
184
  *s = sumf;
@@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
225
226
 
226
227
  int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
227
228
 
228
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
229
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
229
230
  }
230
231
 
231
232
  #endif
@@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
242
243
  }
243
244
 
244
245
  int sumi = sumi0 + sumi1;
245
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
246
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
246
247
  }
247
248
 
248
249
  *s = sumf;
@@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
293
294
  vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
294
295
  int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
295
296
 
296
- sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi;
297
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
297
298
  }
298
299
 
299
300
  #endif
@@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
316
317
  }
317
318
 
318
319
  int sumi = sumi0 + sumi1;
319
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
320
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi;
320
321
  }
321
322
 
322
323
  *s = sumf;
@@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
366
367
  vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl);
367
368
  int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum);
368
369
 
369
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
370
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
370
371
  }
371
372
 
372
373
  #endif
@@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
389
390
  }
390
391
 
391
392
  int sumi = sumi0 + sumi1;
392
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
393
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
393
394
  }
394
395
 
395
396
  *s = sumf;
@@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
427
428
 
428
429
  int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum);
429
430
 
430
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
431
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
431
432
  }
432
433
 
433
434
  #endif
@@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
438
439
  sumi += x[ib].qs[j]*y[ib].qs[j];
439
440
  }
440
441
 
441
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
442
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
442
443
  }
443
444
 
444
445
  *s = sumf;
@@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
465
466
  const uint8_t * q2 = x[i].qs;
466
467
  const int8_t * q8 = y[i].qs;
467
468
  const uint8_t * sc = x[i].scales;
468
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
469
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
469
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
470
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
470
471
  uint8_t *patmp = atmp;
471
472
  int vsums;
472
473
  int tmp;
@@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
569
570
  const int8_t * q8 = y[i].qs;
570
571
  const uint8_t * sc = x[i].scales;
571
572
 
572
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
573
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
573
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
574
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
574
575
 
575
576
  size_t vl = 16;
576
577
 
@@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
644
645
  const uint8_t * q2 = x[i].qs;
645
646
  const int8_t * q8 = y[i].qs;
646
647
  const uint8_t * sc = x[i].scales;
647
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
648
- const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
648
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
649
+ const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
649
650
  uint8_t *patmp = atmp;
650
651
  int vsums;
651
652
  int tmp;
@@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
750
751
  summs += y[i].bsums[j] * (sc[j] >> 4);
751
752
  }
752
753
 
753
- const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
754
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
754
+ const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
755
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
755
756
 
756
757
  int isum = 0;
757
758
  int is = 0;
@@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
916
917
  q3 += 32; q8 += 128; scale += 8;
917
918
  }
918
919
 
919
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
920
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
920
921
  sumf += d * isum;
921
922
  }
922
923
 
@@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1017
1018
 
1018
1019
  }
1019
1020
 
1020
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1021
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1021
1022
 
1022
1023
  sumf += d*sum_t;
1023
1024
 
@@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1134
1135
  q3 += 32; q8 += 128; scale += 8;
1135
1136
  }
1136
1137
 
1137
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1138
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1138
1139
  sumf += d * isum;
1139
1140
  }
1140
1141
  break;
@@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1202
1203
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
1203
1204
  q8 += 8; a += 8;
1204
1205
  }
1205
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1206
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1206
1207
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1207
1208
  }
1208
1209
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1239
1240
  float sumf = 0;
1240
1241
 
1241
1242
  for (int i = 0; i < nb; ++i) {
1242
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1243
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1243
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1244
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1244
1245
 
1245
1246
  int tmp, tmp2, sumi;
1246
1247
  __asm__ __volatile__(
@@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1361
1362
 
1362
1363
  size_t vl = 8;
1363
1364
 
1364
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1365
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1365
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1366
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1366
1367
 
1367
1368
  vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl);
1368
1369
  vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl);
@@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1422
1423
  break;
1423
1424
  case 128:
1424
1425
  for (int i = 0; i < nb; ++i) {
1425
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
1426
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
1426
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
1427
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
1427
1428
 
1428
1429
  int tmp, tmp2, sumi;
1429
1430
  __asm__ __volatile__(
@@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1580
1581
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1581
1582
  q8 += 8; a += 8;
1582
1583
  }
1583
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1584
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1584
1585
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1585
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1586
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1586
1587
  sumf -= dmin * sumi;
1587
1588
  }
1588
1589
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1627
1628
  const uint8_t * GGML_RESTRICT hm = x[i].qh;
1628
1629
  const int8_t * GGML_RESTRICT q8 = y[i].qs;
1629
1630
 
1630
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1631
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1631
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1632
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1632
1633
 
1633
1634
  vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl);
1634
1635
  vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl);
@@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1749
1750
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1750
1751
  q8 += 8; a += 8;
1751
1752
  }
1752
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1753
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1753
1754
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1754
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
1755
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
1755
1756
  sumf -= dmin * sumi;
1756
1757
  }
1757
1758
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1778
1779
 
1779
1780
  for (int i = 0; i < nb; ++i) {
1780
1781
 
1781
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1782
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1782
1783
 
1783
1784
  const uint8_t * restrict q6 = x[i].ql;
1784
1785
  const uint8_t * restrict qh = x[i].qh;
@@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1862
1863
  case 256:
1863
1864
  for (int i = 0; i < nb; ++i) {
1864
1865
 
1865
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1866
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1866
1867
 
1867
1868
  const uint8_t * GGML_RESTRICT q6 = x[i].ql;
1868
1869
  const uint8_t * GGML_RESTRICT qh = x[i].qh;
@@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1943
1944
  case 128:
1944
1945
  for (int i = 0; i < nb; ++i) {
1945
1946
 
1946
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1947
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1947
1948
 
1948
1949
  const uint8_t * restrict q6 = x[i].ql;
1949
1950
  const uint8_t * restrict qh = x[i].qh;
@@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
2058
2059
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
2059
2060
  q8 += 8; a += 8;
2060
2061
  }
2061
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
2062
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
2062
2063
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
2063
2064
  }
2064
2065
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include <cmath>
@@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
90
91
  const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4);
91
92
 
92
93
  // vector version needs Zvfhmin extension
93
- const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
94
+ const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
94
95
  const float b_scales[8] = {
95
- GGML_FP16_TO_FP32(b_ptr[l].d[0]),
96
- GGML_FP16_TO_FP32(b_ptr[l].d[1]),
97
- GGML_FP16_TO_FP32(b_ptr[l].d[2]),
98
- GGML_FP16_TO_FP32(b_ptr[l].d[3]),
99
- GGML_FP16_TO_FP32(b_ptr[l].d[4]),
100
- GGML_FP16_TO_FP32(b_ptr[l].d[5]),
101
- GGML_FP16_TO_FP32(b_ptr[l].d[6]),
102
- GGML_FP16_TO_FP32(b_ptr[l].d[7])
96
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
97
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
98
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
99
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
100
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
101
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
102
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
103
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
103
104
  };
104
105
  const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
105
106
  const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4);
@@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
129
130
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
130
131
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
131
132
  }
132
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
133
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
133
134
  }
134
135
  }
135
136
  }
@@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
181
182
 
182
183
  // vector version needs Zvfhmin extension
183
184
  const float a_scales[4] = {
184
- GGML_FP16_TO_FP32(a_ptr[l].d[0]),
185
- GGML_FP16_TO_FP32(a_ptr[l].d[1]),
186
- GGML_FP16_TO_FP32(a_ptr[l].d[2]),
187
- GGML_FP16_TO_FP32(a_ptr[l].d[3])
185
+ GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]),
186
+ GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]),
187
+ GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]),
188
+ GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3])
188
189
  };
189
190
  const float b_scales[8] = {
190
- GGML_FP16_TO_FP32(b_ptr[l].d[0]),
191
- GGML_FP16_TO_FP32(b_ptr[l].d[1]),
192
- GGML_FP16_TO_FP32(b_ptr[l].d[2]),
193
- GGML_FP16_TO_FP32(b_ptr[l].d[3]),
194
- GGML_FP16_TO_FP32(b_ptr[l].d[4]),
195
- GGML_FP16_TO_FP32(b_ptr[l].d[5]),
196
- GGML_FP16_TO_FP32(b_ptr[l].d[6]),
197
- GGML_FP16_TO_FP32(b_ptr[l].d[7])
191
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]),
192
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]),
193
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]),
194
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]),
195
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]),
196
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]),
197
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]),
198
+ GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7])
198
199
  };
199
200
  const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4);
200
201
 
@@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
382
383
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
383
384
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
384
385
  }
385
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
386
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
386
387
  }
387
388
  }
388
389
  }
@@ -3,6 +3,7 @@
3
3
  #include "ggml-quants.h"
4
4
  #include "ggml-impl.h"
5
5
  #include "ggml-cpu.h"
6
+ #include "simd-mappings.h"
6
7
 
7
8
  #include "../../quants.h"
8
9
  #include "../../ggml-cpu-impl.h"
@@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
49
50
  const float d = amax / ((1 << 7) - 1);
50
51
  const float id = d ? 1.0f / d : 0.0f;
51
52
 
52
- y[i].d = GGML_FP32_TO_FP16(d);
53
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
53
54
 
54
55
  for (int j = 0; j < 8; j++) {
55
56
  const __vector float v = vec_mul(srcv[j], vec_splats(id));
@@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
94
95
  const float d = amax / ((1 << 7) - 1);
95
96
  const float id = d ? 1.0f / d : 0.0f;
96
97
 
97
- y[i].d = GGML_FP32_TO_FP16(d);
98
+ y[i].d = GGML_CPU_FP32_TO_FP16(d);
98
99
 
99
100
  __vector int32_t acc = vec_splats(0);
100
101
 
@@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
110
111
  acc = vec_add(acc, vi);
111
112
  }
112
113
 
113
- y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
114
+ y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3]));
114
115
  }
115
116
  #else
116
117
  GGML_UNUSED(nb);
@@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
164
165
  __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
165
166
 
166
167
  const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
167
- const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
168
+ const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
168
169
 
169
170
  acc = vec_madd(v_xy, v_d, acc);
170
171
  }
@@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
185
186
  }
186
187
 
187
188
  int sumi = sumi0 + sumi1;
188
- sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
189
+ sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d);
189
190
  }
190
191
 
191
192
  *s = sumf;
@@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
219
220
  __builtin_prefetch(x[ib].qs, 0, 1);
220
221
  __builtin_prefetch(y[ib].qs, 0, 1);
221
222
 
222
- summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
223
+ summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s);
223
224
 
224
225
  const uint8x16_t v_x = vec_xl(0, x[ib].qs);
225
226
  const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
@@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
231
232
  const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
232
233
  const float32x4_t v_xy = vec_float(v_xy_);
233
234
 
234
- const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
235
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
235
236
 
236
237
  acc = vec_madd(v_xy, v_d, acc);
237
238
  }
@@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
252
253
  }
253
254
 
254
255
  int sumi = sumi0 + sumi1;
255
- sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
256
+ sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s);
256
257
  }
257
258
 
258
259
  *s = sumf;
@@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
290
291
 
291
292
  const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
292
293
  const float32x4_t v_xy = vec_float(v_xy_);
293
- const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
294
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
294
295
 
295
296
  acc = vec_madd(v_xy, v_d, acc);
296
297
  }
@@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
305
306
  sumi += x[ib].qs[j]*y[ib].qs[j];
306
307
  }
307
308
 
308
- sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d));
309
+ sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d));
309
310
  }
310
311
 
311
312
  *s = sumf;
@@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
348
349
  float sum = 0;
349
350
 
350
351
  for (int i = 0; i < nb; ++i) {
351
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
352
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
352
353
 
353
354
  const uint8_t * restrict x0l = x[i].qs;
354
355
  const uint8_t * restrict x0h = x[i].hmask;
@@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
497
498
  for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
498
499
  q8 += 8; a += 8;
499
500
  }
500
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
501
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
501
502
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
502
503
  }
503
504
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
537
538
  float sumf = 0;
538
539
 
539
540
  for (int i = 0; i < nb; ++i) {
540
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
541
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
541
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
542
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
542
543
 
543
544
  const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
544
545
  const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
647
648
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
648
649
  q8 += 8; a += 8;
649
650
  }
650
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
651
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
651
652
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
652
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
653
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
653
654
  sumf -= dmin * sumi;
654
655
  }
655
656
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
698
699
  float sumf = 0;
699
700
 
700
701
  for (int i = 0; i < nb; ++i) {
701
- const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
702
- const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
702
+ const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
703
+ const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
703
704
 
704
705
  const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums);
705
706
  const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums);
@@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819
820
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
820
821
  q8 += 8; a += 8;
821
822
  }
822
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
823
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
823
824
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
824
- const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
825
+ const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
825
826
  sumf -= dmin * sumi;
826
827
  }
827
828
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
859
860
  int8x16_t v_y[4];
860
861
 
861
862
  for (int i = 0; i < nb; ++i) {
862
- const float d_all = GGML_FP16_TO_FP32(x[i].d);
863
+ const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d);
863
864
 
864
865
  const uint8_t * GGML_RESTRICT x0l = x[i].ql;
865
866
  const uint8_t * GGML_RESTRICT x0h = x[i].qh;
@@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1004
1005
  for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
1005
1006
  q8 += 8; a += 8;
1006
1007
  }
1007
- const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1008
+ const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1008
1009
  for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
1009
1010
  }
1010
1011
  for (int l = 0; l < 8; ++l) sumf += sums[l];
@@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1071
1072
  // float sumf = 0;
1072
1073
 
1073
1074
  // for (int i = 0; i < nb; ++i) {
1074
- // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1075
+ // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1075
1076
  // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1076
1077
  // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1077
1078
 
@@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1121
1122
 
1122
1123
  // float sumf = 0.f;
1123
1124
  // for (int i = 0; i < nb; ++i) {
1124
- // const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
1125
+ // const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
1125
1126
  // const uint16_t * GGML_RESTRICT q2 = x[i].qs;
1126
1127
  // const int8_t * GGML_RESTRICT q8 = y[i].qs;
1127
1128
  // int32_t bsum = 0;
@@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1182
1183
  const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1183
1184
  const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1184
1185
 
1185
- sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1186
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1186
1187
  }
1187
1188
 
1188
1189
  #endif
1189
1190
  for (; ib < nb; ++ib) {
1190
- const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d);
1191
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d);
1191
1192
  int sumi1 = 0, sumi2 = 0;
1192
1193
  for (int j = 0; j < QK4_NL/2; ++j) {
1193
1194
  sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf];
@@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1257
1258
  sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1258
1259
  }
1259
1260
 
1260
- sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1261
+ sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
1261
1262
  }
1262
1263
 
1263
1264
  *s = sumf;
@@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1265
1266
  #else
1266
1267
  float sumf = 0;
1267
1268
  for (int ibl = 0; ibl < nb; ++ibl) {
1268
- const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1269
+ const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d;
1269
1270
  uint16_t h = x[ibl].scales_h;
1270
1271
  const uint8_t * qs = x[ibl].qs;
1271
1272
  const int8_t * q8 = y[ibl].qs;