@fugood/llama.node 1.0.0-beta.7 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +10 -0
  3. package/lib/index.js +8 -0
  4. package/lib/index.ts +14 -0
  5. package/package.json +14 -14
  6. package/src/LlamaContext.cpp +37 -0
  7. package/src/LlamaContext.h +1 -0
  8. package/src/RerankWorker.h +26 -0
  9. package/src/llama.cpp/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  11. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  12. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  14. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  25. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  31. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  32. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  34. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  35. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  36. package/src/llama.cpp/include/llama.h +6 -3
  37. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  38. package/src/llama.cpp/src/llama-arch.h +17 -0
  39. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  40. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  41. package/src/llama.cpp/src/llama-context.cpp +0 -1
  42. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  43. package/src/llama.cpp/src/llama-graph.h +14 -2
  44. package/src/llama.cpp/src/llama-hparams.h +6 -0
  45. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  46. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  47. package/src/llama.cpp/src/llama-model.cpp +518 -1
  48. package/src/llama.cpp/src/llama-model.h +22 -0
  49. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -6,6 +6,7 @@
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
8
  #include "ggml-cpu-impl.h"
9
+ #include "simd-mappings.h"
9
10
  #include "traits.h"
10
11
 
11
12
  #include <cmath>
@@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) {
39
40
  float tmp[16];
40
41
 
41
42
  for (int i = 0; i < 8; i++) {
42
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
43
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
43
44
  }
44
45
 
45
46
  for (int i = 0; i < 8; i++) {
46
- tmp[i + 8] = GGML_FP16_TO_FP32(y[i]);
47
+ tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]);
47
48
  }
48
49
 
49
50
  return _mm512_loadu_ps(tmp);
@@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) {
54
55
  _mm_storeu_si128((__m128i*)tmphalf, x);
55
56
 
56
57
  for (int i = 0; i < 4; i++) {
57
- tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
58
- tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]);
59
- tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]);
60
- tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]);
58
+ tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
59
+ tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
60
+ tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
+ tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
61
62
  }
62
63
 
63
64
  return _mm512_loadu_ps(tmp);
@@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
67
68
  float tmp[8];
68
69
 
69
70
  for (int i = 0; i < 8; i++) {
70
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
71
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
71
72
  }
72
73
 
73
74
  return _mm256_loadu_ps(tmp);
@@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) {
76
77
  float tmp[8];
77
78
 
78
79
  for (int i = 0; i < 4; i++) {
79
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
80
- tmp[i + 4] = GGML_FP16_TO_FP32(x[i]);
80
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
81
+ tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]);
81
82
  }
82
83
 
83
84
  return _mm256_loadu_ps(tmp);
@@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang
88
89
 
89
90
  _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask));
90
91
  for (int i = 0; i < 8; i++) {
91
- tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]);
92
+ tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]);
92
93
  }
93
94
 
94
95
  return _mm256_loadu_ps(tmp);
@@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
211
212
  id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f;
212
213
 
213
214
  // Store the scale for the individual block
214
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
215
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
215
216
 
216
217
  // Store the values in blocks of eight values - Aim is to use these later for block interleaving
217
218
  srcv[row_iter][0] = v0;
@@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
297
298
  const float d = amax / ((1 << 7) - 1);
298
299
  id[row_iter] = d ? 1.0f / d : 0.0f;
299
300
 
300
- y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
301
+ y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
301
302
  }
302
303
 
303
304
  for (int j = 0; j < QK8_0 * 4; j++) {
@@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
647
648
  const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
648
649
 
649
650
  // Load and convert to FP32 scale from block_q8_0
650
- const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d));
651
+ const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d));
651
652
 
652
653
  // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector
653
654
  __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs));
@@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
706
707
  const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
707
708
  sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
708
709
  }
709
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
710
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
710
711
  }
711
712
  }
712
713
  }
@@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
972
973
  sumi2 = sumi2 * scales_1[j];
973
974
  sumi += sumi1 + sumi2;
974
975
  }
975
- sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
+ sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
976
977
  }
977
978
  }
978
979
  for (int sb = 0; sb < 8; sb++) {
979
980
  uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
980
981
  for (int j = 0; j < ncols_interleaved; j++) {
981
- sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
+ sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
982
983
  }
983
984
  }
984
985
  }
@@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
1755
1756
  sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
1756
1757
  (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
1757
1758
  }
1758
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
1759
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
1759
1760
  }
1760
1761
  }
1761
1762
  }
@@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3259
3260
  sumi2 = sumi2 * scales_1[j];
3260
3261
  sumi += sumi1 + sumi2;
3261
3262
  }
3262
- sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
+ sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
3263
3264
  }
3264
3265
  }
3265
3266
  }
@@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
3268
3269
  for(int m = 0; m < 4; m++) {
3269
3270
  const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
3270
3271
  for(int j = 0; j < ncols_interleaved; j++) {
3271
- sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
+ sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
3272
3273
  }
3273
3274
  }
3274
3275
  }
@@ -4,6 +4,7 @@
4
4
  #include "traits.h"
5
5
  #include "ggml-cpu-impl.h"
6
6
  #include "ggml-impl.h"
7
+ #include "simd-mappings.h"
7
8
 
8
9
  #ifdef __cplusplus
9
10
 
@@ -12,11 +13,11 @@
12
13
  // convenience functions/macros for use in template calls
13
14
  // note: these won't be required after the 'traits' lookup table is used.
14
15
  static inline ggml_fp16_t f32_to_f16(float x) {
15
- return GGML_FP32_TO_FP16(x);
16
+ return GGML_CPU_FP32_TO_FP16(x);
16
17
  }
17
18
 
18
19
  static inline float f16_to_f32(ggml_fp16_t x) {
19
- return GGML_FP16_TO_FP32(x);
20
+ return GGML_CPU_FP16_TO_FP32(x);
20
21
  }
21
22
 
22
23
  static inline ggml_bf16_t f32_to_bf16(float x) {
@@ -62,11 +62,17 @@ struct ggml_compute_params {
62
62
  #if defined(__s390x__) && defined(__VEC__)
63
63
  #ifndef __VXE__
64
64
  #define __VXE__
65
- #endif
65
+ #endif // __VXE__
66
66
  #ifndef __VXE2__
67
67
  #define __VXE2__
68
- #endif
69
- #endif
68
+ #endif // __VXE2__
69
+ #endif // __s390x__ && __VEC__
70
+
71
+ #if defined(__s390x__) && defined(GGML_NNPA)
72
+ #ifndef __NNPA__
73
+ #define __NNPA__
74
+ #endif // __NNPA__
75
+ #endif // __s390x__ && GGML_NNPA
70
76
 
71
77
  #if defined(__ARM_FEATURE_SVE)
72
78
  #include <sys/prctl.h>
@@ -72,6 +72,9 @@
72
72
  #define UNUSED GGML_UNUSED
73
73
  #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
74
74
 
75
+ // precomputed f32 table for f16 (256 KB) (simd-mappings.h)
76
+ float ggml_table_f32_f16[1 << 16];
77
+
75
78
  #if defined(__ARM_ARCH)
76
79
  struct ggml_arm_arch_features_type {
77
80
  int sve_cnt;
@@ -736,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
736
739
  {
737
740
  assert(tensor->nb[0] == sizeof(ggml_fp16_t));
738
741
  for (int i = 0; i < n; i++) {
739
- ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
742
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
740
743
  }
741
744
  } break;
742
745
  case GGML_TYPE_BF16:
@@ -795,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
795
798
  {
796
799
  assert(tensor->nb[0] == sizeof(ggml_fp16_t));
797
800
  for (int i = 0; i < n; i++) {
798
- ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value));
801
+ ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value));
799
802
  }
800
803
  } break;
801
804
  case GGML_TYPE_BF16:
@@ -846,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
846
849
  case GGML_TYPE_F16:
847
850
  {
848
851
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
849
- return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
852
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
850
853
  }
851
854
  case GGML_TYPE_BF16:
852
855
  {
@@ -891,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
891
894
  case GGML_TYPE_F16:
892
895
  {
893
896
  GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
894
- ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
897
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
895
898
  } break;
896
899
  case GGML_TYPE_BF16:
897
900
  {
@@ -920,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i
920
923
  case GGML_TYPE_I32:
921
924
  return ((int32_t *) data)[0];
922
925
  case GGML_TYPE_F16:
923
- return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
926
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
924
927
  case GGML_TYPE_BF16:
925
928
  return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
926
929
  case GGML_TYPE_F32:
@@ -947,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
947
950
  } break;
948
951
  case GGML_TYPE_F16:
949
952
  {
950
- ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
953
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
951
954
  } break;
952
955
  case GGML_TYPE_BF16:
953
956
  {
@@ -985,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
985
988
  }
986
989
  case GGML_TYPE_F16:
987
990
  {
988
- return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
991
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
989
992
  }
990
993
  case GGML_TYPE_BF16:
991
994
  {
@@ -1024,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
1024
1027
  } break;
1025
1028
  case GGML_TYPE_F16:
1026
1029
  {
1027
- ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
1030
+ ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value);
1028
1031
  } break;
1029
1032
  case GGML_TYPE_BF16:
1030
1033
  {
@@ -1051,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
1051
1054
  case GGML_TYPE_I32:
1052
1055
  return ((int32_t *) data)[0];
1053
1056
  case GGML_TYPE_F16:
1054
- return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1057
+ return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]);
1055
1058
  case GGML_TYPE_BF16:
1056
1059
  return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]);
1057
1060
  case GGML_TYPE_F32:
@@ -1078,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
1078
1081
  } break;
1079
1082
  case GGML_TYPE_F16:
1080
1083
  {
1081
- ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value);
1084
+ ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value);
1082
1085
  } break;
1083
1086
  case GGML_TYPE_BF16:
1084
1087
  {
@@ -3141,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3141
3144
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3142
3145
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3143
3146
  }
3147
+ #elif defined(__NNPA__)
3148
+ for (; i + 7 < n; i += 8) {
3149
+ float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3150
+ float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3151
+ uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3152
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3153
+ vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3154
+ }
3155
+ for (; i + 3 < n; i += 4) {
3156
+ float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3157
+ float32x4_t v_zero = vec_splats(0.0f);
3158
+ uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3159
+ uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3160
+ vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3161
+ }
3144
3162
  #endif
3145
3163
  for (; i < n; ++i) {
3146
- y[i] = GGML_FP32_TO_FP16(x[i]);
3164
+ y[i] = GGML_CPU_FP32_TO_FP16(x[i]);
3147
3165
  }
3148
3166
  }
3149
3167
 
@@ -3167,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3167
3185
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3168
3186
  _mm_storeu_ps(y + i, y_vec);
3169
3187
  }
3188
+ #elif defined(__NNPA__)
3189
+ for (; i + 7 < n; i += 8) {
3190
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3191
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3192
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3193
+ float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3194
+ vec_xst(v_yh, 0, (float *)(y + i + 0));
3195
+ vec_xst(v_yl, 0, (float *)(y + i + 4));
3196
+ }
3197
+ for (; i + 3 < n; i += 4) {
3198
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3199
+ uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3200
+ float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3201
+ vec_xst(v_yh, 0, (float *)(y + i));
3202
+ }
3170
3203
  #endif
3204
+
3171
3205
  for (; i < n; ++i) {
3172
- y[i] = GGML_FP16_TO_FP32(x[i]);
3206
+ y[i] = GGML_CPU_FP16_TO_FP32(x[i]);
3173
3207
  }
3174
3208
  }
3175
3209
 
@@ -3369,6 +3403,14 @@ int ggml_cpu_has_vxe(void) {
3369
3403
  #endif
3370
3404
  }
3371
3405
 
3406
+ int ggml_cpu_has_nnpa(void) {
3407
+ #if defined(GGML_NNPA)
3408
+ return 1;
3409
+ #else
3410
+ return 0;
3411
+ #endif
3412
+ }
3413
+
3372
3414
  int ggml_cpu_has_neon(void) {
3373
3415
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3374
3416
  return 1;
@@ -3418,7 +3460,7 @@ int ggml_cpu_has_sme(void) {
3418
3460
  }
3419
3461
 
3420
3462
  void ggml_cpu_init(void) {
3421
- // needed to initialize f16 tables
3463
+ // needed to initialize ggml_time
3422
3464
  {
3423
3465
  struct ggml_init_params params = { 0, NULL, false };
3424
3466
  struct ggml_context * ctx = ggml_init(params);
@@ -3439,9 +3481,10 @@ void ggml_cpu_init(void) {
3439
3481
  uint16_t u16;
3440
3482
  ggml_fp16_t fp16;
3441
3483
  } u = {i};
3442
- float f = GGML_FP16_TO_FP32(u.fp16);
3443
- ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
3444
- ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
3484
+ float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
3485
+ ggml_table_f32_f16[i] = f;
3486
+ ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f));
3487
+ ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f));
3445
3488
  }
3446
3489
 
3447
3490
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
@@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
578
578
  if (ggml_cpu_has_vxe()) {
579
579
  features.push_back({ "VXE", "1" });
580
580
  }
581
+ if (ggml_cpu_has_nnpa()) {
582
+ features.push_back({ "NNPA", "1" });
583
+ }
581
584
  if (ggml_cpu_has_wasm_simd()) {
582
585
  features.push_back({ "WASM_SIMD", "1" });
583
586
  }
@@ -52,6 +52,7 @@
52
52
  #include "ggml-impl.h"
53
53
  #include "ggml-cpu-impl.h"
54
54
  #include "ggml-quants.h"
55
+ #include "simd-mappings.h"
55
56
 
56
57
  #include <array>
57
58
  #include <type_traits>
@@ -73,7 +74,7 @@
73
74
  namespace {
74
75
 
75
76
  inline float unhalf(ggml_fp16_t d) {
76
- return GGML_FP16_TO_FP32(d);
77
+ return GGML_CPU_FP16_TO_FP32(d);
77
78
  }
78
79
 
79
80
  ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) {
252
253
  float tmp[4];
253
254
 
254
255
  for (int i = 0; i < 4; i++) {
255
- tmp[i] = GGML_FP16_TO_FP32(p[i]);
256
+ tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]);
256
257
  }
257
258
 
258
259
  return vec_xl(0, (const float *)(tmp));