@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
53
53
 
54
54
  #if defined(__VXE__) || defined(__VXE2__)
55
55
  for (int i = 0; i < nb; i++) {
56
- __vector float srcv [8];
57
- __vector float asrcv[8];
58
- __vector float amaxv[8];
56
+ float32x4_t srcv [8];
57
+ float32x4_t asrcv[8];
58
+ float32x4_t amaxv[8];
59
59
 
60
60
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
61
61
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
74
74
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
75
75
 
76
76
  for (int j = 0; j < 8; j++) {
77
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
78
- const __vector int32_t vi = vec_signed(v);
77
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
+ const int32x4_t vi = vec_signed(v);
79
79
 
80
80
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
81
81
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
98
98
 
99
99
  #if defined(__VXE__) || defined(__VXE2__)
100
100
  for (int i = 0; i < nb; i++) {
101
- __vector float srcv [8];
102
- __vector float asrcv[8];
103
- __vector float amaxv[8];
101
+ float32x4_t srcv [8];
102
+ float32x4_t asrcv[8];
103
+ float32x4_t amaxv[8];
104
104
 
105
105
  for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
106
106
  for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
118
118
 
119
119
  y[i].d = GGML_CPU_FP32_TO_FP16(d);
120
120
 
121
- __vector int32_t acc = vec_splats(0);
121
+ int32x4_t acc = vec_splats(0);
122
122
 
123
123
  for (int j = 0; j < 8; j++) {
124
- const __vector float v = vec_mul(srcv[j], vec_splats(id));
125
- const __vector int32_t vi = vec_signed(v);
124
+ const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125
+ const int32x4_t vi = vec_signed(v);
126
126
 
127
127
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
128
128
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
162
162
  float sumf = 0;
163
163
 
164
164
  #if defined(__VXE__) || defined(__VXE2__)
165
- __vector float acc = vec_splats(0.0f);
165
+ float32x4_t acc = vec_splats(0.0f);
166
166
 
167
- const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
168
- const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
167
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
168
+ const int8x16_t v_s = vec_splats( (const int8_t)0x08);
169
169
 
170
170
  for (; ib < nb; ++ib) {
171
- const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
172
- const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
173
- const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
171
+ const uint8x16_t v_x = vec_xl(0, x[ib].qs);
172
+ const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
173
+ const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
174
174
 
175
- const __vector int8_t v_xls = vec_sub(v_xl, v_s);
176
- const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
175
+ const int8x16_t v_xls = vec_sub(v_xl, v_s);
176
+ const int8x16_t v_xhs = vec_sub(v_xh, v_s);
177
177
 
178
- const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
179
- const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
178
+ const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
179
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
180
180
 
181
- const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
182
- const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
183
- const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
184
- const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
181
+ const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
182
+ const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
183
+ const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
184
+ const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
185
185
 
186
- __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
186
+ int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
187
187
 
188
- const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
189
- const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
188
+ const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
189
+ const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
190
190
 
191
191
  acc = vec_madd(v_xy, v_d, acc);
192
192
  }
193
193
 
194
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
195
-
194
+ sumf = vec_hsum_f32x4(acc);
196
195
  *s = sumf;
197
196
  #else
198
197
  UNUSED(nb);
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
249
248
  acc = vec_madd(v_xy, v_d, acc);
250
249
  }
251
250
 
252
- sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
253
-
251
+ sumf = vec_hsum_f32x4(acc) + summs;
254
252
  *s = sumf;
255
253
  #else
256
254
  UNUSED(nb);
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
351
349
  v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
352
350
  }
353
351
 
354
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
352
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
355
353
 
356
354
  #pragma GCC unroll 4
357
355
  for (; ib < nb; ++ib) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
390
388
  const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
391
389
  const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
392
390
 
393
- sumf += vec_hsum(v_acc);
391
+ sumf += vec_hsum_f32x4(v_acc);
394
392
  }
395
393
 
396
394
  *s = sumf;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
502
500
  v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
503
501
  }
504
502
 
505
- sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
503
+ sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
506
504
 
507
505
  #pragma GCC unroll 4
508
506
  for (; ib < nb; ++ib) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
543
541
  const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
544
542
  const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
545
543
 
546
- sumf += vec_hsum(v_acc) + summs;
544
+ sumf += vec_hsum_f32x4(v_acc) + summs;
547
545
  }
548
546
 
549
547
  *s = sumf;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
575
573
  float sumf = 0;
576
574
 
577
575
  #if defined(__VXE__) || defined(__VXE2__)
578
- __vector float acc = vec_splats(0.0f);
576
+ float32x4_t acc = vec_splats(0.0f);
579
577
 
580
578
  #pragma GCC unroll 8
581
579
  for (; ib < nb; ++ib) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
594
592
  acc = vec_madd(v_xy, v_d, acc);
595
593
  }
596
594
 
597
- sumf = acc[0] + acc[1] + acc[2] + acc[3];
595
+ sumf = vec_hsum_f32x4(acc);
598
596
 
599
597
  *s = sumf;
600
598
  #else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
718
716
  isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
719
717
  isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
720
718
 
721
- isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
722
- isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
723
- isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
724
- isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
719
+ isum += vec_hsum_i32x4(isum0) * scale[0];
720
+ isum += vec_hsum_i32x4(isum1) * scale[1];
721
+ isum += vec_hsum_i32x4(isum2) * scale[2];
722
+ isum += vec_hsum_i32x4(isum3) * scale[3];
725
723
 
726
724
  scale += 4;
727
725
 
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
819
817
  v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
820
818
 
821
819
  const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
822
- sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
820
+ sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
823
821
 
824
822
  v_y[0] = vec_xl(0 , y0);
825
823
  v_y[1] = vec_xl(16, y0);
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
829
827
  v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
830
828
 
831
829
  const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
832
- sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
830
+ sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
833
831
  }
834
832
 
835
833
  sumf += d * (sumi1 + sumi2);
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
911
909
  const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
912
910
  const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
913
911
  const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
914
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
912
+ const int32_t mins = vec_hsum_i32x4(v_mins);
915
913
 
916
914
  const uint8_t * scales = (const uint8_t *)utmp;
917
915
  const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
948
946
  int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
949
947
  int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
950
948
 
951
- sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
952
- sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
949
+ sumi += vec_hsum_i32x4(sumi0) * *scales++;
950
+ sumi += vec_hsum_i32x4(sumi1) * *scales++;
953
951
  }
954
952
 
955
953
  sumf += d * sumi - dmin * mins;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1020
1018
  const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
1021
1019
  const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
1022
1020
 
1023
- const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
1021
+ const int32_t mins = vec_hsum_i32x4(v_mins);
1024
1022
 
1025
1023
  int32_t isum = 0;
1026
1024
  for (int j = 0; j < QK_K/128; ++j) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1060
1058
  int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1061
1059
  int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1062
1060
 
1063
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1064
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1065
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1066
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1061
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1062
+ vec_hsum_i32x4(summs1) * scale[1] +
1063
+ vec_hsum_i32x4(summs2) * scale[2] +
1064
+ vec_hsum_i32x4(summs3) * scale[3];
1067
1065
 
1068
1066
  scale += 4;
1069
1067
 
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
1094
1092
  summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
1095
1093
  summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
1096
1094
 
1097
- isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
1098
- (summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
1099
- (summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
1100
- (summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
1095
+ isum += vec_hsum_i32x4(summs0) * scale[0] +
1096
+ vec_hsum_i32x4(summs1) * scale[1] +
1097
+ vec_hsum_i32x4(summs2) * scale[2] +
1098
+ vec_hsum_i32x4(summs3) * scale[3];
1101
1099
 
1102
1100
  scale += 4;
1103
1101
  }
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
1285
1283
  const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
1286
1284
  const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
1287
1285
 
1288
- sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
1286
+ sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
1289
1287
  }
1290
1288
 
1291
1289
  *s = sumf;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
1354
1352
 
1355
1353
  h >>= 4;
1356
1354
 
1357
- sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
1358
- sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
1355
+ sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
1356
+ sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
1359
1357
  }
1360
1358
 
1361
1359
  sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);
@@ -68,12 +68,6 @@ struct ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__s390x__) && defined(GGML_NNPA)
72
- #ifndef __NNPA__
73
- #define __NNPA__
74
- #endif // __NNPA__
75
- #endif // __s390x__ && GGML_NNPA
76
-
77
71
  #if defined(__ARM_FEATURE_SVE)
78
72
  #include <sys/prctl.h>
79
73
  #endif
@@ -489,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
489
483
  /**
490
484
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
491
485
  */
492
- inline static float vec_hsum(float32x4_t v) {
486
+ inline static float vec_hsum_f32x4(float32x4_t v) {
493
487
  float32x4_t v_temp = v + vec_reve(v);
494
488
  return v_temp[0] + v_temp[1];
495
489
  }
496
490
 
491
+ inline static int32_t vec_hsum_i32x4(int32x4_t v) {
492
+ int32x4_t v_temp = v + vec_reve(v);
493
+ return v_temp[0] + v_temp[1];
494
+ }
495
+
497
496
  inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
498
497
  const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
499
498
  return acc + (vec_unpackh(p) + vec_unpackl(p));
@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
373
373
  .vec_dot_type = GGML_TYPE_Q8_K,
374
374
  .nrows = 1,
375
375
  },
376
+ [GGML_TYPE_I32] = {
377
+ .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
378
+ },
376
379
  };
377
380
 
378
381
  const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -1876,6 +1879,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1876
1879
  {
1877
1880
  ggml_compute_forward_im2col_back_f32(params, tensor);
1878
1881
  } break;
1882
+ case GGML_OP_IM2COL_3D:
1883
+ {
1884
+ ggml_compute_forward_im2col_3d(params, tensor);
1885
+ } break;
1879
1886
  case GGML_OP_CONV_2D:
1880
1887
  {
1881
1888
  ggml_compute_forward_conv_2d(params, tensor);
@@ -2255,6 +2262,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2255
2262
  } break;
2256
2263
  case GGML_OP_IM2COL:
2257
2264
  case GGML_OP_IM2COL_BACK:
2265
+ case GGML_OP_IM2COL_3D:
2258
2266
  case GGML_OP_CONV_2D:
2259
2267
  case GGML_OP_CONV_3D:
2260
2268
  case GGML_OP_CONV_2D_DW:
@@ -2691,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
2691
2699
  if (ggml_is_quantized(node->type) ||
2692
2700
  // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
2693
2701
  (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
2694
- (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
2702
+ (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
2703
+ // conversion between F32 and I32
2704
+ (node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
2705
+ (node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
2695
2706
  cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
2696
2707
  }
2697
2708
  } break;
@@ -3206,20 +3217,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
3206
3217
  __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
3207
3218
  _mm_storel_epi64((__m128i *)(y + i), y_vec);
3208
3219
  }
3209
- #elif defined(__NNPA__)
3210
- for (; i + 7 < n; i += 8) {
3211
- float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3212
- float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3213
- uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3214
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3215
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3216
- }
3217
- for (; i + 3 < n; i += 4) {
3218
- float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3219
- float32x4_t v_zero = vec_splats(0.0f);
3220
- uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3221
- uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3222
- vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3220
+ #elif defined(__riscv_zvfh)
3221
+ for (int vl; i < n; i += vl) {
3222
+ vl = __riscv_vsetvl_e32m2(n - i);
3223
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
3224
+ vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
3225
+ __riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
3223
3226
  }
3224
3227
  #endif
3225
3228
  for (; i < n; ++i) {
@@ -3247,21 +3250,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
3247
3250
  __m128 y_vec = _mm_cvtph_ps(x_vec);
3248
3251
  _mm_storeu_ps(y + i, y_vec);
3249
3252
  }
3250
- #elif defined(__NNPA__)
3251
- for (; i + 7 < n; i += 8) {
3252
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3253
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3254
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3255
- float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3256
- vec_xst(v_yh, 0, (float *)(y + i + 0));
3257
- vec_xst(v_yl, 0, (float *)(y + i + 4));
3258
- }
3259
- for (; i + 3 < n; i += 4) {
3260
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3261
- uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3262
- float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3263
- vec_xst(v_yh, 0, (float *)(y + i));
3264
- }
3265
3253
  #endif
3266
3254
 
3267
3255
  for (; i < n; ++i) {
@@ -3276,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
3276
3264
  }
3277
3265
  }
3278
3266
 
3267
+ void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
3268
+ int64_t i = 0;
3269
+ for (; i < n; ++i) {
3270
+ y[i] = x[i];
3271
+ }
3272
+ }
3273
+
3279
3274
  void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
3280
3275
  int64_t i = 0;
3281
3276
  #if defined(__AVX2__)
@@ -3465,14 +3460,6 @@ int ggml_cpu_has_vxe(void) {
3465
3460
  #endif
3466
3461
  }
3467
3462
 
3468
- int ggml_cpu_has_nnpa(void) {
3469
- #if defined(GGML_NNPA)
3470
- return 1;
3471
- #else
3472
- return 0;
3473
- #endif
3474
- }
3475
-
3476
3463
  int ggml_cpu_has_neon(void) {
3477
3464
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
3478
3465
  return 1;
@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
190
190
  /* .graph_compute = */ ggml_backend_cpu_graph_compute,
191
191
  /* .event_record = */ NULL,
192
192
  /* .event_wait = */ NULL,
193
+ /* .optimize_graph = */ NULL,
193
194
  };
194
195
 
195
196
  static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -348,8 +349,10 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
348
349
  long pages = sysconf(_SC_PHYS_PAGES);
349
350
  long page_size = sysconf(_SC_PAGE_SIZE);
350
351
  *total = pages * page_size;
352
+
353
+ // "free" system memory is ill-defined, for practical purposes assume that all of it is free:
351
354
  *free = *total;
352
- #endif
355
+ #endif // _WIN32
353
356
 
354
357
  GGML_UNUSED(dev);
355
358
  }
@@ -576,9 +579,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
576
579
  if (ggml_cpu_has_vxe()) {
577
580
  features.push_back({ "VXE", "1" });
578
581
  }
579
- if (ggml_cpu_has_nnpa()) {
580
- features.push_back({ "NNPA", "1" });
581
- }
582
582
  if (ggml_cpu_has_wasm_simd()) {
583
583
  features.push_back({ "WASM_SIMD", "1" });
584
584
  }
@@ -154,7 +154,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
154
154
  if (dst->src[0]->type == GGML_TYPE_Q4_0) {
155
155
  return compute_forward_q4_0(params, dst);
156
156
  } else if (dst->src[0]->type == GGML_TYPE_F16) {
157
- return compute_forward_kv_cache(params, dst);
157
+ return compute_forward_fp16(params, dst);
158
158
  }
159
159
  } else if (dst->op == GGML_OP_GET_ROWS) {
160
160
  if (dst->src[0]->type == GGML_TYPE_Q4_0) {
@@ -164,7 +164,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
164
164
  return false;
165
165
  }
166
166
 
167
- bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
167
+ bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
168
168
  static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
169
169
 
170
170
  const ggml_tensor * src0 = dst->src[0];
@@ -515,9 +515,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
515
515
  op->src[0]->buffer &&
516
516
  (ggml_n_dims(op->src[0]) == 2) &&
517
517
  op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
518
- if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
519
- return false;
520
- }
521
518
  if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
522
519
  return false;
523
520
  }
@@ -534,13 +531,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
534
531
  if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
535
532
  return (ggml::cpu::tensor_traits *) op->src[0]->extra;
536
533
  }
537
- else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
538
- op->src[0]->op == GGML_OP_VIEW &&
539
- (op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
540
- op->src[1]->ne[1] > 1) {
541
- if ((op->src[0]->nb[0] != 2) ||
542
- (op->src[1]->nb[0] != 4) ||
543
- (op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
534
+ else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
535
+ if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
544
536
  (op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
545
537
  return nullptr;
546
538
  }