@fugood/llama.node 1.1.11 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +111 -1
  28. package/src/llama.cpp/common/chat.h +3 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/log.cpp +53 -2
  31. package/src/llama.cpp/common/log.h +10 -4
  32. package/src/llama.cpp/common/sampling.cpp +23 -2
  33. package/src/llama.cpp/common/sampling.h +3 -1
  34. package/src/llama.cpp/common/speculative.cpp +1 -1
  35. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  36. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  37. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  38. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  39. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
  40. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  41. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  44. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  47. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  48. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  49. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  50. package/src/llama.cpp/include/llama.h +5 -6
  51. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  52. package/src/llama.cpp/src/llama-adapter.h +3 -0
  53. package/src/llama.cpp/src/llama-arch.cpp +27 -4
  54. package/src/llama.cpp/src/llama-arch.h +2 -0
  55. package/src/llama.cpp/src/llama-context.cpp +62 -56
  56. package/src/llama.cpp/src/llama-context.h +1 -1
  57. package/src/llama.cpp/src/llama-graph.cpp +54 -9
  58. package/src/llama.cpp/src/llama-graph.h +8 -0
  59. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  60. package/src/llama.cpp/src/llama-hparams.h +9 -3
  61. package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
  62. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  63. package/src/llama.cpp/src/llama-model.cpp +159 -1
  64. package/src/llama.cpp/src/llama-model.h +0 -1
  65. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  66. package/src/anyascii.c +0 -22223
  67. package/src/anyascii.h +0 -42
  68. package/src/tts_utils.cpp +0 -371
  69. package/src/tts_utils.h +0 -103
@@ -7027,6 +7027,209 @@ void ggml_compute_forward_im2col_back_f32(
7027
7027
  }
7028
7028
  }
7029
7029
 
7030
+
7031
+ // ggml_compute_forward_im2col_3d_f16
7032
+ // src0: kernel [OC*IC, KD, KH, KW]
7033
+ // src1: image [N*IC, ID, IH, IW]
7034
+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7035
+ static void ggml_compute_forward_im2col_3d_f16(
7036
+ const ggml_compute_params * params,
7037
+ ggml_tensor * dst) {
7038
+
7039
+ const ggml_tensor * src0 = dst->src[0];
7040
+ const ggml_tensor * src1 = dst->src[1];
7041
+
7042
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
7043
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7044
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
7045
+
7046
+ GGML_TENSOR_BINARY_OP_LOCALS;
7047
+
7048
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7049
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7050
+ const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7051
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7052
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7053
+ const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7054
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7055
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7056
+ const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7057
+ const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7058
+
7059
+
7060
+ const int ith = params->ith;
7061
+ const int nth = params->nth;
7062
+
7063
+ const int64_t N = ne13 / IC;
7064
+ const int64_t ID = ne12;
7065
+ const int64_t IH = ne11;
7066
+ const int64_t IW = ne10;
7067
+
7068
+ const int64_t OC = ne03 / IC;
7069
+ GGML_UNUSED(OC);
7070
+ const int64_t KD = ne02;
7071
+ const int64_t KH = ne01;
7072
+ const int64_t KW = ne00;
7073
+
7074
+ const int64_t OD = ne3 / N;
7075
+ const int64_t OH = ne2;
7076
+ const int64_t OW = ne1;
7077
+ const int64_t OH_OW = OH*OW;
7078
+ const int64_t KD_KH_KW = KD*KH*KW;
7079
+ const int64_t KH_KW = KH*KW;
7080
+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7081
+
7082
+ GGML_ASSERT(nb10 == sizeof(float));
7083
+
7084
+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7085
+ {
7086
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
7087
+
7088
+ for (int64_t in = 0; in < N; in++) {
7089
+ for (int64_t iod = 0; iod < OD; iod++) {
7090
+ for (int64_t ioh = 0; ioh < OH; ioh++) {
7091
+ for (int64_t iow = 0; iow < OW; iow++) {
7092
+ for (int64_t iic = ith; iic < IC; iic += nth) {
7093
+
7094
+ // micro kernel
7095
+ ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7096
+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7097
+
7098
+ for (int64_t ikd = 0; ikd < KD; ikd++) {
7099
+ for (int64_t ikh = 0; ikh < KH; ikh++) {
7100
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
7101
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7102
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7103
+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7104
+
7105
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7106
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7107
+ } else {
7108
+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7109
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
7110
+ }
7111
+ }
7112
+ }
7113
+ }
7114
+ }
7115
+ }
7116
+ }
7117
+ }
7118
+ }
7119
+ }
7120
+ }
7121
+
7122
+ // ggml_compute_forward_im2col_3d_f32
7123
+ // src0: kernel [OC*IC, KD, KH, KW]
7124
+ // src1: image [N*IC, ID, IH, IW]
7125
+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7126
+ static void ggml_compute_forward_im2col_3d_f32(
7127
+ const ggml_compute_params * params,
7128
+ ggml_tensor * dst) {
7129
+
7130
+ const ggml_tensor * src0 = dst->src[0];
7131
+ const ggml_tensor * src1 = dst->src[1];
7132
+
7133
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7134
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7135
+
7136
+ GGML_TENSOR_BINARY_OP_LOCALS;
7137
+
7138
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7139
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7140
+ const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7141
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7142
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7143
+ const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7144
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7145
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7146
+ const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7147
+ const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7148
+
7149
+
7150
+ const int ith = params->ith;
7151
+ const int nth = params->nth;
7152
+
7153
+ const int64_t N = ne13 / IC;
7154
+ const int64_t ID = ne12;
7155
+ const int64_t IH = ne11;
7156
+ const int64_t IW = ne10;
7157
+
7158
+ const int64_t OC = ne03 / IC;
7159
+ GGML_UNUSED(OC);
7160
+ const int64_t KD = ne02;
7161
+ const int64_t KH = ne01;
7162
+ const int64_t KW = ne00;
7163
+
7164
+ const int64_t OD = ne3 / N;
7165
+ const int64_t OH = ne2;
7166
+ const int64_t OW = ne1;
7167
+
7168
+ const int64_t OH_OW = OH*OW;
7169
+ const int64_t KD_KH_KW = KD*KH*KW;
7170
+ const int64_t KH_KW = KH*KW;
7171
+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7172
+
7173
+ GGML_ASSERT(nb10 == sizeof(float));
7174
+
7175
+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7176
+ {
7177
+ float * const wdata = (float *) dst->data;
7178
+
7179
+ for (int64_t in = 0; in < N; in++) {
7180
+ for (int64_t iod = 0; iod < OD; iod++) {
7181
+ for (int64_t ioh = 0; ioh < OH; ioh++) {
7182
+ for (int64_t iow = 0; iow < OW; iow++) {
7183
+ for (int64_t iic = ith; iic < IC; iic += nth) {
7184
+
7185
+ // micro kernel
7186
+ float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7187
+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7188
+
7189
+ for (int64_t ikd = 0; ikd < KD; ikd++) {
7190
+ for (int64_t ikh = 0; ikh < KH; ikh++) {
7191
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
7192
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7193
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7194
+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7195
+
7196
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7197
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7198
+ } else {
7199
+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7200
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
7201
+ }
7202
+ }
7203
+ }
7204
+ }
7205
+ }
7206
+ }
7207
+ }
7208
+ }
7209
+ }
7210
+ }
7211
+ }
7212
+
7213
+
7214
+ void ggml_compute_forward_im2col_3d(
7215
+ const ggml_compute_params * params,
7216
+ ggml_tensor * dst) {
7217
+ switch (dst->type) {
7218
+ case GGML_TYPE_F16:
7219
+ {
7220
+ ggml_compute_forward_im2col_3d_f16(params, dst);
7221
+ } break;
7222
+ case GGML_TYPE_F32:
7223
+ {
7224
+ ggml_compute_forward_im2col_3d_f32(params, dst);
7225
+ } break;
7226
+ default:
7227
+ {
7228
+ GGML_ABORT("fatal error");
7229
+ }
7230
+ }
7231
+ }
7232
+
7030
7233
  static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
7031
7234
  void * a, void * b, float * c) {
7032
7235
  const ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -8014,6 +8217,15 @@ static void ggml_compute_forward_pad_f32(
8014
8217
  GGML_TENSOR_UNARY_OP_LOCALS
8015
8218
 
8016
8219
  float * dst_ptr = (float *) dst->data;
8220
+ const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
8221
+ const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
8222
+ const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
8223
+ const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
8224
+ const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
8225
+ const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
8226
+ const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
8227
+ const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
8228
+
8017
8229
 
8018
8230
  // TODO: optimize
8019
8231
 
@@ -8022,10 +8234,12 @@ static void ggml_compute_forward_pad_f32(
8022
8234
  for (int64_t i0 = 0; i0 < ne0; ++i0) {
8023
8235
  for (int64_t i3 = 0; i3 < ne3; ++i3) {
8024
8236
  const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
8025
-
8026
- const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
8027
-
8028
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
8237
+ if ((i0 >= lp0 && i0 < ne0 - rp0) \
8238
+ && (i1 >= lp1 && i1 < ne1 - rp1) \
8239
+ && (i2 >= lp2 && i2 < ne2 - rp2) \
8240
+ && (i3 >= lp3 && i3 < ne3 - rp3)) {
8241
+ const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
8242
+ const float * src_ptr = (const float *)((char *) src0->data + src_idx);
8029
8243
  dst_ptr[dst_idx] = *src_ptr;
8030
8244
  } else {
8031
8245
  dst_ptr[dst_idx] = 0;
@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
69
69
  void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
70
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
71
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
+ void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
73
  void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
74
  void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
74
75
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -114,26 +114,6 @@ extern "C" {
114
114
  #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
115
115
  #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
116
116
  #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
117
- #elif defined(__NNPA__)
118
- #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
119
- #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
120
-
121
- #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
122
- #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
123
-
124
- static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
125
- uint16x8_t v_h = vec_splats(h);
126
- uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
127
- return vec_extend_to_fp32_hi(v_hd, 0)[0];
128
- }
129
-
130
- static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
131
- float32x4_t v_f = vec_splats(f);
132
- float32x4_t v_zero = vec_splats(0.0f);
133
- uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
134
- uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
135
- return vec_extract(v_h, 0);
136
- }
137
117
  #endif
138
118
 
139
119
  // precomputed f32 table for f16 (256 KB)
@@ -215,6 +195,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
215
195
  #define GGML_F32_VEC_MUL GGML_F32xt_MUL
216
196
  #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
217
197
 
198
+ // F16 SVE
199
+ #define DEFAULT_PG32 svptrue_b32()
200
+ #define DEFAULT_PG16 svptrue_b16()
201
+
202
+ #define GGML_F32Cxt svfloat16_t
203
+ #define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
204
+ #define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
205
+ #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
206
+ #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
207
+
208
+ #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
209
+ #define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
210
+ #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
211
+ #define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
212
+ #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
213
+ #define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
214
+ #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
215
+
216
+ #define GGML_F16x_VEC GGML_F32Cxt
217
+ #define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
218
+ #define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
219
+ #define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
220
+ #define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
221
+ #define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
222
+ #define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
223
+ #define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
224
+ #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
225
+
226
+ #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
227
+ #define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
228
+
229
+ #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
230
+ { \
231
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
232
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
233
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
234
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
235
+ (res) = (ggml_float) sum_f16; \
236
+ }
237
+ #define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
238
+
218
239
  // F16 NEON
219
240
 
220
241
  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1115,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1115
1136
  #define GGML_F16_EPR GGML_F32_EPR
1116
1137
 
1117
1138
  static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1118
- #if defined(__NNPA__)
1119
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1120
- uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1121
- return vec_extend_to_fp32_hi(v_xd, 0);
1122
- #else
1123
1139
  float tmp[4];
1124
1140
 
1125
1141
  for (int i = 0; i < 4; i++) {
@@ -1129,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1129
1145
  // note: keep type-cast here to prevent compiler bugs
1130
1146
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
1131
1147
  return vec_xl(0, (const float *)(tmp));
1132
- #endif
1133
1148
  }
1134
1149
 
1135
1150
  static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1136
- #if defined(__NNPA__)
1137
- float32x4_t v_zero = vec_splats(0.0f);
1138
- uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1139
- uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1140
-
1141
- x[0] = vec_extract(v_x, 0);
1142
- x[1] = vec_extract(v_x, 1);
1143
- x[2] = vec_extract(v_x, 2);
1144
- x[3] = vec_extract(v_x, 3);
1145
- #else
1146
1151
  float arr[4];
1147
1152
 
1148
1153
  // note: keep type-cast here to prevent compiler bugs
@@ -1152,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1152
1157
  for (int i = 0; i < 4; i++) {
1153
1158
  x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1154
1159
  }
1155
- #endif
1156
1160
  }
1157
1161
 
1158
1162
  #define GGML_F16_VEC GGML_F32x4
@@ -85,15 +85,21 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
85
85
  // reduce sum1,sum2 to sum1
86
86
  GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
87
87
  #elif defined(__riscv_v_intrinsic)
88
- vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
89
- for (int i = 0, avl; i < n; i += avl) {
90
- avl = __riscv_vsetvl_e32m8(n - i);
91
- vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
92
- vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
93
- vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
94
- vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
88
+ int vl = __riscv_vsetvlmax_e32m8();
89
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
90
+ vfloat32m8_t vsum;
91
+ vfloat32m8_t ax;
92
+ vfloat32m8_t ay;
93
+ vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
94
+ for (int i = 0; i < n; i += vl) {
95
+ vl = __riscv_vsetvl_e32m8(n - i);
96
+ ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
97
+ ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
98
+ vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
95
99
  }
96
- sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
100
+ vl = __riscv_vsetvlmax_e32m8();
101
+ vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
102
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
97
103
  #else
98
104
  const int np = (n & ~(GGML_F32_STEP - 1));
99
105
 
@@ -207,38 +213,125 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
207
213
 
208
214
  ggml_float sumf = 0.0;
209
215
 
210
- #if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
211
- const int np = (n & ~(GGML_F16_STEP - 1));
212
216
 
213
- GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
217
+ #if defined(GGML_SIMD)
218
+ #if defined(__ARM_FEATURE_SVE)
219
+ const int sve_register_length = svcntb() * 8; //get vector length
220
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
221
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
222
+
223
+ const int np= (n & ~(ggml_f16_step - 1));
224
+ svfloat16_t sum1 = svdup_n_f16(0.0f);
225
+ svfloat16_t sum2 = svdup_n_f16(0.0f);
226
+ svfloat16_t sum3 = svdup_n_f16(0.0f);
227
+ svfloat16_t sum4 = svdup_n_f16(0.0f);
228
+
229
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
230
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
231
+ for (int i = 0; i < np; i += ggml_f16_step) {
232
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
233
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
234
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
235
+
236
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
237
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
238
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
239
+
240
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
241
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
242
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
243
+
244
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
245
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
246
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
247
+
248
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
249
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
250
+ sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
251
+
252
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
253
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
254
+ sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
255
+
256
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
257
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
258
+ sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
259
+
260
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
261
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
262
+ sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
263
+ }
214
264
 
215
- GGML_F16_VEC ax[GGML_F16_ARR];
216
- GGML_F16_VEC ay[GGML_F16_ARR];
265
+ const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
266
+ for (int k = np; k < np2; k += ggml_f16_epr) {
267
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
268
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
269
+ sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
270
+ }
217
271
 
218
- for (int i = 0; i < np; i += GGML_F16_STEP) {
219
- for (int j = 0; j < GGML_F16_ARR; j++) {
220
- ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
221
- ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
272
+ if (np2 < n) {
273
+ svbool_t pg = svwhilelt_b16(np2, n);
274
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
275
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
222
276
 
223
- sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
277
+ sum1 = svmad_f16_x(pg, hx, hy, sum1);
224
278
  }
225
- }
279
+ GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
280
+ #elif defined(__riscv_v_intrinsic)
281
+ #if defined(__riscv_zvfh)
282
+ int vl = __riscv_vsetvlmax_e32m2();
283
+ vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
284
+ vfloat32m2_t vsum;
285
+ vfloat16m1_t ax;
286
+ vfloat16m1_t ay;
287
+ vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
288
+ for (int i = 0; i < n; i += vl) {
289
+ vl = __riscv_vsetvl_e16m1(n - i);
290
+ ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
291
+ ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
292
+ vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
293
+ }
294
+ vl = __riscv_vsetvlmax_e32m1();
295
+ vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
296
+ vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
297
+ sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
298
+ #else
299
+ for (int i = 0; i < n; ++i) {
300
+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
301
+ }
302
+ #endif // __riscv_zvfh
303
+ #else
304
+ const int np = (n & ~(GGML_F16_STEP - 1));
226
305
 
227
- // reduce sum0..sum3 to sum0
228
- GGML_F16_VEC_REDUCE(sumf, sum);
306
+ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
229
307
 
230
- // leftovers
231
- for (int i = np; i < n; ++i) {
232
- sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
233
- }
308
+ GGML_F16_VEC ax[GGML_F16_ARR];
309
+ GGML_F16_VEC ay[GGML_F16_ARR];
310
+
311
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
312
+ for (int j = 0; j < GGML_F16_ARR; j++) {
313
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
314
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
315
+
316
+ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
317
+ }
318
+ }
319
+
320
+ // reduce sum0..sum3 to sum0
321
+ GGML_F16_VEC_REDUCE(sumf, sum);
234
322
 
235
- // if you hit this, you are likely running outside the FP range
236
- assert(!isnan(sumf) && !isinf(sumf));
323
+ // leftovers
324
+ for (int i = np; i < n; ++i) {
325
+ sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
326
+ }
327
+ // if you hit this, you are likely running outside the FP range
328
+ assert(!isnan(sumf) && !isinf(sumf));
329
+ #endif
237
330
  #else
238
331
  for (int i = 0; i < n; ++i) {
239
332
  sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
240
333
  }
241
- #endif
334
+ #endif // GGML_SIMD
242
335
 
243
336
  *s = sumf;
244
337
  }
@@ -257,6 +350,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
257
350
  for (; i + 3 < n; i += 4) {
258
351
  _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
259
352
  }
353
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
354
+ const int vlen = svcntw();
355
+ for (; i < n; i += vlen) {
356
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
357
+ svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
358
+ }
260
359
  #elif defined(__ARM_NEON) && defined(__aarch64__)
261
360
  for (; i + 3 < n; i += 4) {
262
361
  vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
@@ -281,10 +380,24 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
281
380
  for (; i + 3 < n; i += 4) {
282
381
  _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
283
382
  }
383
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
384
+ const int vlen = svcntw();
385
+ for (; i < n; i += vlen) {
386
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
387
+ svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
388
+ }
284
389
  #elif defined(__ARM_NEON) && defined(__aarch64__)
285
390
  for (; i + 3 < n; i += 4) {
286
391
  vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
287
392
  }
393
+ #elif defined(__riscv_v_intrinsic)
394
+ for (int vl; i < n; i += vl) {
395
+ vl = __riscv_vsetvl_e32m2(n - i);
396
+ vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
397
+ vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
398
+ vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
399
+ __riscv_vse32_v_f32m2(&y[i], vy, vl);
400
+ }
288
401
  #endif
289
402
  for (; i < n; ++i) {
290
403
  y[i] = ggml_silu_f32(x[i]) * g[i];
@@ -328,6 +441,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
328
441
  #endif
329
442
  sum += (ggml_float)_mm_cvtss_f32(val);
330
443
  }
444
+ #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
445
+ const int vlen = svcntw();
446
+ for (; i < n; i += vlen) {
447
+ const svbool_t pg = svwhilelt_b32_s32(i, n);
448
+ svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
449
+ svdup_n_f32_x(pg, max)));
450
+ svst1_f32(pg, y + i, val);
451
+ sum += (ggml_float)svaddv_f32(pg, val);
452
+ }
331
453
  #elif defined(__ARM_NEON) && defined(__aarch64__)
332
454
  for (; i + 3 < n; i += 4) {
333
455
  float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),