@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -7027,6 +7027,209 @@ void ggml_compute_forward_im2col_back_f32(
7027
7027
  }
7028
7028
  }
7029
7029
 
7030
+
7031
+ // ggml_compute_forward_im2col_3d_f16
7032
+ // src0: kernel [OC*IC, KD, KH, KW]
7033
+ // src1: image [N*IC, ID, IH, IW]
7034
+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7035
+ static void ggml_compute_forward_im2col_3d_f16(
7036
+ const ggml_compute_params * params,
7037
+ ggml_tensor * dst) {
7038
+
7039
+ const ggml_tensor * src0 = dst->src[0];
7040
+ const ggml_tensor * src1 = dst->src[1];
7041
+
7042
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
7043
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7044
+ GGML_ASSERT( dst->type == GGML_TYPE_F16);
7045
+
7046
+ GGML_TENSOR_BINARY_OP_LOCALS;
7047
+
7048
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7049
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7050
+ const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7051
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7052
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7053
+ const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7054
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7055
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7056
+ const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7057
+ const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7058
+
7059
+
7060
+ const int ith = params->ith;
7061
+ const int nth = params->nth;
7062
+
7063
+ const int64_t N = ne13 / IC;
7064
+ const int64_t ID = ne12;
7065
+ const int64_t IH = ne11;
7066
+ const int64_t IW = ne10;
7067
+
7068
+ const int64_t OC = ne03 / IC;
7069
+ GGML_UNUSED(OC);
7070
+ const int64_t KD = ne02;
7071
+ const int64_t KH = ne01;
7072
+ const int64_t KW = ne00;
7073
+
7074
+ const int64_t OD = ne3 / N;
7075
+ const int64_t OH = ne2;
7076
+ const int64_t OW = ne1;
7077
+ const int64_t OH_OW = OH*OW;
7078
+ const int64_t KD_KH_KW = KD*KH*KW;
7079
+ const int64_t KH_KW = KH*KW;
7080
+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7081
+
7082
+ GGML_ASSERT(nb10 == sizeof(float));
7083
+
7084
+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7085
+ {
7086
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
7087
+
7088
+ for (int64_t in = 0; in < N; in++) {
7089
+ for (int64_t iod = 0; iod < OD; iod++) {
7090
+ for (int64_t ioh = 0; ioh < OH; ioh++) {
7091
+ for (int64_t iow = 0; iow < OW; iow++) {
7092
+ for (int64_t iic = ith; iic < IC; iic += nth) {
7093
+
7094
+ // micro kernel
7095
+ ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7096
+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7097
+
7098
+ for (int64_t ikd = 0; ikd < KD; ikd++) {
7099
+ for (int64_t ikh = 0; ikh < KH; ikh++) {
7100
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
7101
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7102
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7103
+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7104
+
7105
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7106
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7107
+ } else {
7108
+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7109
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
7110
+ }
7111
+ }
7112
+ }
7113
+ }
7114
+ }
7115
+ }
7116
+ }
7117
+ }
7118
+ }
7119
+ }
7120
+ }
7121
+
7122
+ // ggml_compute_forward_im2col_3d_f32
7123
+ // src0: kernel [OC*IC, KD, KH, KW]
7124
+ // src1: image [N*IC, ID, IH, IW]
7125
+ // dst: result [N*OD, OH, OW, IC * KD * KH * KW]
7126
+ static void ggml_compute_forward_im2col_3d_f32(
7127
+ const ggml_compute_params * params,
7128
+ ggml_tensor * dst) {
7129
+
7130
+ const ggml_tensor * src0 = dst->src[0];
7131
+ const ggml_tensor * src1 = dst->src[1];
7132
+
7133
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7134
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
7135
+
7136
+ GGML_TENSOR_BINARY_OP_LOCALS;
7137
+
7138
+ const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
7139
+ const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
7140
+ const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
7141
+ const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
7142
+ const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
7143
+ const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
7144
+ const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
7145
+ const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
7146
+ const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
7147
+ const int32_t IC = ((const int32_t *)(dst->op_params))[9];
7148
+
7149
+
7150
+ const int ith = params->ith;
7151
+ const int nth = params->nth;
7152
+
7153
+ const int64_t N = ne13 / IC;
7154
+ const int64_t ID = ne12;
7155
+ const int64_t IH = ne11;
7156
+ const int64_t IW = ne10;
7157
+
7158
+ const int64_t OC = ne03 / IC;
7159
+ GGML_UNUSED(OC);
7160
+ const int64_t KD = ne02;
7161
+ const int64_t KH = ne01;
7162
+ const int64_t KW = ne00;
7163
+
7164
+ const int64_t OD = ne3 / N;
7165
+ const int64_t OH = ne2;
7166
+ const int64_t OW = ne1;
7167
+
7168
+ const int64_t OH_OW = OH*OW;
7169
+ const int64_t KD_KH_KW = KD*KH*KW;
7170
+ const int64_t KH_KW = KH*KW;
7171
+ const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
7172
+
7173
+ GGML_ASSERT(nb10 == sizeof(float));
7174
+
7175
+ // im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
7176
+ {
7177
+ float * const wdata = (float *) dst->data;
7178
+
7179
+ for (int64_t in = 0; in < N; in++) {
7180
+ for (int64_t iod = 0; iod < OD; iod++) {
7181
+ for (int64_t ioh = 0; ioh < OH; ioh++) {
7182
+ for (int64_t iow = 0; iow < OW; iow++) {
7183
+ for (int64_t iic = ith; iic < IC; iic += nth) {
7184
+
7185
+ // micro kernel
7186
+ float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
7187
+ const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
7188
+
7189
+ for (int64_t ikd = 0; ikd < KD; ikd++) {
7190
+ for (int64_t ikh = 0; ikh < KH; ikh++) {
7191
+ for (int64_t ikw = 0; ikw < KW; ikw++) {
7192
+ const int64_t iiw = iow*s0 + ikw*d0 - p0;
7193
+ const int64_t iih = ioh*s1 + ikh*d1 - p1;
7194
+ const int64_t iid = iod*s2 + ikd*d2 - p2;
7195
+
7196
+ if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
7197
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
7198
+ } else {
7199
+ const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
7200
+ dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
7201
+ }
7202
+ }
7203
+ }
7204
+ }
7205
+ }
7206
+ }
7207
+ }
7208
+ }
7209
+ }
7210
+ }
7211
+ }
7212
+
7213
+
7214
+ void ggml_compute_forward_im2col_3d(
7215
+ const ggml_compute_params * params,
7216
+ ggml_tensor * dst) {
7217
+ switch (dst->type) {
7218
+ case GGML_TYPE_F16:
7219
+ {
7220
+ ggml_compute_forward_im2col_3d_f16(params, dst);
7221
+ } break;
7222
+ case GGML_TYPE_F32:
7223
+ {
7224
+ ggml_compute_forward_im2col_3d_f32(params, dst);
7225
+ } break;
7226
+ default:
7227
+ {
7228
+ GGML_ABORT("fatal error");
7229
+ }
7230
+ }
7231
+ }
7232
+
7030
7233
  static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
7031
7234
  void * a, void * b, float * c) {
7032
7235
  const ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -8014,6 +8217,15 @@ static void ggml_compute_forward_pad_f32(
8014
8217
  GGML_TENSOR_UNARY_OP_LOCALS
8015
8218
 
8016
8219
  float * dst_ptr = (float *) dst->data;
8220
+ const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
8221
+ const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
8222
+ const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
8223
+ const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
8224
+ const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
8225
+ const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
8226
+ const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
8227
+ const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
8228
+
8017
8229
 
8018
8230
  // TODO: optimize
8019
8231
 
@@ -8022,10 +8234,12 @@ static void ggml_compute_forward_pad_f32(
8022
8234
  for (int64_t i0 = 0; i0 < ne0; ++i0) {
8023
8235
  for (int64_t i3 = 0; i3 < ne3; ++i3) {
8024
8236
  const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
8025
-
8026
- const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
8027
-
8028
- if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
8237
+ if ((i0 >= lp0 && i0 < ne0 - rp0) \
8238
+ && (i1 >= lp1 && i1 < ne1 - rp1) \
8239
+ && (i2 >= lp2 && i2 < ne2 - rp2) \
8240
+ && (i3 >= lp3 && i3 < ne3 - rp3)) {
8241
+ const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
8242
+ const float * src_ptr = (const float *)((char *) src0->data + src_idx);
8029
8243
  dst_ptr[dst_idx] = *src_ptr;
8030
8244
  } else {
8031
8245
  dst_ptr[dst_idx] = 0;
@@ -9003,8 +9217,7 @@ static void ggml_compute_forward_ssm_scan_f32(
9003
9217
  GGML_ASSERT(src4->nb[0] == sizeof(float));
9004
9218
  GGML_ASSERT(src5->nb[0] == sizeof(float));
9005
9219
  GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
9006
- // allows optimizing the modulo since n_group should be a power of 2
9007
- GGML_ASSERT((ng & -ng) == ng);
9220
+ GGML_ASSERT(nh % ng == 0);
9008
9221
 
9009
9222
  // heads per thread
9010
9223
  const int dh = (nh + nth - 1)/nth;
@@ -9035,6 +9248,7 @@ static void ggml_compute_forward_ssm_scan_f32(
9035
9248
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
9036
9249
  const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
9037
9250
  const float dA = expf(dt_soft_plus * A[h]);
9251
+ const int g = h / (nh / ng); // repeat_interleave
9038
9252
 
9039
9253
  // dim
9040
9254
  for (int i1 = 0; i1 < nr; ++i1) {
@@ -9057,8 +9271,8 @@ static void ggml_compute_forward_ssm_scan_f32(
9057
9271
  // TODO: maybe unroll more?
9058
9272
  for (int j = 0; j < 1; j++) {
9059
9273
  GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
9060
- GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
9061
- GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + (h & (ng - 1))*nc);
9274
+ GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
9275
+ GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
9062
9276
 
9063
9277
  t0 = GGML_F32_VEC_MUL(t0, adA);
9064
9278
  t1 = GGML_F32_VEC_MUL(t1, axdt);
@@ -9072,6 +9286,9 @@ static void ggml_compute_forward_ssm_scan_f32(
9072
9286
  }
9073
9287
 
9074
9288
  sumf = GGML_F32xt_REDUCE_ONE(sum);
9289
+ #elif defined(__riscv_v_intrinsic)
9290
+ // todo: RVV implementation
9291
+ const int np = 0;
9075
9292
  #else
9076
9293
  const int np = (nc & ~(GGML_F32_STEP - 1));
9077
9294
 
@@ -9087,8 +9304,8 @@ static void ggml_compute_forward_ssm_scan_f32(
9087
9304
  for (int i = 0; i < np; i += GGML_F32_STEP) {
9088
9305
  for (int j = 0; j < GGML_F32_ARR; j++) {
9089
9306
  ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
9090
- ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
9091
- az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + (h & (ng - 1))*nc);
9307
+ ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
9308
+ az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);
9092
9309
 
9093
9310
  ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
9094
9311
  ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
@@ -9110,7 +9327,7 @@ static void ggml_compute_forward_ssm_scan_f32(
9110
9327
  // d_state
9111
9328
  for (int i0 = np; i0 < nc; ++i0) {
9112
9329
  const int i = i0 + ii*nc;
9113
- const int ig = i0 + (h & (ng - 1))*nc;
9330
+ const int ig = i0 + g*nc;
9114
9331
  // state = prev_state * dA + dB * x
9115
9332
  const float state = (s0[i] * dA) + (B[ig] * x_dt);
9116
9333
  // y = rowwise_dotprod(state, C)
@@ -9127,6 +9344,7 @@ static void ggml_compute_forward_ssm_scan_f32(
9127
9344
  for (int h = ih0; h < ih1; ++h) {
9128
9345
  // ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
9129
9346
  const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
9347
+ const int g = h / (nh / ng); // repeat_interleave
9130
9348
 
9131
9349
  // dim
9132
9350
  for (int i1 = 0; i1 < nr; ++i1) {
@@ -9141,8 +9359,8 @@ static void ggml_compute_forward_ssm_scan_f32(
9141
9359
  // TODO: what happens when (d_state % svcntw()) != 0?
9142
9360
  for (int64_t k = 0; k < nc; k += svcntw()) {
9143
9361
  svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
9144
- svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + (h & (ng - 1))*nc]);
9145
- svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + (h & (ng - 1))*nc]);
9362
+ svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
9363
+ svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
9146
9364
  svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
9147
9365
 
9148
9366
  svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
@@ -9162,7 +9380,7 @@ static void ggml_compute_forward_ssm_scan_f32(
9162
9380
  // d_state
9163
9381
  for (int i0 = 0; i0 < nc; ++i0) {
9164
9382
  const int i = i0 + ii*nc;
9165
- const int ig = i0 + (h & (ng - 1))*nc;
9383
+ const int ig = i0 + g*nc;
9166
9384
  // state = prev_state * dA + dB * x
9167
9385
  const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
9168
9386
  // y = rowwise_dotprod(state, C)
@@ -10023,8 +10241,8 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
10023
10241
  int64_t h_stride_2d = head_size * head_size;
10024
10242
 
10025
10243
  #if defined(GGML_SIMD)
10026
- #if defined(__ARM_FEATURE_SVE)
10027
- // scalar Route to scalar implementation //TODO: Write SVE code
10244
+ #if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic)
10245
+ // scalar Route to scalar implementation //TODO: Write SVE code and RVV code
10028
10246
  for (int64_t t = 0; t < T; t++) {
10029
10247
  int64_t t_offset = t * t_stride;
10030
10248
  int64_t state_offset = head_size * C * (t / (T / n_seqs));
@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
69
69
  void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
70
70
  void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
71
71
  void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
+ void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
72
73
  void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
73
74
  void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
74
75
  void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
@@ -18,6 +18,10 @@
18
18
  #include <immintrin.h>
19
19
  #endif
20
20
 
21
+ #if defined(__riscv_v_intrinsic)
22
+ #include <riscv_vector.h>
23
+ #endif
24
+
21
25
  #ifdef __cplusplus
22
26
  extern "C" {
23
27
  #endif
@@ -94,24 +98,15 @@ extern "C" {
94
98
  }
95
99
  #elif defined(__riscv) && defined(__riscv_zfhmin)
96
100
  static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
- float f;
98
- __asm__(
99
- "fmv.h.x %[f], %[h]\n\t"
100
- "fcvt.s.h %[f], %[f]"
101
- : [f] "=&f" (f)
102
- : [h] "r" (h)
103
- );
104
- return f;
101
+ _Float16 hf;
102
+ memcpy(&hf, &h, sizeof(ggml_fp16_t));
103
+ return hf;
105
104
  }
106
105
 
107
106
  static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
107
  ggml_fp16_t res;
109
- __asm__(
110
- "fcvt.h.s %[f], %[f]\n\t"
111
- "fmv.x.h %[h], %[f]"
112
- : [h] "=&r" (res)
113
- : [f] "f" (f)
114
- );
108
+ _Float16 hf = (_Float16)f;
109
+ memcpy(&res, &hf, sizeof(ggml_fp16_t));
115
110
  return res;
116
111
  }
117
112
 
@@ -119,26 +114,6 @@ extern "C" {
119
114
  #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
120
115
  #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
121
116
  #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
122
- #elif defined(__NNPA__)
123
- #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
124
- #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
125
-
126
- #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
127
- #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
128
-
129
- static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
130
- uint16x8_t v_h = vec_splats(h);
131
- uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
132
- return vec_extend_to_fp32_hi(v_hd, 0)[0];
133
- }
134
-
135
- static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
136
- float32x4_t v_f = vec_splats(f);
137
- float32x4_t v_zero = vec_splats(0.0f);
138
- uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
139
- uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
140
- return vec_extract(v_h, 0);
141
- }
142
117
  #endif
143
118
 
144
119
  // precomputed f32 table for f16 (256 KB)
@@ -220,6 +195,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
220
195
  #define GGML_F32_VEC_MUL GGML_F32xt_MUL
221
196
  #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
222
197
 
198
+ // F16 SVE
199
+ #define DEFAULT_PG32 svptrue_b32()
200
+ #define DEFAULT_PG16 svptrue_b16()
201
+
202
+ #define GGML_F32Cxt svfloat16_t
203
+ #define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
204
+ #define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
205
+ #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
206
+ #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
207
+
208
+ #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
209
+ #define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
210
+ #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
211
+ #define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
212
+ #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
213
+ #define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
214
+ #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
215
+
216
+ #define GGML_F16x_VEC GGML_F32Cxt
217
+ #define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
218
+ #define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
219
+ #define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
220
+ #define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
221
+ #define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
222
+ #define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
223
+ #define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
224
+ #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
225
+
226
+ #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
227
+ #define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
228
+
229
+ #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
230
+ { \
231
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
232
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
233
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
234
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
235
+ (res) = (ggml_float) sum_f16; \
236
+ }
237
+ #define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
238
+
223
239
  // F16 NEON
224
240
 
225
241
  #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1120,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1120
1136
  #define GGML_F16_EPR GGML_F32_EPR
1121
1137
 
1122
1138
  static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1123
- #if defined(__NNPA__)
1124
- uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1125
- uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1126
- return vec_extend_to_fp32_hi(v_xd, 0);
1127
- #else
1128
1139
  float tmp[4];
1129
1140
 
1130
1141
  for (int i = 0; i < 4; i++) {
@@ -1134,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1134
1145
  // note: keep type-cast here to prevent compiler bugs
1135
1146
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
1136
1147
  return vec_xl(0, (const float *)(tmp));
1137
- #endif
1138
1148
  }
1139
1149
 
1140
1150
  static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1141
- #if defined(__NNPA__)
1142
- float32x4_t v_zero = vec_splats(0.0f);
1143
- uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1144
- uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1145
-
1146
- x[0] = vec_extract(v_x, 0);
1147
- x[1] = vec_extract(v_x, 1);
1148
- x[2] = vec_extract(v_x, 2);
1149
- x[3] = vec_extract(v_x, 3);
1150
- #else
1151
1151
  float arr[4];
1152
1152
 
1153
1153
  // note: keep type-cast here to prevent compiler bugs
@@ -1157,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1157
1157
  for (int i = 0; i < 4; i++) {
1158
1158
  x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1159
1159
  }
1160
- #endif
1161
1160
  }
1162
1161
 
1163
1162
  #define GGML_F16_VEC GGML_F32x4
@@ -1170,6 +1169,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1170
1169
  #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1171
1170
  #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1172
1171
 
1172
+ #elif defined(__riscv_v_intrinsic)
1173
+
1174
+ // compatible with vlen >= 128
1175
+
1176
+ #define GGML_SIMD
1177
+
1178
+ // F32
1179
+
1180
+ #define GGML_F32_STEP 16
1181
+ #define GGML_F32_EPR 4
1182
+
1183
+ #define GGML_F32x4 vfloat32m1_t
1184
+ #define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1185
+ #define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1186
+ #define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1187
+ #define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1188
+ #define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1189
+ #define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1190
+ #define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1191
+
1192
+ #define GGML_F32_VEC GGML_F32x4
1193
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1194
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1195
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1196
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1197
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1198
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1199
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1200
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1201
+
1173
1202
  #endif
1174
1203
 
1175
1204
  // GGML_F32_ARR / GGML_F16_ARR