@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -7027,6 +7027,209 @@ void ggml_compute_forward_im2col_back_f32(
|
|
|
7027
7027
|
}
|
|
7028
7028
|
}
|
|
7029
7029
|
|
|
7030
|
+
|
|
7031
|
+
// ggml_compute_forward_im2col_3d_f16
|
|
7032
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7033
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7034
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7035
|
+
static void ggml_compute_forward_im2col_3d_f16(
|
|
7036
|
+
const ggml_compute_params * params,
|
|
7037
|
+
ggml_tensor * dst) {
|
|
7038
|
+
|
|
7039
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7040
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7041
|
+
|
|
7042
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
7043
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7044
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
|
7045
|
+
|
|
7046
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7047
|
+
|
|
7048
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7049
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7050
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7051
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7052
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7053
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7054
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7055
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7056
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7057
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7058
|
+
|
|
7059
|
+
|
|
7060
|
+
const int ith = params->ith;
|
|
7061
|
+
const int nth = params->nth;
|
|
7062
|
+
|
|
7063
|
+
const int64_t N = ne13 / IC;
|
|
7064
|
+
const int64_t ID = ne12;
|
|
7065
|
+
const int64_t IH = ne11;
|
|
7066
|
+
const int64_t IW = ne10;
|
|
7067
|
+
|
|
7068
|
+
const int64_t OC = ne03 / IC;
|
|
7069
|
+
GGML_UNUSED(OC);
|
|
7070
|
+
const int64_t KD = ne02;
|
|
7071
|
+
const int64_t KH = ne01;
|
|
7072
|
+
const int64_t KW = ne00;
|
|
7073
|
+
|
|
7074
|
+
const int64_t OD = ne3 / N;
|
|
7075
|
+
const int64_t OH = ne2;
|
|
7076
|
+
const int64_t OW = ne1;
|
|
7077
|
+
const int64_t OH_OW = OH*OW;
|
|
7078
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7079
|
+
const int64_t KH_KW = KH*KW;
|
|
7080
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7081
|
+
|
|
7082
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7083
|
+
|
|
7084
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7085
|
+
{
|
|
7086
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
|
7087
|
+
|
|
7088
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7089
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7090
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7091
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7092
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7093
|
+
|
|
7094
|
+
// micro kernel
|
|
7095
|
+
ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7096
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7097
|
+
|
|
7098
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7099
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7100
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7101
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7102
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7103
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7104
|
+
|
|
7105
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7106
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7107
|
+
} else {
|
|
7108
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7109
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
|
|
7110
|
+
}
|
|
7111
|
+
}
|
|
7112
|
+
}
|
|
7113
|
+
}
|
|
7114
|
+
}
|
|
7115
|
+
}
|
|
7116
|
+
}
|
|
7117
|
+
}
|
|
7118
|
+
}
|
|
7119
|
+
}
|
|
7120
|
+
}
|
|
7121
|
+
|
|
7122
|
+
// ggml_compute_forward_im2col_3d_f32
|
|
7123
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7124
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7125
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7126
|
+
static void ggml_compute_forward_im2col_3d_f32(
|
|
7127
|
+
const ggml_compute_params * params,
|
|
7128
|
+
ggml_tensor * dst) {
|
|
7129
|
+
|
|
7130
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7131
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7132
|
+
|
|
7133
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7134
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
7135
|
+
|
|
7136
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7137
|
+
|
|
7138
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7139
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7140
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7141
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7142
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7143
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7144
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7145
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7146
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7147
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7148
|
+
|
|
7149
|
+
|
|
7150
|
+
const int ith = params->ith;
|
|
7151
|
+
const int nth = params->nth;
|
|
7152
|
+
|
|
7153
|
+
const int64_t N = ne13 / IC;
|
|
7154
|
+
const int64_t ID = ne12;
|
|
7155
|
+
const int64_t IH = ne11;
|
|
7156
|
+
const int64_t IW = ne10;
|
|
7157
|
+
|
|
7158
|
+
const int64_t OC = ne03 / IC;
|
|
7159
|
+
GGML_UNUSED(OC);
|
|
7160
|
+
const int64_t KD = ne02;
|
|
7161
|
+
const int64_t KH = ne01;
|
|
7162
|
+
const int64_t KW = ne00;
|
|
7163
|
+
|
|
7164
|
+
const int64_t OD = ne3 / N;
|
|
7165
|
+
const int64_t OH = ne2;
|
|
7166
|
+
const int64_t OW = ne1;
|
|
7167
|
+
|
|
7168
|
+
const int64_t OH_OW = OH*OW;
|
|
7169
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7170
|
+
const int64_t KH_KW = KH*KW;
|
|
7171
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7172
|
+
|
|
7173
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7174
|
+
|
|
7175
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7176
|
+
{
|
|
7177
|
+
float * const wdata = (float *) dst->data;
|
|
7178
|
+
|
|
7179
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7180
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7181
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7182
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7183
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7184
|
+
|
|
7185
|
+
// micro kernel
|
|
7186
|
+
float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7187
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7188
|
+
|
|
7189
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7190
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7191
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7192
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7193
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7194
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7195
|
+
|
|
7196
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7197
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7198
|
+
} else {
|
|
7199
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7200
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
|
|
7201
|
+
}
|
|
7202
|
+
}
|
|
7203
|
+
}
|
|
7204
|
+
}
|
|
7205
|
+
}
|
|
7206
|
+
}
|
|
7207
|
+
}
|
|
7208
|
+
}
|
|
7209
|
+
}
|
|
7210
|
+
}
|
|
7211
|
+
}
|
|
7212
|
+
|
|
7213
|
+
|
|
7214
|
+
void ggml_compute_forward_im2col_3d(
|
|
7215
|
+
const ggml_compute_params * params,
|
|
7216
|
+
ggml_tensor * dst) {
|
|
7217
|
+
switch (dst->type) {
|
|
7218
|
+
case GGML_TYPE_F16:
|
|
7219
|
+
{
|
|
7220
|
+
ggml_compute_forward_im2col_3d_f16(params, dst);
|
|
7221
|
+
} break;
|
|
7222
|
+
case GGML_TYPE_F32:
|
|
7223
|
+
{
|
|
7224
|
+
ggml_compute_forward_im2col_3d_f32(params, dst);
|
|
7225
|
+
} break;
|
|
7226
|
+
default:
|
|
7227
|
+
{
|
|
7228
|
+
GGML_ABORT("fatal error");
|
|
7229
|
+
}
|
|
7230
|
+
}
|
|
7231
|
+
}
|
|
7232
|
+
|
|
7030
7233
|
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
|
7031
7234
|
void * a, void * b, float * c) {
|
|
7032
7235
|
const ggml_type_traits * traits = ggml_get_type_traits(type);
|
|
@@ -8014,6 +8217,15 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8014
8217
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
8015
8218
|
|
|
8016
8219
|
float * dst_ptr = (float *) dst->data;
|
|
8220
|
+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
|
|
8221
|
+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
|
|
8222
|
+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
|
|
8223
|
+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
|
|
8224
|
+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
|
|
8225
|
+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
|
|
8226
|
+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
8227
|
+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
8228
|
+
|
|
8017
8229
|
|
|
8018
8230
|
// TODO: optimize
|
|
8019
8231
|
|
|
@@ -8022,10 +8234,12 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8022
8234
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|
8023
8235
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
8024
8236
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
8025
|
-
|
|
8026
|
-
|
|
8027
|
-
|
|
8028
|
-
|
|
8237
|
+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
|
8238
|
+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
|
8239
|
+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
|
|
8240
|
+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
|
|
8241
|
+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
|
|
8242
|
+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
8029
8243
|
dst_ptr[dst_idx] = *src_ptr;
|
|
8030
8244
|
} else {
|
|
8031
8245
|
dst_ptr[dst_idx] = 0;
|
|
@@ -9003,8 +9217,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9003
9217
|
GGML_ASSERT(src4->nb[0] == sizeof(float));
|
|
9004
9218
|
GGML_ASSERT(src5->nb[0] == sizeof(float));
|
|
9005
9219
|
GGML_ASSERT(src6->nb[0] == sizeof(int32_t));
|
|
9006
|
-
|
|
9007
|
-
GGML_ASSERT((ng & -ng) == ng);
|
|
9220
|
+
GGML_ASSERT(nh % ng == 0);
|
|
9008
9221
|
|
|
9009
9222
|
// heads per thread
|
|
9010
9223
|
const int dh = (nh + nth - 1)/nth;
|
|
@@ -9035,6 +9248,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9035
9248
|
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
|
9036
9249
|
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
|
9037
9250
|
const float dA = expf(dt_soft_plus * A[h]);
|
|
9251
|
+
const int g = h / (nh / ng); // repeat_interleave
|
|
9038
9252
|
|
|
9039
9253
|
// dim
|
|
9040
9254
|
for (int i1 = 0; i1 < nr; ++i1) {
|
|
@@ -9057,8 +9271,8 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9057
9271
|
// TODO: maybe unroll more?
|
|
9058
9272
|
for (int j = 0; j < 1; j++) {
|
|
9059
9273
|
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
|
|
9060
|
-
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr +
|
|
9061
|
-
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr +
|
|
9274
|
+
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
|
|
9275
|
+
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
|
|
9062
9276
|
|
|
9063
9277
|
t0 = GGML_F32_VEC_MUL(t0, adA);
|
|
9064
9278
|
t1 = GGML_F32_VEC_MUL(t1, axdt);
|
|
@@ -9072,6 +9286,9 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9072
9286
|
}
|
|
9073
9287
|
|
|
9074
9288
|
sumf = GGML_F32xt_REDUCE_ONE(sum);
|
|
9289
|
+
#elif defined(__riscv_v_intrinsic)
|
|
9290
|
+
// todo: RVV implementation
|
|
9291
|
+
const int np = 0;
|
|
9075
9292
|
#else
|
|
9076
9293
|
const int np = (nc & ~(GGML_F32_STEP - 1));
|
|
9077
9294
|
|
|
@@ -9087,8 +9304,8 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9087
9304
|
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
|
9088
9305
|
for (int j = 0; j < GGML_F32_ARR; j++) {
|
|
9089
9306
|
ax[j] = GGML_F32_VEC_LOAD(s0 + i + j*GGML_F32_EPR + ii*nc);
|
|
9090
|
-
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR +
|
|
9091
|
-
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR +
|
|
9307
|
+
ay[j] = GGML_F32_VEC_LOAD(B + i + j*GGML_F32_EPR + g*nc);
|
|
9308
|
+
az[j] = GGML_F32_VEC_LOAD(C + i + j*GGML_F32_EPR + g*nc);
|
|
9092
9309
|
|
|
9093
9310
|
ax[j] = GGML_F32_VEC_MUL(ax[j], adA);
|
|
9094
9311
|
ay[j] = GGML_F32_VEC_MUL(ay[j], axdt);
|
|
@@ -9110,7 +9327,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9110
9327
|
// d_state
|
|
9111
9328
|
for (int i0 = np; i0 < nc; ++i0) {
|
|
9112
9329
|
const int i = i0 + ii*nc;
|
|
9113
|
-
const int ig = i0 +
|
|
9330
|
+
const int ig = i0 + g*nc;
|
|
9114
9331
|
// state = prev_state * dA + dB * x
|
|
9115
9332
|
const float state = (s0[i] * dA) + (B[ig] * x_dt);
|
|
9116
9333
|
// y = rowwise_dotprod(state, C)
|
|
@@ -9127,6 +9344,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9127
9344
|
for (int h = ih0; h < ih1; ++h) {
|
|
9128
9345
|
// ref: https://github.com/state-spaces/mamba/blob/62db608da60f6fc790b8ed9f4b3225e95ca15fde/mamba_ssm/ops/triton/softplus.py#L16
|
|
9129
9346
|
const float dt_soft_plus = dt[h] <= 20.0f ? log1pf(expf(dt[h])) : dt[h];
|
|
9347
|
+
const int g = h / (nh / ng); // repeat_interleave
|
|
9130
9348
|
|
|
9131
9349
|
// dim
|
|
9132
9350
|
for (int i1 = 0; i1 < nr; ++i1) {
|
|
@@ -9141,8 +9359,8 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9141
9359
|
// TODO: what happens when (d_state % svcntw()) != 0?
|
|
9142
9360
|
for (int64_t k = 0; k < nc; k += svcntw()) {
|
|
9143
9361
|
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
|
|
9144
|
-
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k +
|
|
9145
|
-
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k +
|
|
9362
|
+
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
|
|
9363
|
+
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
|
|
9146
9364
|
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
|
|
9147
9365
|
|
|
9148
9366
|
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
|
@@ -9162,7 +9380,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
|
|
9162
9380
|
// d_state
|
|
9163
9381
|
for (int i0 = 0; i0 < nc; ++i0) {
|
|
9164
9382
|
const int i = i0 + ii*nc;
|
|
9165
|
-
const int ig = i0 +
|
|
9383
|
+
const int ig = i0 + g*nc;
|
|
9166
9384
|
// state = prev_state * dA + dB * x
|
|
9167
9385
|
const float state = (s0[i] * expf(dt_soft_plus * A[i0 + h*nc])) + (B[ig] * x_dt);
|
|
9168
9386
|
// y = rowwise_dotprod(state, C)
|
|
@@ -10023,8 +10241,8 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
|
|
|
10023
10241
|
int64_t h_stride_2d = head_size * head_size;
|
|
10024
10242
|
|
|
10025
10243
|
#if defined(GGML_SIMD)
|
|
10026
|
-
#if defined(__ARM_FEATURE_SVE)
|
|
10027
|
-
// scalar Route to scalar implementation //TODO: Write SVE code
|
|
10244
|
+
#if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic)
|
|
10245
|
+
// scalar Route to scalar implementation //TODO: Write SVE code and RVV code
|
|
10028
10246
|
for (int64_t t = 0; t < T; t++) {
|
|
10029
10247
|
int64_t t_offset = t * t_stride;
|
|
10030
10248
|
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
|
@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
|
|
69
69
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
70
70
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
71
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
|
+
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
73
|
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
73
74
|
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
74
75
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -18,6 +18,10 @@
|
|
|
18
18
|
#include <immintrin.h>
|
|
19
19
|
#endif
|
|
20
20
|
|
|
21
|
+
#if defined(__riscv_v_intrinsic)
|
|
22
|
+
#include <riscv_vector.h>
|
|
23
|
+
#endif
|
|
24
|
+
|
|
21
25
|
#ifdef __cplusplus
|
|
22
26
|
extern "C" {
|
|
23
27
|
#endif
|
|
@@ -94,24 +98,15 @@ extern "C" {
|
|
|
94
98
|
}
|
|
95
99
|
#elif defined(__riscv) && defined(__riscv_zfhmin)
|
|
96
100
|
static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
"fcvt.s.h %[f], %[f]"
|
|
101
|
-
: [f] "=&f" (f)
|
|
102
|
-
: [h] "r" (h)
|
|
103
|
-
);
|
|
104
|
-
return f;
|
|
101
|
+
_Float16 hf;
|
|
102
|
+
memcpy(&hf, &h, sizeof(ggml_fp16_t));
|
|
103
|
+
return hf;
|
|
105
104
|
}
|
|
106
105
|
|
|
107
106
|
static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
|
|
108
107
|
ggml_fp16_t res;
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
"fmv.x.h %[h], %[f]"
|
|
112
|
-
: [h] "=&r" (res)
|
|
113
|
-
: [f] "f" (f)
|
|
114
|
-
);
|
|
108
|
+
_Float16 hf = (_Float16)f;
|
|
109
|
+
memcpy(&res, &hf, sizeof(ggml_fp16_t));
|
|
115
110
|
return res;
|
|
116
111
|
}
|
|
117
112
|
|
|
@@ -119,26 +114,6 @@ extern "C" {
|
|
|
119
114
|
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
120
115
|
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
121
116
|
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
122
|
-
#elif defined(__NNPA__)
|
|
123
|
-
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
124
|
-
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
125
|
-
|
|
126
|
-
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
127
|
-
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
128
|
-
|
|
129
|
-
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
130
|
-
uint16x8_t v_h = vec_splats(h);
|
|
131
|
-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
132
|
-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
136
|
-
float32x4_t v_f = vec_splats(f);
|
|
137
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
138
|
-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
139
|
-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
140
|
-
return vec_extract(v_h, 0);
|
|
141
|
-
}
|
|
142
117
|
#endif
|
|
143
118
|
|
|
144
119
|
// precomputed f32 table for f16 (256 KB)
|
|
@@ -220,6 +195,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
220
195
|
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
|
221
196
|
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
|
222
197
|
|
|
198
|
+
// F16 SVE
|
|
199
|
+
#define DEFAULT_PG32 svptrue_b32()
|
|
200
|
+
#define DEFAULT_PG16 svptrue_b16()
|
|
201
|
+
|
|
202
|
+
#define GGML_F32Cxt svfloat16_t
|
|
203
|
+
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
|
204
|
+
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
|
205
|
+
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
|
206
|
+
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
|
207
|
+
|
|
208
|
+
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
|
209
|
+
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
210
|
+
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
|
211
|
+
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
212
|
+
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
|
213
|
+
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
214
|
+
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
|
215
|
+
|
|
216
|
+
#define GGML_F16x_VEC GGML_F32Cxt
|
|
217
|
+
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
|
|
218
|
+
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
|
|
219
|
+
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
|
|
220
|
+
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
|
221
|
+
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
|
|
222
|
+
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
|
|
223
|
+
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
|
|
224
|
+
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
|
225
|
+
|
|
226
|
+
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
|
227
|
+
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
228
|
+
|
|
229
|
+
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
|
230
|
+
{ \
|
|
231
|
+
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
|
232
|
+
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
|
233
|
+
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
|
234
|
+
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
|
235
|
+
(res) = (ggml_float) sum_f16; \
|
|
236
|
+
}
|
|
237
|
+
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
238
|
+
|
|
223
239
|
// F16 NEON
|
|
224
240
|
|
|
225
241
|
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
@@ -1120,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
1120
1136
|
#define GGML_F16_EPR GGML_F32_EPR
|
|
1121
1137
|
|
|
1122
1138
|
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1123
|
-
#if defined(__NNPA__)
|
|
1124
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
|
1125
|
-
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1126
|
-
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1127
|
-
#else
|
|
1128
1139
|
float tmp[4];
|
|
1129
1140
|
|
|
1130
1141
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1134,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
|
1134
1145
|
// note: keep type-cast here to prevent compiler bugs
|
|
1135
1146
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
1136
1147
|
return vec_xl(0, (const float *)(tmp));
|
|
1137
|
-
#endif
|
|
1138
1148
|
}
|
|
1139
1149
|
|
|
1140
1150
|
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
1141
|
-
#if defined(__NNPA__)
|
|
1142
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
1143
|
-
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1144
|
-
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1145
|
-
|
|
1146
|
-
x[0] = vec_extract(v_x, 0);
|
|
1147
|
-
x[1] = vec_extract(v_x, 1);
|
|
1148
|
-
x[2] = vec_extract(v_x, 2);
|
|
1149
|
-
x[3] = vec_extract(v_x, 3);
|
|
1150
|
-
#else
|
|
1151
1151
|
float arr[4];
|
|
1152
1152
|
|
|
1153
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
@@ -1157,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1157
1157
|
for (int i = 0; i < 4; i++) {
|
|
1158
1158
|
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
1159
1159
|
}
|
|
1160
|
-
#endif
|
|
1161
1160
|
}
|
|
1162
1161
|
|
|
1163
1162
|
#define GGML_F16_VEC GGML_F32x4
|
|
@@ -1170,6 +1169,36 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1170
1169
|
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
|
1171
1170
|
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1172
1171
|
|
|
1172
|
+
#elif defined(__riscv_v_intrinsic)
|
|
1173
|
+
|
|
1174
|
+
// compatible with vlen >= 128
|
|
1175
|
+
|
|
1176
|
+
#define GGML_SIMD
|
|
1177
|
+
|
|
1178
|
+
// F32
|
|
1179
|
+
|
|
1180
|
+
#define GGML_F32_STEP 16
|
|
1181
|
+
#define GGML_F32_EPR 4
|
|
1182
|
+
|
|
1183
|
+
#define GGML_F32x4 vfloat32m1_t
|
|
1184
|
+
#define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
|
|
1185
|
+
#define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
|
|
1186
|
+
#define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
|
|
1187
|
+
#define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
|
|
1188
|
+
#define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
|
|
1189
|
+
#define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
|
|
1190
|
+
#define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
|
|
1191
|
+
|
|
1192
|
+
#define GGML_F32_VEC GGML_F32x4
|
|
1193
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
|
1194
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
|
1195
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
|
1196
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
|
1197
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
|
1198
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
|
1199
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
|
1200
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1201
|
+
|
|
1173
1202
|
#endif
|
|
1174
1203
|
|
|
1175
1204
|
// GGML_F32_ARR / GGML_F16_ARR
|