@fugood/llama.node 1.1.11 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +111 -1
- package/src/llama.cpp/common/chat.h +3 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +27 -4
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +62 -56
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +54 -9
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +159 -1
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -7027,6 +7027,209 @@ void ggml_compute_forward_im2col_back_f32(
|
|
|
7027
7027
|
}
|
|
7028
7028
|
}
|
|
7029
7029
|
|
|
7030
|
+
|
|
7031
|
+
// ggml_compute_forward_im2col_3d_f16
|
|
7032
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7033
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7034
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7035
|
+
static void ggml_compute_forward_im2col_3d_f16(
|
|
7036
|
+
const ggml_compute_params * params,
|
|
7037
|
+
ggml_tensor * dst) {
|
|
7038
|
+
|
|
7039
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7040
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7041
|
+
|
|
7042
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
7043
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7044
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
|
7045
|
+
|
|
7046
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7047
|
+
|
|
7048
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7049
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7050
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7051
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7052
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7053
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7054
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7055
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7056
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7057
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7058
|
+
|
|
7059
|
+
|
|
7060
|
+
const int ith = params->ith;
|
|
7061
|
+
const int nth = params->nth;
|
|
7062
|
+
|
|
7063
|
+
const int64_t N = ne13 / IC;
|
|
7064
|
+
const int64_t ID = ne12;
|
|
7065
|
+
const int64_t IH = ne11;
|
|
7066
|
+
const int64_t IW = ne10;
|
|
7067
|
+
|
|
7068
|
+
const int64_t OC = ne03 / IC;
|
|
7069
|
+
GGML_UNUSED(OC);
|
|
7070
|
+
const int64_t KD = ne02;
|
|
7071
|
+
const int64_t KH = ne01;
|
|
7072
|
+
const int64_t KW = ne00;
|
|
7073
|
+
|
|
7074
|
+
const int64_t OD = ne3 / N;
|
|
7075
|
+
const int64_t OH = ne2;
|
|
7076
|
+
const int64_t OW = ne1;
|
|
7077
|
+
const int64_t OH_OW = OH*OW;
|
|
7078
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7079
|
+
const int64_t KH_KW = KH*KW;
|
|
7080
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7081
|
+
|
|
7082
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7083
|
+
|
|
7084
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7085
|
+
{
|
|
7086
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
|
7087
|
+
|
|
7088
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7089
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7090
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7091
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7092
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7093
|
+
|
|
7094
|
+
// micro kernel
|
|
7095
|
+
ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7096
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7097
|
+
|
|
7098
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7099
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7100
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7101
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7102
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7103
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7104
|
+
|
|
7105
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7106
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7107
|
+
} else {
|
|
7108
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7109
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
|
|
7110
|
+
}
|
|
7111
|
+
}
|
|
7112
|
+
}
|
|
7113
|
+
}
|
|
7114
|
+
}
|
|
7115
|
+
}
|
|
7116
|
+
}
|
|
7117
|
+
}
|
|
7118
|
+
}
|
|
7119
|
+
}
|
|
7120
|
+
}
|
|
7121
|
+
|
|
7122
|
+
// ggml_compute_forward_im2col_3d_f32
|
|
7123
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7124
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7125
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7126
|
+
static void ggml_compute_forward_im2col_3d_f32(
|
|
7127
|
+
const ggml_compute_params * params,
|
|
7128
|
+
ggml_tensor * dst) {
|
|
7129
|
+
|
|
7130
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7131
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7132
|
+
|
|
7133
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7134
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
7135
|
+
|
|
7136
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7137
|
+
|
|
7138
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7139
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7140
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7141
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7142
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7143
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7144
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7145
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7146
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7147
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7148
|
+
|
|
7149
|
+
|
|
7150
|
+
const int ith = params->ith;
|
|
7151
|
+
const int nth = params->nth;
|
|
7152
|
+
|
|
7153
|
+
const int64_t N = ne13 / IC;
|
|
7154
|
+
const int64_t ID = ne12;
|
|
7155
|
+
const int64_t IH = ne11;
|
|
7156
|
+
const int64_t IW = ne10;
|
|
7157
|
+
|
|
7158
|
+
const int64_t OC = ne03 / IC;
|
|
7159
|
+
GGML_UNUSED(OC);
|
|
7160
|
+
const int64_t KD = ne02;
|
|
7161
|
+
const int64_t KH = ne01;
|
|
7162
|
+
const int64_t KW = ne00;
|
|
7163
|
+
|
|
7164
|
+
const int64_t OD = ne3 / N;
|
|
7165
|
+
const int64_t OH = ne2;
|
|
7166
|
+
const int64_t OW = ne1;
|
|
7167
|
+
|
|
7168
|
+
const int64_t OH_OW = OH*OW;
|
|
7169
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7170
|
+
const int64_t KH_KW = KH*KW;
|
|
7171
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7172
|
+
|
|
7173
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7174
|
+
|
|
7175
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7176
|
+
{
|
|
7177
|
+
float * const wdata = (float *) dst->data;
|
|
7178
|
+
|
|
7179
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7180
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7181
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7182
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7183
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7184
|
+
|
|
7185
|
+
// micro kernel
|
|
7186
|
+
float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7187
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7188
|
+
|
|
7189
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7190
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7191
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7192
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7193
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7194
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7195
|
+
|
|
7196
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7197
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7198
|
+
} else {
|
|
7199
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7200
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
|
|
7201
|
+
}
|
|
7202
|
+
}
|
|
7203
|
+
}
|
|
7204
|
+
}
|
|
7205
|
+
}
|
|
7206
|
+
}
|
|
7207
|
+
}
|
|
7208
|
+
}
|
|
7209
|
+
}
|
|
7210
|
+
}
|
|
7211
|
+
}
|
|
7212
|
+
|
|
7213
|
+
|
|
7214
|
+
void ggml_compute_forward_im2col_3d(
|
|
7215
|
+
const ggml_compute_params * params,
|
|
7216
|
+
ggml_tensor * dst) {
|
|
7217
|
+
switch (dst->type) {
|
|
7218
|
+
case GGML_TYPE_F16:
|
|
7219
|
+
{
|
|
7220
|
+
ggml_compute_forward_im2col_3d_f16(params, dst);
|
|
7221
|
+
} break;
|
|
7222
|
+
case GGML_TYPE_F32:
|
|
7223
|
+
{
|
|
7224
|
+
ggml_compute_forward_im2col_3d_f32(params, dst);
|
|
7225
|
+
} break;
|
|
7226
|
+
default:
|
|
7227
|
+
{
|
|
7228
|
+
GGML_ABORT("fatal error");
|
|
7229
|
+
}
|
|
7230
|
+
}
|
|
7231
|
+
}
|
|
7232
|
+
|
|
7030
7233
|
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
|
7031
7234
|
void * a, void * b, float * c) {
|
|
7032
7235
|
const ggml_type_traits * traits = ggml_get_type_traits(type);
|
|
@@ -8014,6 +8217,15 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8014
8217
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
8015
8218
|
|
|
8016
8219
|
float * dst_ptr = (float *) dst->data;
|
|
8220
|
+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
|
|
8221
|
+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
|
|
8222
|
+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
|
|
8223
|
+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
|
|
8224
|
+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
|
|
8225
|
+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
|
|
8226
|
+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
8227
|
+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
8228
|
+
|
|
8017
8229
|
|
|
8018
8230
|
// TODO: optimize
|
|
8019
8231
|
|
|
@@ -8022,10 +8234,12 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8022
8234
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|
8023
8235
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
8024
8236
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
8025
|
-
|
|
8026
|
-
|
|
8027
|
-
|
|
8028
|
-
|
|
8237
|
+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
|
8238
|
+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
|
8239
|
+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
|
|
8240
|
+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
|
|
8241
|
+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
|
|
8242
|
+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
8029
8243
|
dst_ptr[dst_idx] = *src_ptr;
|
|
8030
8244
|
} else {
|
|
8031
8245
|
dst_ptr[dst_idx] = 0;
|
|
@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
|
|
69
69
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
70
70
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
71
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
|
+
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
73
|
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
73
74
|
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
74
75
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -114,26 +114,6 @@ extern "C" {
|
|
|
114
114
|
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
115
115
|
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
116
116
|
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
117
|
-
#elif defined(__NNPA__)
|
|
118
|
-
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
119
|
-
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
120
|
-
|
|
121
|
-
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
122
|
-
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
123
|
-
|
|
124
|
-
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
125
|
-
uint16x8_t v_h = vec_splats(h);
|
|
126
|
-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
127
|
-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
131
|
-
float32x4_t v_f = vec_splats(f);
|
|
132
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
133
|
-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
134
|
-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
135
|
-
return vec_extract(v_h, 0);
|
|
136
|
-
}
|
|
137
117
|
#endif
|
|
138
118
|
|
|
139
119
|
// precomputed f32 table for f16 (256 KB)
|
|
@@ -215,6 +195,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
215
195
|
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
|
216
196
|
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
|
217
197
|
|
|
198
|
+
// F16 SVE
|
|
199
|
+
#define DEFAULT_PG32 svptrue_b32()
|
|
200
|
+
#define DEFAULT_PG16 svptrue_b16()
|
|
201
|
+
|
|
202
|
+
#define GGML_F32Cxt svfloat16_t
|
|
203
|
+
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
|
204
|
+
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
|
205
|
+
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
|
206
|
+
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
|
207
|
+
|
|
208
|
+
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
|
209
|
+
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
210
|
+
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
|
211
|
+
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
212
|
+
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
|
213
|
+
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
214
|
+
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
|
215
|
+
|
|
216
|
+
#define GGML_F16x_VEC GGML_F32Cxt
|
|
217
|
+
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
|
|
218
|
+
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
|
|
219
|
+
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
|
|
220
|
+
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
|
221
|
+
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
|
|
222
|
+
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
|
|
223
|
+
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
|
|
224
|
+
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
|
225
|
+
|
|
226
|
+
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
|
227
|
+
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
228
|
+
|
|
229
|
+
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
|
230
|
+
{ \
|
|
231
|
+
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
|
232
|
+
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
|
233
|
+
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
|
234
|
+
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
|
235
|
+
(res) = (ggml_float) sum_f16; \
|
|
236
|
+
}
|
|
237
|
+
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
238
|
+
|
|
218
239
|
// F16 NEON
|
|
219
240
|
|
|
220
241
|
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
@@ -1115,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
1115
1136
|
#define GGML_F16_EPR GGML_F32_EPR
|
|
1116
1137
|
|
|
1117
1138
|
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1118
|
-
#if defined(__NNPA__)
|
|
1119
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
|
1120
|
-
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1121
|
-
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1122
|
-
#else
|
|
1123
1139
|
float tmp[4];
|
|
1124
1140
|
|
|
1125
1141
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1129,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
|
1129
1145
|
// note: keep type-cast here to prevent compiler bugs
|
|
1130
1146
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
1131
1147
|
return vec_xl(0, (const float *)(tmp));
|
|
1132
|
-
#endif
|
|
1133
1148
|
}
|
|
1134
1149
|
|
|
1135
1150
|
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
1136
|
-
#if defined(__NNPA__)
|
|
1137
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
1138
|
-
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1139
|
-
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1140
|
-
|
|
1141
|
-
x[0] = vec_extract(v_x, 0);
|
|
1142
|
-
x[1] = vec_extract(v_x, 1);
|
|
1143
|
-
x[2] = vec_extract(v_x, 2);
|
|
1144
|
-
x[3] = vec_extract(v_x, 3);
|
|
1145
|
-
#else
|
|
1146
1151
|
float arr[4];
|
|
1147
1152
|
|
|
1148
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
@@ -1152,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1152
1157
|
for (int i = 0; i < 4; i++) {
|
|
1153
1158
|
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
1154
1159
|
}
|
|
1155
|
-
#endif
|
|
1156
1160
|
}
|
|
1157
1161
|
|
|
1158
1162
|
#define GGML_F16_VEC GGML_F32x4
|
|
@@ -85,15 +85,21 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
|
|
85
85
|
// reduce sum1,sum2 to sum1
|
|
86
86
|
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
|
87
87
|
#elif defined(__riscv_v_intrinsic)
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
88
|
+
int vl = __riscv_vsetvlmax_e32m8();
|
|
89
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
90
|
+
vfloat32m8_t vsum;
|
|
91
|
+
vfloat32m8_t ax;
|
|
92
|
+
vfloat32m8_t ay;
|
|
93
|
+
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
|
|
94
|
+
for (int i = 0; i < n; i += vl) {
|
|
95
|
+
vl = __riscv_vsetvl_e32m8(n - i);
|
|
96
|
+
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
|
|
97
|
+
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
|
|
98
|
+
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
|
|
95
99
|
}
|
|
96
|
-
|
|
100
|
+
vl = __riscv_vsetvlmax_e32m8();
|
|
101
|
+
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
|
|
102
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
97
103
|
#else
|
|
98
104
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
99
105
|
|
|
@@ -207,38 +213,125 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
|
|
207
213
|
|
|
208
214
|
ggml_float sumf = 0.0;
|
|
209
215
|
|
|
210
|
-
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
|
211
|
-
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
212
216
|
|
|
213
|
-
|
|
217
|
+
#if defined(GGML_SIMD)
|
|
218
|
+
#if defined(__ARM_FEATURE_SVE)
|
|
219
|
+
const int sve_register_length = svcntb() * 8; //get vector length
|
|
220
|
+
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
|
221
|
+
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
|
222
|
+
|
|
223
|
+
const int np= (n & ~(ggml_f16_step - 1));
|
|
224
|
+
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
|
225
|
+
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
|
226
|
+
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
|
227
|
+
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
|
228
|
+
|
|
229
|
+
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
|
230
|
+
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
|
231
|
+
for (int i = 0; i < np; i += ggml_f16_step) {
|
|
232
|
+
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
|
233
|
+
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
|
234
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
|
235
|
+
|
|
236
|
+
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
|
237
|
+
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
|
238
|
+
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
|
239
|
+
|
|
240
|
+
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
|
241
|
+
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
|
242
|
+
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
|
243
|
+
|
|
244
|
+
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
|
245
|
+
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
|
246
|
+
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
|
247
|
+
|
|
248
|
+
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
|
249
|
+
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
|
250
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
|
251
|
+
|
|
252
|
+
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
|
253
|
+
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
|
254
|
+
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
|
255
|
+
|
|
256
|
+
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
|
257
|
+
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
|
258
|
+
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
|
259
|
+
|
|
260
|
+
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
|
261
|
+
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
|
262
|
+
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
|
263
|
+
}
|
|
214
264
|
|
|
215
|
-
|
|
216
|
-
|
|
265
|
+
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
|
266
|
+
for (int k = np; k < np2; k += ggml_f16_epr) {
|
|
267
|
+
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
|
268
|
+
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
|
269
|
+
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
|
270
|
+
}
|
|
217
271
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
272
|
+
if (np2 < n) {
|
|
273
|
+
svbool_t pg = svwhilelt_b16(np2, n);
|
|
274
|
+
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
|
275
|
+
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
|
222
276
|
|
|
223
|
-
|
|
277
|
+
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
|
224
278
|
}
|
|
225
|
-
|
|
279
|
+
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
|
280
|
+
#elif defined(__riscv_v_intrinsic)
|
|
281
|
+
#if defined(__riscv_zvfh)
|
|
282
|
+
int vl = __riscv_vsetvlmax_e32m2();
|
|
283
|
+
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
|
284
|
+
vfloat32m2_t vsum;
|
|
285
|
+
vfloat16m1_t ax;
|
|
286
|
+
vfloat16m1_t ay;
|
|
287
|
+
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
|
|
288
|
+
for (int i = 0; i < n; i += vl) {
|
|
289
|
+
vl = __riscv_vsetvl_e16m1(n - i);
|
|
290
|
+
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
|
|
291
|
+
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
|
|
292
|
+
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
|
|
293
|
+
}
|
|
294
|
+
vl = __riscv_vsetvlmax_e32m1();
|
|
295
|
+
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
|
|
296
|
+
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
|
|
297
|
+
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
|
|
298
|
+
#else
|
|
299
|
+
for (int i = 0; i < n; ++i) {
|
|
300
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
301
|
+
}
|
|
302
|
+
#endif // __riscv_zvfh
|
|
303
|
+
#else
|
|
304
|
+
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
226
305
|
|
|
227
|
-
|
|
228
|
-
GGML_F16_VEC_REDUCE(sumf, sum);
|
|
306
|
+
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
|
|
229
307
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
308
|
+
GGML_F16_VEC ax[GGML_F16_ARR];
|
|
309
|
+
GGML_F16_VEC ay[GGML_F16_ARR];
|
|
310
|
+
|
|
311
|
+
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
|
312
|
+
for (int j = 0; j < GGML_F16_ARR; j++) {
|
|
313
|
+
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
|
314
|
+
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
|
315
|
+
|
|
316
|
+
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// reduce sum0..sum3 to sum0
|
|
321
|
+
GGML_F16_VEC_REDUCE(sumf, sum);
|
|
234
322
|
|
|
235
|
-
|
|
236
|
-
|
|
323
|
+
// leftovers
|
|
324
|
+
for (int i = np; i < n; ++i) {
|
|
325
|
+
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
326
|
+
}
|
|
327
|
+
// if you hit this, you are likely running outside the FP range
|
|
328
|
+
assert(!isnan(sumf) && !isinf(sumf));
|
|
329
|
+
#endif
|
|
237
330
|
#else
|
|
238
331
|
for (int i = 0; i < n; ++i) {
|
|
239
332
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
|
240
333
|
}
|
|
241
|
-
#endif
|
|
334
|
+
#endif // GGML_SIMD
|
|
242
335
|
|
|
243
336
|
*s = sumf;
|
|
244
337
|
}
|
|
@@ -257,6 +350,12 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
257
350
|
for (; i + 3 < n; i += 4) {
|
|
258
351
|
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
|
|
259
352
|
}
|
|
353
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
354
|
+
const int vlen = svcntw();
|
|
355
|
+
for (; i < n; i += vlen) {
|
|
356
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
357
|
+
svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i)));
|
|
358
|
+
}
|
|
260
359
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
261
360
|
for (; i + 3 < n; i += 4) {
|
|
262
361
|
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
|
|
@@ -281,10 +380,24 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
|
|
|
281
380
|
for (; i + 3 < n; i += 4) {
|
|
282
381
|
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i)));
|
|
283
382
|
}
|
|
383
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
384
|
+
const int vlen = svcntw();
|
|
385
|
+
for (; i < n; i += vlen) {
|
|
386
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
387
|
+
svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i)));
|
|
388
|
+
}
|
|
284
389
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
285
390
|
for (; i + 3 < n; i += 4) {
|
|
286
391
|
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
|
|
287
392
|
}
|
|
393
|
+
#elif defined(__riscv_v_intrinsic)
|
|
394
|
+
for (int vl; i < n; i += vl) {
|
|
395
|
+
vl = __riscv_vsetvl_e32m2(n - i);
|
|
396
|
+
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
|
|
397
|
+
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
|
|
398
|
+
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
|
|
399
|
+
__riscv_vse32_v_f32m2(&y[i], vy, vl);
|
|
400
|
+
}
|
|
288
401
|
#endif
|
|
289
402
|
for (; i < n; ++i) {
|
|
290
403
|
y[i] = ggml_silu_f32(x[i]) * g[i];
|
|
@@ -328,6 +441,15 @@ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float
|
|
|
328
441
|
#endif
|
|
329
442
|
sum += (ggml_float)_mm_cvtss_f32(val);
|
|
330
443
|
}
|
|
444
|
+
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
|
445
|
+
const int vlen = svcntw();
|
|
446
|
+
for (; i < n; i += vlen) {
|
|
447
|
+
const svbool_t pg = svwhilelt_b32_s32(i, n);
|
|
448
|
+
svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i),
|
|
449
|
+
svdup_n_f32_x(pg, max)));
|
|
450
|
+
svst1_f32(pg, y + i, val);
|
|
451
|
+
sum += (ggml_float)svaddv_f32(pg, val);
|
|
452
|
+
}
|
|
331
453
|
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
|
332
454
|
for (; i + 3 < n; i += 4) {
|
|
333
455
|
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
|