@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
|
|
|
776
776
|
id += ne00 * (ne01 - ir1);
|
|
777
777
|
}
|
|
778
778
|
}
|
|
779
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
780
|
+
size_t id = 0;
|
|
781
|
+
int32_t * dst_ptr = (int32_t *) dst->data;
|
|
782
|
+
|
|
783
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
|
784
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
|
785
|
+
id += ne00 * ir0;
|
|
786
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
787
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
|
788
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
789
|
+
|
|
790
|
+
dst_ptr[id] = *src0_ptr;
|
|
791
|
+
id++;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
id += ne00 * (ne01 - ir1);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
779
797
|
} else {
|
|
780
798
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
781
799
|
}
|
|
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
|
|
|
947
965
|
}
|
|
948
966
|
}
|
|
949
967
|
}
|
|
968
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
969
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
970
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
971
|
+
i10 += ne00 * ir0;
|
|
972
|
+
while (i10 >= ne0) {
|
|
973
|
+
i10 -= ne0;
|
|
974
|
+
if (++i11 == ne1) {
|
|
975
|
+
i11 = 0;
|
|
976
|
+
if (++i12 == ne2) {
|
|
977
|
+
i12 = 0;
|
|
978
|
+
if (++i13 == ne3) {
|
|
979
|
+
i13 = 0;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
985
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
986
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
987
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
988
|
+
|
|
989
|
+
*(int32_t *) dst_ptr = *(const float *) src0_ptr;
|
|
990
|
+
|
|
991
|
+
if (++i10 == ne0) {
|
|
992
|
+
i10 = 0;
|
|
993
|
+
if (++i11 == ne1) {
|
|
994
|
+
i11 = 0;
|
|
995
|
+
if (++i12 == ne2) {
|
|
996
|
+
i12 = 0;
|
|
997
|
+
if (++i13 == ne3) {
|
|
998
|
+
i13 = 0;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1006
|
+
while (i10 >= ne0) {
|
|
1007
|
+
i10 -= ne0;
|
|
1008
|
+
if (++i11 == ne1) {
|
|
1009
|
+
i11 = 0;
|
|
1010
|
+
if (++i12 == ne2) {
|
|
1011
|
+
i12 = 0;
|
|
1012
|
+
if (++i13 == ne3) {
|
|
1013
|
+
i13 = 0;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
}
|
|
1020
|
+
} else {
|
|
1021
|
+
GGML_ABORT("fatal error"); // TODO: implement
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
static void ggml_compute_forward_dup_i32(
|
|
1026
|
+
const ggml_compute_params * params,
|
|
1027
|
+
ggml_tensor * dst) {
|
|
1028
|
+
|
|
1029
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
1030
|
+
|
|
1031
|
+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
1032
|
+
|
|
1033
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
1034
|
+
|
|
1035
|
+
const int ith = params->ith; // thread index
|
|
1036
|
+
const int nth = params->nth; // number of threads
|
|
1037
|
+
|
|
1038
|
+
// parallelize by rows
|
|
1039
|
+
const int nr = ne01;
|
|
1040
|
+
// number of rows per thread
|
|
1041
|
+
const int dr = (nr + nth - 1) / nth;
|
|
1042
|
+
// row range for this thread
|
|
1043
|
+
const int ir0 = dr * ith;
|
|
1044
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
1045
|
+
|
|
1046
|
+
// dst counters
|
|
1047
|
+
|
|
1048
|
+
int64_t i10 = 0;
|
|
1049
|
+
int64_t i11 = 0;
|
|
1050
|
+
int64_t i12 = 0;
|
|
1051
|
+
int64_t i13 = 0;
|
|
1052
|
+
|
|
1053
|
+
// TODO: not optimal, but works
|
|
1054
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
1055
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
1056
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
1057
|
+
i10 += ne00 * ir0;
|
|
1058
|
+
while (i10 >= ne0) {
|
|
1059
|
+
i10 -= ne0;
|
|
1060
|
+
if (++i11 == ne1) {
|
|
1061
|
+
i11 = 0;
|
|
1062
|
+
if (++i12 == ne2) {
|
|
1063
|
+
i12 = 0;
|
|
1064
|
+
if (++i13 == ne3) {
|
|
1065
|
+
i13 = 0;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
1071
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
1072
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
1073
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
1074
|
+
|
|
1075
|
+
*(float *) dst_ptr = *(const int32_t *) src0_ptr;
|
|
1076
|
+
|
|
1077
|
+
if (++i10 == ne0) {
|
|
1078
|
+
i10 = 0;
|
|
1079
|
+
if (++i11 == ne1) {
|
|
1080
|
+
i11 = 0;
|
|
1081
|
+
if (++i12 == ne2) {
|
|
1082
|
+
i12 = 0;
|
|
1083
|
+
if (++i13 == ne3) {
|
|
1084
|
+
i13 = 0;
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1092
|
+
while (i10 >= ne0) {
|
|
1093
|
+
i10 -= ne0;
|
|
1094
|
+
if (++i11 == ne1) {
|
|
1095
|
+
i11 = 0;
|
|
1096
|
+
if (++i12 == ne2) {
|
|
1097
|
+
i12 = 0;
|
|
1098
|
+
if (++i13 == ne3) {
|
|
1099
|
+
i13 = 0;
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
950
1106
|
} else {
|
|
951
1107
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
952
1108
|
}
|
|
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
|
|
|
1177
1333
|
{
|
|
1178
1334
|
ggml_compute_forward_dup_f32(params, dst);
|
|
1179
1335
|
} break;
|
|
1336
|
+
case GGML_TYPE_I32:
|
|
1337
|
+
{
|
|
1338
|
+
ggml_compute_forward_dup_i32(params, dst);
|
|
1339
|
+
} break;
|
|
1180
1340
|
default:
|
|
1181
1341
|
{
|
|
1182
1342
|
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
|
|
@@ -7027,6 +7187,209 @@ void ggml_compute_forward_im2col_back_f32(
|
|
|
7027
7187
|
}
|
|
7028
7188
|
}
|
|
7029
7189
|
|
|
7190
|
+
|
|
7191
|
+
// ggml_compute_forward_im2col_3d_f16
|
|
7192
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7193
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7194
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7195
|
+
static void ggml_compute_forward_im2col_3d_f16(
|
|
7196
|
+
const ggml_compute_params * params,
|
|
7197
|
+
ggml_tensor * dst) {
|
|
7198
|
+
|
|
7199
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7200
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7201
|
+
|
|
7202
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
7203
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7204
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
|
7205
|
+
|
|
7206
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7207
|
+
|
|
7208
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7209
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7210
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7211
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7212
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7213
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7214
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7215
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7216
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7217
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7218
|
+
|
|
7219
|
+
|
|
7220
|
+
const int ith = params->ith;
|
|
7221
|
+
const int nth = params->nth;
|
|
7222
|
+
|
|
7223
|
+
const int64_t N = ne13 / IC;
|
|
7224
|
+
const int64_t ID = ne12;
|
|
7225
|
+
const int64_t IH = ne11;
|
|
7226
|
+
const int64_t IW = ne10;
|
|
7227
|
+
|
|
7228
|
+
const int64_t OC = ne03 / IC;
|
|
7229
|
+
GGML_UNUSED(OC);
|
|
7230
|
+
const int64_t KD = ne02;
|
|
7231
|
+
const int64_t KH = ne01;
|
|
7232
|
+
const int64_t KW = ne00;
|
|
7233
|
+
|
|
7234
|
+
const int64_t OD = ne3 / N;
|
|
7235
|
+
const int64_t OH = ne2;
|
|
7236
|
+
const int64_t OW = ne1;
|
|
7237
|
+
const int64_t OH_OW = OH*OW;
|
|
7238
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7239
|
+
const int64_t KH_KW = KH*KW;
|
|
7240
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7241
|
+
|
|
7242
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7243
|
+
|
|
7244
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7245
|
+
{
|
|
7246
|
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
|
7247
|
+
|
|
7248
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7249
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7250
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7251
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7252
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7253
|
+
|
|
7254
|
+
// micro kernel
|
|
7255
|
+
ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7256
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7257
|
+
|
|
7258
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7259
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7260
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7261
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7262
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7263
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7264
|
+
|
|
7265
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7266
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7267
|
+
} else {
|
|
7268
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7269
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
|
|
7270
|
+
}
|
|
7271
|
+
}
|
|
7272
|
+
}
|
|
7273
|
+
}
|
|
7274
|
+
}
|
|
7275
|
+
}
|
|
7276
|
+
}
|
|
7277
|
+
}
|
|
7278
|
+
}
|
|
7279
|
+
}
|
|
7280
|
+
}
|
|
7281
|
+
|
|
7282
|
+
// ggml_compute_forward_im2col_3d_f32
|
|
7283
|
+
// src0: kernel [OC*IC, KD, KH, KW]
|
|
7284
|
+
// src1: image [N*IC, ID, IH, IW]
|
|
7285
|
+
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7286
|
+
static void ggml_compute_forward_im2col_3d_f32(
|
|
7287
|
+
const ggml_compute_params * params,
|
|
7288
|
+
ggml_tensor * dst) {
|
|
7289
|
+
|
|
7290
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
7291
|
+
const ggml_tensor * src1 = dst->src[1];
|
|
7292
|
+
|
|
7293
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7294
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
7295
|
+
|
|
7296
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
7297
|
+
|
|
7298
|
+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
|
|
7299
|
+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
|
|
7300
|
+
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
|
|
7301
|
+
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
|
|
7302
|
+
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
|
|
7303
|
+
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
|
|
7304
|
+
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
|
|
7305
|
+
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
|
|
7306
|
+
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
|
|
7307
|
+
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
|
|
7308
|
+
|
|
7309
|
+
|
|
7310
|
+
const int ith = params->ith;
|
|
7311
|
+
const int nth = params->nth;
|
|
7312
|
+
|
|
7313
|
+
const int64_t N = ne13 / IC;
|
|
7314
|
+
const int64_t ID = ne12;
|
|
7315
|
+
const int64_t IH = ne11;
|
|
7316
|
+
const int64_t IW = ne10;
|
|
7317
|
+
|
|
7318
|
+
const int64_t OC = ne03 / IC;
|
|
7319
|
+
GGML_UNUSED(OC);
|
|
7320
|
+
const int64_t KD = ne02;
|
|
7321
|
+
const int64_t KH = ne01;
|
|
7322
|
+
const int64_t KW = ne00;
|
|
7323
|
+
|
|
7324
|
+
const int64_t OD = ne3 / N;
|
|
7325
|
+
const int64_t OH = ne2;
|
|
7326
|
+
const int64_t OW = ne1;
|
|
7327
|
+
|
|
7328
|
+
const int64_t OH_OW = OH*OW;
|
|
7329
|
+
const int64_t KD_KH_KW = KD*KH*KW;
|
|
7330
|
+
const int64_t KH_KW = KH*KW;
|
|
7331
|
+
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
|
|
7332
|
+
|
|
7333
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7334
|
+
|
|
7335
|
+
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
|
|
7336
|
+
{
|
|
7337
|
+
float * const wdata = (float *) dst->data;
|
|
7338
|
+
|
|
7339
|
+
for (int64_t in = 0; in < N; in++) {
|
|
7340
|
+
for (int64_t iod = 0; iod < OD; iod++) {
|
|
7341
|
+
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
|
7342
|
+
for (int64_t iow = 0; iow < OW; iow++) {
|
|
7343
|
+
for (int64_t iic = ith; iic < IC; iic += nth) {
|
|
7344
|
+
|
|
7345
|
+
// micro kernel
|
|
7346
|
+
float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
|
|
7347
|
+
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
|
|
7348
|
+
|
|
7349
|
+
for (int64_t ikd = 0; ikd < KD; ikd++) {
|
|
7350
|
+
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
|
7351
|
+
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
|
7352
|
+
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
|
7353
|
+
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
|
7354
|
+
const int64_t iid = iod*s2 + ikd*d2 - p2;
|
|
7355
|
+
|
|
7356
|
+
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
|
|
7357
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
|
|
7358
|
+
} else {
|
|
7359
|
+
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
|
|
7360
|
+
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
|
|
7361
|
+
}
|
|
7362
|
+
}
|
|
7363
|
+
}
|
|
7364
|
+
}
|
|
7365
|
+
}
|
|
7366
|
+
}
|
|
7367
|
+
}
|
|
7368
|
+
}
|
|
7369
|
+
}
|
|
7370
|
+
}
|
|
7371
|
+
}
|
|
7372
|
+
|
|
7373
|
+
|
|
7374
|
+
void ggml_compute_forward_im2col_3d(
|
|
7375
|
+
const ggml_compute_params * params,
|
|
7376
|
+
ggml_tensor * dst) {
|
|
7377
|
+
switch (dst->type) {
|
|
7378
|
+
case GGML_TYPE_F16:
|
|
7379
|
+
{
|
|
7380
|
+
ggml_compute_forward_im2col_3d_f16(params, dst);
|
|
7381
|
+
} break;
|
|
7382
|
+
case GGML_TYPE_F32:
|
|
7383
|
+
{
|
|
7384
|
+
ggml_compute_forward_im2col_3d_f32(params, dst);
|
|
7385
|
+
} break;
|
|
7386
|
+
default:
|
|
7387
|
+
{
|
|
7388
|
+
GGML_ABORT("fatal error");
|
|
7389
|
+
}
|
|
7390
|
+
}
|
|
7391
|
+
}
|
|
7392
|
+
|
|
7030
7393
|
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
|
|
7031
7394
|
void * a, void * b, float * c) {
|
|
7032
7395
|
const ggml_type_traits * traits = ggml_get_type_traits(type);
|
|
@@ -8014,6 +8377,15 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8014
8377
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
8015
8378
|
|
|
8016
8379
|
float * dst_ptr = (float *) dst->data;
|
|
8380
|
+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
|
|
8381
|
+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
|
|
8382
|
+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
|
|
8383
|
+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
|
|
8384
|
+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
|
|
8385
|
+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
|
|
8386
|
+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
8387
|
+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
8388
|
+
|
|
8017
8389
|
|
|
8018
8390
|
// TODO: optimize
|
|
8019
8391
|
|
|
@@ -8022,10 +8394,12 @@ static void ggml_compute_forward_pad_f32(
|
|
|
8022
8394
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|
8023
8395
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
8024
8396
|
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
8025
|
-
|
|
8026
|
-
|
|
8027
|
-
|
|
8028
|
-
|
|
8397
|
+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
|
8398
|
+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
|
8399
|
+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
|
|
8400
|
+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
|
|
8401
|
+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
|
|
8402
|
+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
8029
8403
|
dst_ptr[dst_idx] = *src_ptr;
|
|
8030
8404
|
} else {
|
|
8031
8405
|
dst_ptr[dst_idx] = 0;
|
|
@@ -8224,6 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
|
|
|
8224
8598
|
embed_data[j + half] = sinf(arg);
|
|
8225
8599
|
}
|
|
8226
8600
|
if (dim % 2 != 0 && ith == 0) {
|
|
8601
|
+
embed_data[2 * half] = 0.f;
|
|
8227
8602
|
embed_data[dim] = 0.f;
|
|
8228
8603
|
}
|
|
8229
8604
|
}
|
|
@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
|
|
|
69
69
|
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
70
70
|
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
71
71
|
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
|
+
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
72
73
|
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
73
74
|
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
74
75
|
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
|
@@ -114,26 +114,6 @@ extern "C" {
|
|
|
114
114
|
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
|
|
115
115
|
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
116
116
|
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
117
|
-
#elif defined(__NNPA__)
|
|
118
|
-
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
|
|
119
|
-
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
|
|
120
|
-
|
|
121
|
-
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
|
|
122
|
-
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
|
|
123
|
-
|
|
124
|
-
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
125
|
-
uint16x8_t v_h = vec_splats(h);
|
|
126
|
-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
|
|
127
|
-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
|
|
131
|
-
float32x4_t v_f = vec_splats(f);
|
|
132
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
133
|
-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
|
|
134
|
-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
|
|
135
|
-
return vec_extract(v_h, 0);
|
|
136
|
-
}
|
|
137
117
|
#endif
|
|
138
118
|
|
|
139
119
|
// precomputed f32 table for f16 (256 KB)
|
|
@@ -215,6 +195,47 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
|
215
195
|
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
|
216
196
|
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
|
217
197
|
|
|
198
|
+
// F16 SVE
|
|
199
|
+
#define DEFAULT_PG32 svptrue_b32()
|
|
200
|
+
#define DEFAULT_PG16 svptrue_b16()
|
|
201
|
+
|
|
202
|
+
#define GGML_F32Cxt svfloat16_t
|
|
203
|
+
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
|
204
|
+
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
|
205
|
+
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
|
206
|
+
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
|
207
|
+
|
|
208
|
+
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
|
209
|
+
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
210
|
+
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
|
211
|
+
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
212
|
+
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
|
213
|
+
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
214
|
+
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
|
215
|
+
|
|
216
|
+
#define GGML_F16x_VEC GGML_F32Cxt
|
|
217
|
+
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
|
|
218
|
+
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
|
|
219
|
+
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
|
|
220
|
+
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
|
221
|
+
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
|
|
222
|
+
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
|
|
223
|
+
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
|
|
224
|
+
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
|
225
|
+
|
|
226
|
+
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
|
227
|
+
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
228
|
+
|
|
229
|
+
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
|
230
|
+
{ \
|
|
231
|
+
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
|
232
|
+
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
|
233
|
+
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
|
234
|
+
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
|
235
|
+
(res) = (ggml_float) sum_f16; \
|
|
236
|
+
}
|
|
237
|
+
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
|
238
|
+
|
|
218
239
|
// F16 NEON
|
|
219
240
|
|
|
220
241
|
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
@@ -1115,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
1115
1136
|
#define GGML_F16_EPR GGML_F32_EPR
|
|
1116
1137
|
|
|
1117
1138
|
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1118
|
-
#if defined(__NNPA__)
|
|
1119
|
-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
|
|
1120
|
-
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
|
|
1121
|
-
return vec_extend_to_fp32_hi(v_xd, 0);
|
|
1122
|
-
#else
|
|
1123
1139
|
float tmp[4];
|
|
1124
1140
|
|
|
1125
1141
|
for (int i = 0; i < 4; i++) {
|
|
@@ -1129,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
|
1129
1145
|
// note: keep type-cast here to prevent compiler bugs
|
|
1130
1146
|
// see: https://github.com/ggml-org/llama.cpp/issues/12846
|
|
1131
1147
|
return vec_xl(0, (const float *)(tmp));
|
|
1132
|
-
#endif
|
|
1133
1148
|
}
|
|
1134
1149
|
|
|
1135
1150
|
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
1136
|
-
#if defined(__NNPA__)
|
|
1137
|
-
float32x4_t v_zero = vec_splats(0.0f);
|
|
1138
|
-
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
|
|
1139
|
-
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
|
|
1140
|
-
|
|
1141
|
-
x[0] = vec_extract(v_x, 0);
|
|
1142
|
-
x[1] = vec_extract(v_x, 1);
|
|
1143
|
-
x[2] = vec_extract(v_x, 2);
|
|
1144
|
-
x[3] = vec_extract(v_x, 3);
|
|
1145
|
-
#else
|
|
1146
1151
|
float arr[4];
|
|
1147
1152
|
|
|
1148
1153
|
// note: keep type-cast here to prevent compiler bugs
|
|
@@ -1152,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
|
|
|
1152
1157
|
for (int i = 0; i < 4; i++) {
|
|
1153
1158
|
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
|
|
1154
1159
|
}
|
|
1155
|
-
#endif
|
|
1156
1160
|
}
|
|
1157
1161
|
|
|
1158
1162
|
#define GGML_F16_VEC GGML_F32x4
|