@fugood/llama.node 1.2.0-rc.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +16 -15
- package/src/llama.cpp/CMakeLists.txt +7 -0
- package/src/llama.cpp/common/arg.cpp +141 -21
- package/src/llama.cpp/common/chat.cpp +139 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.h +23 -8
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +28 -7
- package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +12 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +10 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +7 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +0 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +161 -1
- package/src/llama.cpp/src/llama-arch.cpp +44 -10
- package/src/llama.cpp/src/llama-arch.h +9 -0
- package/src/llama.cpp/src/llama-chat.cpp +17 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +13 -11
- package/src/llama.cpp/src/llama-graph.cpp +6 -5
- package/src/llama.cpp/src/llama-hparams.h +14 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +55 -15
- package/src/llama.cpp/src/llama-kv-cache.h +8 -0
- package/src/llama.cpp/src/llama-model.cpp +386 -140
- package/src/llama.cpp/src/llama-model.h +3 -0
- package/src/llama.cpp/src/llama-quant.cpp +6 -4
- package/src/llama.cpp/src/llama-vocab.cpp +13 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +53 -10
|
@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
|
|
|
776
776
|
id += ne00 * (ne01 - ir1);
|
|
777
777
|
}
|
|
778
778
|
}
|
|
779
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
780
|
+
size_t id = 0;
|
|
781
|
+
int32_t * dst_ptr = (int32_t *) dst->data;
|
|
782
|
+
|
|
783
|
+
for (int i03 = 0; i03 < ne03; i03++) {
|
|
784
|
+
for (int i02 = 0; i02 < ne02; i02++) {
|
|
785
|
+
id += ne00 * ir0;
|
|
786
|
+
for (int i01 = ir0; i01 < ir1; i01++) {
|
|
787
|
+
for (int i00 = 0; i00 < ne00; i00++) {
|
|
788
|
+
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
789
|
+
|
|
790
|
+
dst_ptr[id] = *src0_ptr;
|
|
791
|
+
id++;
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
id += ne00 * (ne01 - ir1);
|
|
795
|
+
}
|
|
796
|
+
}
|
|
779
797
|
} else {
|
|
780
798
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
781
799
|
}
|
|
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
|
|
|
947
965
|
}
|
|
948
966
|
}
|
|
949
967
|
}
|
|
968
|
+
} else if (dst->type == GGML_TYPE_I32) {
|
|
969
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
970
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
971
|
+
i10 += ne00 * ir0;
|
|
972
|
+
while (i10 >= ne0) {
|
|
973
|
+
i10 -= ne0;
|
|
974
|
+
if (++i11 == ne1) {
|
|
975
|
+
i11 = 0;
|
|
976
|
+
if (++i12 == ne2) {
|
|
977
|
+
i12 = 0;
|
|
978
|
+
if (++i13 == ne3) {
|
|
979
|
+
i13 = 0;
|
|
980
|
+
}
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
}
|
|
984
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
985
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
986
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
987
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
988
|
+
|
|
989
|
+
*(int32_t *) dst_ptr = *(const float *) src0_ptr;
|
|
990
|
+
|
|
991
|
+
if (++i10 == ne0) {
|
|
992
|
+
i10 = 0;
|
|
993
|
+
if (++i11 == ne1) {
|
|
994
|
+
i11 = 0;
|
|
995
|
+
if (++i12 == ne2) {
|
|
996
|
+
i12 = 0;
|
|
997
|
+
if (++i13 == ne3) {
|
|
998
|
+
i13 = 0;
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1006
|
+
while (i10 >= ne0) {
|
|
1007
|
+
i10 -= ne0;
|
|
1008
|
+
if (++i11 == ne1) {
|
|
1009
|
+
i11 = 0;
|
|
1010
|
+
if (++i12 == ne2) {
|
|
1011
|
+
i12 = 0;
|
|
1012
|
+
if (++i13 == ne3) {
|
|
1013
|
+
i13 = 0;
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
}
|
|
1020
|
+
} else {
|
|
1021
|
+
GGML_ABORT("fatal error"); // TODO: implement
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
static void ggml_compute_forward_dup_i32(
|
|
1026
|
+
const ggml_compute_params * params,
|
|
1027
|
+
ggml_tensor * dst) {
|
|
1028
|
+
|
|
1029
|
+
const ggml_tensor * src0 = dst->src[0];
|
|
1030
|
+
|
|
1031
|
+
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
1032
|
+
|
|
1033
|
+
GGML_TENSOR_UNARY_OP_LOCALS
|
|
1034
|
+
|
|
1035
|
+
const int ith = params->ith; // thread index
|
|
1036
|
+
const int nth = params->nth; // number of threads
|
|
1037
|
+
|
|
1038
|
+
// parallelize by rows
|
|
1039
|
+
const int nr = ne01;
|
|
1040
|
+
// number of rows per thread
|
|
1041
|
+
const int dr = (nr + nth - 1) / nth;
|
|
1042
|
+
// row range for this thread
|
|
1043
|
+
const int ir0 = dr * ith;
|
|
1044
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
1045
|
+
|
|
1046
|
+
// dst counters
|
|
1047
|
+
|
|
1048
|
+
int64_t i10 = 0;
|
|
1049
|
+
int64_t i11 = 0;
|
|
1050
|
+
int64_t i12 = 0;
|
|
1051
|
+
int64_t i13 = 0;
|
|
1052
|
+
|
|
1053
|
+
// TODO: not optimal, but works
|
|
1054
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
1055
|
+
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
1056
|
+
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
1057
|
+
i10 += ne00 * ir0;
|
|
1058
|
+
while (i10 >= ne0) {
|
|
1059
|
+
i10 -= ne0;
|
|
1060
|
+
if (++i11 == ne1) {
|
|
1061
|
+
i11 = 0;
|
|
1062
|
+
if (++i12 == ne2) {
|
|
1063
|
+
i12 = 0;
|
|
1064
|
+
if (++i13 == ne3) {
|
|
1065
|
+
i13 = 0;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
1071
|
+
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
|
1072
|
+
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
1073
|
+
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
1074
|
+
|
|
1075
|
+
*(float *) dst_ptr = *(const int32_t *) src0_ptr;
|
|
1076
|
+
|
|
1077
|
+
if (++i10 == ne0) {
|
|
1078
|
+
i10 = 0;
|
|
1079
|
+
if (++i11 == ne1) {
|
|
1080
|
+
i11 = 0;
|
|
1081
|
+
if (++i12 == ne2) {
|
|
1082
|
+
i12 = 0;
|
|
1083
|
+
if (++i13 == ne3) {
|
|
1084
|
+
i13 = 0;
|
|
1085
|
+
}
|
|
1086
|
+
}
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
i10 += ne00 * (ne01 - ir1);
|
|
1092
|
+
while (i10 >= ne0) {
|
|
1093
|
+
i10 -= ne0;
|
|
1094
|
+
if (++i11 == ne1) {
|
|
1095
|
+
i11 = 0;
|
|
1096
|
+
if (++i12 == ne2) {
|
|
1097
|
+
i12 = 0;
|
|
1098
|
+
if (++i13 == ne3) {
|
|
1099
|
+
i13 = 0;
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
950
1106
|
} else {
|
|
951
1107
|
GGML_ABORT("fatal error"); // TODO: implement
|
|
952
1108
|
}
|
|
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
|
|
|
1177
1333
|
{
|
|
1178
1334
|
ggml_compute_forward_dup_f32(params, dst);
|
|
1179
1335
|
} break;
|
|
1336
|
+
case GGML_TYPE_I32:
|
|
1337
|
+
{
|
|
1338
|
+
ggml_compute_forward_dup_i32(params, dst);
|
|
1339
|
+
} break;
|
|
1180
1340
|
default:
|
|
1181
1341
|
{
|
|
1182
1342
|
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
|
|
@@ -8438,7 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
|
|
|
8438
8598
|
embed_data[j + half] = sinf(arg);
|
|
8439
8599
|
}
|
|
8440
8600
|
if (dim % 2 != 0 && ith == 0) {
|
|
8441
|
-
embed_data[
|
|
8601
|
+
embed_data[2 * half] = 0.f;
|
|
8442
8602
|
}
|
|
8443
8603
|
}
|
|
8444
8604
|
}
|
|
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
96
96
|
{ LLM_ARCH_DREAM, "dream" },
|
|
97
97
|
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
|
|
98
98
|
{ LLM_ARCH_LLADA, "llada" },
|
|
99
|
+
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
|
|
99
100
|
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
|
100
101
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
101
102
|
};
|
|
@@ -137,7 +138,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
137
138
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
|
138
139
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
|
139
140
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
|
141
|
+
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
|
140
142
|
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
|
143
|
+
{ LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
|
|
141
144
|
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
|
142
145
|
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
|
143
146
|
{ LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
|
|
@@ -168,19 +171,25 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
168
171
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
|
169
172
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
|
170
173
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
174
|
+
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
|
175
|
+
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
|
171
176
|
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
|
172
177
|
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
|
173
178
|
|
|
174
|
-
{ LLM_KV_ROPE_DIMENSION_COUNT,
|
|
175
|
-
{ LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
176
|
-
{ LLM_KV_ROPE_FREQ_BASE,
|
|
177
|
-
{ LLM_KV_ROPE_SCALE_LINEAR,
|
|
178
|
-
{ LLM_KV_ROPE_SCALING_TYPE,
|
|
179
|
-
{ LLM_KV_ROPE_SCALING_FACTOR,
|
|
180
|
-
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
181
|
-
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
182
|
-
{ LLM_KV_ROPE_SCALING_FINETUNED,
|
|
183
|
-
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
179
|
+
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
180
|
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
181
|
+
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
182
|
+
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
183
|
+
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
184
|
+
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
185
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
|
186
|
+
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
187
|
+
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
188
|
+
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
|
|
189
|
+
{ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
|
|
190
|
+
{ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
|
|
191
|
+
{ LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
|
|
192
|
+
{ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
|
|
184
193
|
|
|
185
194
|
{ LLM_KV_SPLIT_NO, "split.no" },
|
|
186
195
|
{ LLM_KV_SPLIT_COUNT, "split.count" },
|
|
@@ -397,12 +406,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
397
406
|
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
398
407
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
399
408
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
409
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
410
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
411
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
400
412
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
|
401
413
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
|
402
414
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
|
403
415
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
404
416
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
405
417
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
418
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
406
419
|
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
407
420
|
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
408
421
|
},
|
|
@@ -2135,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
2135
2148
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
2136
2149
|
},
|
|
2137
2150
|
},
|
|
2151
|
+
{
|
|
2152
|
+
LLM_ARCH_LLADA_MOE,
|
|
2153
|
+
{
|
|
2154
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
2155
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
2156
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
2157
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
2158
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
2159
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
2160
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
2161
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
2162
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
2163
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
2164
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
2165
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
2166
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
2167
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
2168
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
2169
|
+
},
|
|
2170
|
+
},
|
|
2138
2171
|
{
|
|
2139
2172
|
LLM_ARCH_SEED_OSS,
|
|
2140
2173
|
{
|
|
@@ -2415,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
|
|
|
2415
2448
|
switch (arch) {
|
|
2416
2449
|
case LLM_ARCH_DREAM:
|
|
2417
2450
|
case LLM_ARCH_LLADA:
|
|
2451
|
+
case LLM_ARCH_LLADA_MOE:
|
|
2418
2452
|
return true;
|
|
2419
2453
|
default:
|
|
2420
2454
|
return false;
|
|
@@ -100,6 +100,7 @@ enum llm_arch {
|
|
|
100
100
|
LLM_ARCH_DREAM,
|
|
101
101
|
LLM_ARCH_SMALLTHINKER,
|
|
102
102
|
LLM_ARCH_LLADA,
|
|
103
|
+
LLM_ARCH_LLADA_MOE,
|
|
103
104
|
LLM_ARCH_SEED_OSS,
|
|
104
105
|
LLM_ARCH_UNKNOWN,
|
|
105
106
|
};
|
|
@@ -141,7 +142,9 @@ enum llm_kv {
|
|
|
141
142
|
LLM_KV_POOLING_TYPE,
|
|
142
143
|
LLM_KV_LOGIT_SCALE,
|
|
143
144
|
LLM_KV_DECODER_START_TOKEN_ID,
|
|
145
|
+
LLM_KV_DECODER_BLOCK_COUNT,
|
|
144
146
|
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
|
147
|
+
LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
|
|
145
148
|
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
|
146
149
|
LLM_KV_SWIN_NORM,
|
|
147
150
|
LLM_KV_RESCALE_EVERY_N_LAYERS,
|
|
@@ -172,6 +175,8 @@ enum llm_kv {
|
|
|
172
175
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
|
173
176
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
|
174
177
|
LLM_KV_ATTENTION_SCALE,
|
|
178
|
+
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
|
179
|
+
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
|
175
180
|
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
|
176
181
|
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
|
177
182
|
|
|
@@ -185,6 +190,10 @@ enum llm_kv {
|
|
|
185
190
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
186
191
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
187
192
|
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
|
|
193
|
+
LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
|
|
194
|
+
LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
|
|
195
|
+
LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
|
|
196
|
+
LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
|
|
188
197
|
|
|
189
198
|
LLM_KV_SPLIT_NO,
|
|
190
199
|
LLM_KV_SPLIT_COUNT,
|
|
@@ -70,6 +70,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
70
70
|
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
|
|
71
71
|
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
|
72
72
|
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
|
|
73
|
+
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
|
|
73
74
|
};
|
|
74
75
|
|
|
75
76
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
204
205
|
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
|
205
206
|
} else if (tmpl_contains("<seed:bos>")) {
|
|
206
207
|
return LLM_CHAT_TEMPLATE_SEED_OSS;
|
|
208
|
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
|
|
209
|
+
return LLM_CHAT_TEMPLATE_GROK_2;
|
|
207
210
|
}
|
|
208
211
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
209
212
|
}
|
|
@@ -763,6 +766,20 @@ int32_t llm_chat_apply_template(
|
|
|
763
766
|
if (add_ass) {
|
|
764
767
|
ss << "<seed:bos>assistant\n";
|
|
765
768
|
}
|
|
769
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
|
|
770
|
+
for (auto message : chat) {
|
|
771
|
+
std::string role(message->role);
|
|
772
|
+
if (role == "system") {
|
|
773
|
+
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
|
|
774
|
+
} else if (role == "user") {
|
|
775
|
+
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
|
|
776
|
+
} else if (role == "assistant") {
|
|
777
|
+
ss << "Assistant: " << message->content << "<|separator|>\n\n";
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
if (add_ass) {
|
|
781
|
+
ss << "Assistant:";
|
|
782
|
+
}
|
|
766
783
|
} else {
|
|
767
784
|
// template not supported
|
|
768
785
|
return -1;
|
|
@@ -35,10 +35,10 @@ llama_context::llama_context(
|
|
|
35
35
|
|
|
36
36
|
cparams.n_threads = params.n_threads;
|
|
37
37
|
cparams.n_threads_batch = params.n_threads_batch;
|
|
38
|
-
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
|
39
|
-
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
|
40
|
-
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
|
41
|
-
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
|
38
|
+
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
|
|
39
|
+
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
|
|
40
|
+
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
|
|
41
|
+
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
|
|
42
42
|
cparams.embeddings = params.embeddings;
|
|
43
43
|
cparams.offload_kqv = params.offload_kqv;
|
|
44
44
|
cparams.no_perf = params.no_perf;
|
|
@@ -181,7 +181,7 @@ llama_context::llama_context(
|
|
|
181
181
|
// graph outputs buffer
|
|
182
182
|
{
|
|
183
183
|
// resized during inference when a batch uses more outputs
|
|
184
|
-
if (
|
|
184
|
+
if (output_reserve(params.n_seq_max) < params.n_seq_max) {
|
|
185
185
|
throw std::runtime_error("failed to reserve initial output buffer");
|
|
186
186
|
}
|
|
187
187
|
|
|
@@ -285,8 +285,8 @@ llama_context::llama_context(
|
|
|
285
285
|
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
286
286
|
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
287
287
|
|
|
288
|
-
// avoid reserving graphs with zero outputs
|
|
289
|
-
n_outputs =
|
|
288
|
+
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
289
|
+
n_outputs = n_seqs;
|
|
290
290
|
|
|
291
291
|
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
292
292
|
|
|
@@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
|
|
|
1447
1447
|
if (backend_cpu != nullptr) {
|
|
1448
1448
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
|
1449
1449
|
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
|
1450
|
-
set_threadpool_fn
|
|
1450
|
+
if (set_threadpool_fn) {
|
|
1451
|
+
set_threadpool_fn(backend_cpu, tp);
|
|
1452
|
+
}
|
|
1451
1453
|
}
|
|
1452
1454
|
|
|
1453
1455
|
// set the number of threads for all the backends
|
|
@@ -2261,9 +2263,9 @@ llama_context_params llama_context_default_params() {
|
|
|
2261
2263
|
/*.rope_freq_base =*/ 0.0f,
|
|
2262
2264
|
/*.rope_freq_scale =*/ 0.0f,
|
|
2263
2265
|
/*.yarn_ext_factor =*/ -1.0f,
|
|
2264
|
-
/*.yarn_attn_factor =*/ 1.0f,
|
|
2265
|
-
/*.yarn_beta_fast =*/
|
|
2266
|
-
/*.yarn_beta_slow =*/ 1.0f,
|
|
2266
|
+
/*.yarn_attn_factor =*/ -1.0f,
|
|
2267
|
+
/*.yarn_beta_fast =*/ -1.0f,
|
|
2268
|
+
/*.yarn_beta_slow =*/ -1.0f,
|
|
2267
2269
|
/*.yarn_orig_ctx =*/ 0,
|
|
2268
2270
|
/*.defrag_thold =*/ -1.0f,
|
|
2269
2271
|
/*.cb_eval =*/ nullptr,
|
|
@@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1273
1273
|
// split the batch into streams if needed
|
|
1274
1274
|
const auto n_stream = k->ne[3];
|
|
1275
1275
|
|
|
1276
|
-
q =
|
|
1276
|
+
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
|
1277
1277
|
|
|
1278
1278
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
|
1279
1279
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
@@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1335
1335
|
|
|
1336
1336
|
if (arch == LLM_ARCH_GROK) {
|
|
1337
1337
|
// need to do the following:
|
|
1338
|
-
// multiply by
|
|
1338
|
+
// multiply by attn_output_multiplier
|
|
1339
1339
|
// and then :
|
|
1340
1340
|
// kq = 30 * tanh(kq / 30)
|
|
1341
1341
|
// before the softmax below
|
|
1342
1342
|
|
|
1343
|
-
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq,
|
|
1343
|
+
kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
|
|
1344
1344
|
cb(kq, "kq_tanh", il);
|
|
1345
|
-
kq = ggml_scale(ctx0, kq,
|
|
1345
|
+
kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
|
|
1346
1346
|
cb(kq, "kq_scaled", il);
|
|
1347
1347
|
}
|
|
1348
1348
|
|
|
@@ -1431,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1431
1431
|
|
|
1432
1432
|
// [TAG_NO_CACHE_PAD]
|
|
1433
1433
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
1434
|
-
|
|
1434
|
+
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
|
1435
|
+
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
|
1435
1436
|
|
|
1436
1437
|
ggml_tensor * q = q_cur;
|
|
1437
1438
|
ggml_tensor * k = k_cur;
|
|
@@ -82,8 +82,9 @@ struct llama_hparams {
|
|
|
82
82
|
float f_norm_rms_eps;
|
|
83
83
|
float f_norm_group_eps;
|
|
84
84
|
|
|
85
|
-
float f_attn_logit_softcapping
|
|
86
|
-
float
|
|
85
|
+
float f_attn_logit_softcapping = 50.0f;
|
|
86
|
+
float f_router_logit_softcapping = 30.0f;
|
|
87
|
+
float f_final_logit_softcapping = 30.0f;
|
|
87
88
|
|
|
88
89
|
// for RWKV
|
|
89
90
|
uint32_t rescale_every_n_layers = 0;
|
|
@@ -104,6 +105,11 @@ struct llama_hparams {
|
|
|
104
105
|
uint32_t n_ctx_orig_yarn;
|
|
105
106
|
float rope_yarn_log_mul = 0.0f;
|
|
106
107
|
|
|
108
|
+
float yarn_ext_factor = -1.0f;
|
|
109
|
+
float yarn_attn_factor = 1.0f;
|
|
110
|
+
float yarn_beta_fast = 32.0f;
|
|
111
|
+
float yarn_beta_slow = 1.0f;
|
|
112
|
+
|
|
107
113
|
std::array<int, 4> rope_sections;
|
|
108
114
|
|
|
109
115
|
// Sliding Window Attention (SWA)
|
|
@@ -136,10 +142,14 @@ struct llama_hparams {
|
|
|
136
142
|
float f_embedding_scale = 0.0f;
|
|
137
143
|
float f_attention_scale = 0.0f;
|
|
138
144
|
|
|
145
|
+
// grok-2
|
|
146
|
+
float f_attn_out_scale = 0.0f;
|
|
147
|
+
uint32_t attn_temp_length = 0;
|
|
148
|
+
|
|
139
149
|
bool causal_attn = true;
|
|
140
150
|
bool use_alibi = false;
|
|
141
151
|
bool attn_soft_cap = false;
|
|
142
|
-
bool use_kq_norm =
|
|
152
|
+
bool use_kq_norm = false;
|
|
143
153
|
|
|
144
154
|
// for Classifiers
|
|
145
155
|
uint32_t n_cls_out = 1;
|
|
@@ -159,6 +169,7 @@ struct llama_hparams {
|
|
|
159
169
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
160
170
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
161
171
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
172
|
+
uint32_t dec_n_layer = 0;
|
|
162
173
|
|
|
163
174
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
164
175
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
|
|
1018
1018
|
|
|
1019
1019
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1020
1020
|
|
|
1021
|
-
|
|
1021
|
+
ggml_tensor * k = layers[ikv].k;
|
|
1022
|
+
|
|
1023
|
+
const int64_t n_embd_head = k_cur->ne[0];
|
|
1024
|
+
const int64_t n_head = k_cur->ne[1];
|
|
1025
|
+
const int64_t n_tokens = k_cur->ne[2];
|
|
1026
|
+
|
|
1027
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1022
1028
|
|
|
1023
|
-
|
|
1029
|
+
// we can merge dims 0 and 1
|
|
1030
|
+
// TODO: add ggml helper function for this?
|
|
1031
|
+
GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
|
|
1024
1032
|
|
|
1025
|
-
k_cur =
|
|
1033
|
+
k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
|
|
1026
1034
|
|
|
1027
|
-
|
|
1028
|
-
|
|
1035
|
+
const int64_t n_stream = k->ne[2];
|
|
1036
|
+
|
|
1037
|
+
if (n_stream > 1) {
|
|
1038
|
+
const int64_t kv_size = get_size();
|
|
1039
|
+
|
|
1040
|
+
assert(n_embd_gqa == k->ne[0]);
|
|
1041
|
+
assert(kv_size == k->ne[1]);
|
|
1042
|
+
|
|
1043
|
+
// merge the buffer across all streams because the idxs are global
|
|
1044
|
+
k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
|
|
1029
1045
|
}
|
|
1030
1046
|
|
|
1047
|
+
// store the current K values into the cache
|
|
1031
1048
|
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1032
1049
|
}
|
|
1033
1050
|
|
|
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
|
|
1038
1055
|
|
|
1039
1056
|
auto * v = layers[ikv].v;
|
|
1040
1057
|
|
|
1041
|
-
const int64_t
|
|
1042
|
-
const int64_t
|
|
1058
|
+
const int64_t n_embd_head = v_cur->ne[0];
|
|
1059
|
+
const int64_t n_head = v_cur->ne[1];
|
|
1060
|
+
const int64_t n_tokens = v_cur->ne[2];
|
|
1061
|
+
|
|
1062
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1043
1063
|
|
|
1044
|
-
|
|
1064
|
+
// we can merge dims 0 and 1
|
|
1065
|
+
GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
|
|
1045
1066
|
|
|
1067
|
+
const int64_t n_stream = v->ne[2];
|
|
1068
|
+
|
|
1069
|
+
// take this branch when FA is enabled (the V cache is not transposed)
|
|
1046
1070
|
if (!v_trans) {
|
|
1047
|
-
|
|
1048
|
-
|
|
1071
|
+
v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
|
|
1072
|
+
|
|
1073
|
+
if (n_stream > 1) {
|
|
1074
|
+
const int64_t kv_size = get_size();
|
|
1075
|
+
|
|
1076
|
+
assert(n_embd_gqa == v->ne[0]);
|
|
1077
|
+
assert(kv_size == v->ne[1]);
|
|
1078
|
+
|
|
1079
|
+
// merge the buffer across all streams because the idxs are global
|
|
1080
|
+
v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
|
|
1049
1081
|
}
|
|
1050
1082
|
|
|
1051
1083
|
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1052
1084
|
}
|
|
1053
1085
|
|
|
1086
|
+
if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
|
|
1087
|
+
// we can merge dims 0, 1 and 2
|
|
1088
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1089
|
+
} else {
|
|
1090
|
+
// otherwise -> make a copy to get contiguous data
|
|
1091
|
+
v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1054
1094
|
// [TAG_V_CACHE_VARIABLE]
|
|
1055
|
-
if (
|
|
1056
|
-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] -
|
|
1095
|
+
if (n_embd_gqa < v->ne[0]) {
|
|
1096
|
+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
|
|
1057
1097
|
}
|
|
1058
1098
|
|
|
1059
|
-
// the row
|
|
1060
|
-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v
|
|
1099
|
+
// in this branch the v_idxs are constructed in such a way that each row is a single head element
|
|
1100
|
+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
|
|
1061
1101
|
|
|
1062
|
-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur
|
|
1102
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
|
|
1063
1103
|
|
|
1064
1104
|
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1065
1105
|
}
|
|
@@ -317,9 +317,17 @@ public:
|
|
|
317
317
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
318
318
|
|
|
319
319
|
// store k_cur and v_cur in the cache based on the provided head location
|
|
320
|
+
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
|
321
|
+
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
|
322
|
+
// - k_idxs [n_tokens]
|
|
323
|
+
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
|
324
|
+
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
|
320
325
|
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
|
321
326
|
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
|
322
327
|
|
|
328
|
+
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
|
329
|
+
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
|
330
|
+
// helps understand the implementation logic of cpy_k and cpy_v
|
|
323
331
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
324
332
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
325
333
|
|