cui-llama.rn 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +3 -4
- package/cpp/common.cpp +183 -1990
- package/cpp/common.h +101 -130
- package/cpp/ggml-impl.h +32 -0
- package/cpp/ggml-metal.m +38 -28
- package/cpp/ggml-quants.c +275 -84
- package/cpp/ggml.c +89 -35
- package/cpp/ggml.h +30 -67
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +218 -102
- package/cpp/llama.cpp +599 -120
- package/cpp/llama.h +33 -25
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +9 -11
- package/cpp/sampling.cpp +12 -9
- package/cpp/sampling.h +4 -56
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -204,6 +204,7 @@ enum llm_arch {
|
|
204
204
|
LLM_ARCH_ORION,
|
205
205
|
LLM_ARCH_INTERNLM2,
|
206
206
|
LLM_ARCH_MINICPM,
|
207
|
+
LLM_ARCH_MINICPM3,
|
207
208
|
LLM_ARCH_GEMMA,
|
208
209
|
LLM_ARCH_GEMMA2,
|
209
210
|
LLM_ARCH_STARCODER2,
|
@@ -212,6 +213,7 @@ enum llm_arch {
|
|
212
213
|
LLM_ARCH_COMMAND_R,
|
213
214
|
LLM_ARCH_DBRX,
|
214
215
|
LLM_ARCH_OLMO,
|
216
|
+
LLM_ARCH_OLMOE,
|
215
217
|
LLM_ARCH_OPENELM,
|
216
218
|
LLM_ARCH_ARCTIC,
|
217
219
|
LLM_ARCH_DEEPSEEK2,
|
@@ -252,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
254
|
{ LLM_ARCH_ORION, "orion" },
|
253
255
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
254
256
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
257
|
+
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
255
258
|
{ LLM_ARCH_GEMMA, "gemma" },
|
256
259
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
257
260
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
@@ -260,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
260
263
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
261
264
|
{ LLM_ARCH_DBRX, "dbrx" },
|
262
265
|
{ LLM_ARCH_OLMO, "olmo" },
|
266
|
+
{ LLM_ARCH_OLMOE, "olmoe" },
|
263
267
|
{ LLM_ARCH_OPENELM, "openelm" },
|
264
268
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
265
269
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
@@ -1045,6 +1049,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1045
1049
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
1046
1050
|
},
|
1047
1051
|
},
|
1052
|
+
{
|
1053
|
+
LLM_ARCH_MINICPM3,
|
1054
|
+
{
|
1055
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1056
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1057
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1058
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
1059
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
1060
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1061
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1062
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1063
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1064
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1065
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1066
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1067
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1068
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1069
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1070
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1071
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1072
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1073
|
+
},
|
1074
|
+
},
|
1048
1075
|
{
|
1049
1076
|
LLM_ARCH_GEMMA,
|
1050
1077
|
{
|
@@ -1179,6 +1206,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1179
1206
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1180
1207
|
},
|
1181
1208
|
},
|
1209
|
+
{
|
1210
|
+
LLM_ARCH_OLMOE,
|
1211
|
+
{
|
1212
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1213
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1214
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1215
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1216
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1217
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1218
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1219
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1220
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
1221
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1222
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1223
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1224
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1225
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1226
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1227
|
+
},
|
1228
|
+
},
|
1182
1229
|
{
|
1183
1230
|
LLM_ARCH_OPENELM,
|
1184
1231
|
{
|
@@ -2167,6 +2214,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
|
|
2167
2214
|
if (host_buffer) {
|
2168
2215
|
buft = lm_ggml_backend_sycl_host_buffer_type();
|
2169
2216
|
}
|
2217
|
+
#elif defined(LM_GGML_USE_CANN)
|
2218
|
+
if (host_buffer) {
|
2219
|
+
buft = lm_ggml_backend_cann_host_buffer_type();
|
2220
|
+
}
|
2170
2221
|
#elif defined(LM_GGML_USE_CPU_HBM)
|
2171
2222
|
buft = lm_ggml_backend_cpu_hbm_buffer_type();
|
2172
2223
|
#elif defined(LM_GGML_USE_VULKAN)
|
@@ -2259,6 +2310,7 @@ enum e_model {
|
|
2259
2310
|
MODEL_MEDIUM,
|
2260
2311
|
MODEL_LARGE,
|
2261
2312
|
MODEL_XL,
|
2313
|
+
MODEL_A1_7B,
|
2262
2314
|
MODEL_A2_7B,
|
2263
2315
|
MODEL_8x7B,
|
2264
2316
|
MODEL_8x22B,
|
@@ -2493,6 +2545,7 @@ struct llama_cparams {
|
|
2493
2545
|
bool causal_attn;
|
2494
2546
|
bool offload_kqv;
|
2495
2547
|
bool flash_attn;
|
2548
|
+
bool no_perf;
|
2496
2549
|
|
2497
2550
|
enum llama_pooling_type pooling_type;
|
2498
2551
|
|
@@ -5222,6 +5275,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5222
5275
|
case MODEL_MEDIUM: return "0.4B";
|
5223
5276
|
case MODEL_LARGE: return "0.8B";
|
5224
5277
|
case MODEL_XL: return "1.5B";
|
5278
|
+
case MODEL_A1_7B: return "A1.7B";
|
5225
5279
|
case MODEL_A2_7B: return "A2.7B";
|
5226
5280
|
case MODEL_8x7B: return "8x7B";
|
5227
5281
|
case MODEL_8x22B: return "8x22B";
|
@@ -5396,6 +5450,17 @@ static void llm_load_hparams(
|
|
5396
5450
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5397
5451
|
}
|
5398
5452
|
} break;
|
5453
|
+
case LLM_ARCH_MINICPM3:
|
5454
|
+
{
|
5455
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5456
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
5457
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
5458
|
+
|
5459
|
+
switch (hparams.n_layer) {
|
5460
|
+
case 62: model.type = e_model::MODEL_4B; break;
|
5461
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5462
|
+
}
|
5463
|
+
} break;
|
5399
5464
|
case LLM_ARCH_GROK:
|
5400
5465
|
{
|
5401
5466
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -5761,6 +5826,14 @@ static void llm_load_hparams(
|
|
5761
5826
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5762
5827
|
}
|
5763
5828
|
} break;
|
5829
|
+
case LLM_ARCH_OLMOE:
|
5830
|
+
{
|
5831
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5832
|
+
switch (hparams.n_layer) {
|
5833
|
+
case 16: model.type = e_model::MODEL_A1_7B; break;
|
5834
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5835
|
+
}
|
5836
|
+
} break;
|
5764
5837
|
case LLM_ARCH_OPENELM:
|
5765
5838
|
{
|
5766
5839
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -6668,8 +6741,6 @@ static bool llm_load_tensors(
|
|
6668
6741
|
bool use_mlock,
|
6669
6742
|
llama_progress_callback progress_callback,
|
6670
6743
|
void * progress_callback_user_data) {
|
6671
|
-
model.t_start_us = lm_ggml_time_us();
|
6672
|
-
|
6673
6744
|
auto & hparams = model.hparams;
|
6674
6745
|
|
6675
6746
|
model.split_mode = split_mode;
|
@@ -6905,6 +6976,54 @@ static bool llm_load_tensors(
|
|
6905
6976
|
}
|
6906
6977
|
}
|
6907
6978
|
} break;
|
6979
|
+
case LLM_ARCH_MINICPM3:
|
6980
|
+
{
|
6981
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
6982
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6983
|
+
|
6984
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
6985
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
6986
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6987
|
+
|
6988
|
+
// output
|
6989
|
+
{
|
6990
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6991
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6992
|
+
|
6993
|
+
// if output is NULL, init from the input tok embed
|
6994
|
+
if (model.output == NULL) {
|
6995
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6996
|
+
}
|
6997
|
+
}
|
6998
|
+
|
6999
|
+
for (int i = 0; i < n_layer; ++i) {
|
7000
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7001
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7002
|
+
|
7003
|
+
auto & layer = model.layers[i];
|
7004
|
+
|
7005
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7006
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
7007
|
+
|
7008
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
7009
|
+
|
7010
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
7011
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
|
7012
|
+
|
7013
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
7014
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
7015
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
7016
|
+
|
7017
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7018
|
+
|
7019
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7020
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7021
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7022
|
+
|
7023
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7024
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7025
|
+
}
|
7026
|
+
} break;
|
6908
7027
|
case LLM_ARCH_GROK:
|
6909
7028
|
{
|
6910
7029
|
if (n_expert == 0) {
|
@@ -7942,6 +8061,44 @@ static bool llm_load_tensors(
|
|
7942
8061
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7943
8062
|
}
|
7944
8063
|
} break;
|
8064
|
+
case LLM_ARCH_OLMOE:
|
8065
|
+
{
|
8066
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
8067
|
+
|
8068
|
+
// output
|
8069
|
+
{
|
8070
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
8071
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
8072
|
+
}
|
8073
|
+
|
8074
|
+
for (int i = 0; i < n_layer; ++i) {
|
8075
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
8076
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
8077
|
+
|
8078
|
+
auto & layer = model.layers[i];
|
8079
|
+
|
8080
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
8081
|
+
|
8082
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
8083
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
8084
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
8085
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
8086
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
|
8087
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
|
8088
|
+
|
8089
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
8090
|
+
|
8091
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
8092
|
+
|
8093
|
+
LM_GGML_ASSERT(n_expert > 0);
|
8094
|
+
LM_GGML_ASSERT(n_expert_used > 0);
|
8095
|
+
|
8096
|
+
// MoE branch
|
8097
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
8098
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
8099
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
8100
|
+
}
|
8101
|
+
} break;
|
7945
8102
|
case LLM_ARCH_OPENELM:
|
7946
8103
|
{
|
7947
8104
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -8600,14 +8757,13 @@ static bool llm_load_tensors(
|
|
8600
8757
|
}
|
8601
8758
|
}
|
8602
8759
|
|
8603
|
-
// loading time will be recalculate after the first eval, so
|
8604
|
-
// we take page faults deferred by mmap() into consideration
|
8605
|
-
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8606
8760
|
return true;
|
8607
8761
|
}
|
8608
8762
|
|
8609
8763
|
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
|
8610
8764
|
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
|
8765
|
+
model.t_start_us = lm_ggml_time_us();
|
8766
|
+
|
8611
8767
|
try {
|
8612
8768
|
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
|
8613
8769
|
|
@@ -8669,6 +8825,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
8669
8825
|
return -1;
|
8670
8826
|
}
|
8671
8827
|
|
8828
|
+
// loading time will be recalculate after the first eval, so
|
8829
|
+
// we take page faults deferred by mmap() into consideration
|
8830
|
+
model.t_load_us = lm_ggml_time_us() - model.t_start_us;
|
8831
|
+
|
8672
8832
|
return 0;
|
8673
8833
|
}
|
8674
8834
|
|
@@ -9269,7 +9429,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
|
|
9269
9429
|
// FIXME: zero-out NANs?
|
9270
9430
|
states = lm_ggml_mul(ctx, states, state_mask);
|
9271
9431
|
|
9272
|
-
// copy states which won't be changed further (between n_seqs and
|
9432
|
+
// copy states which won't be changed further (between n_seqs and n_kv)
|
9273
9433
|
lm_ggml_build_forward_expand(graph,
|
9274
9434
|
lm_ggml_cpy(ctx,
|
9275
9435
|
lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
|
@@ -9422,7 +9582,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9422
9582
|
struct lm_ggml_tensor * cur,
|
9423
9583
|
struct lm_ggml_tensor * x_prev,
|
9424
9584
|
struct lm_ggml_tensor ** wkv_state) {
|
9425
|
-
size_t
|
9585
|
+
size_t n_embd = cur->ne[0];
|
9426
9586
|
size_t n_seq_tokens = cur->ne[1];
|
9427
9587
|
size_t n_seqs = cur->ne[2];
|
9428
9588
|
|
@@ -9433,8 +9593,8 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9433
9593
|
|
9434
9594
|
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9435
9595
|
|
9436
|
-
sx = lm_ggml_reshape_2d(ctx, sx,
|
9437
|
-
cur = lm_ggml_reshape_2d(ctx, cur,
|
9596
|
+
sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
|
9597
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
9438
9598
|
|
9439
9599
|
struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
|
9440
9600
|
|
@@ -9459,11 +9619,11 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9459
9619
|
xxx
|
9460
9620
|
);
|
9461
9621
|
|
9462
|
-
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx,
|
9463
|
-
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx,
|
9464
|
-
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx,
|
9465
|
-
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx,
|
9466
|
-
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx,
|
9622
|
+
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
9623
|
+
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
9624
|
+
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
9625
|
+
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
9626
|
+
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
9467
9627
|
|
9468
9628
|
struct lm_ggml_tensor * xw = lm_ggml_add(
|
9469
9629
|
ctx,
|
@@ -9532,7 +9692,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9532
9692
|
)
|
9533
9693
|
);
|
9534
9694
|
|
9535
|
-
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay,
|
9695
|
+
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
|
9536
9696
|
w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
|
9537
9697
|
w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
9538
9698
|
|
@@ -9541,21 +9701,21 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9541
9701
|
r = lm_ggml_transpose(ctx, r);
|
9542
9702
|
|
9543
9703
|
struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
9544
|
-
cur = lm_ggml_view_1d(ctx, wkv_output,
|
9545
|
-
*wkv_state = lm_ggml_view_1d(ctx, wkv_output,
|
9704
|
+
cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
9705
|
+
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
9546
9706
|
|
9547
9707
|
// group norm with head_count groups
|
9548
|
-
cur = lm_ggml_reshape_3d(ctx, cur,
|
9708
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
|
9549
9709
|
cur = lm_ggml_norm(ctx, cur, 64e-5f);
|
9550
9710
|
|
9551
9711
|
// Convert back to regular vectors.
|
9552
|
-
cur = lm_ggml_reshape_2d(ctx, cur,
|
9712
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
9553
9713
|
cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
9554
9714
|
|
9555
9715
|
cur = lm_ggml_mul(ctx, cur, g);
|
9556
9716
|
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
9557
9717
|
|
9558
|
-
return lm_ggml_reshape_3d(ctx, cur,
|
9718
|
+
return lm_ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
|
9559
9719
|
}
|
9560
9720
|
|
9561
9721
|
static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
|
@@ -9888,8 +10048,8 @@ struct llm_build_context {
|
|
9888
10048
|
struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
|
9889
10049
|
// find result_norm tensor for input
|
9890
10050
|
struct lm_ggml_tensor * inp = nullptr;
|
9891
|
-
for (int i = gf
|
9892
|
-
inp = gf
|
10051
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
10052
|
+
inp = lm_ggml_graph_node(gf, i);
|
9893
10053
|
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
9894
10054
|
break;
|
9895
10055
|
} else {
|
@@ -12848,6 +13008,215 @@ struct llm_build_context {
|
|
12848
13008
|
return gf;
|
12849
13009
|
}
|
12850
13010
|
|
13011
|
+
struct lm_ggml_cgraph * build_minicpm3() {
|
13012
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13013
|
+
|
13014
|
+
//TODO: if the model varies, these parameters need to be read from the model
|
13015
|
+
const int64_t n_embd_base = 256;
|
13016
|
+
const float scale_embd = 12.0f;
|
13017
|
+
const float scale_depth = 1.4f;
|
13018
|
+
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
13019
|
+
|
13020
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
13021
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
13022
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
13023
|
+
|
13024
|
+
struct lm_ggml_tensor * cur;
|
13025
|
+
struct lm_ggml_tensor * inpL;
|
13026
|
+
|
13027
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13028
|
+
|
13029
|
+
// scale the input embeddings
|
13030
|
+
inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
|
13031
|
+
cb(inpL, "inp_scaled", -1);
|
13032
|
+
|
13033
|
+
// inp_pos - contains the positions
|
13034
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13035
|
+
|
13036
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13037
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13038
|
+
|
13039
|
+
for (int il = 0; il < n_layer; ++il) {
|
13040
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13041
|
+
|
13042
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
13043
|
+
// norm
|
13044
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13045
|
+
model.layers[il].attn_norm, NULL,
|
13046
|
+
LLM_NORM_RMS, cb, il);
|
13047
|
+
cb(cur, "attn_norm", il);
|
13048
|
+
|
13049
|
+
// self_attention
|
13050
|
+
{
|
13051
|
+
struct lm_ggml_tensor * q = NULL;
|
13052
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
13053
|
+
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
13054
|
+
cb(q, "q", il);
|
13055
|
+
|
13056
|
+
q = llm_build_norm(ctx0, q, hparams,
|
13057
|
+
model.layers[il].attn_q_a_norm, NULL,
|
13058
|
+
LLM_NORM_RMS, cb, il);
|
13059
|
+
cb(q, "q", il);
|
13060
|
+
|
13061
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
13062
|
+
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
13063
|
+
cb(q, "q", il);
|
13064
|
+
|
13065
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
13066
|
+
struct lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
13067
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k),
|
13068
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
13069
|
+
0);
|
13070
|
+
cb(q_nope, "q_nope", il);
|
13071
|
+
|
13072
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
13073
|
+
struct lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
13074
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k),
|
13075
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
13076
|
+
lm_ggml_row_size(q->type, n_embd_head_qk_nope));
|
13077
|
+
cb(q_pe, "q_pe", il);
|
13078
|
+
|
13079
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
13080
|
+
struct lm_ggml_tensor * kv_pe_compresseed = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
13081
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
13082
|
+
|
13083
|
+
// split into {kv_lora_rank, n_tokens}
|
13084
|
+
struct lm_ggml_tensor * kv_compressed = lm_ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
13085
|
+
kv_pe_compresseed->nb[1],
|
13086
|
+
0);
|
13087
|
+
cb(kv_compressed, "kv_compressed", il);
|
13088
|
+
|
13089
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
13090
|
+
struct lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
13091
|
+
kv_pe_compresseed->nb[1],
|
13092
|
+
kv_pe_compresseed->nb[1],
|
13093
|
+
lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
13094
|
+
cb(k_pe, "k_pe", il);
|
13095
|
+
|
13096
|
+
kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
13097
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
13098
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
13099
|
+
LLM_NORM_RMS, cb, il);
|
13100
|
+
cb(kv_compressed, "kv_compressed", il);
|
13101
|
+
|
13102
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
13103
|
+
struct lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
13104
|
+
cb(kv, "kv", il);
|
13105
|
+
|
13106
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
13107
|
+
struct lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
13108
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
13109
|
+
lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
13110
|
+
0);
|
13111
|
+
cb(k_nope, "k_nope", il);
|
13112
|
+
|
13113
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
13114
|
+
struct lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
13115
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
13116
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
13117
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
13118
|
+
cb(v_states, "v_states", il);
|
13119
|
+
|
13120
|
+
v_states = lm_ggml_cont(ctx0, v_states);
|
13121
|
+
cb(v_states, "v_states", il);
|
13122
|
+
|
13123
|
+
v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
13124
|
+
lm_ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
13125
|
+
0);
|
13126
|
+
cb(v_states, "v_states", il);
|
13127
|
+
|
13128
|
+
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
13129
|
+
q_pe = lm_ggml_rope_ext(
|
13130
|
+
ctx0, q_pe, inp_pos, rope_factors,
|
13131
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13132
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13133
|
+
);
|
13134
|
+
cb(q_pe, "q_pe", il);
|
13135
|
+
|
13136
|
+
// shared RoPE key
|
13137
|
+
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
13138
|
+
k_pe = lm_ggml_rope_ext(
|
13139
|
+
ctx0, k_pe, inp_pos, rope_factors,
|
13140
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13141
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13142
|
+
);
|
13143
|
+
cb(k_pe, "k_pe", il);
|
13144
|
+
|
13145
|
+
struct lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0);
|
13146
|
+
cb(q_states, "q_states", il);
|
13147
|
+
|
13148
|
+
struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
|
13149
|
+
cb(k_states, "k_states", il);
|
13150
|
+
|
13151
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13152
|
+
model.layers[il].wo, NULL,
|
13153
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
13154
|
+
}
|
13155
|
+
|
13156
|
+
if (il == n_layer - 1) {
|
13157
|
+
// skip computing output for unused tokens
|
13158
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13159
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13160
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13161
|
+
}
|
13162
|
+
|
13163
|
+
// scale_res - scale the hidden states for residual connection
|
13164
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
13165
|
+
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13166
|
+
cb(cur, "hidden_scaled", il);
|
13167
|
+
|
13168
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13169
|
+
cb(ffn_inp, "ffn_inp", il);
|
13170
|
+
|
13171
|
+
// feed-forward network
|
13172
|
+
{
|
13173
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13174
|
+
model.layers[il].ffn_norm, NULL,
|
13175
|
+
LLM_NORM_RMS, cb, il);
|
13176
|
+
cb(cur, "ffn_norm", il);
|
13177
|
+
|
13178
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13179
|
+
model.layers[il].ffn_up, NULL, NULL,
|
13180
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
13181
|
+
model.layers[il].ffn_down, NULL, NULL,
|
13182
|
+
NULL,
|
13183
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
13184
|
+
cb(cur, "ffn_out", il);
|
13185
|
+
}
|
13186
|
+
|
13187
|
+
// scale the hidden states for residual connection
|
13188
|
+
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13189
|
+
cb(cur, "hidden_scaled_ffn", il);
|
13190
|
+
|
13191
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13192
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
13193
|
+
cb(cur, "l_out", il);
|
13194
|
+
|
13195
|
+
// input for next layer
|
13196
|
+
inpL = cur;
|
13197
|
+
}
|
13198
|
+
|
13199
|
+
cur = inpL;
|
13200
|
+
|
13201
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13202
|
+
model.output_norm, NULL,
|
13203
|
+
LLM_NORM_RMS, cb, -1);
|
13204
|
+
cb(cur, "result_norm", -1);
|
13205
|
+
|
13206
|
+
// lm_head scaling
|
13207
|
+
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
13208
|
+
cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
|
13209
|
+
cb(cur, "lmhead_scaling", -1);
|
13210
|
+
|
13211
|
+
// lm_head
|
13212
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13213
|
+
cb(cur, "result_output", -1);
|
13214
|
+
|
13215
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13216
|
+
|
13217
|
+
return gf;
|
13218
|
+
}
|
13219
|
+
|
12851
13220
|
struct lm_ggml_cgraph * build_gemma() {
|
12852
13221
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
12853
13222
|
|
@@ -13544,6 +13913,134 @@ struct llm_build_context {
|
|
13544
13913
|
return gf;
|
13545
13914
|
}
|
13546
13915
|
|
13916
|
+
// based on the build_qwen2moe() function, changes:
|
13917
|
+
// * removed shared experts
|
13918
|
+
// * removed bias
|
13919
|
+
// * added q, k norm
|
13920
|
+
struct lm_ggml_cgraph * build_olmoe() {
|
13921
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13922
|
+
|
13923
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
13924
|
+
int32_t n_tokens = this->n_tokens;
|
13925
|
+
|
13926
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13927
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13928
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13929
|
+
|
13930
|
+
struct lm_ggml_tensor * cur;
|
13931
|
+
struct lm_ggml_tensor * inpL;
|
13932
|
+
|
13933
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13934
|
+
|
13935
|
+
// inp_pos - contains the positions
|
13936
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13937
|
+
|
13938
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13939
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13940
|
+
|
13941
|
+
for (int il = 0; il < n_layer; ++il) {
|
13942
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13943
|
+
|
13944
|
+
// norm
|
13945
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13946
|
+
model.layers[il].attn_norm, NULL,
|
13947
|
+
LLM_NORM_RMS, cb, il);
|
13948
|
+
cb(cur, "attn_norm", il);
|
13949
|
+
|
13950
|
+
// self_attention
|
13951
|
+
{
|
13952
|
+
// compute Q and K and RoPE them
|
13953
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13954
|
+
cb(Qcur, "Qcur", il);
|
13955
|
+
|
13956
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13957
|
+
cb(Kcur, "Kcur", il);
|
13958
|
+
|
13959
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13960
|
+
cb(Vcur, "Vcur", il);
|
13961
|
+
|
13962
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
|
13963
|
+
LLM_NORM_RMS, cb, il);
|
13964
|
+
cb(Qcur, "Qcur_normed", il);
|
13965
|
+
|
13966
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
|
13967
|
+
LLM_NORM_RMS, cb, il);
|
13968
|
+
cb(Kcur, "Kcur_normed", il);
|
13969
|
+
|
13970
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13971
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
13972
|
+
|
13973
|
+
Qcur = lm_ggml_rope_ext(
|
13974
|
+
ctx0, Qcur, inp_pos, nullptr,
|
13975
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13976
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13977
|
+
);
|
13978
|
+
cb(Qcur, "Qcur_rope", il);
|
13979
|
+
|
13980
|
+
Kcur = lm_ggml_rope_ext(
|
13981
|
+
ctx0, Kcur, inp_pos, nullptr,
|
13982
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13983
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13984
|
+
);
|
13985
|
+
cb(Kcur, "Kcur_rope", il);
|
13986
|
+
|
13987
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13988
|
+
model.layers[il].wo, NULL,
|
13989
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13990
|
+
}
|
13991
|
+
|
13992
|
+
if (il == n_layer - 1) {
|
13993
|
+
// skip computing output for unused tokens
|
13994
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13995
|
+
n_tokens = n_outputs;
|
13996
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13997
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13998
|
+
}
|
13999
|
+
|
14000
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
14001
|
+
cb(ffn_inp, "ffn_inp", il);
|
14002
|
+
|
14003
|
+
// MoE branch
|
14004
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
14005
|
+
model.layers[il].ffn_norm, NULL,
|
14006
|
+
LLM_NORM_RMS, cb, il);
|
14007
|
+
cb(cur, "ffn_norm", il);
|
14008
|
+
|
14009
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
14010
|
+
model.layers[il].ffn_gate_inp,
|
14011
|
+
model.layers[il].ffn_up_exps,
|
14012
|
+
model.layers[il].ffn_gate_exps,
|
14013
|
+
model.layers[il].ffn_down_exps,
|
14014
|
+
n_expert, n_expert_used,
|
14015
|
+
LLM_FFN_SILU, false,
|
14016
|
+
false, 0.0,
|
14017
|
+
cb, il);
|
14018
|
+
cb(cur, "ffn_moe_out", il);
|
14019
|
+
|
14020
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
14021
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14022
|
+
cb(cur, "l_out", il);
|
14023
|
+
|
14024
|
+
// input for next layer
|
14025
|
+
inpL = cur;
|
14026
|
+
}
|
14027
|
+
|
14028
|
+
cur = inpL;
|
14029
|
+
|
14030
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14031
|
+
model.output_norm, NULL,
|
14032
|
+
LLM_NORM_RMS, cb, -1);
|
14033
|
+
cb(cur, "result_norm", -1);
|
14034
|
+
|
14035
|
+
// lm_head
|
14036
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14037
|
+
cb(cur, "result_output", -1);
|
14038
|
+
|
14039
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14040
|
+
|
14041
|
+
return gf;
|
14042
|
+
}
|
14043
|
+
|
13547
14044
|
struct lm_ggml_cgraph * build_openelm() {
|
13548
14045
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13549
14046
|
|
@@ -15388,6 +15885,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
15388
15885
|
{
|
15389
15886
|
result = llm.build_minicpm();
|
15390
15887
|
} break;
|
15888
|
+
case LLM_ARCH_MINICPM3:
|
15889
|
+
{
|
15890
|
+
result = llm.build_minicpm3();
|
15891
|
+
} break;
|
15391
15892
|
case LLM_ARCH_GEMMA:
|
15392
15893
|
{
|
15393
15894
|
result = llm.build_gemma();
|
@@ -15420,6 +15921,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
15420
15921
|
{
|
15421
15922
|
result = llm.build_olmo();
|
15422
15923
|
} break;
|
15924
|
+
case LLM_ARCH_OLMOE:
|
15925
|
+
{
|
15926
|
+
result = llm.build_olmoe();
|
15927
|
+
} break;
|
15423
15928
|
case LLM_ARCH_OPENELM:
|
15424
15929
|
{
|
15425
15930
|
result = llm.build_openelm();
|
@@ -15831,7 +16336,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
|
15831
16336
|
|
15832
16337
|
// clear unused states
|
15833
16338
|
for (int i = 0; i < n_kv; ++i) {
|
15834
|
-
uint32_t
|
16339
|
+
const uint32_t cell_id = i + kv_self.head;
|
15835
16340
|
llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
|
15836
16341
|
|
15837
16342
|
data[i] = (float) (kv_cell.src >= 0);
|
@@ -16087,19 +16592,21 @@ static int llama_decode_internal(
|
|
16087
16592
|
return -1;
|
16088
16593
|
}
|
16089
16594
|
|
16090
|
-
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16091
|
-
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
16092
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16093
|
-
return -1;
|
16094
|
-
}
|
16095
|
-
}
|
16096
|
-
|
16097
16595
|
const auto & model = lctx.model;
|
16098
16596
|
const auto & hparams = model.hparams;
|
16099
16597
|
const auto & cparams = lctx.cparams;
|
16100
16598
|
|
16101
16599
|
LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
|
16102
16600
|
|
16601
|
+
if (batch_all.token) {
|
16602
|
+
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
16603
|
+
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
|
16604
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
16605
|
+
return -1;
|
16606
|
+
}
|
16607
|
+
}
|
16608
|
+
}
|
16609
|
+
|
16103
16610
|
LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
|
16104
16611
|
|
16105
16612
|
LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
|
@@ -16216,8 +16723,8 @@ static int llama_decode_internal(
|
|
16216
16723
|
lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
|
16217
16724
|
|
16218
16725
|
// the output is always the last tensor in the graph
|
16219
|
-
struct lm_ggml_tensor * res = gf
|
16220
|
-
struct lm_ggml_tensor * embd = gf
|
16726
|
+
struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
|
16727
|
+
struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
|
16221
16728
|
|
16222
16729
|
if (lctx.n_outputs == 0) {
|
16223
16730
|
// no output
|
@@ -16226,9 +16733,9 @@ static int llama_decode_internal(
|
|
16226
16733
|
} else if (cparams.embeddings) {
|
16227
16734
|
res = nullptr; // do not extract logits for embedding case
|
16228
16735
|
embd = nullptr;
|
16229
|
-
for (int i = gf
|
16230
|
-
if (strcmp(gf
|
16231
|
-
embd = gf
|
16736
|
+
for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
16737
|
+
if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
16738
|
+
embd = lm_ggml_graph_node(gf, i);
|
16232
16739
|
break;
|
16233
16740
|
}
|
16234
16741
|
}
|
@@ -16386,19 +16893,21 @@ static int llama_encode_internal(
|
|
16386
16893
|
return -1;
|
16387
16894
|
}
|
16388
16895
|
|
16389
|
-
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16390
|
-
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
16391
|
-
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16392
|
-
return -1;
|
16393
|
-
}
|
16394
|
-
}
|
16395
|
-
|
16396
16896
|
const auto & model = lctx.model;
|
16397
16897
|
const auto & hparams = model.hparams;
|
16398
16898
|
const auto & cparams = lctx.cparams;
|
16399
16899
|
|
16400
16900
|
LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
16401
16901
|
|
16902
|
+
if (batch.token) {
|
16903
|
+
for (uint32_t i = 0; i < n_tokens; ++i) {
|
16904
|
+
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
|
16905
|
+
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
16906
|
+
return -1;
|
16907
|
+
}
|
16908
|
+
}
|
16909
|
+
}
|
16910
|
+
|
16402
16911
|
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
16403
16912
|
LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
|
16404
16913
|
|
@@ -16443,15 +16952,15 @@ static int llama_encode_internal(
|
|
16443
16952
|
// there are two cases here
|
16444
16953
|
if (llama_model_has_decoder(&lctx.model)) {
|
16445
16954
|
// first case is an encoder-decoder T5 model where embeddings are passed to decoder
|
16446
|
-
embd = gf
|
16955
|
+
embd = lm_ggml_graph_node(gf, -1);
|
16447
16956
|
LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
|
16448
16957
|
} else {
|
16449
16958
|
// second case is an encoder-only T5 model
|
16450
16959
|
if (cparams.embeddings) {
|
16451
16960
|
// only output embeddings if required
|
16452
|
-
embd = gf
|
16961
|
+
embd = lm_ggml_graph_node(gf, -1);
|
16453
16962
|
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
16454
|
-
embd = gf
|
16963
|
+
embd = lm_ggml_graph_node(gf, -2);
|
16455
16964
|
}
|
16456
16965
|
LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
16457
16966
|
}
|
@@ -17541,6 +18050,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
17541
18050
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
17542
18051
|
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
17543
18052
|
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
18053
|
+
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
18054
|
+
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
17544
18055
|
|
17545
18056
|
// do not quantize relative position bias (T5)
|
17546
18057
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
@@ -17950,6 +18461,7 @@ struct llama_context_params llama_context_default_params() {
|
|
17950
18461
|
/*.embeddings =*/ false,
|
17951
18462
|
/*.offload_kqv =*/ true,
|
17952
18463
|
/*.flash_attn =*/ false,
|
18464
|
+
/*.no_perf =*/ true,
|
17953
18465
|
/*.abort_callback =*/ nullptr,
|
17954
18466
|
/*.abort_callback_data =*/ nullptr,
|
17955
18467
|
};
|
@@ -18072,9 +18584,9 @@ struct llama_model * llama_load_model_from_file(
|
|
18072
18584
|
unsigned percentage = (unsigned) (100 * progress);
|
18073
18585
|
while (percentage > *cur_percentage_p) {
|
18074
18586
|
*cur_percentage_p = percentage;
|
18075
|
-
|
18587
|
+
LLAMA_LOG(".");
|
18076
18588
|
if (percentage >= 100) {
|
18077
|
-
|
18589
|
+
LLAMA_LOG("\n");
|
18078
18590
|
}
|
18079
18591
|
}
|
18080
18592
|
return true;
|
@@ -18160,6 +18672,7 @@ struct llama_context * llama_new_context_with_model(
|
|
18160
18672
|
cparams.embeddings = params.embeddings;
|
18161
18673
|
cparams.offload_kqv = params.offload_kqv;
|
18162
18674
|
cparams.flash_attn = params.flash_attn;
|
18675
|
+
cparams.no_perf = params.no_perf;
|
18163
18676
|
cparams.pooling_type = params.pooling_type;
|
18164
18677
|
|
18165
18678
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
@@ -18497,7 +19010,7 @@ struct llama_context * llama_new_context_with_model(
|
|
18497
19010
|
|
18498
19011
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
18499
19012
|
int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
|
18500
|
-
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf
|
19013
|
+
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
|
18501
19014
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
18502
19015
|
}
|
18503
19016
|
}
|
@@ -18596,6 +19109,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18596
19109
|
case LLM_ARCH_QWEN:
|
18597
19110
|
case LLM_ARCH_QWEN2:
|
18598
19111
|
case LLM_ARCH_QWEN2MOE:
|
19112
|
+
case LLM_ARCH_OLMOE:
|
18599
19113
|
case LLM_ARCH_PHI2:
|
18600
19114
|
case LLM_ARCH_PHI3:
|
18601
19115
|
case LLM_ARCH_GEMMA:
|
@@ -18606,6 +19120,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18606
19120
|
case LLM_ARCH_CODESHELL:
|
18607
19121
|
case LLM_ARCH_NEMOTRON:
|
18608
19122
|
case LLM_ARCH_EXAONE:
|
19123
|
+
case LLM_ARCH_MINICPM3:
|
18609
19124
|
return LLAMA_ROPE_TYPE_NEOX;
|
18610
19125
|
|
18611
19126
|
// all model arches should be listed explicitly here
|
@@ -20078,10 +20593,14 @@ void llama_synchronize(struct llama_context * ctx) {
|
|
20078
20593
|
|
20079
20594
|
// add the evaluation to the stats
|
20080
20595
|
if (ctx->n_queued_tokens == 1) {
|
20081
|
-
ctx->
|
20596
|
+
if (!ctx->cparams.no_perf) {
|
20597
|
+
ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20598
|
+
}
|
20082
20599
|
ctx->n_eval++;
|
20083
20600
|
} else if (ctx->n_queued_tokens > 1) {
|
20084
|
-
ctx->
|
20601
|
+
if (!ctx->cparams.no_perf) {
|
20602
|
+
ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
|
20603
|
+
}
|
20085
20604
|
ctx->n_p_eval += ctx->n_queued_tokens;
|
20086
20605
|
}
|
20087
20606
|
|
@@ -20677,6 +21196,7 @@ const char * llama_print_system_info(void) {
|
|
20677
21196
|
s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
|
20678
21197
|
s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
|
20679
21198
|
s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
|
21199
|
+
s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
|
20680
21200
|
s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
|
20681
21201
|
s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
|
20682
21202
|
s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
|
@@ -20688,65 +21208,40 @@ const char * llama_print_system_info(void) {
|
|
20688
21208
|
return s.c_str();
|
20689
21209
|
}
|
20690
21210
|
|
20691
|
-
|
20692
|
-
|
20693
|
-
case LLAMA_PERF_TYPE_CONTEXT:
|
20694
|
-
{
|
20695
|
-
const auto * p = (const struct llama_context *) ctx;
|
20696
|
-
|
20697
|
-
const double t_start_ms = 1e-3 * p->t_start_us;
|
20698
|
-
const double t_end_ms = 1.00 * lm_ggml_time_ms();
|
20699
|
-
const double t_load_ms = 1e-3 * p->t_load_us;
|
20700
|
-
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
|
20701
|
-
const double t_eval_ms = 1e-3 * p->t_eval_us;
|
20702
|
-
|
20703
|
-
const int32_t n_p_eval = std::max(0, p->n_p_eval);
|
20704
|
-
const int32_t n_eval = std::max(1, p->n_eval);
|
20705
|
-
|
20706
|
-
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
|
20707
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
20708
|
-
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
|
20709
|
-
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
20710
|
-
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
|
20711
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
|
20712
|
-
} break;
|
20713
|
-
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20714
|
-
{
|
20715
|
-
const auto * smpl = (const struct llama_sampler *) ctx;
|
20716
|
-
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
|
21211
|
+
struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
|
21212
|
+
struct llama_perf_context_data data = {};
|
20717
21213
|
|
20718
|
-
|
21214
|
+
if (ctx == nullptr) {
|
21215
|
+
return data;
|
21216
|
+
}
|
20719
21217
|
|
20720
|
-
|
21218
|
+
data.t_start_ms = 1e-3 * ctx->t_start_us;
|
21219
|
+
data.t_load_ms = 1e-3 * ctx->t_load_us;
|
21220
|
+
data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
|
21221
|
+
data.t_eval_ms = 1e-3 * ctx->t_eval_us;
|
21222
|
+
data.n_p_eval = std::max(1, ctx->n_p_eval);
|
21223
|
+
data.n_eval = std::max(1, ctx->n_eval);
|
20721
21224
|
|
20722
|
-
|
20723
|
-
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
|
20724
|
-
} break;
|
20725
|
-
default:
|
20726
|
-
LM_GGML_ABORT("invalid perf type");
|
20727
|
-
}
|
21225
|
+
return data;
|
20728
21226
|
}
|
20729
21227
|
|
20730
|
-
void
|
20731
|
-
|
20732
|
-
case LLAMA_PERF_TYPE_CONTEXT:
|
20733
|
-
{
|
20734
|
-
auto * p = (struct llama_context *) ctx;
|
21228
|
+
void llama_perf_context_print(const struct llama_context * ctx) {
|
21229
|
+
const auto data = llama_perf_context(ctx);
|
20735
21230
|
|
20736
|
-
|
20737
|
-
p->t_eval_us = p->n_eval = 0;
|
20738
|
-
p->t_p_eval_us = p->n_p_eval = 0;
|
20739
|
-
} break;
|
20740
|
-
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
|
20741
|
-
{
|
20742
|
-
auto * smpl = (struct llama_sampler *) ctx;
|
20743
|
-
auto * p = (struct llama_sampler_chain *) smpl->ctx;
|
21231
|
+
const double t_end_ms = 1e-3 * lm_ggml_time_us();
|
20744
21232
|
|
20745
|
-
|
20746
|
-
|
20747
|
-
|
20748
|
-
|
20749
|
-
|
21233
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
|
21234
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
21235
|
+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
|
21236
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
21237
|
+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
21238
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
21239
|
+
}
|
21240
|
+
|
21241
|
+
void llama_perf_context_reset(struct llama_context * ctx) {
|
21242
|
+
ctx->t_start_us = lm_ggml_time_us();
|
21243
|
+
ctx->t_eval_us = ctx->n_eval = 0;
|
21244
|
+
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
20750
21245
|
}
|
20751
21246
|
|
20752
21247
|
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
@@ -20798,8 +21293,8 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
|
|
20798
21293
|
if (len < 128) {
|
20799
21294
|
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
20800
21295
|
} else {
|
20801
|
-
char* buffer2 = new char[len+1];
|
20802
|
-
vsnprintf(buffer2, len+1, format, args_copy);
|
21296
|
+
char * buffer2 = new char[len + 1];
|
21297
|
+
vsnprintf(buffer2, len + 1, format, args_copy);
|
20803
21298
|
buffer2[len] = 0;
|
20804
21299
|
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
20805
21300
|
delete[] buffer2;
|
@@ -20821,19 +21316,3 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
20821
21316
|
fflush(stderr);
|
20822
21317
|
}
|
20823
21318
|
|
20824
|
-
struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
|
20825
|
-
const auto * ctx = (llama_context *) v_ctx;
|
20826
|
-
struct llama_token_timings result = {
|
20827
|
-
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20828
|
-
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20829
|
-
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20830
|
-
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20831
|
-
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20832
|
-
|
20833
|
-
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20834
|
-
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20835
|
-
};
|
20836
|
-
|
20837
|
-
return result;
|
20838
|
-
}
|
20839
|
-
|