cui-llama.rn 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +1 -2
- package/cpp/common.cpp +157 -53
- package/cpp/common.h +11 -3
- package/cpp/ggml-metal.m +33 -22
- package/cpp/ggml-quants.c +33 -36
- package/cpp/ggml.h +5 -4
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +0 -8
- package/cpp/llama.cpp +519 -34
- package/cpp/llama.h +0 -17
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +7 -10
- package/cpp/sampling.cpp +1 -5
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/llama.cpp
CHANGED
@@ -204,6 +204,7 @@ enum llm_arch {
|
|
204
204
|
LLM_ARCH_ORION,
|
205
205
|
LLM_ARCH_INTERNLM2,
|
206
206
|
LLM_ARCH_MINICPM,
|
207
|
+
LLM_ARCH_MINICPM3,
|
207
208
|
LLM_ARCH_GEMMA,
|
208
209
|
LLM_ARCH_GEMMA2,
|
209
210
|
LLM_ARCH_STARCODER2,
|
@@ -212,6 +213,7 @@ enum llm_arch {
|
|
212
213
|
LLM_ARCH_COMMAND_R,
|
213
214
|
LLM_ARCH_DBRX,
|
214
215
|
LLM_ARCH_OLMO,
|
216
|
+
LLM_ARCH_OLMOE,
|
215
217
|
LLM_ARCH_OPENELM,
|
216
218
|
LLM_ARCH_ARCTIC,
|
217
219
|
LLM_ARCH_DEEPSEEK2,
|
@@ -252,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
252
254
|
{ LLM_ARCH_ORION, "orion" },
|
253
255
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
254
256
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
257
|
+
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
255
258
|
{ LLM_ARCH_GEMMA, "gemma" },
|
256
259
|
{ LLM_ARCH_GEMMA2, "gemma2" },
|
257
260
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
@@ -260,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
260
263
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
261
264
|
{ LLM_ARCH_DBRX, "dbrx" },
|
262
265
|
{ LLM_ARCH_OLMO, "olmo" },
|
266
|
+
{ LLM_ARCH_OLMOE, "olmoe" },
|
263
267
|
{ LLM_ARCH_OPENELM, "openelm" },
|
264
268
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
265
269
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
@@ -1045,6 +1049,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1045
1049
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
1046
1050
|
},
|
1047
1051
|
},
|
1052
|
+
{
|
1053
|
+
LLM_ARCH_MINICPM3,
|
1054
|
+
{
|
1055
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1056
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1057
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1058
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
1059
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
1060
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1061
|
+
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
1062
|
+
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
1063
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1064
|
+
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
1065
|
+
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1066
|
+
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1067
|
+
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1068
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1069
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1070
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1071
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1072
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1073
|
+
},
|
1074
|
+
},
|
1048
1075
|
{
|
1049
1076
|
LLM_ARCH_GEMMA,
|
1050
1077
|
{
|
@@ -1179,6 +1206,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1179
1206
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1180
1207
|
},
|
1181
1208
|
},
|
1209
|
+
{
|
1210
|
+
LLM_ARCH_OLMOE,
|
1211
|
+
{
|
1212
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1213
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1214
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1215
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1216
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1217
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1218
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1219
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1220
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
1221
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
1222
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1223
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1224
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1225
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1226
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1227
|
+
},
|
1228
|
+
},
|
1182
1229
|
{
|
1183
1230
|
LLM_ARCH_OPENELM,
|
1184
1231
|
{
|
@@ -2263,6 +2310,7 @@ enum e_model {
|
|
2263
2310
|
MODEL_MEDIUM,
|
2264
2311
|
MODEL_LARGE,
|
2265
2312
|
MODEL_XL,
|
2313
|
+
MODEL_A1_7B,
|
2266
2314
|
MODEL_A2_7B,
|
2267
2315
|
MODEL_8x7B,
|
2268
2316
|
MODEL_8x22B,
|
@@ -5227,6 +5275,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
5227
5275
|
case MODEL_MEDIUM: return "0.4B";
|
5228
5276
|
case MODEL_LARGE: return "0.8B";
|
5229
5277
|
case MODEL_XL: return "1.5B";
|
5278
|
+
case MODEL_A1_7B: return "A1.7B";
|
5230
5279
|
case MODEL_A2_7B: return "A2.7B";
|
5231
5280
|
case MODEL_8x7B: return "8x7B";
|
5232
5281
|
case MODEL_8x22B: return "8x22B";
|
@@ -5401,6 +5450,17 @@ static void llm_load_hparams(
|
|
5401
5450
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5402
5451
|
}
|
5403
5452
|
} break;
|
5453
|
+
case LLM_ARCH_MINICPM3:
|
5454
|
+
{
|
5455
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5456
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
5457
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
5458
|
+
|
5459
|
+
switch (hparams.n_layer) {
|
5460
|
+
case 62: model.type = e_model::MODEL_4B; break;
|
5461
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5462
|
+
}
|
5463
|
+
} break;
|
5404
5464
|
case LLM_ARCH_GROK:
|
5405
5465
|
{
|
5406
5466
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -5766,6 +5826,14 @@ static void llm_load_hparams(
|
|
5766
5826
|
default: model.type = e_model::MODEL_UNKNOWN;
|
5767
5827
|
}
|
5768
5828
|
} break;
|
5829
|
+
case LLM_ARCH_OLMOE:
|
5830
|
+
{
|
5831
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
5832
|
+
switch (hparams.n_layer) {
|
5833
|
+
case 16: model.type = e_model::MODEL_A1_7B; break;
|
5834
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
5835
|
+
}
|
5836
|
+
} break;
|
5769
5837
|
case LLM_ARCH_OPENELM:
|
5770
5838
|
{
|
5771
5839
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -6908,6 +6976,54 @@ static bool llm_load_tensors(
|
|
6908
6976
|
}
|
6909
6977
|
}
|
6910
6978
|
} break;
|
6979
|
+
case LLM_ARCH_MINICPM3:
|
6980
|
+
{
|
6981
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
6982
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
6983
|
+
|
6984
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
6985
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
6986
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6987
|
+
|
6988
|
+
// output
|
6989
|
+
{
|
6990
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6991
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6992
|
+
|
6993
|
+
// if output is NULL, init from the input tok embed
|
6994
|
+
if (model.output == NULL) {
|
6995
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6996
|
+
}
|
6997
|
+
}
|
6998
|
+
|
6999
|
+
for (int i = 0; i < n_layer; ++i) {
|
7000
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
7001
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
7002
|
+
|
7003
|
+
auto & layer = model.layers[i];
|
7004
|
+
|
7005
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
7006
|
+
layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
|
7007
|
+
|
7008
|
+
layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
7009
|
+
|
7010
|
+
layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
7011
|
+
layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
|
7012
|
+
|
7013
|
+
layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
7014
|
+
layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
|
7015
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
|
7016
|
+
|
7017
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
7018
|
+
|
7019
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
7020
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
7021
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7022
|
+
|
7023
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7024
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
7025
|
+
}
|
7026
|
+
} break;
|
6911
7027
|
case LLM_ARCH_GROK:
|
6912
7028
|
{
|
6913
7029
|
if (n_expert == 0) {
|
@@ -7945,6 +8061,44 @@ static bool llm_load_tensors(
|
|
7945
8061
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
7946
8062
|
}
|
7947
8063
|
} break;
|
8064
|
+
case LLM_ARCH_OLMOE:
|
8065
|
+
{
|
8066
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
8067
|
+
|
8068
|
+
// output
|
8069
|
+
{
|
8070
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
8071
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
8072
|
+
}
|
8073
|
+
|
8074
|
+
for (int i = 0; i < n_layer; ++i) {
|
8075
|
+
lm_ggml_context * ctx_layer = ctx_for_layer(i);
|
8076
|
+
lm_ggml_context * ctx_split = ctx_for_layer_split(i);
|
8077
|
+
|
8078
|
+
auto & layer = model.layers[i];
|
8079
|
+
|
8080
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
8081
|
+
|
8082
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
8083
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
8084
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
8085
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
8086
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
|
8087
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
|
8088
|
+
|
8089
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
8090
|
+
|
8091
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
8092
|
+
|
8093
|
+
LM_GGML_ASSERT(n_expert > 0);
|
8094
|
+
LM_GGML_ASSERT(n_expert_used > 0);
|
8095
|
+
|
8096
|
+
// MoE branch
|
8097
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
8098
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
|
8099
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
8100
|
+
}
|
8101
|
+
} break;
|
7948
8102
|
case LLM_ARCH_OPENELM:
|
7949
8103
|
{
|
7950
8104
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
@@ -9428,7 +9582,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9428
9582
|
struct lm_ggml_tensor * cur,
|
9429
9583
|
struct lm_ggml_tensor * x_prev,
|
9430
9584
|
struct lm_ggml_tensor ** wkv_state) {
|
9431
|
-
size_t
|
9585
|
+
size_t n_embd = cur->ne[0];
|
9432
9586
|
size_t n_seq_tokens = cur->ne[1];
|
9433
9587
|
size_t n_seqs = cur->ne[2];
|
9434
9588
|
|
@@ -9439,8 +9593,8 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9439
9593
|
|
9440
9594
|
struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
|
9441
9595
|
|
9442
|
-
sx = lm_ggml_reshape_2d(ctx, sx,
|
9443
|
-
cur = lm_ggml_reshape_2d(ctx, cur,
|
9596
|
+
sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
|
9597
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
9444
9598
|
|
9445
9599
|
struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
|
9446
9600
|
|
@@ -9465,11 +9619,11 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9465
9619
|
xxx
|
9466
9620
|
);
|
9467
9621
|
|
9468
|
-
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx,
|
9469
|
-
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx,
|
9470
|
-
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx,
|
9471
|
-
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx,
|
9472
|
-
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx,
|
9622
|
+
struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
9623
|
+
struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
9624
|
+
struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
9625
|
+
struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
9626
|
+
struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
9473
9627
|
|
9474
9628
|
struct lm_ggml_tensor * xw = lm_ggml_add(
|
9475
9629
|
ctx,
|
@@ -9538,7 +9692,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9538
9692
|
)
|
9539
9693
|
);
|
9540
9694
|
|
9541
|
-
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay,
|
9695
|
+
w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
|
9542
9696
|
w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
|
9543
9697
|
w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
9544
9698
|
|
@@ -9547,21 +9701,21 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
|
|
9547
9701
|
r = lm_ggml_transpose(ctx, r);
|
9548
9702
|
|
9549
9703
|
struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
9550
|
-
cur = lm_ggml_view_1d(ctx, wkv_output,
|
9551
|
-
*wkv_state = lm_ggml_view_1d(ctx, wkv_output,
|
9704
|
+
cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
9705
|
+
*wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
9552
9706
|
|
9553
9707
|
// group norm with head_count groups
|
9554
|
-
cur = lm_ggml_reshape_3d(ctx, cur,
|
9708
|
+
cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
|
9555
9709
|
cur = lm_ggml_norm(ctx, cur, 64e-5f);
|
9556
9710
|
|
9557
9711
|
// Convert back to regular vectors.
|
9558
|
-
cur = lm_ggml_reshape_2d(ctx, cur,
|
9712
|
+
cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
9559
9713
|
cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
9560
9714
|
|
9561
9715
|
cur = lm_ggml_mul(ctx, cur, g);
|
9562
9716
|
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
9563
9717
|
|
9564
|
-
return lm_ggml_reshape_3d(ctx, cur,
|
9718
|
+
return lm_ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
|
9565
9719
|
}
|
9566
9720
|
|
9567
9721
|
static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
|
@@ -12854,6 +13008,215 @@ struct llm_build_context {
|
|
12854
13008
|
return gf;
|
12855
13009
|
}
|
12856
13010
|
|
13011
|
+
struct lm_ggml_cgraph * build_minicpm3() {
|
13012
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13013
|
+
|
13014
|
+
//TODO: if the model varies, these parameters need to be read from the model
|
13015
|
+
const int64_t n_embd_base = 256;
|
13016
|
+
const float scale_embd = 12.0f;
|
13017
|
+
const float scale_depth = 1.4f;
|
13018
|
+
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
13019
|
+
|
13020
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
13021
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
13022
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
13023
|
+
|
13024
|
+
struct lm_ggml_tensor * cur;
|
13025
|
+
struct lm_ggml_tensor * inpL;
|
13026
|
+
|
13027
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13028
|
+
|
13029
|
+
// scale the input embeddings
|
13030
|
+
inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
|
13031
|
+
cb(inpL, "inp_scaled", -1);
|
13032
|
+
|
13033
|
+
// inp_pos - contains the positions
|
13034
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13035
|
+
|
13036
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13037
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13038
|
+
|
13039
|
+
for (int il = 0; il < n_layer; ++il) {
|
13040
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13041
|
+
|
13042
|
+
struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
|
13043
|
+
// norm
|
13044
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13045
|
+
model.layers[il].attn_norm, NULL,
|
13046
|
+
LLM_NORM_RMS, cb, il);
|
13047
|
+
cb(cur, "attn_norm", il);
|
13048
|
+
|
13049
|
+
// self_attention
|
13050
|
+
{
|
13051
|
+
struct lm_ggml_tensor * q = NULL;
|
13052
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
13053
|
+
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
13054
|
+
cb(q, "q", il);
|
13055
|
+
|
13056
|
+
q = llm_build_norm(ctx0, q, hparams,
|
13057
|
+
model.layers[il].attn_q_a_norm, NULL,
|
13058
|
+
LLM_NORM_RMS, cb, il);
|
13059
|
+
cb(q, "q", il);
|
13060
|
+
|
13061
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
13062
|
+
q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
13063
|
+
cb(q, "q", il);
|
13064
|
+
|
13065
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
13066
|
+
struct lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
13067
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k),
|
13068
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
13069
|
+
0);
|
13070
|
+
cb(q_nope, "q_nope", il);
|
13071
|
+
|
13072
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
13073
|
+
struct lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
13074
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k),
|
13075
|
+
lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
13076
|
+
lm_ggml_row_size(q->type, n_embd_head_qk_nope));
|
13077
|
+
cb(q_pe, "q_pe", il);
|
13078
|
+
|
13079
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
13080
|
+
struct lm_ggml_tensor * kv_pe_compresseed = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
13081
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
13082
|
+
|
13083
|
+
// split into {kv_lora_rank, n_tokens}
|
13084
|
+
struct lm_ggml_tensor * kv_compressed = lm_ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
13085
|
+
kv_pe_compresseed->nb[1],
|
13086
|
+
0);
|
13087
|
+
cb(kv_compressed, "kv_compressed", il);
|
13088
|
+
|
13089
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
13090
|
+
struct lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
13091
|
+
kv_pe_compresseed->nb[1],
|
13092
|
+
kv_pe_compresseed->nb[1],
|
13093
|
+
lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
13094
|
+
cb(k_pe, "k_pe", il);
|
13095
|
+
|
13096
|
+
kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
13097
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
13098
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
13099
|
+
LLM_NORM_RMS, cb, il);
|
13100
|
+
cb(kv_compressed, "kv_compressed", il);
|
13101
|
+
|
13102
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
13103
|
+
struct lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
13104
|
+
cb(kv, "kv", il);
|
13105
|
+
|
13106
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
13107
|
+
struct lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
13108
|
+
lm_ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
13109
|
+
lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
13110
|
+
0);
|
13111
|
+
cb(k_nope, "k_nope", il);
|
13112
|
+
|
13113
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
13114
|
+
struct lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
13115
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
13116
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
13117
|
+
lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
13118
|
+
cb(v_states, "v_states", il);
|
13119
|
+
|
13120
|
+
v_states = lm_ggml_cont(ctx0, v_states);
|
13121
|
+
cb(v_states, "v_states", il);
|
13122
|
+
|
13123
|
+
v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
13124
|
+
lm_ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
13125
|
+
0);
|
13126
|
+
cb(v_states, "v_states", il);
|
13127
|
+
|
13128
|
+
q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
13129
|
+
q_pe = lm_ggml_rope_ext(
|
13130
|
+
ctx0, q_pe, inp_pos, rope_factors,
|
13131
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13132
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13133
|
+
);
|
13134
|
+
cb(q_pe, "q_pe", il);
|
13135
|
+
|
13136
|
+
// shared RoPE key
|
13137
|
+
k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
13138
|
+
k_pe = lm_ggml_rope_ext(
|
13139
|
+
ctx0, k_pe, inp_pos, rope_factors,
|
13140
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13141
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13142
|
+
);
|
13143
|
+
cb(k_pe, "k_pe", il);
|
13144
|
+
|
13145
|
+
struct lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0);
|
13146
|
+
cb(q_states, "q_states", il);
|
13147
|
+
|
13148
|
+
struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
|
13149
|
+
cb(k_states, "k_states", il);
|
13150
|
+
|
13151
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13152
|
+
model.layers[il].wo, NULL,
|
13153
|
+
k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
13154
|
+
}
|
13155
|
+
|
13156
|
+
if (il == n_layer - 1) {
|
13157
|
+
// skip computing output for unused tokens
|
13158
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13159
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13160
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13161
|
+
}
|
13162
|
+
|
13163
|
+
// scale_res - scale the hidden states for residual connection
|
13164
|
+
const float scale_res = scale_depth/sqrtf(float(n_layer));
|
13165
|
+
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13166
|
+
cb(cur, "hidden_scaled", il);
|
13167
|
+
|
13168
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
13169
|
+
cb(ffn_inp, "ffn_inp", il);
|
13170
|
+
|
13171
|
+
// feed-forward network
|
13172
|
+
{
|
13173
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
13174
|
+
model.layers[il].ffn_norm, NULL,
|
13175
|
+
LLM_NORM_RMS, cb, il);
|
13176
|
+
cb(cur, "ffn_norm", il);
|
13177
|
+
|
13178
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
13179
|
+
model.layers[il].ffn_up, NULL, NULL,
|
13180
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
13181
|
+
model.layers[il].ffn_down, NULL, NULL,
|
13182
|
+
NULL,
|
13183
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
13184
|
+
cb(cur, "ffn_out", il);
|
13185
|
+
}
|
13186
|
+
|
13187
|
+
// scale the hidden states for residual connection
|
13188
|
+
cur = lm_ggml_scale(ctx0, cur, scale_res);
|
13189
|
+
cb(cur, "hidden_scaled_ffn", il);
|
13190
|
+
|
13191
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
13192
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
13193
|
+
cb(cur, "l_out", il);
|
13194
|
+
|
13195
|
+
// input for next layer
|
13196
|
+
inpL = cur;
|
13197
|
+
}
|
13198
|
+
|
13199
|
+
cur = inpL;
|
13200
|
+
|
13201
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
13202
|
+
model.output_norm, NULL,
|
13203
|
+
LLM_NORM_RMS, cb, -1);
|
13204
|
+
cb(cur, "result_norm", -1);
|
13205
|
+
|
13206
|
+
// lm_head scaling
|
13207
|
+
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
13208
|
+
cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
|
13209
|
+
cb(cur, "lmhead_scaling", -1);
|
13210
|
+
|
13211
|
+
// lm_head
|
13212
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
13213
|
+
cb(cur, "result_output", -1);
|
13214
|
+
|
13215
|
+
lm_ggml_build_forward_expand(gf, cur);
|
13216
|
+
|
13217
|
+
return gf;
|
13218
|
+
}
|
13219
|
+
|
12857
13220
|
struct lm_ggml_cgraph * build_gemma() {
|
12858
13221
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
12859
13222
|
|
@@ -13550,6 +13913,134 @@ struct llm_build_context {
|
|
13550
13913
|
return gf;
|
13551
13914
|
}
|
13552
13915
|
|
13916
|
+
// based on the build_qwen2moe() function, changes:
|
13917
|
+
// * removed shared experts
|
13918
|
+
// * removed bias
|
13919
|
+
// * added q, k norm
|
13920
|
+
struct lm_ggml_cgraph * build_olmoe() {
|
13921
|
+
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13922
|
+
|
13923
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
13924
|
+
int32_t n_tokens = this->n_tokens;
|
13925
|
+
|
13926
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
13927
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
13928
|
+
LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
|
13929
|
+
|
13930
|
+
struct lm_ggml_tensor * cur;
|
13931
|
+
struct lm_ggml_tensor * inpL;
|
13932
|
+
|
13933
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
13934
|
+
|
13935
|
+
// inp_pos - contains the positions
|
13936
|
+
struct lm_ggml_tensor * inp_pos = build_inp_pos();
|
13937
|
+
|
13938
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
13939
|
+
struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
13940
|
+
|
13941
|
+
for (int il = 0; il < n_layer; ++il) {
|
13942
|
+
struct lm_ggml_tensor * inpSA = inpL;
|
13943
|
+
|
13944
|
+
// norm
|
13945
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
13946
|
+
model.layers[il].attn_norm, NULL,
|
13947
|
+
LLM_NORM_RMS, cb, il);
|
13948
|
+
cb(cur, "attn_norm", il);
|
13949
|
+
|
13950
|
+
// self_attention
|
13951
|
+
{
|
13952
|
+
// compute Q and K and RoPE them
|
13953
|
+
struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
13954
|
+
cb(Qcur, "Qcur", il);
|
13955
|
+
|
13956
|
+
struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
13957
|
+
cb(Kcur, "Kcur", il);
|
13958
|
+
|
13959
|
+
struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
13960
|
+
cb(Vcur, "Vcur", il);
|
13961
|
+
|
13962
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
|
13963
|
+
LLM_NORM_RMS, cb, il);
|
13964
|
+
cb(Qcur, "Qcur_normed", il);
|
13965
|
+
|
13966
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
|
13967
|
+
LLM_NORM_RMS, cb, il);
|
13968
|
+
cb(Kcur, "Kcur_normed", il);
|
13969
|
+
|
13970
|
+
Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
13971
|
+
Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
13972
|
+
|
13973
|
+
Qcur = lm_ggml_rope_ext(
|
13974
|
+
ctx0, Qcur, inp_pos, nullptr,
|
13975
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13976
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13977
|
+
);
|
13978
|
+
cb(Qcur, "Qcur_rope", il);
|
13979
|
+
|
13980
|
+
Kcur = lm_ggml_rope_ext(
|
13981
|
+
ctx0, Kcur, inp_pos, nullptr,
|
13982
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
13983
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
13984
|
+
);
|
13985
|
+
cb(Kcur, "Kcur_rope", il);
|
13986
|
+
|
13987
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
13988
|
+
model.layers[il].wo, NULL,
|
13989
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
13990
|
+
}
|
13991
|
+
|
13992
|
+
if (il == n_layer - 1) {
|
13993
|
+
// skip computing output for unused tokens
|
13994
|
+
struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
|
13995
|
+
n_tokens = n_outputs;
|
13996
|
+
cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
|
13997
|
+
inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
13998
|
+
}
|
13999
|
+
|
14000
|
+
struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
|
14001
|
+
cb(ffn_inp, "ffn_inp", il);
|
14002
|
+
|
14003
|
+
// MoE branch
|
14004
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
14005
|
+
model.layers[il].ffn_norm, NULL,
|
14006
|
+
LLM_NORM_RMS, cb, il);
|
14007
|
+
cb(cur, "ffn_norm", il);
|
14008
|
+
|
14009
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
14010
|
+
model.layers[il].ffn_gate_inp,
|
14011
|
+
model.layers[il].ffn_up_exps,
|
14012
|
+
model.layers[il].ffn_gate_exps,
|
14013
|
+
model.layers[il].ffn_down_exps,
|
14014
|
+
n_expert, n_expert_used,
|
14015
|
+
LLM_FFN_SILU, false,
|
14016
|
+
false, 0.0,
|
14017
|
+
cb, il);
|
14018
|
+
cb(cur, "ffn_moe_out", il);
|
14019
|
+
|
14020
|
+
cur = lm_ggml_add(ctx0, cur, ffn_inp);
|
14021
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
14022
|
+
cb(cur, "l_out", il);
|
14023
|
+
|
14024
|
+
// input for next layer
|
14025
|
+
inpL = cur;
|
14026
|
+
}
|
14027
|
+
|
14028
|
+
cur = inpL;
|
14029
|
+
|
14030
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
14031
|
+
model.output_norm, NULL,
|
14032
|
+
LLM_NORM_RMS, cb, -1);
|
14033
|
+
cb(cur, "result_norm", -1);
|
14034
|
+
|
14035
|
+
// lm_head
|
14036
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
14037
|
+
cb(cur, "result_output", -1);
|
14038
|
+
|
14039
|
+
lm_ggml_build_forward_expand(gf, cur);
|
14040
|
+
|
14041
|
+
return gf;
|
14042
|
+
}
|
14043
|
+
|
13553
14044
|
struct lm_ggml_cgraph * build_openelm() {
|
13554
14045
|
struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
13555
14046
|
|
@@ -15394,6 +15885,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
15394
15885
|
{
|
15395
15886
|
result = llm.build_minicpm();
|
15396
15887
|
} break;
|
15888
|
+
case LLM_ARCH_MINICPM3:
|
15889
|
+
{
|
15890
|
+
result = llm.build_minicpm3();
|
15891
|
+
} break;
|
15397
15892
|
case LLM_ARCH_GEMMA:
|
15398
15893
|
{
|
15399
15894
|
result = llm.build_gemma();
|
@@ -15426,6 +15921,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
|
|
15426
15921
|
{
|
15427
15922
|
result = llm.build_olmo();
|
15428
15923
|
} break;
|
15924
|
+
case LLM_ARCH_OLMOE:
|
15925
|
+
{
|
15926
|
+
result = llm.build_olmoe();
|
15927
|
+
} break;
|
15429
15928
|
case LLM_ARCH_OPENELM:
|
15430
15929
|
{
|
15431
15930
|
result = llm.build_openelm();
|
@@ -18085,9 +18584,9 @@ struct llama_model * llama_load_model_from_file(
|
|
18085
18584
|
unsigned percentage = (unsigned) (100 * progress);
|
18086
18585
|
while (percentage > *cur_percentage_p) {
|
18087
18586
|
*cur_percentage_p = percentage;
|
18088
|
-
|
18587
|
+
LLAMA_LOG(".");
|
18089
18588
|
if (percentage >= 100) {
|
18090
|
-
|
18589
|
+
LLAMA_LOG("\n");
|
18091
18590
|
}
|
18092
18591
|
}
|
18093
18592
|
return true;
|
@@ -18610,6 +19109,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18610
19109
|
case LLM_ARCH_QWEN:
|
18611
19110
|
case LLM_ARCH_QWEN2:
|
18612
19111
|
case LLM_ARCH_QWEN2MOE:
|
19112
|
+
case LLM_ARCH_OLMOE:
|
18613
19113
|
case LLM_ARCH_PHI2:
|
18614
19114
|
case LLM_ARCH_PHI3:
|
18615
19115
|
case LLM_ARCH_GEMMA:
|
@@ -18620,6 +19120,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
18620
19120
|
case LLM_ARCH_CODESHELL:
|
18621
19121
|
case LLM_ARCH_NEMOTRON:
|
18622
19122
|
case LLM_ARCH_EXAONE:
|
19123
|
+
case LLM_ARCH_MINICPM3:
|
18623
19124
|
return LLAMA_ROPE_TYPE_NEOX;
|
18624
19125
|
|
18625
19126
|
// all model arches should be listed explicitly here
|
@@ -20792,8 +21293,8 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
|
|
20792
21293
|
if (len < 128) {
|
20793
21294
|
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
|
20794
21295
|
} else {
|
20795
|
-
char* buffer2 = new char[len+1];
|
20796
|
-
vsnprintf(buffer2, len+1, format, args_copy);
|
21296
|
+
char * buffer2 = new char[len + 1];
|
21297
|
+
vsnprintf(buffer2, len + 1, format, args_copy);
|
20797
21298
|
buffer2[len] = 0;
|
20798
21299
|
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
|
20799
21300
|
delete[] buffer2;
|
@@ -20815,19 +21316,3 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
20815
21316
|
fflush(stderr);
|
20816
21317
|
}
|
20817
21318
|
|
20818
|
-
struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
|
20819
|
-
const auto * ctx = (llama_context *) v_ctx;
|
20820
|
-
struct llama_token_timings result = {
|
20821
|
-
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
20822
|
-
/*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
|
20823
|
-
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
20824
|
-
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
20825
|
-
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
20826
|
-
|
20827
|
-
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
20828
|
-
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
20829
|
-
};
|
20830
|
-
|
20831
|
-
return result;
|
20832
|
-
}
|
20833
|
-
|