cui-llama.rn 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -204,6 +204,7 @@ enum llm_arch {
204
204
  LLM_ARCH_ORION,
205
205
  LLM_ARCH_INTERNLM2,
206
206
  LLM_ARCH_MINICPM,
207
+ LLM_ARCH_MINICPM3,
207
208
  LLM_ARCH_GEMMA,
208
209
  LLM_ARCH_GEMMA2,
209
210
  LLM_ARCH_STARCODER2,
@@ -212,6 +213,7 @@ enum llm_arch {
212
213
  LLM_ARCH_COMMAND_R,
213
214
  LLM_ARCH_DBRX,
214
215
  LLM_ARCH_OLMO,
216
+ LLM_ARCH_OLMOE,
215
217
  LLM_ARCH_OPENELM,
216
218
  LLM_ARCH_ARCTIC,
217
219
  LLM_ARCH_DEEPSEEK2,
@@ -252,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
254
  { LLM_ARCH_ORION, "orion" },
253
255
  { LLM_ARCH_INTERNLM2, "internlm2" },
254
256
  { LLM_ARCH_MINICPM, "minicpm" },
257
+ { LLM_ARCH_MINICPM3, "minicpm3" },
255
258
  { LLM_ARCH_GEMMA, "gemma" },
256
259
  { LLM_ARCH_GEMMA2, "gemma2" },
257
260
  { LLM_ARCH_STARCODER2, "starcoder2" },
@@ -260,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
260
263
  { LLM_ARCH_COMMAND_R, "command-r" },
261
264
  { LLM_ARCH_DBRX, "dbrx" },
262
265
  { LLM_ARCH_OLMO, "olmo" },
266
+ { LLM_ARCH_OLMOE, "olmoe" },
263
267
  { LLM_ARCH_OPENELM, "openelm" },
264
268
  { LLM_ARCH_ARCTIC, "arctic" },
265
269
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
@@ -1045,6 +1049,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1045
1049
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
1046
1050
  },
1047
1051
  },
1052
+ {
1053
+ LLM_ARCH_MINICPM3,
1054
+ {
1055
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1056
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1057
+ { LLM_TENSOR_OUTPUT, "output" },
1058
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1059
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1060
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1061
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1062
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1063
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1064
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1065
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1066
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1067
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1068
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1069
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1070
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1071
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1072
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1073
+ },
1074
+ },
1048
1075
  {
1049
1076
  LLM_ARCH_GEMMA,
1050
1077
  {
@@ -1179,6 +1206,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1179
1206
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1180
1207
  },
1181
1208
  },
1209
+ {
1210
+ LLM_ARCH_OLMOE,
1211
+ {
1212
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1213
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1214
+ { LLM_TENSOR_OUTPUT, "output" },
1215
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1216
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1217
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1218
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1219
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1220
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1221
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1222
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1223
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1224
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1225
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1226
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1227
+ },
1228
+ },
1182
1229
  {
1183
1230
  LLM_ARCH_OPENELM,
1184
1231
  {
@@ -2263,6 +2310,7 @@ enum e_model {
2263
2310
  MODEL_MEDIUM,
2264
2311
  MODEL_LARGE,
2265
2312
  MODEL_XL,
2313
+ MODEL_A1_7B,
2266
2314
  MODEL_A2_7B,
2267
2315
  MODEL_8x7B,
2268
2316
  MODEL_8x22B,
@@ -5227,6 +5275,7 @@ static const char * llama_model_type_name(e_model type) {
5227
5275
  case MODEL_MEDIUM: return "0.4B";
5228
5276
  case MODEL_LARGE: return "0.8B";
5229
5277
  case MODEL_XL: return "1.5B";
5278
+ case MODEL_A1_7B: return "A1.7B";
5230
5279
  case MODEL_A2_7B: return "A2.7B";
5231
5280
  case MODEL_8x7B: return "8x7B";
5232
5281
  case MODEL_8x22B: return "8x22B";
@@ -5401,6 +5450,17 @@ static void llm_load_hparams(
5401
5450
  default: model.type = e_model::MODEL_UNKNOWN;
5402
5451
  }
5403
5452
  } break;
5453
+ case LLM_ARCH_MINICPM3:
5454
+ {
5455
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5456
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
5457
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
5458
+
5459
+ switch (hparams.n_layer) {
5460
+ case 62: model.type = e_model::MODEL_4B; break;
5461
+ default: model.type = e_model::MODEL_UNKNOWN;
5462
+ }
5463
+ } break;
5404
5464
  case LLM_ARCH_GROK:
5405
5465
  {
5406
5466
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5766,6 +5826,14 @@ static void llm_load_hparams(
5766
5826
  default: model.type = e_model::MODEL_UNKNOWN;
5767
5827
  }
5768
5828
  } break;
5829
+ case LLM_ARCH_OLMOE:
5830
+ {
5831
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5832
+ switch (hparams.n_layer) {
5833
+ case 16: model.type = e_model::MODEL_A1_7B; break;
5834
+ default: model.type = e_model::MODEL_UNKNOWN;
5835
+ }
5836
+ } break;
5769
5837
  case LLM_ARCH_OPENELM:
5770
5838
  {
5771
5839
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6908,6 +6976,54 @@ static bool llm_load_tensors(
6908
6976
  }
6909
6977
  }
6910
6978
  } break;
6979
+ case LLM_ARCH_MINICPM3:
6980
+ {
6981
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
6982
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6983
+
6984
+ const int64_t q_lora_rank = hparams.n_lora_q;
6985
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
6986
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6987
+
6988
+ // output
6989
+ {
6990
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6991
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6992
+
6993
+ // if output is NULL, init from the input tok embed
6994
+ if (model.output == NULL) {
6995
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6996
+ }
6997
+ }
6998
+
6999
+ for (int i = 0; i < n_layer; ++i) {
7000
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7001
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7002
+
7003
+ auto & layer = model.layers[i];
7004
+
7005
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7006
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
7007
+
7008
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
7009
+
7010
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
7011
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
7012
+
7013
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
7014
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
7015
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
7016
+
7017
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7018
+
7019
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7020
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7021
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7022
+
7023
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7024
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7025
+ }
7026
+ } break;
6911
7027
  case LLM_ARCH_GROK:
6912
7028
  {
6913
7029
  if (n_expert == 0) {
@@ -7945,6 +8061,44 @@ static bool llm_load_tensors(
7945
8061
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7946
8062
  }
7947
8063
  } break;
8064
+ case LLM_ARCH_OLMOE:
8065
+ {
8066
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8067
+
8068
+ // output
8069
+ {
8070
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8071
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
8072
+ }
8073
+
8074
+ for (int i = 0; i < n_layer; ++i) {
8075
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
8076
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
8077
+
8078
+ auto & layer = model.layers[i];
8079
+
8080
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8081
+
8082
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
8083
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
8084
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
8085
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
8086
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
8087
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
8088
+
8089
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
8090
+
8091
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
8092
+
8093
+ LM_GGML_ASSERT(n_expert > 0);
8094
+ LM_GGML_ASSERT(n_expert_used > 0);
8095
+
8096
+ // MoE branch
8097
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
8098
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
8099
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
8100
+ }
8101
+ } break;
7948
8102
  case LLM_ARCH_OPENELM:
7949
8103
  {
7950
8104
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -9428,7 +9582,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9428
9582
  struct lm_ggml_tensor * cur,
9429
9583
  struct lm_ggml_tensor * x_prev,
9430
9584
  struct lm_ggml_tensor ** wkv_state) {
9431
- size_t n_embed = cur->ne[0];
9585
+ size_t n_embd = cur->ne[0];
9432
9586
  size_t n_seq_tokens = cur->ne[1];
9433
9587
  size_t n_seqs = cur->ne[2];
9434
9588
 
@@ -9439,8 +9593,8 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9439
9593
 
9440
9594
  struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9441
9595
 
9442
- sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
9443
- cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9596
+ sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
9597
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
9444
9598
 
9445
9599
  struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
9446
9600
 
@@ -9465,11 +9619,11 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9465
9619
  xxx
9466
9620
  );
9467
9621
 
9468
- struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
9469
- struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
9470
- struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
9471
- struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
9472
- struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
9622
+ struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
9623
+ struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
9624
+ struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
9625
+ struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
9626
+ struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
9473
9627
 
9474
9628
  struct lm_ggml_tensor * xw = lm_ggml_add(
9475
9629
  ctx,
@@ -9538,7 +9692,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9538
9692
  )
9539
9693
  );
9540
9694
 
9541
- w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
9695
+ w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
9542
9696
  w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
9543
9697
  w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
9544
9698
 
@@ -9547,21 +9701,21 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9547
9701
  r = lm_ggml_transpose(ctx, r);
9548
9702
 
9549
9703
  struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
9550
- cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
9551
- *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
9704
+ cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
9705
+ *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
9552
9706
 
9553
9707
  // group norm with head_count groups
9554
- cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
9708
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
9555
9709
  cur = lm_ggml_norm(ctx, cur, 64e-5f);
9556
9710
 
9557
9711
  // Convert back to regular vectors.
9558
- cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9712
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
9559
9713
  cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
9560
9714
 
9561
9715
  cur = lm_ggml_mul(ctx, cur, g);
9562
9716
  cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
9563
9717
 
9564
- return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
9718
+ return lm_ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
9565
9719
  }
9566
9720
 
9567
9721
  static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
@@ -12854,6 +13008,215 @@ struct llm_build_context {
12854
13008
  return gf;
12855
13009
  }
12856
13010
 
13011
+ struct lm_ggml_cgraph * build_minicpm3() {
13012
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13013
+
13014
+ //TODO: if the model varies, these parameters need to be read from the model
13015
+ const int64_t n_embd_base = 256;
13016
+ const float scale_embd = 12.0f;
13017
+ const float scale_depth = 1.4f;
13018
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
13019
+
13020
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13021
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13022
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
13023
+
13024
+ struct lm_ggml_tensor * cur;
13025
+ struct lm_ggml_tensor * inpL;
13026
+
13027
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13028
+
13029
+ // scale the input embeddings
13030
+ inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
13031
+ cb(inpL, "inp_scaled", -1);
13032
+
13033
+ // inp_pos - contains the positions
13034
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13035
+
13036
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13037
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13038
+
13039
+ for (int il = 0; il < n_layer; ++il) {
13040
+ struct lm_ggml_tensor * inpSA = inpL;
13041
+
13042
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
13043
+ // norm
13044
+ cur = llm_build_norm(ctx0, inpL, hparams,
13045
+ model.layers[il].attn_norm, NULL,
13046
+ LLM_NORM_RMS, cb, il);
13047
+ cb(cur, "attn_norm", il);
13048
+
13049
+ // self_attention
13050
+ {
13051
+ struct lm_ggml_tensor * q = NULL;
13052
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
13053
+ q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
13054
+ cb(q, "q", il);
13055
+
13056
+ q = llm_build_norm(ctx0, q, hparams,
13057
+ model.layers[il].attn_q_a_norm, NULL,
13058
+ LLM_NORM_RMS, cb, il);
13059
+ cb(q, "q", il);
13060
+
13061
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
13062
+ q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
13063
+ cb(q, "q", il);
13064
+
13065
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
13066
+ struct lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
13067
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k),
13068
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13069
+ 0);
13070
+ cb(q_nope, "q_nope", il);
13071
+
13072
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
13073
+ struct lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
13074
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k),
13075
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13076
+ lm_ggml_row_size(q->type, n_embd_head_qk_nope));
13077
+ cb(q_pe, "q_pe", il);
13078
+
13079
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
13080
+ struct lm_ggml_tensor * kv_pe_compresseed = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
13081
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
13082
+
13083
+ // split into {kv_lora_rank, n_tokens}
13084
+ struct lm_ggml_tensor * kv_compressed = lm_ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
13085
+ kv_pe_compresseed->nb[1],
13086
+ 0);
13087
+ cb(kv_compressed, "kv_compressed", il);
13088
+
13089
+ // and {n_embd_head_qk_rope, n_tokens}
13090
+ struct lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
13091
+ kv_pe_compresseed->nb[1],
13092
+ kv_pe_compresseed->nb[1],
13093
+ lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
13094
+ cb(k_pe, "k_pe", il);
13095
+
13096
+ kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
13097
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
13098
+ model.layers[il].attn_kv_a_norm, NULL,
13099
+ LLM_NORM_RMS, cb, il);
13100
+ cb(kv_compressed, "kv_compressed", il);
13101
+
13102
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
13103
+ struct lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
13104
+ cb(kv, "kv", il);
13105
+
13106
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
13107
+ struct lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
13108
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
13109
+ lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13110
+ 0);
13111
+ cb(k_nope, "k_nope", il);
13112
+
13113
+ // and {n_head * n_embd_head_v, n_tokens}
13114
+ struct lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
13115
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13116
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
13117
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
13118
+ cb(v_states, "v_states", il);
13119
+
13120
+ v_states = lm_ggml_cont(ctx0, v_states);
13121
+ cb(v_states, "v_states", il);
13122
+
13123
+ v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
13124
+ lm_ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
13125
+ 0);
13126
+ cb(v_states, "v_states", il);
13127
+
13128
+ q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13129
+ q_pe = lm_ggml_rope_ext(
13130
+ ctx0, q_pe, inp_pos, rope_factors,
13131
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13132
+ ext_factor, attn_factor, beta_fast, beta_slow
13133
+ );
13134
+ cb(q_pe, "q_pe", il);
13135
+
13136
+ // shared RoPE key
13137
+ k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13138
+ k_pe = lm_ggml_rope_ext(
13139
+ ctx0, k_pe, inp_pos, rope_factors,
13140
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13141
+ ext_factor, attn_factor, beta_fast, beta_slow
13142
+ );
13143
+ cb(k_pe, "k_pe", il);
13144
+
13145
+ struct lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0);
13146
+ cb(q_states, "q_states", il);
13147
+
13148
+ struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
13149
+ cb(k_states, "k_states", il);
13150
+
13151
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13152
+ model.layers[il].wo, NULL,
13153
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
13154
+ }
13155
+
13156
+ if (il == n_layer - 1) {
13157
+ // skip computing output for unused tokens
13158
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13159
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13160
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13161
+ }
13162
+
13163
+ // scale_res - scale the hidden states for residual connection
13164
+ const float scale_res = scale_depth/sqrtf(float(n_layer));
13165
+ cur = lm_ggml_scale(ctx0, cur, scale_res);
13166
+ cb(cur, "hidden_scaled", il);
13167
+
13168
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13169
+ cb(ffn_inp, "ffn_inp", il);
13170
+
13171
+ // feed-forward network
13172
+ {
13173
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13174
+ model.layers[il].ffn_norm, NULL,
13175
+ LLM_NORM_RMS, cb, il);
13176
+ cb(cur, "ffn_norm", il);
13177
+
13178
+ cur = llm_build_ffn(ctx0, lctx, cur,
13179
+ model.layers[il].ffn_up, NULL, NULL,
13180
+ model.layers[il].ffn_gate, NULL, NULL,
13181
+ model.layers[il].ffn_down, NULL, NULL,
13182
+ NULL,
13183
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13184
+ cb(cur, "ffn_out", il);
13185
+ }
13186
+
13187
+ // scale the hidden states for residual connection
13188
+ cur = lm_ggml_scale(ctx0, cur, scale_res);
13189
+ cb(cur, "hidden_scaled_ffn", il);
13190
+
13191
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13192
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
13193
+ cb(cur, "l_out", il);
13194
+
13195
+ // input for next layer
13196
+ inpL = cur;
13197
+ }
13198
+
13199
+ cur = inpL;
13200
+
13201
+ cur = llm_build_norm(ctx0, cur, hparams,
13202
+ model.output_norm, NULL,
13203
+ LLM_NORM_RMS, cb, -1);
13204
+ cb(cur, "result_norm", -1);
13205
+
13206
+ // lm_head scaling
13207
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
13208
+ cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
13209
+ cb(cur, "lmhead_scaling", -1);
13210
+
13211
+ // lm_head
13212
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13213
+ cb(cur, "result_output", -1);
13214
+
13215
+ lm_ggml_build_forward_expand(gf, cur);
13216
+
13217
+ return gf;
13218
+ }
13219
+
12857
13220
  struct lm_ggml_cgraph * build_gemma() {
12858
13221
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12859
13222
 
@@ -13550,6 +13913,134 @@ struct llm_build_context {
13550
13913
  return gf;
13551
13914
  }
13552
13915
 
13916
+ // based on the build_qwen2moe() function, changes:
13917
+ // * removed shared experts
13918
+ // * removed bias
13919
+ // * added q, k norm
13920
+ struct lm_ggml_cgraph * build_olmoe() {
13921
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13922
+
13923
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
13924
+ int32_t n_tokens = this->n_tokens;
13925
+
13926
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13927
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13928
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13929
+
13930
+ struct lm_ggml_tensor * cur;
13931
+ struct lm_ggml_tensor * inpL;
13932
+
13933
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13934
+
13935
+ // inp_pos - contains the positions
13936
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13937
+
13938
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13939
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13940
+
13941
+ for (int il = 0; il < n_layer; ++il) {
13942
+ struct lm_ggml_tensor * inpSA = inpL;
13943
+
13944
+ // norm
13945
+ cur = llm_build_norm(ctx0, inpL, hparams,
13946
+ model.layers[il].attn_norm, NULL,
13947
+ LLM_NORM_RMS, cb, il);
13948
+ cb(cur, "attn_norm", il);
13949
+
13950
+ // self_attention
13951
+ {
13952
+ // compute Q and K and RoPE them
13953
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13954
+ cb(Qcur, "Qcur", il);
13955
+
13956
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13957
+ cb(Kcur, "Kcur", il);
13958
+
13959
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13960
+ cb(Vcur, "Vcur", il);
13961
+
13962
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
13963
+ LLM_NORM_RMS, cb, il);
13964
+ cb(Qcur, "Qcur_normed", il);
13965
+
13966
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
13967
+ LLM_NORM_RMS, cb, il);
13968
+ cb(Kcur, "Kcur_normed", il);
13969
+
13970
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13971
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13972
+
13973
+ Qcur = lm_ggml_rope_ext(
13974
+ ctx0, Qcur, inp_pos, nullptr,
13975
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13976
+ ext_factor, attn_factor, beta_fast, beta_slow
13977
+ );
13978
+ cb(Qcur, "Qcur_rope", il);
13979
+
13980
+ Kcur = lm_ggml_rope_ext(
13981
+ ctx0, Kcur, inp_pos, nullptr,
13982
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13983
+ ext_factor, attn_factor, beta_fast, beta_slow
13984
+ );
13985
+ cb(Kcur, "Kcur_rope", il);
13986
+
13987
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13988
+ model.layers[il].wo, NULL,
13989
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13990
+ }
13991
+
13992
+ if (il == n_layer - 1) {
13993
+ // skip computing output for unused tokens
13994
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13995
+ n_tokens = n_outputs;
13996
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13997
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13998
+ }
13999
+
14000
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14001
+ cb(ffn_inp, "ffn_inp", il);
14002
+
14003
+ // MoE branch
14004
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14005
+ model.layers[il].ffn_norm, NULL,
14006
+ LLM_NORM_RMS, cb, il);
14007
+ cb(cur, "ffn_norm", il);
14008
+
14009
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
14010
+ model.layers[il].ffn_gate_inp,
14011
+ model.layers[il].ffn_up_exps,
14012
+ model.layers[il].ffn_gate_exps,
14013
+ model.layers[il].ffn_down_exps,
14014
+ n_expert, n_expert_used,
14015
+ LLM_FFN_SILU, false,
14016
+ false, 0.0,
14017
+ cb, il);
14018
+ cb(cur, "ffn_moe_out", il);
14019
+
14020
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14021
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14022
+ cb(cur, "l_out", il);
14023
+
14024
+ // input for next layer
14025
+ inpL = cur;
14026
+ }
14027
+
14028
+ cur = inpL;
14029
+
14030
+ cur = llm_build_norm(ctx0, cur, hparams,
14031
+ model.output_norm, NULL,
14032
+ LLM_NORM_RMS, cb, -1);
14033
+ cb(cur, "result_norm", -1);
14034
+
14035
+ // lm_head
14036
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14037
+ cb(cur, "result_output", -1);
14038
+
14039
+ lm_ggml_build_forward_expand(gf, cur);
14040
+
14041
+ return gf;
14042
+ }
14043
+
13553
14044
  struct lm_ggml_cgraph * build_openelm() {
13554
14045
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13555
14046
 
@@ -15394,6 +15885,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
15394
15885
  {
15395
15886
  result = llm.build_minicpm();
15396
15887
  } break;
15888
+ case LLM_ARCH_MINICPM3:
15889
+ {
15890
+ result = llm.build_minicpm3();
15891
+ } break;
15397
15892
  case LLM_ARCH_GEMMA:
15398
15893
  {
15399
15894
  result = llm.build_gemma();
@@ -15426,6 +15921,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
15426
15921
  {
15427
15922
  result = llm.build_olmo();
15428
15923
  } break;
15924
+ case LLM_ARCH_OLMOE:
15925
+ {
15926
+ result = llm.build_olmoe();
15927
+ } break;
15429
15928
  case LLM_ARCH_OPENELM:
15430
15929
  {
15431
15930
  result = llm.build_openelm();
@@ -18085,9 +18584,9 @@ struct llama_model * llama_load_model_from_file(
18085
18584
  unsigned percentage = (unsigned) (100 * progress);
18086
18585
  while (percentage > *cur_percentage_p) {
18087
18586
  *cur_percentage_p = percentage;
18088
- LLAMA_LOG_INFO(".");
18587
+ LLAMA_LOG(".");
18089
18588
  if (percentage >= 100) {
18090
- LLAMA_LOG_INFO("\n");
18589
+ LLAMA_LOG("\n");
18091
18590
  }
18092
18591
  }
18093
18592
  return true;
@@ -18610,6 +19109,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18610
19109
  case LLM_ARCH_QWEN:
18611
19110
  case LLM_ARCH_QWEN2:
18612
19111
  case LLM_ARCH_QWEN2MOE:
19112
+ case LLM_ARCH_OLMOE:
18613
19113
  case LLM_ARCH_PHI2:
18614
19114
  case LLM_ARCH_PHI3:
18615
19115
  case LLM_ARCH_GEMMA:
@@ -18620,6 +19120,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18620
19120
  case LLM_ARCH_CODESHELL:
18621
19121
  case LLM_ARCH_NEMOTRON:
18622
19122
  case LLM_ARCH_EXAONE:
19123
+ case LLM_ARCH_MINICPM3:
18623
19124
  return LLAMA_ROPE_TYPE_NEOX;
18624
19125
 
18625
19126
  // all model arches should be listed explicitly here
@@ -20792,8 +21293,8 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
20792
21293
  if (len < 128) {
20793
21294
  g_state.log_callback(level, buffer, g_state.log_callback_user_data);
20794
21295
  } else {
20795
- char* buffer2 = new char[len+1];
20796
- vsnprintf(buffer2, len+1, format, args_copy);
21296
+ char * buffer2 = new char[len + 1];
21297
+ vsnprintf(buffer2, len + 1, format, args_copy);
20797
21298
  buffer2[len] = 0;
20798
21299
  g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
20799
21300
  delete[] buffer2;
@@ -20815,19 +21316,3 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
20815
21316
  fflush(stderr);
20816
21317
  }
20817
21318
 
20818
- struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
20819
- const auto * ctx = (llama_context *) v_ctx;
20820
- struct llama_token_timings result = {
20821
- /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20822
- /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20823
- /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20824
- /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20825
- /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20826
-
20827
- /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20828
- /*.n_eval =*/ std::max(1, ctx->n_eval),
20829
- };
20830
-
20831
- return result;
20832
- }
20833
-