cui-llama.rn 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/llama.cpp CHANGED
@@ -204,6 +204,7 @@ enum llm_arch {
204
204
  LLM_ARCH_ORION,
205
205
  LLM_ARCH_INTERNLM2,
206
206
  LLM_ARCH_MINICPM,
207
+ LLM_ARCH_MINICPM3,
207
208
  LLM_ARCH_GEMMA,
208
209
  LLM_ARCH_GEMMA2,
209
210
  LLM_ARCH_STARCODER2,
@@ -212,6 +213,7 @@ enum llm_arch {
212
213
  LLM_ARCH_COMMAND_R,
213
214
  LLM_ARCH_DBRX,
214
215
  LLM_ARCH_OLMO,
216
+ LLM_ARCH_OLMOE,
215
217
  LLM_ARCH_OPENELM,
216
218
  LLM_ARCH_ARCTIC,
217
219
  LLM_ARCH_DEEPSEEK2,
@@ -252,6 +254,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
252
254
  { LLM_ARCH_ORION, "orion" },
253
255
  { LLM_ARCH_INTERNLM2, "internlm2" },
254
256
  { LLM_ARCH_MINICPM, "minicpm" },
257
+ { LLM_ARCH_MINICPM3, "minicpm3" },
255
258
  { LLM_ARCH_GEMMA, "gemma" },
256
259
  { LLM_ARCH_GEMMA2, "gemma2" },
257
260
  { LLM_ARCH_STARCODER2, "starcoder2" },
@@ -260,6 +263,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
260
263
  { LLM_ARCH_COMMAND_R, "command-r" },
261
264
  { LLM_ARCH_DBRX, "dbrx" },
262
265
  { LLM_ARCH_OLMO, "olmo" },
266
+ { LLM_ARCH_OLMOE, "olmoe" },
263
267
  { LLM_ARCH_OPENELM, "openelm" },
264
268
  { LLM_ARCH_ARCTIC, "arctic" },
265
269
  { LLM_ARCH_DEEPSEEK2, "deepseek2" },
@@ -1045,6 +1049,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1045
1049
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
1046
1050
  },
1047
1051
  },
1052
+ {
1053
+ LLM_ARCH_MINICPM3,
1054
+ {
1055
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1056
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1057
+ { LLM_TENSOR_OUTPUT, "output" },
1058
+ { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
1059
+ { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
1060
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1061
+ { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
1062
+ { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
1063
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1064
+ { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
1065
+ { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
1066
+ { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
1067
+ { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
1068
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1069
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1070
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1071
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1072
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1073
+ },
1074
+ },
1048
1075
  {
1049
1076
  LLM_ARCH_GEMMA,
1050
1077
  {
@@ -1179,6 +1206,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
1179
1206
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1180
1207
  },
1181
1208
  },
1209
+ {
1210
+ LLM_ARCH_OLMOE,
1211
+ {
1212
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1213
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1214
+ { LLM_TENSOR_OUTPUT, "output" },
1215
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1216
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1217
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1218
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1219
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1220
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1221
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1222
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1223
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1224
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1225
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1226
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1227
+ },
1228
+ },
1182
1229
  {
1183
1230
  LLM_ARCH_OPENELM,
1184
1231
  {
@@ -2167,6 +2214,10 @@ static lm_ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buf
2167
2214
  if (host_buffer) {
2168
2215
  buft = lm_ggml_backend_sycl_host_buffer_type();
2169
2216
  }
2217
+ #elif defined(LM_GGML_USE_CANN)
2218
+ if (host_buffer) {
2219
+ buft = lm_ggml_backend_cann_host_buffer_type();
2220
+ }
2170
2221
  #elif defined(LM_GGML_USE_CPU_HBM)
2171
2222
  buft = lm_ggml_backend_cpu_hbm_buffer_type();
2172
2223
  #elif defined(LM_GGML_USE_VULKAN)
@@ -2259,6 +2310,7 @@ enum e_model {
2259
2310
  MODEL_MEDIUM,
2260
2311
  MODEL_LARGE,
2261
2312
  MODEL_XL,
2313
+ MODEL_A1_7B,
2262
2314
  MODEL_A2_7B,
2263
2315
  MODEL_8x7B,
2264
2316
  MODEL_8x22B,
@@ -2493,6 +2545,7 @@ struct llama_cparams {
2493
2545
  bool causal_attn;
2494
2546
  bool offload_kqv;
2495
2547
  bool flash_attn;
2548
+ bool no_perf;
2496
2549
 
2497
2550
  enum llama_pooling_type pooling_type;
2498
2551
 
@@ -5222,6 +5275,7 @@ static const char * llama_model_type_name(e_model type) {
5222
5275
  case MODEL_MEDIUM: return "0.4B";
5223
5276
  case MODEL_LARGE: return "0.8B";
5224
5277
  case MODEL_XL: return "1.5B";
5278
+ case MODEL_A1_7B: return "A1.7B";
5225
5279
  case MODEL_A2_7B: return "A2.7B";
5226
5280
  case MODEL_8x7B: return "8x7B";
5227
5281
  case MODEL_8x22B: return "8x22B";
@@ -5396,6 +5450,17 @@ static void llm_load_hparams(
5396
5450
  default: model.type = e_model::MODEL_UNKNOWN;
5397
5451
  }
5398
5452
  } break;
5453
+ case LLM_ARCH_MINICPM3:
5454
+ {
5455
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5456
+ ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
5457
+ ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
5458
+
5459
+ switch (hparams.n_layer) {
5460
+ case 62: model.type = e_model::MODEL_4B; break;
5461
+ default: model.type = e_model::MODEL_UNKNOWN;
5462
+ }
5463
+ } break;
5399
5464
  case LLM_ARCH_GROK:
5400
5465
  {
5401
5466
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -5761,6 +5826,14 @@ static void llm_load_hparams(
5761
5826
  default: model.type = e_model::MODEL_UNKNOWN;
5762
5827
  }
5763
5828
  } break;
5829
+ case LLM_ARCH_OLMOE:
5830
+ {
5831
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
5832
+ switch (hparams.n_layer) {
5833
+ case 16: model.type = e_model::MODEL_A1_7B; break;
5834
+ default: model.type = e_model::MODEL_UNKNOWN;
5835
+ }
5836
+ } break;
5764
5837
  case LLM_ARCH_OPENELM:
5765
5838
  {
5766
5839
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6668,8 +6741,6 @@ static bool llm_load_tensors(
6668
6741
  bool use_mlock,
6669
6742
  llama_progress_callback progress_callback,
6670
6743
  void * progress_callback_user_data) {
6671
- model.t_start_us = lm_ggml_time_us();
6672
-
6673
6744
  auto & hparams = model.hparams;
6674
6745
 
6675
6746
  model.split_mode = split_mode;
@@ -6905,6 +6976,54 @@ static bool llm_load_tensors(
6905
6976
  }
6906
6977
  }
6907
6978
  } break;
6979
+ case LLM_ARCH_MINICPM3:
6980
+ {
6981
+ const int64_t n_embd_head_qk_rope = hparams.n_rot;
6982
+ const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
6983
+
6984
+ const int64_t q_lora_rank = hparams.n_lora_q;
6985
+ const int64_t kv_lora_rank = hparams.n_lora_kv;
6986
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
6987
+
6988
+ // output
6989
+ {
6990
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
6991
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
6992
+
6993
+ // if output is NULL, init from the input tok embed
6994
+ if (model.output == NULL) {
6995
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
6996
+ }
6997
+ }
6998
+
6999
+ for (int i = 0; i < n_layer; ++i) {
7000
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
7001
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
7002
+
7003
+ auto & layer = model.layers[i];
7004
+
7005
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
7006
+ layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
7007
+
7008
+ layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
7009
+
7010
+ layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
7011
+ layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
7012
+
7013
+ layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
7014
+ layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
7015
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
7016
+
7017
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
7018
+
7019
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
7020
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
7021
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7022
+
7023
+ layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7024
+ layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
7025
+ }
7026
+ } break;
6908
7027
  case LLM_ARCH_GROK:
6909
7028
  {
6910
7029
  if (n_expert == 0) {
@@ -7942,6 +8061,44 @@ static bool llm_load_tensors(
7942
8061
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
7943
8062
  }
7944
8063
  } break;
8064
+ case LLM_ARCH_OLMOE:
8065
+ {
8066
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
8067
+
8068
+ // output
8069
+ {
8070
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
8071
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
8072
+ }
8073
+
8074
+ for (int i = 0; i < n_layer; ++i) {
8075
+ lm_ggml_context * ctx_layer = ctx_for_layer(i);
8076
+ lm_ggml_context * ctx_split = ctx_for_layer_split(i);
8077
+
8078
+ auto & layer = model.layers[i];
8079
+
8080
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
8081
+
8082
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
8083
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
8084
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
8085
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
8086
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
8087
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
8088
+
8089
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
8090
+
8091
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
8092
+
8093
+ LM_GGML_ASSERT(n_expert > 0);
8094
+ LM_GGML_ASSERT(n_expert_used > 0);
8095
+
8096
+ // MoE branch
8097
+ layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
8098
+ layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
8099
+ layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
8100
+ }
8101
+ } break;
7945
8102
  case LLM_ARCH_OPENELM:
7946
8103
  {
7947
8104
  model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -8600,14 +8757,13 @@ static bool llm_load_tensors(
8600
8757
  }
8601
8758
  }
8602
8759
 
8603
- // loading time will be recalculate after the first eval, so
8604
- // we take page faults deferred by mmap() into consideration
8605
- model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8606
8760
  return true;
8607
8761
  }
8608
8762
 
8609
8763
  // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
8610
8764
  static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8765
+ model.t_start_us = lm_ggml_time_us();
8766
+
8611
8767
  try {
8612
8768
  llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
8613
8769
 
@@ -8669,6 +8825,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8669
8825
  return -1;
8670
8826
  }
8671
8827
 
8828
+ // loading time will be recalculate after the first eval, so
8829
+ // we take page faults deferred by mmap() into consideration
8830
+ model.t_load_us = lm_ggml_time_us() - model.t_start_us;
8831
+
8672
8832
  return 0;
8673
8833
  }
8674
8834
 
@@ -9269,7 +9429,7 @@ static struct lm_ggml_tensor * llm_build_copy_mask_state(
9269
9429
  // FIXME: zero-out NANs?
9270
9430
  states = lm_ggml_mul(ctx, states, state_mask);
9271
9431
 
9272
- // copy states which won't be changed further (between n_seqs and n_rs)
9432
+ // copy states which won't be changed further (between n_seqs and n_kv)
9273
9433
  lm_ggml_build_forward_expand(graph,
9274
9434
  lm_ggml_cpy(ctx,
9275
9435
  lm_ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*lm_ggml_element_size(states)),
@@ -9422,7 +9582,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9422
9582
  struct lm_ggml_tensor * cur,
9423
9583
  struct lm_ggml_tensor * x_prev,
9424
9584
  struct lm_ggml_tensor ** wkv_state) {
9425
- size_t n_embed = cur->ne[0];
9585
+ size_t n_embd = cur->ne[0];
9426
9586
  size_t n_seq_tokens = cur->ne[1];
9427
9587
  size_t n_seqs = cur->ne[2];
9428
9588
 
@@ -9433,8 +9593,8 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9433
9593
 
9434
9594
  struct lm_ggml_tensor * sx = lm_ggml_sub(ctx, x_prev, cur);
9435
9595
 
9436
- sx = lm_ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
9437
- cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9596
+ sx = lm_ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
9597
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
9438
9598
 
9439
9599
  struct lm_ggml_tensor * xxx = lm_ggml_add(ctx, lm_ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
9440
9600
 
@@ -9459,11 +9619,11 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9459
9619
  xxx
9460
9620
  );
9461
9621
 
9462
- struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
9463
- struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
9464
- struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
9465
- struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
9466
- struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
9622
+ struct lm_ggml_tensor *mw = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
9623
+ struct lm_ggml_tensor *mk = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
9624
+ struct lm_ggml_tensor *mv = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
9625
+ struct lm_ggml_tensor *mr = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
9626
+ struct lm_ggml_tensor *mg = lm_ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
9467
9627
 
9468
9628
  struct lm_ggml_tensor * xw = lm_ggml_add(
9469
9629
  ctx,
@@ -9532,7 +9692,7 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9532
9692
  )
9533
9693
  );
9534
9694
 
9535
- w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
9695
+ w = lm_ggml_add(ctx, w, lm_ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
9536
9696
  w = lm_ggml_exp(ctx, lm_ggml_neg(ctx, lm_ggml_exp(ctx, w)));
9537
9697
  w = lm_ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
9538
9698
 
@@ -9541,21 +9701,21 @@ static struct lm_ggml_tensor * llm_build_rwkv6_time_mix(
9541
9701
  r = lm_ggml_transpose(ctx, r);
9542
9702
 
9543
9703
  struct lm_ggml_tensor * wkv_output = lm_ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
9544
- cur = lm_ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
9545
- *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
9704
+ cur = lm_ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
9705
+ *wkv_state = lm_ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
9546
9706
 
9547
9707
  // group norm with head_count groups
9548
- cur = lm_ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
9708
+ cur = lm_ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
9549
9709
  cur = lm_ggml_norm(ctx, cur, 64e-5f);
9550
9710
 
9551
9711
  // Convert back to regular vectors.
9552
- cur = lm_ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
9712
+ cur = lm_ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
9553
9713
  cur = lm_ggml_add(ctx, lm_ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
9554
9714
 
9555
9715
  cur = lm_ggml_mul(ctx, cur, g);
9556
9716
  cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
9557
9717
 
9558
- return lm_ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
9718
+ return lm_ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
9559
9719
  }
9560
9720
 
9561
9721
  static struct lm_ggml_tensor * llm_build_rwkv6_channel_mix(
@@ -9888,8 +10048,8 @@ struct llm_build_context {
9888
10048
  struct lm_ggml_cgraph * append_pooling(struct lm_ggml_cgraph * gf) {
9889
10049
  // find result_norm tensor for input
9890
10050
  struct lm_ggml_tensor * inp = nullptr;
9891
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
9892
- inp = gf->nodes[i];
10051
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
10052
+ inp = lm_ggml_graph_node(gf, i);
9893
10053
  if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
9894
10054
  break;
9895
10055
  } else {
@@ -12848,6 +13008,215 @@ struct llm_build_context {
12848
13008
  return gf;
12849
13009
  }
12850
13010
 
13011
+ struct lm_ggml_cgraph * build_minicpm3() {
13012
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13013
+
13014
+ //TODO: if the model varies, these parameters need to be read from the model
13015
+ const int64_t n_embd_base = 256;
13016
+ const float scale_embd = 12.0f;
13017
+ const float scale_depth = 1.4f;
13018
+ const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
13019
+
13020
+ const uint32_t n_embd_head_qk_rope = hparams.n_rot;
13021
+ const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
13022
+ const uint32_t kv_lora_rank = hparams.n_lora_kv;
13023
+
13024
+ struct lm_ggml_tensor * cur;
13025
+ struct lm_ggml_tensor * inpL;
13026
+
13027
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13028
+
13029
+ // scale the input embeddings
13030
+ inpL = lm_ggml_scale(ctx0, inpL, scale_embd);
13031
+ cb(inpL, "inp_scaled", -1);
13032
+
13033
+ // inp_pos - contains the positions
13034
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13035
+
13036
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13037
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13038
+
13039
+ for (int il = 0; il < n_layer; ++il) {
13040
+ struct lm_ggml_tensor * inpSA = inpL;
13041
+
13042
+ struct lm_ggml_tensor * rope_factors = build_rope_factors(il);
13043
+ // norm
13044
+ cur = llm_build_norm(ctx0, inpL, hparams,
13045
+ model.layers[il].attn_norm, NULL,
13046
+ LLM_NORM_RMS, cb, il);
13047
+ cb(cur, "attn_norm", il);
13048
+
13049
+ // self_attention
13050
+ {
13051
+ struct lm_ggml_tensor * q = NULL;
13052
+ // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
13053
+ q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
13054
+ cb(q, "q", il);
13055
+
13056
+ q = llm_build_norm(ctx0, q, hparams,
13057
+ model.layers[il].attn_q_a_norm, NULL,
13058
+ LLM_NORM_RMS, cb, il);
13059
+ cb(q, "q", il);
13060
+
13061
+ // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
13062
+ q = lm_ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
13063
+ cb(q, "q", il);
13064
+
13065
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
13066
+ struct lm_ggml_tensor * q_nope = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
13067
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k),
13068
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13069
+ 0);
13070
+ cb(q_nope, "q_nope", il);
13071
+
13072
+ // and {n_head * n_embd_head_qk_rope, n_tokens}
13073
+ struct lm_ggml_tensor * q_pe = lm_ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
13074
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k),
13075
+ lm_ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
13076
+ lm_ggml_row_size(q->type, n_embd_head_qk_nope));
13077
+ cb(q_pe, "q_pe", il);
13078
+
13079
+ // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
13080
+ struct lm_ggml_tensor * kv_pe_compresseed = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
13081
+ cb(kv_pe_compresseed, "kv_pe_compresseed", il);
13082
+
13083
+ // split into {kv_lora_rank, n_tokens}
13084
+ struct lm_ggml_tensor * kv_compressed = lm_ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
13085
+ kv_pe_compresseed->nb[1],
13086
+ 0);
13087
+ cb(kv_compressed, "kv_compressed", il);
13088
+
13089
+ // and {n_embd_head_qk_rope, n_tokens}
13090
+ struct lm_ggml_tensor * k_pe = lm_ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
13091
+ kv_pe_compresseed->nb[1],
13092
+ kv_pe_compresseed->nb[1],
13093
+ lm_ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
13094
+ cb(k_pe, "k_pe", il);
13095
+
13096
+ kv_compressed = lm_ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
13097
+ kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
13098
+ model.layers[il].attn_kv_a_norm, NULL,
13099
+ LLM_NORM_RMS, cb, il);
13100
+ cb(kv_compressed, "kv_compressed", il);
13101
+
13102
+ // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
13103
+ struct lm_ggml_tensor * kv = lm_ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
13104
+ cb(kv, "kv", il);
13105
+
13106
+ // split into {n_head * n_embd_head_qk_nope, n_tokens}
13107
+ struct lm_ggml_tensor * k_nope = lm_ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
13108
+ lm_ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
13109
+ lm_ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13110
+ 0);
13111
+ cb(k_nope, "k_nope", il);
13112
+
13113
+ // and {n_head * n_embd_head_v, n_tokens}
13114
+ struct lm_ggml_tensor * v_states = lm_ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
13115
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
13116
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
13117
+ lm_ggml_row_size(kv->type, (n_embd_head_qk_nope)));
13118
+ cb(v_states, "v_states", il);
13119
+
13120
+ v_states = lm_ggml_cont(ctx0, v_states);
13121
+ cb(v_states, "v_states", il);
13122
+
13123
+ v_states = lm_ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
13124
+ lm_ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
13125
+ 0);
13126
+ cb(v_states, "v_states", il);
13127
+
13128
+ q_pe = lm_ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13129
+ q_pe = lm_ggml_rope_ext(
13130
+ ctx0, q_pe, inp_pos, rope_factors,
13131
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13132
+ ext_factor, attn_factor, beta_fast, beta_slow
13133
+ );
13134
+ cb(q_pe, "q_pe", il);
13135
+
13136
+ // shared RoPE key
13137
+ k_pe = lm_ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
13138
+ k_pe = lm_ggml_rope_ext(
13139
+ ctx0, k_pe, inp_pos, rope_factors,
13140
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13141
+ ext_factor, attn_factor, beta_fast, beta_slow
13142
+ );
13143
+ cb(k_pe, "k_pe", il);
13144
+
13145
+ struct lm_ggml_tensor * q_states = lm_ggml_concat(ctx0, q_nope, q_pe, 0);
13146
+ cb(q_states, "q_states", il);
13147
+
13148
+ struct lm_ggml_tensor * k_states = lm_ggml_concat(ctx0, k_nope, lm_ggml_repeat(ctx0, k_pe, q_pe), 0);
13149
+ cb(k_states, "k_states", il);
13150
+
13151
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13152
+ model.layers[il].wo, NULL,
13153
+ k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
13154
+ }
13155
+
13156
+ if (il == n_layer - 1) {
13157
+ // skip computing output for unused tokens
13158
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13159
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13160
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13161
+ }
13162
+
13163
+ // scale_res - scale the hidden states for residual connection
13164
+ const float scale_res = scale_depth/sqrtf(float(n_layer));
13165
+ cur = lm_ggml_scale(ctx0, cur, scale_res);
13166
+ cb(cur, "hidden_scaled", il);
13167
+
13168
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
13169
+ cb(ffn_inp, "ffn_inp", il);
13170
+
13171
+ // feed-forward network
13172
+ {
13173
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
13174
+ model.layers[il].ffn_norm, NULL,
13175
+ LLM_NORM_RMS, cb, il);
13176
+ cb(cur, "ffn_norm", il);
13177
+
13178
+ cur = llm_build_ffn(ctx0, lctx, cur,
13179
+ model.layers[il].ffn_up, NULL, NULL,
13180
+ model.layers[il].ffn_gate, NULL, NULL,
13181
+ model.layers[il].ffn_down, NULL, NULL,
13182
+ NULL,
13183
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
13184
+ cb(cur, "ffn_out", il);
13185
+ }
13186
+
13187
+ // scale the hidden states for residual connection
13188
+ cur = lm_ggml_scale(ctx0, cur, scale_res);
13189
+ cb(cur, "hidden_scaled_ffn", il);
13190
+
13191
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
13192
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
13193
+ cb(cur, "l_out", il);
13194
+
13195
+ // input for next layer
13196
+ inpL = cur;
13197
+ }
13198
+
13199
+ cur = inpL;
13200
+
13201
+ cur = llm_build_norm(ctx0, cur, hparams,
13202
+ model.output_norm, NULL,
13203
+ LLM_NORM_RMS, cb, -1);
13204
+ cb(cur, "result_norm", -1);
13205
+
13206
+ // lm_head scaling
13207
+ const float scale_lmhead = float(n_embd_base)/float(n_embd);
13208
+ cur = lm_ggml_scale(ctx0, cur, scale_lmhead);
13209
+ cb(cur, "lmhead_scaling", -1);
13210
+
13211
+ // lm_head
13212
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
13213
+ cb(cur, "result_output", -1);
13214
+
13215
+ lm_ggml_build_forward_expand(gf, cur);
13216
+
13217
+ return gf;
13218
+ }
13219
+
12851
13220
  struct lm_ggml_cgraph * build_gemma() {
12852
13221
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
12853
13222
 
@@ -13544,6 +13913,134 @@ struct llm_build_context {
13544
13913
  return gf;
13545
13914
  }
13546
13915
 
13916
+ // based on the build_qwen2moe() function, changes:
13917
+ // * removed shared experts
13918
+ // * removed bias
13919
+ // * added q, k norm
13920
+ struct lm_ggml_cgraph * build_olmoe() {
13921
+ struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13922
+
13923
+ // mutable variable, needed during the last layer of the computation to skip unused tokens
13924
+ int32_t n_tokens = this->n_tokens;
13925
+
13926
+ const int64_t n_embd_head = hparams.n_embd_head_v;
13927
+ LM_GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
13928
+ LM_GGML_ASSERT(n_embd_head == hparams.n_rot);
13929
+
13930
+ struct lm_ggml_tensor * cur;
13931
+ struct lm_ggml_tensor * inpL;
13932
+
13933
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
13934
+
13935
+ // inp_pos - contains the positions
13936
+ struct lm_ggml_tensor * inp_pos = build_inp_pos();
13937
+
13938
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
13939
+ struct lm_ggml_tensor * KQ_mask = build_inp_KQ_mask();
13940
+
13941
+ for (int il = 0; il < n_layer; ++il) {
13942
+ struct lm_ggml_tensor * inpSA = inpL;
13943
+
13944
+ // norm
13945
+ cur = llm_build_norm(ctx0, inpL, hparams,
13946
+ model.layers[il].attn_norm, NULL,
13947
+ LLM_NORM_RMS, cb, il);
13948
+ cb(cur, "attn_norm", il);
13949
+
13950
+ // self_attention
13951
+ {
13952
+ // compute Q and K and RoPE them
13953
+ struct lm_ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
13954
+ cb(Qcur, "Qcur", il);
13955
+
13956
+ struct lm_ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
13957
+ cb(Kcur, "Kcur", il);
13958
+
13959
+ struct lm_ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
13960
+ cb(Vcur, "Vcur", il);
13961
+
13962
+ Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
13963
+ LLM_NORM_RMS, cb, il);
13964
+ cb(Qcur, "Qcur_normed", il);
13965
+
13966
+ Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
13967
+ LLM_NORM_RMS, cb, il);
13968
+ cb(Kcur, "Kcur_normed", il);
13969
+
13970
+ Qcur = lm_ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
13971
+ Kcur = lm_ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
13972
+
13973
+ Qcur = lm_ggml_rope_ext(
13974
+ ctx0, Qcur, inp_pos, nullptr,
13975
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13976
+ ext_factor, attn_factor, beta_fast, beta_slow
13977
+ );
13978
+ cb(Qcur, "Qcur_rope", il);
13979
+
13980
+ Kcur = lm_ggml_rope_ext(
13981
+ ctx0, Kcur, inp_pos, nullptr,
13982
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
13983
+ ext_factor, attn_factor, beta_fast, beta_slow
13984
+ );
13985
+ cb(Kcur, "Kcur_rope", il);
13986
+
13987
+ cur = llm_build_kv(ctx0, lctx, kv_self, gf,
13988
+ model.layers[il].wo, NULL,
13989
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
13990
+ }
13991
+
13992
+ if (il == n_layer - 1) {
13993
+ // skip computing output for unused tokens
13994
+ struct lm_ggml_tensor * inp_out_ids = build_inp_out_ids();
13995
+ n_tokens = n_outputs;
13996
+ cur = lm_ggml_get_rows(ctx0, cur, inp_out_ids);
13997
+ inpSA = lm_ggml_get_rows(ctx0, inpSA, inp_out_ids);
13998
+ }
13999
+
14000
+ struct lm_ggml_tensor * ffn_inp = lm_ggml_add(ctx0, cur, inpSA);
14001
+ cb(ffn_inp, "ffn_inp", il);
14002
+
14003
+ // MoE branch
14004
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
14005
+ model.layers[il].ffn_norm, NULL,
14006
+ LLM_NORM_RMS, cb, il);
14007
+ cb(cur, "ffn_norm", il);
14008
+
14009
+ cur = llm_build_moe_ffn(ctx0, lctx, cur,
14010
+ model.layers[il].ffn_gate_inp,
14011
+ model.layers[il].ffn_up_exps,
14012
+ model.layers[il].ffn_gate_exps,
14013
+ model.layers[il].ffn_down_exps,
14014
+ n_expert, n_expert_used,
14015
+ LLM_FFN_SILU, false,
14016
+ false, 0.0,
14017
+ cb, il);
14018
+ cb(cur, "ffn_moe_out", il);
14019
+
14020
+ cur = lm_ggml_add(ctx0, cur, ffn_inp);
14021
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
14022
+ cb(cur, "l_out", il);
14023
+
14024
+ // input for next layer
14025
+ inpL = cur;
14026
+ }
14027
+
14028
+ cur = inpL;
14029
+
14030
+ cur = llm_build_norm(ctx0, cur, hparams,
14031
+ model.output_norm, NULL,
14032
+ LLM_NORM_RMS, cb, -1);
14033
+ cb(cur, "result_norm", -1);
14034
+
14035
+ // lm_head
14036
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
14037
+ cb(cur, "result_output", -1);
14038
+
14039
+ lm_ggml_build_forward_expand(gf, cur);
14040
+
14041
+ return gf;
14042
+ }
14043
+
13547
14044
  struct lm_ggml_cgraph * build_openelm() {
13548
14045
  struct lm_ggml_cgraph * gf = lm_ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
13549
14046
 
@@ -15388,6 +15885,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
15388
15885
  {
15389
15886
  result = llm.build_minicpm();
15390
15887
  } break;
15888
+ case LLM_ARCH_MINICPM3:
15889
+ {
15890
+ result = llm.build_minicpm3();
15891
+ } break;
15391
15892
  case LLM_ARCH_GEMMA:
15392
15893
  {
15393
15894
  result = llm.build_gemma();
@@ -15420,6 +15921,10 @@ static struct lm_ggml_cgraph * llama_build_graph(
15420
15921
  {
15421
15922
  result = llm.build_olmo();
15422
15923
  } break;
15924
+ case LLM_ARCH_OLMOE:
15925
+ {
15926
+ result = llm.build_olmoe();
15927
+ } break;
15423
15928
  case LLM_ARCH_OPENELM:
15424
15929
  {
15425
15930
  result = llm.build_openelm();
@@ -15831,7 +16336,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
15831
16336
 
15832
16337
  // clear unused states
15833
16338
  for (int i = 0; i < n_kv; ++i) {
15834
- uint32_t cell_id = i + kv_self.head;
16339
+ const uint32_t cell_id = i + kv_self.head;
15835
16340
  llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
15836
16341
 
15837
16342
  data[i] = (float) (kv_cell.src >= 0);
@@ -16087,19 +16592,21 @@ static int llama_decode_internal(
16087
16592
  return -1;
16088
16593
  }
16089
16594
 
16090
- for (uint32_t i = 0; i < n_tokens_all; ++i) {
16091
- if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
16092
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16093
- return -1;
16094
- }
16095
- }
16096
-
16097
16595
  const auto & model = lctx.model;
16098
16596
  const auto & hparams = model.hparams;
16099
16597
  const auto & cparams = lctx.cparams;
16100
16598
 
16101
16599
  LM_GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
16102
16600
 
16601
+ if (batch_all.token) {
16602
+ for (uint32_t i = 0; i < n_tokens_all; ++i) {
16603
+ if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
16604
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
16605
+ return -1;
16606
+ }
16607
+ }
16608
+ }
16609
+
16103
16610
  LM_GGML_ASSERT(n_tokens_all <= cparams.n_batch);
16104
16611
 
16105
16612
  LM_GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16216,8 +16723,8 @@ static int llama_decode_internal(
16216
16723
  lm_ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
16217
16724
 
16218
16725
  // the output is always the last tensor in the graph
16219
- struct lm_ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
16220
- struct lm_ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
16726
+ struct lm_ggml_tensor * res = lm_ggml_graph_node(gf, -1);
16727
+ struct lm_ggml_tensor * embd = lm_ggml_graph_node(gf, -2);
16221
16728
 
16222
16729
  if (lctx.n_outputs == 0) {
16223
16730
  // no output
@@ -16226,9 +16733,9 @@ static int llama_decode_internal(
16226
16733
  } else if (cparams.embeddings) {
16227
16734
  res = nullptr; // do not extract logits for embedding case
16228
16735
  embd = nullptr;
16229
- for (int i = gf->n_nodes - 1; i >= 0; --i) {
16230
- if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
16231
- embd = gf->nodes[i];
16736
+ for (int i = lm_ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
16737
+ if (strcmp(lm_ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
16738
+ embd = lm_ggml_graph_node(gf, i);
16232
16739
  break;
16233
16740
  }
16234
16741
  }
@@ -16386,19 +16893,21 @@ static int llama_encode_internal(
16386
16893
  return -1;
16387
16894
  }
16388
16895
 
16389
- for (uint32_t i = 0; i < n_tokens; ++i) {
16390
- if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
16391
- LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16392
- return -1;
16393
- }
16394
- }
16395
-
16396
16896
  const auto & model = lctx.model;
16397
16897
  const auto & hparams = model.hparams;
16398
16898
  const auto & cparams = lctx.cparams;
16399
16899
 
16400
16900
  LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
16401
16901
 
16902
+ if (batch.token) {
16903
+ for (uint32_t i = 0; i < n_tokens; ++i) {
16904
+ if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
16905
+ LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
16906
+ return -1;
16907
+ }
16908
+ }
16909
+ }
16910
+
16402
16911
  // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
16403
16912
  LM_GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
16404
16913
 
@@ -16443,15 +16952,15 @@ static int llama_encode_internal(
16443
16952
  // there are two cases here
16444
16953
  if (llama_model_has_decoder(&lctx.model)) {
16445
16954
  // first case is an encoder-decoder T5 model where embeddings are passed to decoder
16446
- embd = gf->nodes[gf->n_nodes - 1];
16955
+ embd = lm_ggml_graph_node(gf, -1);
16447
16956
  LM_GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
16448
16957
  } else {
16449
16958
  // second case is an encoder-only T5 model
16450
16959
  if (cparams.embeddings) {
16451
16960
  // only output embeddings if required
16452
- embd = gf->nodes[gf->n_nodes - 1];
16961
+ embd = lm_ggml_graph_node(gf, -1);
16453
16962
  if (strcmp(embd->name, "result_embd_pooled") != 0) {
16454
- embd = gf->nodes[gf->n_nodes - 2];
16963
+ embd = lm_ggml_graph_node(gf, -2);
16455
16964
  }
16456
16965
  LM_GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
16457
16966
  }
@@ -17541,6 +18050,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
17541
18050
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
17542
18051
  quantize &= name.find("time_mix_w1.weight") == std::string::npos;
17543
18052
  quantize &= name.find("time_mix_w2.weight") == std::string::npos;
18053
+ quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
18054
+ quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
17544
18055
 
17545
18056
  // do not quantize relative position bias (T5)
17546
18057
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
@@ -17950,6 +18461,7 @@ struct llama_context_params llama_context_default_params() {
17950
18461
  /*.embeddings =*/ false,
17951
18462
  /*.offload_kqv =*/ true,
17952
18463
  /*.flash_attn =*/ false,
18464
+ /*.no_perf =*/ true,
17953
18465
  /*.abort_callback =*/ nullptr,
17954
18466
  /*.abort_callback_data =*/ nullptr,
17955
18467
  };
@@ -18072,9 +18584,9 @@ struct llama_model * llama_load_model_from_file(
18072
18584
  unsigned percentage = (unsigned) (100 * progress);
18073
18585
  while (percentage > *cur_percentage_p) {
18074
18586
  *cur_percentage_p = percentage;
18075
- LLAMA_LOG_INFO(".");
18587
+ LLAMA_LOG(".");
18076
18588
  if (percentage >= 100) {
18077
- LLAMA_LOG_INFO("\n");
18589
+ LLAMA_LOG("\n");
18078
18590
  }
18079
18591
  }
18080
18592
  return true;
@@ -18160,6 +18672,7 @@ struct llama_context * llama_new_context_with_model(
18160
18672
  cparams.embeddings = params.embeddings;
18161
18673
  cparams.offload_kqv = params.offload_kqv;
18162
18674
  cparams.flash_attn = params.flash_attn;
18675
+ cparams.no_perf = params.no_perf;
18163
18676
  cparams.pooling_type = params.pooling_type;
18164
18677
 
18165
18678
  cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -18497,7 +19010,7 @@ struct llama_context * llama_new_context_with_model(
18497
19010
 
18498
19011
  // note: the number of splits during measure is higher than during inference due to the kv shift
18499
19012
  int n_splits = lm_ggml_backend_sched_get_n_splits(ctx->sched);
18500
- LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
19013
+ LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, lm_ggml_graph_n_nodes(gf));
18501
19014
  LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
18502
19015
  }
18503
19016
  }
@@ -18596,6 +19109,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18596
19109
  case LLM_ARCH_QWEN:
18597
19110
  case LLM_ARCH_QWEN2:
18598
19111
  case LLM_ARCH_QWEN2MOE:
19112
+ case LLM_ARCH_OLMOE:
18599
19113
  case LLM_ARCH_PHI2:
18600
19114
  case LLM_ARCH_PHI3:
18601
19115
  case LLM_ARCH_GEMMA:
@@ -18606,6 +19120,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
18606
19120
  case LLM_ARCH_CODESHELL:
18607
19121
  case LLM_ARCH_NEMOTRON:
18608
19122
  case LLM_ARCH_EXAONE:
19123
+ case LLM_ARCH_MINICPM3:
18609
19124
  return LLAMA_ROPE_TYPE_NEOX;
18610
19125
 
18611
19126
  // all model arches should be listed explicitly here
@@ -20078,10 +20593,14 @@ void llama_synchronize(struct llama_context * ctx) {
20078
20593
 
20079
20594
  // add the evaluation to the stats
20080
20595
  if (ctx->n_queued_tokens == 1) {
20081
- ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20596
+ if (!ctx->cparams.no_perf) {
20597
+ ctx->t_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20598
+ }
20082
20599
  ctx->n_eval++;
20083
20600
  } else if (ctx->n_queued_tokens > 1) {
20084
- ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20601
+ if (!ctx->cparams.no_perf) {
20602
+ ctx->t_p_eval_us += lm_ggml_time_us() - ctx->t_compute_start_us;
20603
+ }
20085
20604
  ctx->n_p_eval += ctx->n_queued_tokens;
20086
20605
  }
20087
20606
 
@@ -20677,6 +21196,7 @@ const char * llama_print_system_info(void) {
20677
21196
  s += "ARM_FMA = " + std::to_string(lm_ggml_cpu_has_arm_fma()) + " | ";
20678
21197
  s += "F16C = " + std::to_string(lm_ggml_cpu_has_f16c()) + " | ";
20679
21198
  s += "FP16_VA = " + std::to_string(lm_ggml_cpu_has_fp16_va()) + " | ";
21199
+ s += "RISCV_VECT = " + std::to_string(lm_ggml_cpu_has_riscv_v()) + " | ";
20680
21200
  s += "WASM_SIMD = " + std::to_string(lm_ggml_cpu_has_wasm_simd()) + " | ";
20681
21201
  s += "BLAS = " + std::to_string(lm_ggml_cpu_has_blas()) + " | ";
20682
21202
  s += "SSE3 = " + std::to_string(lm_ggml_cpu_has_sse3()) + " | ";
@@ -20688,65 +21208,40 @@ const char * llama_print_system_info(void) {
20688
21208
  return s.c_str();
20689
21209
  }
20690
21210
 
20691
- void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20692
- switch (type) {
20693
- case LLAMA_PERF_TYPE_CONTEXT:
20694
- {
20695
- const auto * p = (const struct llama_context *) ctx;
20696
-
20697
- const double t_start_ms = 1e-3 * p->t_start_us;
20698
- const double t_end_ms = 1.00 * lm_ggml_time_ms();
20699
- const double t_load_ms = 1e-3 * p->t_load_us;
20700
- const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20701
- const double t_eval_ms = 1e-3 * p->t_eval_us;
20702
-
20703
- const int32_t n_p_eval = std::max(0, p->n_p_eval);
20704
- const int32_t n_eval = std::max(1, p->n_eval);
20705
-
20706
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20707
- LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20708
- __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20709
- LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20710
- __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20711
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20712
- } break;
20713
- case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20714
- {
20715
- const auto * smpl = (const struct llama_sampler *) ctx;
20716
- const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
21211
+ struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
21212
+ struct llama_perf_context_data data = {};
20717
21213
 
20718
- const double t_sampler_ms = 1e-3 * p->t_sample_us;
21214
+ if (ctx == nullptr) {
21215
+ return data;
21216
+ }
20719
21217
 
20720
- const int32_t n_sampler = std::max(0, p->n_sample);
21218
+ data.t_start_ms = 1e-3 * ctx->t_start_us;
21219
+ data.t_load_ms = 1e-3 * ctx->t_load_us;
21220
+ data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
21221
+ data.t_eval_ms = 1e-3 * ctx->t_eval_us;
21222
+ data.n_p_eval = std::max(1, ctx->n_p_eval);
21223
+ data.n_eval = std::max(1, ctx->n_eval);
20721
21224
 
20722
- LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20723
- __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
20724
- } break;
20725
- default:
20726
- LM_GGML_ABORT("invalid perf type");
20727
- }
21225
+ return data;
20728
21226
  }
20729
21227
 
20730
- void llama_perf_reset(void * ctx, enum llama_perf_type type) {
20731
- switch (type) {
20732
- case LLAMA_PERF_TYPE_CONTEXT:
20733
- {
20734
- auto * p = (struct llama_context *) ctx;
21228
+ void llama_perf_context_print(const struct llama_context * ctx) {
21229
+ const auto data = llama_perf_context(ctx);
20735
21230
 
20736
- p->t_start_us = lm_ggml_time_us();
20737
- p->t_eval_us = p->n_eval = 0;
20738
- p->t_p_eval_us = p->n_p_eval = 0;
20739
- } break;
20740
- case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20741
- {
20742
- auto * smpl = (struct llama_sampler *) ctx;
20743
- auto * p = (struct llama_sampler_chain *) smpl->ctx;
21231
+ const double t_end_ms = 1e-3 * lm_ggml_time_us();
20744
21232
 
20745
- p->t_sample_us = p->n_sample = 0;
20746
- } break;
20747
- default:
20748
- LM_GGML_ABORT("invalid perf type");
20749
- }
21233
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
21234
+ LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
21235
+ __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
21236
+ LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
21237
+ __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
21238
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
21239
+ }
21240
+
21241
+ void llama_perf_context_reset(struct llama_context * ctx) {
21242
+ ctx->t_start_us = lm_ggml_time_us();
21243
+ ctx->t_eval_us = ctx->n_eval = 0;
21244
+ ctx->t_p_eval_us = ctx->n_p_eval = 0;
20750
21245
  }
20751
21246
 
20752
21247
  void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
@@ -20798,8 +21293,8 @@ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, v
20798
21293
  if (len < 128) {
20799
21294
  g_state.log_callback(level, buffer, g_state.log_callback_user_data);
20800
21295
  } else {
20801
- char* buffer2 = new char[len+1];
20802
- vsnprintf(buffer2, len+1, format, args_copy);
21296
+ char * buffer2 = new char[len + 1];
21297
+ vsnprintf(buffer2, len + 1, format, args_copy);
20803
21298
  buffer2[len] = 0;
20804
21299
  g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
20805
21300
  delete[] buffer2;
@@ -20821,19 +21316,3 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
20821
21316
  fflush(stderr);
20822
21317
  }
20823
21318
 
20824
- struct llama_token_timings llama_get_token_timings(const void * v_ctx) {
20825
- const auto * ctx = (llama_context *) v_ctx;
20826
- struct llama_token_timings result = {
20827
- /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
20828
- /*.t_end_ms =*/ 1.00 * lm_ggml_time_ms(),
20829
- /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
20830
- /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
20831
- /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
20832
-
20833
- /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
20834
- /*.n_eval =*/ std::max(1, ctx->n_eval),
20835
- };
20836
-
20837
- return result;
20838
- }
20839
-