@fugood/llama.node 1.3.0-rc.5 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,6 @@
15
15
 
16
16
  #include <algorithm>
17
17
  #include <cassert>
18
- #include <cmath>
19
18
  #include <cfloat>
20
19
  #include <cstring>
21
20
  #include <cmath>
@@ -114,9 +113,12 @@ const char * llm_type_name(llm_type type) {
114
113
  case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
115
114
  case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
116
115
  case LLM_TYPE_A13B: return "A13B";
116
+ case LLM_TYPE_7B_A1B: return "7B.A1B";
117
117
  case LLM_TYPE_8B_A1B: return "8B.A1B";
118
+ case LLM_TYPE_16B_A1B: return "16B.A1B";
118
119
  case LLM_TYPE_21B_A3B: return "21B.A3B";
119
120
  case LLM_TYPE_30B_A3B: return "30B.A3B";
121
+ case LLM_TYPE_100B_A6B: return "100B.A6B";
120
122
  case LLM_TYPE_106B_A12B: return "106B.A12B";
121
123
  case LLM_TYPE_235B_A22B: return "235B.A22B";
122
124
  case LLM_TYPE_300B_A47B: return "300B.A47B";
@@ -401,6 +403,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
401
403
  // add the device default buffer type
402
404
  buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
403
405
 
406
+ // add the device extra buffer type (if any)
407
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
408
+ auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
409
+ ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
410
+
411
+ if (ggml_backend_dev_get_extra_bufts_fn) {
412
+ ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
413
+ while (extra_bufts && *extra_bufts) {
414
+ buft_list.emplace_back(dev, *extra_bufts);
415
+ ++extra_bufts;
416
+ }
417
+ }
418
+
404
419
  return buft_list;
405
420
  }
406
421
 
@@ -422,7 +437,7 @@ struct llama_model::impl {
422
437
  llama_mlocks mlock_mmaps;
423
438
 
424
439
  // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
425
- std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
440
+ std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
426
441
 
427
442
  buft_list_t cpu_buft_list;
428
443
  std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -480,11 +495,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
480
495
  return;
481
496
  }
482
497
 
483
- ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
484
- ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
485
- ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
486
- ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
487
- ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
498
+ ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
499
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
500
+ ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
501
+ ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
502
+ ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
503
+ ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
504
+ ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
488
505
 
489
506
  if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
490
507
  ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
@@ -500,8 +517,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
500
517
  GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
501
518
  if (hparams.n_expert > 0) {
502
519
  GGML_ASSERT(hparams.n_expert_used > 0);
520
+ GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
521
+ if (hparams.n_expert_groups > 1) {
522
+ GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
523
+ GGML_ASSERT(hparams.n_group_used > 0);
524
+ GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
525
+ }
503
526
  } else {
504
527
  GGML_ASSERT(hparams.n_expert_used == 0);
528
+ GGML_ASSERT(hparams.n_expert_groups == 0);
505
529
  }
506
530
 
507
531
  std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
@@ -1843,8 +1867,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1843
1867
 
1844
1868
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1845
1869
 
1846
- switch (hparams.n_layer) {
1847
- // TODO: Add llm type label (not sure this is useful)
1870
+ switch (hparams.n_embd) {
1871
+ case 1536: type = LLM_TYPE_7B_A1B; break;
1872
+ case 2048: case 2560: type = LLM_TYPE_3B; break;
1873
+ case 4096: type = LLM_TYPE_32B; break;
1848
1874
  default: type = LLM_TYPE_UNKNOWN;
1849
1875
  }
1850
1876
 
@@ -1885,6 +1911,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1885
1911
  default: type = LLM_TYPE_UNKNOWN;
1886
1912
  }
1887
1913
  } break;
1914
+ case LLM_ARCH_BAILINGMOE2:
1915
+ {
1916
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1917
+ ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
1918
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
1919
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
1920
+ ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
1921
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
1922
+ ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
1923
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
1924
+ ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
1925
+
1926
+ // TODO: when MTP is implemented, this should probably be updated if needed
1927
+ hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
1928
+
1929
+ switch (hparams.n_layer) {
1930
+ case 20: type = LLM_TYPE_16B_A1B; break;
1931
+ case 21: type = LLM_TYPE_16B_A1B; break;
1932
+ case 32: type = LLM_TYPE_100B_A6B; break;
1933
+ case 33: type = LLM_TYPE_100B_A6B; break;
1934
+ default: type = LLM_TYPE_UNKNOWN;
1935
+ }
1936
+ } break;
1888
1937
  case LLM_ARCH_DOTS1:
1889
1938
  {
1890
1939
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -2182,7 +2231,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2182
2231
  // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2183
2232
  struct ggml_backend_buft_comparator {
2184
2233
  bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2185
- return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2234
+ return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
2186
2235
  }
2187
2236
  };
2188
2237
  std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
@@ -5495,6 +5544,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5495
5544
  layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
5496
5545
  }
5497
5546
  } break;
5547
+ case LLM_ARCH_BAILINGMOE2:
5548
+ {
5549
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5550
+ const int64_t n_expert_shared = hparams.n_expert_shared;
5551
+
5552
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
5553
+
5554
+ // output
5555
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
5556
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
5557
+
5558
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
5559
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
5560
+
5561
+ for (int i = 0; i < n_layer; ++i) {
5562
+ int flags = 0;
5563
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5564
+ // skip all tensors in the NextN layers
5565
+ flags |= TENSOR_SKIP;
5566
+ }
5567
+
5568
+ auto & layer = layers[i];
5569
+
5570
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
5571
+
5572
+ layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
5573
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
5574
+
5575
+ layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
5576
+ layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
5577
+
5578
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
5579
+
5580
+ if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
5581
+ const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
5582
+
5583
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
5584
+ layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
5585
+
5586
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5587
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
5588
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
5589
+
5590
+ layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5591
+ layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
5592
+ layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
5593
+ } else { // Dense layers
5594
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
5595
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
5596
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
5597
+ }
5598
+
5599
+ // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
5600
+ if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
5601
+ layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
5602
+ layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5603
+ layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
5604
+ layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
5605
+ layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
5606
+ layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
5607
+ layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
5608
+ }
5609
+ }
5610
+ } break;
5498
5611
  case LLM_ARCH_DOTS1:
5499
5612
  {
5500
5613
  const int64_t n_ff_exp = hparams.n_ff_exp;
@@ -6072,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6072
6185
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6073
6186
  bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6074
6187
 
6075
- ggml_backend_buffer_t buf = nullptr;
6188
+ std::vector<ggml_backend_buffer_ptr> bufs;
6076
6189
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6077
6190
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6078
6191
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6085,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6085
6198
  continue;
6086
6199
  }
6087
6200
  const size_t max_size = ggml_get_max_tensor_size(ctx);
6088
- buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6201
+ ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6089
6202
  if (buf == nullptr) {
6090
6203
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6091
6204
  }
6205
+ bufs.emplace_back(buf);
6092
6206
  buf_map.emplace(idx, buf);
6093
6207
  }
6094
6208
  }
6095
6209
  else {
6096
- buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6210
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6097
6211
  if (buf == nullptr) {
6098
6212
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6099
6213
  }
@@ -6103,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6103
6217
  mlock_buf->init (ggml_backend_buffer_get_base(buf));
6104
6218
  mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
6105
6219
  }
6220
+ bufs.emplace_back(buf);
6106
6221
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6107
6222
  buf_map.emplace(idx, buf);
6108
6223
  }
6109
6224
  }
6110
- pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6225
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
6111
6226
 
6112
6227
  for (auto & buf : buf_map) {
6113
6228
  // indicate that this buffer contains weights
@@ -6133,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6133
6248
  }
6134
6249
 
6135
6250
  // print memory requirements per buffer type
6136
- for (auto & [_, buf] : pimpl->ctxs_bufs) {
6137
- LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6251
+ for (auto & [_, bufs] : pimpl->ctxs_bufs) {
6252
+ for (auto & buf: bufs) {
6253
+ LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
6254
+ __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6255
+ }
6138
6256
  }
6139
6257
 
6140
6258
  // populate tensors_by_name
@@ -6186,8 +6304,10 @@ size_t llama_model::n_devices() const {
6186
6304
 
6187
6305
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6188
6306
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6189
- for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6190
- ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6307
+ for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
6308
+ for (const auto & buf : bufs) {
6309
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6310
+ }
6191
6311
  }
6192
6312
  return ret;
6193
6313
  }
@@ -6255,6 +6375,8 @@ void llama_model::print_info() const {
6255
6375
  LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
6256
6376
  LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
6257
6377
  LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
6378
+ LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
6379
+ LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
6258
6380
  LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
6259
6381
  LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
6260
6382
  LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
@@ -6350,6 +6472,17 @@ void llama_model::print_info() const {
6350
6472
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6351
6473
  }
6352
6474
 
6475
+ if (arch == LLM_ARCH_BAILINGMOE2) {
6476
+ LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
6477
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6478
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
6479
+ LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
6480
+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
6481
+ LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
6482
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
6483
+ LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
6484
+ }
6485
+
6353
6486
  if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
6354
6487
  LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
6355
6488
  LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
@@ -17039,6 +17172,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
17039
17172
  }
17040
17173
  };
17041
17174
 
17175
+ struct llm_build_bailingmoe2 : public llm_graph_context {
17176
+ llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17177
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17178
+ const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
17179
+
17180
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17181
+
17182
+ ggml_tensor * cur;
17183
+ ggml_tensor * inpL;
17184
+
17185
+ inpL = build_inp_embd(model.tok_embd);
17186
+
17187
+ // inp_pos - contains the positions
17188
+ ggml_tensor * inp_pos = build_inp_pos();
17189
+
17190
+ auto * inp_attn = build_attn_inp_kv();
17191
+
17192
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17193
+
17194
+ const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
17195
+ for (int il = 0; il < n_transformer_layers; ++il) {
17196
+ ggml_tensor * inpSA = inpL;
17197
+
17198
+ // norm
17199
+ cur = build_norm(inpL,
17200
+ model.layers[il].attn_norm, NULL,
17201
+ LLM_NORM_RMS, il);
17202
+ cb(cur, "attn_norm", il);
17203
+
17204
+ // self_attention
17205
+ {
17206
+ cur = build_lora_mm(model.layers[il].wqkv, cur);
17207
+ cb(cur, "wqkv", il);
17208
+
17209
+ ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
17210
+ ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
17211
+ ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
17212
+
17213
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
17214
+ cb(Qcur, "Qcur_normed", il);
17215
+
17216
+ Qcur = ggml_rope_ext(
17217
+ ctx0, Qcur, inp_pos, nullptr,
17218
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17219
+ ext_factor, attn_factor, beta_fast, beta_slow
17220
+ );
17221
+
17222
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
17223
+ cb(Kcur, "Kcur_normed", il);
17224
+
17225
+ Kcur = ggml_rope_ext(
17226
+ ctx0, Kcur, inp_pos, nullptr,
17227
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17228
+ ext_factor, attn_factor, beta_fast, beta_slow
17229
+ );
17230
+
17231
+ cb(Qcur, "Qcur", il);
17232
+ cb(Kcur, "Kcur", il);
17233
+ cb(Vcur, "Vcur", il);
17234
+
17235
+ cur = build_attn(inp_attn,
17236
+ model.layers[il].wo, model.layers[il].bo,
17237
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
17238
+ }
17239
+
17240
+ if (il == n_transformer_layers - 1 && inp_out_ids) {
17241
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17242
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17243
+ }
17244
+
17245
+ ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
17246
+ cb(sa_out, "sa_out", il);
17247
+
17248
+ // MoE branch
17249
+ cur = build_norm(sa_out,
17250
+ model.layers[il].ffn_norm, NULL,
17251
+ LLM_NORM_RMS, il);
17252
+ cb(cur, "ffn_norm", il);
17253
+
17254
+ if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
17255
+ cur = build_ffn(cur,
17256
+ model.layers[il].ffn_up, NULL, NULL,
17257
+ model.layers[il].ffn_gate, NULL, NULL,
17258
+ model.layers[il].ffn_down, NULL, NULL,
17259
+ NULL,
17260
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17261
+ cb(cur, "ffn_out", il);
17262
+ } else {
17263
+ ggml_tensor * moe_out =
17264
+ build_moe_ffn(cur,
17265
+ model.layers[il].ffn_gate_inp,
17266
+ model.layers[il].ffn_up_exps,
17267
+ model.layers[il].ffn_gate_exps,
17268
+ model.layers[il].ffn_down_exps,
17269
+ model.layers[il].ffn_exp_probs_b,
17270
+ n_expert, n_expert_used,
17271
+ LLM_FFN_SILU, hparams.expert_weights_norm,
17272
+ true, hparams.expert_weights_scale,
17273
+ (llama_expert_gating_func_type) hparams.expert_gating_func,
17274
+ il);
17275
+ cb(moe_out, "ffn_moe_out", il);
17276
+
17277
+ {
17278
+ ggml_tensor * ffn_shexp = build_ffn(cur,
17279
+ model.layers[il].ffn_up_shexp, NULL, NULL,
17280
+ model.layers[il].ffn_gate_shexp, NULL, NULL,
17281
+ model.layers[il].ffn_down_shexp, NULL, NULL,
17282
+ NULL,
17283
+ LLM_FFN_SILU, LLM_FFN_PAR, il);
17284
+ cb(ffn_shexp, "ffn_shexp", il);
17285
+
17286
+ cur = ggml_add(ctx0, moe_out, ffn_shexp);
17287
+ cb(cur, "ffn_out", il);
17288
+ }
17289
+ }
17290
+
17291
+ cur = ggml_add(ctx0, cur, sa_out);
17292
+
17293
+ cur = build_cvec(cur, il);
17294
+ cb(cur, "l_out", il);
17295
+
17296
+ // input for next layer
17297
+ inpL = cur;
17298
+ }
17299
+
17300
+ cur = inpL;
17301
+
17302
+ cur = build_norm(cur,
17303
+ model.output_norm, NULL,
17304
+ LLM_NORM_RMS, -1);
17305
+
17306
+ cb(cur, "result_norm", -1);
17307
+ res->t_embd = cur;
17308
+
17309
+ // lm_head
17310
+ cur = build_lora_mm(model.output, cur);
17311
+
17312
+ cb(cur, "result_output", -1);
17313
+ res->t_logits = cur;
17314
+
17315
+ ggml_build_forward_expand(gf, cur);
17316
+ }
17317
+ };
17318
+
17042
17319
  struct llm_build_dots1 : public llm_graph_context {
17043
17320
  llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
17044
17321
  const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17694,6 +17971,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
17694
17971
  cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
17695
17972
  cb(cur, "result_norm", -1);
17696
17973
 
17974
+ res->t_embd = cur;
17975
+
17697
17976
  // lm_head
17698
17977
  cur = build_lora_mm(model.output, cur);
17699
17978
  cb(cur, "result_output", -1);
@@ -19066,6 +19345,7 @@ struct llm_build_smallthinker : public llm_graph_context{
19066
19345
 
19067
19346
  cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
19068
19347
  cb(cur, "result_norm", -1);
19348
+ res->t_embd = cur;
19069
19349
 
19070
19350
  // lm_head
19071
19351
  cur = build_lora_mm(model.output, cur);
@@ -19361,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
19361
19641
  }
19362
19642
  };
19363
19643
 
19364
- llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
19644
+ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
19365
19645
  llama_memory_i * res;
19366
19646
 
19367
19647
  switch (arch) {
@@ -19412,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19412
19692
  };
19413
19693
  }
19414
19694
 
19415
- const auto padding = llama_kv_cache::get_padding(cparams);
19416
-
19417
- cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
19418
-
19419
19695
  res = new llama_memory_hybrid(
19420
19696
  /* model */ *this,
19421
19697
  /* attn_type_k */ params.type_k,
19422
19698
  /* attn_type_v */ params.type_v,
19423
19699
  /* attn_v_trans */ !cparams.flash_attn,
19424
19700
  /* attn_kv_size */ cparams.n_ctx,
19425
- /* attn_n_pad */ padding,
19701
+ /* attn_n_pad */ 1,
19426
19702
  /* attn_n_swa */ hparams.n_swa,
19427
19703
  /* attn_swa_type */ hparams.swa_type,
19428
19704
  /* recurrent_type_k */ GGML_TYPE_F32,
@@ -19434,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19434
19710
  /* filter_attn */ std::move(filter_attn),
19435
19711
  /* filter_recr */ std::move(filter_recr));
19436
19712
  } else {
19437
- const auto padding = llama_kv_cache::get_padding(cparams);
19438
-
19439
19713
  uint32_t n_ctx_per_stream = cparams.n_ctx;
19440
19714
 
19441
19715
  if (!cparams.kv_unified) {
19442
19716
  n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
19443
- n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19444
-
19445
- cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
19446
- } else {
19447
- n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
19448
-
19449
- cparams.n_ctx = n_ctx_per_stream;
19450
19717
  }
19451
19718
 
19452
- LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
19453
-
19454
19719
  llama_memory_i::layer_reuse_cb reuse = nullptr;
19455
19720
 
19456
19721
  if (arch == LLM_ARCH_GEMMA3N) {
@@ -19477,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19477
19742
  n_ctx_per_stream,
19478
19743
  cparams.n_seq_max,
19479
19744
  cparams.n_ubatch,
19480
- padding,
19745
+ 1,
19481
19746
  nullptr,
19482
19747
  reuse);
19483
19748
  } else {
@@ -19492,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19492
19757
  cparams.kv_unified,
19493
19758
  n_ctx_per_stream,
19494
19759
  cparams.n_seq_max,
19495
- padding,
19760
+ 1,
19496
19761
  hparams.n_swa,
19497
19762
  hparams.swa_type,
19498
19763
  nullptr,
@@ -19835,6 +20100,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19835
20100
  {
19836
20101
  llm = std::make_unique<llm_build_bailingmoe>(*this, params);
19837
20102
  } break;
20103
+ case LLM_ARCH_BAILINGMOE2:
20104
+ {
20105
+ llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
20106
+ } break;
19838
20107
  case LLM_ARCH_SEED_OSS:
19839
20108
  {
19840
20109
  llm = std::make_unique<llm_build_seed_oss>(*this, params);
@@ -20101,6 +20370,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
20101
20370
  case LLM_ARCH_EXAONE:
20102
20371
  case LLM_ARCH_EXAONE4:
20103
20372
  case LLM_ARCH_MINICPM3:
20373
+ case LLM_ARCH_BAILINGMOE2:
20104
20374
  case LLM_ARCH_DOTS1:
20105
20375
  case LLM_ARCH_HUNYUAN_MOE:
20106
20376
  case LLM_ARCH_OPENAI_MOE:
@@ -107,9 +107,12 @@ enum llm_type {
107
107
  LLM_TYPE_17B_16E, // llama4 Scout
108
108
  LLM_TYPE_17B_128E, // llama4 Maverick
109
109
  LLM_TYPE_A13B,
110
+ LLM_TYPE_7B_A1B,
110
111
  LLM_TYPE_8B_A1B, // lfm2moe
112
+ LLM_TYPE_16B_A1B,
111
113
  LLM_TYPE_21B_A3B, // Ernie MoE small
112
114
  LLM_TYPE_30B_A3B,
115
+ LLM_TYPE_100B_A6B,
113
116
  LLM_TYPE_106B_A12B, // GLM-4.5-Air
114
117
  LLM_TYPE_235B_A22B,
115
118
  LLM_TYPE_300B_A47B, // Ernie MoE big
@@ -497,9 +500,8 @@ struct llama_model {
497
500
 
498
501
  ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
499
502
 
500
- // note: can mutate `cparams`
501
503
  // TODO: move this to new llm_arch_model_i interface
502
- llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
504
+ llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
503
505
 
504
506
  // TODO: move this to new llm_arch_model_i interface
505
507
  ggml_cgraph * build_graph(const llm_graph_params & params) const;
@@ -1968,6 +1968,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1968
1968
  clean_spaces = false;
1969
1969
  } else if (
1970
1970
  tokenizer_pre == "bailingmoe" ||
1971
+ tokenizer_pre == "bailingmoe2" ||
1971
1972
  tokenizer_pre == "llada-moe") {
1972
1973
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1973
1974
  clean_spaces = false;