@fugood/llama.node 1.3.0-rc.5 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +12 -2
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +12 -13
- package/src/llama.cpp/common/arg.cpp +2 -2
- package/src/llama.cpp/common/chat.cpp +199 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +19 -3
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-batch.h +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +35 -2
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +10 -4
- package/src/llama.cpp/src/llama-graph.cpp +35 -0
- package/src/llama.cpp/src/llama-hparams.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +23 -20
- package/src/llama.cpp/src/llama-kv-cache.h +2 -4
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +18 -14
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +307 -37
- package/src/llama.cpp/src/llama-model.h +4 -2
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
|
@@ -15,7 +15,6 @@
|
|
|
15
15
|
|
|
16
16
|
#include <algorithm>
|
|
17
17
|
#include <cassert>
|
|
18
|
-
#include <cmath>
|
|
19
18
|
#include <cfloat>
|
|
20
19
|
#include <cstring>
|
|
21
20
|
#include <cmath>
|
|
@@ -114,9 +113,12 @@ const char * llm_type_name(llm_type type) {
|
|
|
114
113
|
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
115
114
|
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
116
115
|
case LLM_TYPE_A13B: return "A13B";
|
|
116
|
+
case LLM_TYPE_7B_A1B: return "7B.A1B";
|
|
117
117
|
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
|
118
|
+
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
118
119
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
119
120
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
121
|
+
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
120
122
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
121
123
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
122
124
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
@@ -401,6 +403,19 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|
|
401
403
|
// add the device default buffer type
|
|
402
404
|
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
|
|
403
405
|
|
|
406
|
+
// add the device extra buffer type (if any)
|
|
407
|
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
408
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
409
|
+
ggml_backend_reg_get_proc_address(reg, "ggml_backend_dev_get_extra_bufts");
|
|
410
|
+
|
|
411
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
412
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(dev);
|
|
413
|
+
while (extra_bufts && *extra_bufts) {
|
|
414
|
+
buft_list.emplace_back(dev, *extra_bufts);
|
|
415
|
+
++extra_bufts;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
404
419
|
return buft_list;
|
|
405
420
|
}
|
|
406
421
|
|
|
@@ -422,7 +437,7 @@ struct llama_model::impl {
|
|
|
422
437
|
llama_mlocks mlock_mmaps;
|
|
423
438
|
|
|
424
439
|
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
|
425
|
-
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr
|
|
440
|
+
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
|
426
441
|
|
|
427
442
|
buft_list_t cpu_buft_list;
|
|
428
443
|
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
@@ -480,11 +495,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
480
495
|
return;
|
|
481
496
|
}
|
|
482
497
|
|
|
483
|
-
ml.get_key(LLM_KV_CONTEXT_LENGTH,
|
|
484
|
-
ml.get_key(LLM_KV_EMBEDDING_LENGTH,
|
|
485
|
-
ml.get_key(LLM_KV_BLOCK_COUNT,
|
|
486
|
-
ml.get_key(LLM_KV_EXPERT_COUNT,
|
|
487
|
-
ml.get_key(LLM_KV_EXPERT_USED_COUNT,
|
|
498
|
+
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
499
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
500
|
+
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
501
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
502
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
503
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false);
|
|
504
|
+
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
|
488
505
|
|
|
489
506
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
490
507
|
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
|
@@ -500,8 +517,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
500
517
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
|
501
518
|
if (hparams.n_expert > 0) {
|
|
502
519
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
520
|
+
GGML_ASSERT(hparams.n_expert_groups < hparams.n_expert);
|
|
521
|
+
if (hparams.n_expert_groups > 1) {
|
|
522
|
+
GGML_ASSERT(hparams.n_expert % hparams.n_expert_groups == 0);
|
|
523
|
+
GGML_ASSERT(hparams.n_group_used > 0);
|
|
524
|
+
GGML_ASSERT(hparams.n_group_used < hparams.n_expert_groups);
|
|
525
|
+
}
|
|
503
526
|
} else {
|
|
504
527
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
528
|
+
GGML_ASSERT(hparams.n_expert_groups == 0);
|
|
505
529
|
}
|
|
506
530
|
|
|
507
531
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
@@ -1843,8 +1867,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1843
1867
|
|
|
1844
1868
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1845
1869
|
|
|
1846
|
-
switch (hparams.
|
|
1847
|
-
|
|
1870
|
+
switch (hparams.n_embd) {
|
|
1871
|
+
case 1536: type = LLM_TYPE_7B_A1B; break;
|
|
1872
|
+
case 2048: case 2560: type = LLM_TYPE_3B; break;
|
|
1873
|
+
case 4096: type = LLM_TYPE_32B; break;
|
|
1848
1874
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1849
1875
|
}
|
|
1850
1876
|
|
|
@@ -1885,6 +1911,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1885
1911
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1886
1912
|
}
|
|
1887
1913
|
} break;
|
|
1914
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
1915
|
+
{
|
|
1916
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1917
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1918
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1919
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
1920
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1921
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1922
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1923
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
1924
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1925
|
+
|
|
1926
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1927
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1928
|
+
|
|
1929
|
+
switch (hparams.n_layer) {
|
|
1930
|
+
case 20: type = LLM_TYPE_16B_A1B; break;
|
|
1931
|
+
case 21: type = LLM_TYPE_16B_A1B; break;
|
|
1932
|
+
case 32: type = LLM_TYPE_100B_A6B; break;
|
|
1933
|
+
case 33: type = LLM_TYPE_100B_A6B; break;
|
|
1934
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1935
|
+
}
|
|
1936
|
+
} break;
|
|
1888
1937
|
case LLM_ARCH_DOTS1:
|
|
1889
1938
|
{
|
|
1890
1939
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2182,7 +2231,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2182
2231
|
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
2183
2232
|
struct ggml_backend_buft_comparator {
|
|
2184
2233
|
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
2185
|
-
return ggml_backend_buft_name(lhs)
|
|
2234
|
+
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
2186
2235
|
}
|
|
2187
2236
|
};
|
|
2188
2237
|
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
@@ -5495,6 +5544,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5495
5544
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
5496
5545
|
}
|
|
5497
5546
|
} break;
|
|
5547
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
5548
|
+
{
|
|
5549
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5550
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
5551
|
+
|
|
5552
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5553
|
+
|
|
5554
|
+
// output
|
|
5555
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5556
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5557
|
+
|
|
5558
|
+
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
|
|
5559
|
+
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
|
|
5560
|
+
|
|
5561
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5562
|
+
int flags = 0;
|
|
5563
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5564
|
+
// skip all tensors in the NextN layers
|
|
5565
|
+
flags |= TENSOR_SKIP;
|
|
5566
|
+
}
|
|
5567
|
+
|
|
5568
|
+
auto & layer = layers[i];
|
|
5569
|
+
|
|
5570
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5571
|
+
|
|
5572
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags);
|
|
5573
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
|
|
5574
|
+
|
|
5575
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5576
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5577
|
+
|
|
5578
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5579
|
+
|
|
5580
|
+
if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) { // MoE layers
|
|
5581
|
+
const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared;
|
|
5582
|
+
|
|
5583
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5584
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
|
|
5585
|
+
|
|
5586
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5587
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5588
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5589
|
+
|
|
5590
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5591
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
|
|
5592
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5593
|
+
} else { // Dense layers
|
|
5594
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5595
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
|
5596
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5597
|
+
}
|
|
5598
|
+
|
|
5599
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5600
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5601
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
5602
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
|
|
5603
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
5604
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
5605
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags);
|
|
5606
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags);
|
|
5607
|
+
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags);
|
|
5608
|
+
}
|
|
5609
|
+
}
|
|
5610
|
+
} break;
|
|
5498
5611
|
case LLM_ARCH_DOTS1:
|
|
5499
5612
|
{
|
|
5500
5613
|
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
@@ -6072,7 +6185,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6072
6185
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
|
6073
6186
|
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
|
6074
6187
|
|
|
6075
|
-
|
|
6188
|
+
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
6076
6189
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6077
6190
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6078
6191
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
@@ -6085,15 +6198,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6085
6198
|
continue;
|
|
6086
6199
|
}
|
|
6087
6200
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
6088
|
-
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6201
|
+
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6089
6202
|
if (buf == nullptr) {
|
|
6090
6203
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6091
6204
|
}
|
|
6205
|
+
bufs.emplace_back(buf);
|
|
6092
6206
|
buf_map.emplace(idx, buf);
|
|
6093
6207
|
}
|
|
6094
6208
|
}
|
|
6095
6209
|
else {
|
|
6096
|
-
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6210
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6097
6211
|
if (buf == nullptr) {
|
|
6098
6212
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6099
6213
|
}
|
|
@@ -6103,11 +6217,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6103
6217
|
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
|
6104
6218
|
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
|
6105
6219
|
}
|
|
6220
|
+
bufs.emplace_back(buf);
|
|
6106
6221
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6107
6222
|
buf_map.emplace(idx, buf);
|
|
6108
6223
|
}
|
|
6109
6224
|
}
|
|
6110
|
-
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr),
|
|
6225
|
+
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), std::move(bufs));
|
|
6111
6226
|
|
|
6112
6227
|
for (auto & buf : buf_map) {
|
|
6113
6228
|
// indicate that this buffer contains weights
|
|
@@ -6133,8 +6248,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6133
6248
|
}
|
|
6134
6249
|
|
|
6135
6250
|
// print memory requirements per buffer type
|
|
6136
|
-
for (auto & [_,
|
|
6137
|
-
|
|
6251
|
+
for (auto & [_, bufs] : pimpl->ctxs_bufs) {
|
|
6252
|
+
for (auto & buf: bufs) {
|
|
6253
|
+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n",
|
|
6254
|
+
__func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
|
6255
|
+
}
|
|
6138
6256
|
}
|
|
6139
6257
|
|
|
6140
6258
|
// populate tensors_by_name
|
|
@@ -6186,8 +6304,10 @@ size_t llama_model::n_devices() const {
|
|
|
6186
6304
|
|
|
6187
6305
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6188
6306
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6189
|
-
for (const auto & [_,
|
|
6190
|
-
|
|
6307
|
+
for (const auto & [_, bufs] : pimpl->ctxs_bufs) {
|
|
6308
|
+
for (const auto & buf : bufs) {
|
|
6309
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6310
|
+
}
|
|
6191
6311
|
}
|
|
6192
6312
|
return ret;
|
|
6193
6313
|
}
|
|
@@ -6255,6 +6375,8 @@ void llama_model::print_info() const {
|
|
|
6255
6375
|
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
6256
6376
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
6257
6377
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
6378
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
6379
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
6258
6380
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
6259
6381
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
6260
6382
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
@@ -6350,6 +6472,17 @@ void llama_model::print_info() const {
|
|
|
6350
6472
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6351
6473
|
}
|
|
6352
6474
|
|
|
6475
|
+
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
6476
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
6477
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6478
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6479
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
6480
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
6481
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
6482
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
6483
|
+
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
|
6484
|
+
}
|
|
6485
|
+
|
|
6353
6486
|
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
6354
6487
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6355
6488
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
@@ -17039,6 +17172,150 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
17039
17172
|
}
|
|
17040
17173
|
};
|
|
17041
17174
|
|
|
17175
|
+
struct llm_build_bailingmoe2 : public llm_graph_context {
|
|
17176
|
+
llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17177
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
17178
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
17179
|
+
|
|
17180
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
17181
|
+
|
|
17182
|
+
ggml_tensor * cur;
|
|
17183
|
+
ggml_tensor * inpL;
|
|
17184
|
+
|
|
17185
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
17186
|
+
|
|
17187
|
+
// inp_pos - contains the positions
|
|
17188
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
17189
|
+
|
|
17190
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
17191
|
+
|
|
17192
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
17193
|
+
|
|
17194
|
+
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
|
|
17195
|
+
for (int il = 0; il < n_transformer_layers; ++il) {
|
|
17196
|
+
ggml_tensor * inpSA = inpL;
|
|
17197
|
+
|
|
17198
|
+
// norm
|
|
17199
|
+
cur = build_norm(inpL,
|
|
17200
|
+
model.layers[il].attn_norm, NULL,
|
|
17201
|
+
LLM_NORM_RMS, il);
|
|
17202
|
+
cb(cur, "attn_norm", il);
|
|
17203
|
+
|
|
17204
|
+
// self_attention
|
|
17205
|
+
{
|
|
17206
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
17207
|
+
cb(cur, "wqkv", il);
|
|
17208
|
+
|
|
17209
|
+
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
|
|
17210
|
+
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
|
|
17211
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
|
|
17212
|
+
|
|
17213
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
17214
|
+
cb(Qcur, "Qcur_normed", il);
|
|
17215
|
+
|
|
17216
|
+
Qcur = ggml_rope_ext(
|
|
17217
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
17218
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17219
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17220
|
+
);
|
|
17221
|
+
|
|
17222
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
17223
|
+
cb(Kcur, "Kcur_normed", il);
|
|
17224
|
+
|
|
17225
|
+
Kcur = ggml_rope_ext(
|
|
17226
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
17227
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
17228
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
17229
|
+
);
|
|
17230
|
+
|
|
17231
|
+
cb(Qcur, "Qcur", il);
|
|
17232
|
+
cb(Kcur, "Kcur", il);
|
|
17233
|
+
cb(Vcur, "Vcur", il);
|
|
17234
|
+
|
|
17235
|
+
cur = build_attn(inp_attn,
|
|
17236
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
17237
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
17238
|
+
}
|
|
17239
|
+
|
|
17240
|
+
if (il == n_transformer_layers - 1 && inp_out_ids) {
|
|
17241
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
17242
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
17243
|
+
}
|
|
17244
|
+
|
|
17245
|
+
ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
|
|
17246
|
+
cb(sa_out, "sa_out", il);
|
|
17247
|
+
|
|
17248
|
+
// MoE branch
|
|
17249
|
+
cur = build_norm(sa_out,
|
|
17250
|
+
model.layers[il].ffn_norm, NULL,
|
|
17251
|
+
LLM_NORM_RMS, il);
|
|
17252
|
+
cb(cur, "ffn_norm", il);
|
|
17253
|
+
|
|
17254
|
+
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
|
|
17255
|
+
cur = build_ffn(cur,
|
|
17256
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
17257
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
17258
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
17259
|
+
NULL,
|
|
17260
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17261
|
+
cb(cur, "ffn_out", il);
|
|
17262
|
+
} else {
|
|
17263
|
+
ggml_tensor * moe_out =
|
|
17264
|
+
build_moe_ffn(cur,
|
|
17265
|
+
model.layers[il].ffn_gate_inp,
|
|
17266
|
+
model.layers[il].ffn_up_exps,
|
|
17267
|
+
model.layers[il].ffn_gate_exps,
|
|
17268
|
+
model.layers[il].ffn_down_exps,
|
|
17269
|
+
model.layers[il].ffn_exp_probs_b,
|
|
17270
|
+
n_expert, n_expert_used,
|
|
17271
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
17272
|
+
true, hparams.expert_weights_scale,
|
|
17273
|
+
(llama_expert_gating_func_type) hparams.expert_gating_func,
|
|
17274
|
+
il);
|
|
17275
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
17276
|
+
|
|
17277
|
+
{
|
|
17278
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
17279
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
17280
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
17281
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
17282
|
+
NULL,
|
|
17283
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
17284
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
17285
|
+
|
|
17286
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
17287
|
+
cb(cur, "ffn_out", il);
|
|
17288
|
+
}
|
|
17289
|
+
}
|
|
17290
|
+
|
|
17291
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
17292
|
+
|
|
17293
|
+
cur = build_cvec(cur, il);
|
|
17294
|
+
cb(cur, "l_out", il);
|
|
17295
|
+
|
|
17296
|
+
// input for next layer
|
|
17297
|
+
inpL = cur;
|
|
17298
|
+
}
|
|
17299
|
+
|
|
17300
|
+
cur = inpL;
|
|
17301
|
+
|
|
17302
|
+
cur = build_norm(cur,
|
|
17303
|
+
model.output_norm, NULL,
|
|
17304
|
+
LLM_NORM_RMS, -1);
|
|
17305
|
+
|
|
17306
|
+
cb(cur, "result_norm", -1);
|
|
17307
|
+
res->t_embd = cur;
|
|
17308
|
+
|
|
17309
|
+
// lm_head
|
|
17310
|
+
cur = build_lora_mm(model.output, cur);
|
|
17311
|
+
|
|
17312
|
+
cb(cur, "result_output", -1);
|
|
17313
|
+
res->t_logits = cur;
|
|
17314
|
+
|
|
17315
|
+
ggml_build_forward_expand(gf, cur);
|
|
17316
|
+
}
|
|
17317
|
+
};
|
|
17318
|
+
|
|
17042
17319
|
struct llm_build_dots1 : public llm_graph_context {
|
|
17043
17320
|
llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
17044
17321
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -17694,6 +17971,8 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
17694
17971
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
17695
17972
|
cb(cur, "result_norm", -1);
|
|
17696
17973
|
|
|
17974
|
+
res->t_embd = cur;
|
|
17975
|
+
|
|
17697
17976
|
// lm_head
|
|
17698
17977
|
cur = build_lora_mm(model.output, cur);
|
|
17699
17978
|
cb(cur, "result_output", -1);
|
|
@@ -19066,6 +19345,7 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
19066
19345
|
|
|
19067
19346
|
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
19068
19347
|
cb(cur, "result_norm", -1);
|
|
19348
|
+
res->t_embd = cur;
|
|
19069
19349
|
|
|
19070
19350
|
// lm_head
|
|
19071
19351
|
cur = build_lora_mm(model.output, cur);
|
|
@@ -19361,7 +19641,7 @@ struct llm_build_apertus : public llm_graph_context {
|
|
|
19361
19641
|
}
|
|
19362
19642
|
};
|
|
19363
19643
|
|
|
19364
|
-
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
|
19644
|
+
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, const llama_cparams & cparams) const {
|
|
19365
19645
|
llama_memory_i * res;
|
|
19366
19646
|
|
|
19367
19647
|
switch (arch) {
|
|
@@ -19412,17 +19692,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19412
19692
|
};
|
|
19413
19693
|
}
|
|
19414
19694
|
|
|
19415
|
-
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
19416
|
-
|
|
19417
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
19418
|
-
|
|
19419
19695
|
res = new llama_memory_hybrid(
|
|
19420
19696
|
/* model */ *this,
|
|
19421
19697
|
/* attn_type_k */ params.type_k,
|
|
19422
19698
|
/* attn_type_v */ params.type_v,
|
|
19423
19699
|
/* attn_v_trans */ !cparams.flash_attn,
|
|
19424
19700
|
/* attn_kv_size */ cparams.n_ctx,
|
|
19425
|
-
/* attn_n_pad */
|
|
19701
|
+
/* attn_n_pad */ 1,
|
|
19426
19702
|
/* attn_n_swa */ hparams.n_swa,
|
|
19427
19703
|
/* attn_swa_type */ hparams.swa_type,
|
|
19428
19704
|
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
@@ -19434,23 +19710,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19434
19710
|
/* filter_attn */ std::move(filter_attn),
|
|
19435
19711
|
/* filter_recr */ std::move(filter_recr));
|
|
19436
19712
|
} else {
|
|
19437
|
-
const auto padding = llama_kv_cache::get_padding(cparams);
|
|
19438
|
-
|
|
19439
19713
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
19440
19714
|
|
|
19441
19715
|
if (!cparams.kv_unified) {
|
|
19442
19716
|
n_ctx_per_stream = (cparams.n_ctx + cparams.n_seq_max - 1)/cparams.n_seq_max;
|
|
19443
|
-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
19444
|
-
|
|
19445
|
-
cparams.n_ctx = n_ctx_per_stream*cparams.n_seq_max;
|
|
19446
|
-
} else {
|
|
19447
|
-
n_ctx_per_stream = GGML_PAD(n_ctx_per_stream, padding);
|
|
19448
|
-
|
|
19449
|
-
cparams.n_ctx = n_ctx_per_stream;
|
|
19450
19717
|
}
|
|
19451
19718
|
|
|
19452
|
-
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
|
19453
|
-
|
|
19454
19719
|
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
19455
19720
|
|
|
19456
19721
|
if (arch == LLM_ARCH_GEMMA3N) {
|
|
@@ -19477,7 +19742,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19477
19742
|
n_ctx_per_stream,
|
|
19478
19743
|
cparams.n_seq_max,
|
|
19479
19744
|
cparams.n_ubatch,
|
|
19480
|
-
|
|
19745
|
+
1,
|
|
19481
19746
|
nullptr,
|
|
19482
19747
|
reuse);
|
|
19483
19748
|
} else {
|
|
@@ -19492,7 +19757,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19492
19757
|
cparams.kv_unified,
|
|
19493
19758
|
n_ctx_per_stream,
|
|
19494
19759
|
cparams.n_seq_max,
|
|
19495
|
-
|
|
19760
|
+
1,
|
|
19496
19761
|
hparams.n_swa,
|
|
19497
19762
|
hparams.swa_type,
|
|
19498
19763
|
nullptr,
|
|
@@ -19835,6 +20100,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19835
20100
|
{
|
|
19836
20101
|
llm = std::make_unique<llm_build_bailingmoe>(*this, params);
|
|
19837
20102
|
} break;
|
|
20103
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
20104
|
+
{
|
|
20105
|
+
llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
|
|
20106
|
+
} break;
|
|
19838
20107
|
case LLM_ARCH_SEED_OSS:
|
|
19839
20108
|
{
|
|
19840
20109
|
llm = std::make_unique<llm_build_seed_oss>(*this, params);
|
|
@@ -20101,6 +20370,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
20101
20370
|
case LLM_ARCH_EXAONE:
|
|
20102
20371
|
case LLM_ARCH_EXAONE4:
|
|
20103
20372
|
case LLM_ARCH_MINICPM3:
|
|
20373
|
+
case LLM_ARCH_BAILINGMOE2:
|
|
20104
20374
|
case LLM_ARCH_DOTS1:
|
|
20105
20375
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
20106
20376
|
case LLM_ARCH_OPENAI_MOE:
|
|
@@ -107,9 +107,12 @@ enum llm_type {
|
|
|
107
107
|
LLM_TYPE_17B_16E, // llama4 Scout
|
|
108
108
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
109
109
|
LLM_TYPE_A13B,
|
|
110
|
+
LLM_TYPE_7B_A1B,
|
|
110
111
|
LLM_TYPE_8B_A1B, // lfm2moe
|
|
112
|
+
LLM_TYPE_16B_A1B,
|
|
111
113
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
112
114
|
LLM_TYPE_30B_A3B,
|
|
115
|
+
LLM_TYPE_100B_A6B,
|
|
113
116
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
114
117
|
LLM_TYPE_235B_A22B,
|
|
115
118
|
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
@@ -497,9 +500,8 @@ struct llama_model {
|
|
|
497
500
|
|
|
498
501
|
ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
|
|
499
502
|
|
|
500
|
-
// note: can mutate `cparams`
|
|
501
503
|
// TODO: move this to new llm_arch_model_i interface
|
|
502
|
-
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
504
|
+
llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
|
|
503
505
|
|
|
504
506
|
// TODO: move this to new llm_arch_model_i interface
|
|
505
507
|
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
|
@@ -1968,6 +1968,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1968
1968
|
clean_spaces = false;
|
|
1969
1969
|
} else if (
|
|
1970
1970
|
tokenizer_pre == "bailingmoe" ||
|
|
1971
|
+
tokenizer_pre == "bailingmoe2" ||
|
|
1971
1972
|
tokenizer_pre == "llada-moe") {
|
|
1972
1973
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
|
1973
1974
|
clean_spaces = false;
|