@fugood/llama.node 1.3.7 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.js +18 -1
- package/lib/binding.ts +19 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +1 -1
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +7 -7
- package/src/LlamaCompletionWorker.cpp +2 -2
- package/src/llama.cpp/common/arg.cpp +27 -2
- package/src/llama.cpp/common/chat-parser.cpp +968 -0
- package/src/llama.cpp/common/chat.cpp +0 -952
- package/src/llama.cpp/common/common.cpp +55 -0
- package/src/llama.cpp/common/common.h +18 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml.h +12 -4
- package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
- package/src/llama.cpp/include/llama.h +18 -0
- package/src/llama.cpp/src/CMakeLists.txt +2 -0
- package/src/llama.cpp/src/llama-arch.cpp +95 -16
- package/src/llama.cpp/src/llama-arch.h +15 -0
- package/src/llama.cpp/src/llama-context.cpp +7 -3
- package/src/llama.cpp/src/llama-graph.cpp +3 -3
- package/src/llama.cpp/src/llama-hparams.h +1 -1
- package/src/llama.cpp/src/llama-model.cpp +141 -6
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +13 -5
- package/src/llama.cpp/src/models/lfm2.cpp +5 -3
- package/src/llama.cpp/src/models/models.h +55 -1
- package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
- package/src/llama.cpp/src/models/rnd1.cpp +126 -0
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama-impl.h"
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
|
-
#include "llama-batch.h"
|
|
6
5
|
#include "llama-cparams.h"
|
|
7
6
|
#include "llama-model-loader.h"
|
|
8
7
|
|
|
@@ -1036,6 +1035,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1036
1035
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1037
1036
|
}
|
|
1038
1037
|
} break;
|
|
1038
|
+
case LLM_ARCH_RND1:
|
|
1039
|
+
{
|
|
1040
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
1041
|
+
|
|
1042
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1043
|
+
switch (hparams.n_layer) {
|
|
1044
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
1045
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1046
|
+
}
|
|
1047
|
+
// Set non-causal attention for diffusion models
|
|
1048
|
+
hparams.causal_attn = false;
|
|
1049
|
+
} break;
|
|
1039
1050
|
case LLM_ARCH_QWEN2MOE:
|
|
1040
1051
|
{
|
|
1041
1052
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
@@ -2213,6 +2224,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2213
2224
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2214
2225
|
}
|
|
2215
2226
|
} break;
|
|
2227
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
2228
|
+
{
|
|
2229
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
2230
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2231
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2232
|
+
|
|
2233
|
+
// Load linear attention (gated delta net) parameters
|
|
2234
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
2235
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
2236
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
2237
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
2238
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
2239
|
+
|
|
2240
|
+
// Mark recurrent layers (linear attention layers)
|
|
2241
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2242
|
+
hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
|
|
2243
|
+
}
|
|
2244
|
+
|
|
2245
|
+
switch (hparams.n_layer) {
|
|
2246
|
+
case 80: type = LLM_TYPE_80B_A3B; break;
|
|
2247
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2248
|
+
}
|
|
2249
|
+
} break;
|
|
2216
2250
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2217
2251
|
}
|
|
2218
2252
|
|
|
@@ -3402,6 +3436,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3402
3436
|
} break;
|
|
3403
3437
|
case LLM_ARCH_QWEN3MOE:
|
|
3404
3438
|
case LLM_ARCH_QWEN3VLMOE:
|
|
3439
|
+
case LLM_ARCH_RND1:
|
|
3405
3440
|
{
|
|
3406
3441
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3407
3442
|
|
|
@@ -6120,9 +6155,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6120
6155
|
case LLM_ARCH_LFM2:
|
|
6121
6156
|
case LLM_ARCH_LFM2MOE:
|
|
6122
6157
|
{
|
|
6123
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,
|
|
6124
|
-
|
|
6125
|
-
|
|
6158
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6159
|
+
|
|
6160
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6161
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6126
6162
|
|
|
6127
6163
|
if (output == NULL) {
|
|
6128
6164
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -6401,6 +6437,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6401
6437
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
6402
6438
|
}
|
|
6403
6439
|
} break;
|
|
6440
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
6441
|
+
{
|
|
6442
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
6443
|
+
|
|
6444
|
+
// output
|
|
6445
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
6446
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
6447
|
+
|
|
6448
|
+
// if output is NULL, init from the input tok embed
|
|
6449
|
+
if (output == NULL) {
|
|
6450
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
6451
|
+
}
|
|
6452
|
+
|
|
6453
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
6454
|
+
|
|
6455
|
+
// Calculate dimensions from hyperparameters
|
|
6456
|
+
const int64_t head_k_dim = hparams.ssm_d_state;
|
|
6457
|
+
const int64_t head_v_dim = hparams.ssm_d_state;
|
|
6458
|
+
const int64_t n_k_heads = hparams.ssm_n_group;
|
|
6459
|
+
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
|
6460
|
+
const int64_t key_dim = head_k_dim * n_k_heads;
|
|
6461
|
+
const int64_t value_dim = head_v_dim * n_v_heads;
|
|
6462
|
+
const int64_t conv_dim = key_dim * 2 + value_dim;
|
|
6463
|
+
|
|
6464
|
+
// Calculate projection sizes
|
|
6465
|
+
const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
|
|
6466
|
+
const int64_t ba_dim = n_v_heads * 2;
|
|
6467
|
+
|
|
6468
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6469
|
+
auto & layer = layers[i];
|
|
6470
|
+
|
|
6471
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
6472
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
|
6473
|
+
|
|
6474
|
+
if (!hparams.is_recurrent(i)) {
|
|
6475
|
+
// Attention layers
|
|
6476
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
|
|
6477
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
6478
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
6479
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
6480
|
+
|
|
6481
|
+
// Q/K normalization for attention layers
|
|
6482
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6483
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6484
|
+
} else {
|
|
6485
|
+
// Linear attention (gated delta net) specific tensors
|
|
6486
|
+
// Create tensors with calculated dimensions
|
|
6487
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
|
6488
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6489
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6490
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
|
|
6491
|
+
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
|
6492
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
6493
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
6494
|
+
}
|
|
6495
|
+
|
|
6496
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
|
6497
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
6498
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
|
6499
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
6500
|
+
|
|
6501
|
+
// Shared experts
|
|
6502
|
+
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
|
6503
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
|
6504
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
|
6505
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
6506
|
+
}
|
|
6507
|
+
} break;
|
|
6404
6508
|
default:
|
|
6405
6509
|
throw std::runtime_error("unknown architecture");
|
|
6406
6510
|
}
|
|
@@ -6671,6 +6775,7 @@ void llama_model::print_info() const {
|
|
|
6671
6775
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
6672
6776
|
arch == LLM_ARCH_PLAMO2 ||
|
|
6673
6777
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6778
|
+
arch == LLM_ARCH_QWEN3NEXT ||
|
|
6674
6779
|
arch == LLM_ARCH_NEMOTRON_H) {
|
|
6675
6780
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
6676
6781
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
@@ -6720,7 +6825,7 @@ void llama_model::print_info() const {
|
|
|
6720
6825
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6721
6826
|
}
|
|
6722
6827
|
|
|
6723
|
-
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE) {
|
|
6828
|
+
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
6724
6829
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
6725
6830
|
}
|
|
6726
6831
|
|
|
@@ -6882,6 +6987,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
6882
6987
|
case LLM_ARCH_DREAM:
|
|
6883
6988
|
case LLM_ARCH_LLADA:
|
|
6884
6989
|
case LLM_ARCH_LLADA_MOE:
|
|
6990
|
+
case LLM_ARCH_RND1:
|
|
6885
6991
|
{
|
|
6886
6992
|
res = nullptr;
|
|
6887
6993
|
} break;
|
|
@@ -7075,6 +7181,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7075
7181
|
llm = std::make_unique<llm_build_llada_moe>(*this, params);
|
|
7076
7182
|
}
|
|
7077
7183
|
break;
|
|
7184
|
+
case LLM_ARCH_RND1:
|
|
7185
|
+
{
|
|
7186
|
+
llm = std::make_unique<llm_build_rnd1>(*this, params);
|
|
7187
|
+
}
|
|
7188
|
+
break;
|
|
7078
7189
|
case LLM_ARCH_QWEN2VL:
|
|
7079
7190
|
{
|
|
7080
7191
|
llm = std::make_unique<llm_build_qwen2vl>(*this, params);
|
|
@@ -7406,7 +7517,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7406
7517
|
case LLM_ARCH_PANGU_EMBED:
|
|
7407
7518
|
{
|
|
7408
7519
|
llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
|
|
7409
|
-
}break;
|
|
7520
|
+
} break;
|
|
7521
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
7522
|
+
{
|
|
7523
|
+
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
|
7524
|
+
} break;
|
|
7410
7525
|
default:
|
|
7411
7526
|
GGML_ABORT("fatal error");
|
|
7412
7527
|
}
|
|
@@ -7595,6 +7710,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7595
7710
|
case LLM_ARCH_QWEN3:
|
|
7596
7711
|
case LLM_ARCH_QWEN3MOE:
|
|
7597
7712
|
case LLM_ARCH_LLADA_MOE:
|
|
7713
|
+
case LLM_ARCH_RND1:
|
|
7598
7714
|
case LLM_ARCH_OLMO2:
|
|
7599
7715
|
case LLM_ARCH_OLMOE:
|
|
7600
7716
|
case LLM_ARCH_PHI2:
|
|
@@ -7632,6 +7748,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7632
7748
|
case LLM_ARCH_COGVLM:
|
|
7633
7749
|
case LLM_ARCH_PANGU_EMBED:
|
|
7634
7750
|
case LLM_ARCH_AFMOE:
|
|
7751
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
7635
7752
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
7636
7753
|
|
|
7637
7754
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -7667,6 +7784,24 @@ int32_t llama_model_meta_count(const llama_model * model) {
|
|
|
7667
7784
|
return (int)model->gguf_kv.size();
|
|
7668
7785
|
}
|
|
7669
7786
|
|
|
7787
|
+
const char * llama_model_meta_key_str(llama_model_meta_key key) {
|
|
7788
|
+
switch (key) {
|
|
7789
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE: return "general.sampling.sequence";
|
|
7790
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_K: return "general.sampling.top_k";
|
|
7791
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TOP_P: return "general.sampling.top_p";
|
|
7792
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIN_P: return "general.sampling.min_p";
|
|
7793
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY: return "general.sampling.xtc_probability";
|
|
7794
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD: return "general.sampling.xtc_threshold";
|
|
7795
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_TEMP: return "general.sampling.temp";
|
|
7796
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N: return "general.sampling.penalty_last_n";
|
|
7797
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT: return "general.sampling.penalty_repeat";
|
|
7798
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT: return "general.sampling.mirostat";
|
|
7799
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU: return "general.sampling.mirostat_tau";
|
|
7800
|
+
case LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA: return "general.sampling.mirostat_eta";
|
|
7801
|
+
default: return nullptr;
|
|
7802
|
+
}
|
|
7803
|
+
}
|
|
7804
|
+
|
|
7670
7805
|
int32_t llama_model_meta_key_by_index(const llama_model * model, int i, char * buf, size_t buf_size) {
|
|
7671
7806
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
|
7672
7807
|
if (buf_size > 0) {
|
|
@@ -113,6 +113,7 @@ enum llm_type {
|
|
|
113
113
|
LLM_TYPE_16B_A1B,
|
|
114
114
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
115
115
|
LLM_TYPE_30B_A3B,
|
|
116
|
+
LLM_TYPE_80B_A3B, // Qwen3 Next
|
|
116
117
|
LLM_TYPE_100B_A6B,
|
|
117
118
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
118
119
|
LLM_TYPE_230B_A10B, // Minimax M2
|
|
@@ -309,6 +310,9 @@ struct llama_layer {
|
|
|
309
310
|
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
|
310
311
|
struct ggml_tensor * ssm_dt_b = nullptr;
|
|
311
312
|
|
|
313
|
+
// qwen3next
|
|
314
|
+
struct ggml_tensor * ssm_beta_alpha = nullptr;
|
|
315
|
+
|
|
312
316
|
// rwkv
|
|
313
317
|
struct ggml_tensor * time_mix_w1 = nullptr;
|
|
314
318
|
struct ggml_tensor * time_mix_w2 = nullptr;
|
|
@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
681
681
|
}
|
|
682
682
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
683
683
|
continue;
|
|
684
|
-
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
if (remapped_name != it.first) {
|
|
685
687
|
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
686
688
|
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
687
689
|
}
|
|
@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
726
728
|
{
|
|
727
729
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
728
730
|
// attention layers have a non-zero number of kv heads
|
|
729
|
-
int32_t
|
|
731
|
+
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
730
732
|
if (llama_model_has_encoder(&model)) {
|
|
731
|
-
// now
|
|
733
|
+
// now n_layer_attn is the number of attention layers in the encoder
|
|
732
734
|
// for each decoder block, there are 2 attention layers
|
|
733
|
-
|
|
735
|
+
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
|
734
736
|
}
|
|
735
|
-
|
|
737
|
+
|
|
738
|
+
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
|
739
|
+
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
|
740
|
+
|
|
741
|
+
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
|
742
|
+
|
|
743
|
+
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
|
736
744
|
}
|
|
737
745
|
|
|
738
746
|
size_t total_size_org = 0;
|
|
@@ -9,6 +9,8 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
|
|
|
9
9
|
ggml_tensor * cur = build_inp_embd(model.tok_embd);
|
|
10
10
|
cb(cur, "model.embed_tokens", -1);
|
|
11
11
|
|
|
12
|
+
ggml_build_forward_expand(gf, cur);
|
|
13
|
+
|
|
12
14
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13
15
|
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
14
16
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -40,12 +42,12 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
|
|
|
40
42
|
cur = ggml_add(ctx0, cur, ffn_out);
|
|
41
43
|
}
|
|
42
44
|
|
|
43
|
-
cur = build_norm(cur, model.
|
|
44
|
-
cb(cur, "
|
|
45
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
46
|
+
cb(cur, "result_norm", -1);
|
|
45
47
|
res->t_embd = cur;
|
|
46
48
|
|
|
47
49
|
cur = build_lora_mm(model.output, cur);
|
|
48
|
-
cb(cur, "
|
|
50
|
+
cb(cur, "result_output", -1);
|
|
49
51
|
|
|
50
52
|
res->t_logits = cur;
|
|
51
53
|
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
#include "../llama-model.h"
|
|
4
4
|
#include "../llama-graph.h"
|
|
5
|
-
#include "../llama-memory-recurrent.h"
|
|
6
5
|
|
|
6
|
+
// TODO: remove in follow-up PR - move to .cpp files
|
|
7
|
+
#include "../llama-memory-recurrent.h"
|
|
7
8
|
#include <cmath>
|
|
8
9
|
|
|
9
10
|
struct llm_graph_context_mamba : public llm_graph_context {
|
|
@@ -421,7 +422,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
|
|
|
421
422
|
struct llm_build_qwen3vlmoe : public llm_graph_context {
|
|
422
423
|
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
|
|
423
424
|
};
|
|
425
|
+
struct llm_build_qwen3next : public llm_graph_context_mamba {
|
|
426
|
+
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
|
|
427
|
+
private:
|
|
428
|
+
ggml_tensor * build_layer_attn(
|
|
429
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
430
|
+
ggml_tensor * cur,
|
|
431
|
+
ggml_tensor * inp_pos,
|
|
432
|
+
int il);
|
|
433
|
+
|
|
434
|
+
ggml_tensor * build_layer_attn_linear(
|
|
435
|
+
llm_graph_input_rs * inp,
|
|
436
|
+
ggml_tensor * cur,
|
|
437
|
+
ggml_tensor * causal_mask,
|
|
438
|
+
ggml_tensor * identity,
|
|
439
|
+
int il);
|
|
424
440
|
|
|
441
|
+
ggml_tensor * build_layer_ffn(
|
|
442
|
+
ggml_tensor * cur,
|
|
443
|
+
int il);
|
|
444
|
+
|
|
445
|
+
ggml_tensor * build_delta_net_recurrent(
|
|
446
|
+
ggml_tensor * q,
|
|
447
|
+
ggml_tensor * k,
|
|
448
|
+
ggml_tensor * v,
|
|
449
|
+
ggml_tensor * g,
|
|
450
|
+
ggml_tensor * beta,
|
|
451
|
+
ggml_tensor * state,
|
|
452
|
+
ggml_tensor * causal_mask,
|
|
453
|
+
ggml_tensor * identity,
|
|
454
|
+
int il);
|
|
455
|
+
|
|
456
|
+
ggml_tensor * build_delta_net_chunking(
|
|
457
|
+
ggml_tensor * q,
|
|
458
|
+
ggml_tensor * k,
|
|
459
|
+
ggml_tensor * v,
|
|
460
|
+
ggml_tensor * g,
|
|
461
|
+
ggml_tensor * beta,
|
|
462
|
+
ggml_tensor * state,
|
|
463
|
+
ggml_tensor * causal_mask,
|
|
464
|
+
ggml_tensor * identity,
|
|
465
|
+
int il);
|
|
466
|
+
|
|
467
|
+
ggml_tensor * build_norm_gated(
|
|
468
|
+
ggml_tensor * input,
|
|
469
|
+
ggml_tensor * weights,
|
|
470
|
+
ggml_tensor * gate,
|
|
471
|
+
int layer);
|
|
472
|
+
|
|
473
|
+
const llama_model & model;
|
|
474
|
+
};
|
|
425
475
|
|
|
426
476
|
struct llm_build_qwen : public llm_graph_context {
|
|
427
477
|
llm_build_qwen(const llama_model & model, const llm_graph_params & params);
|
|
@@ -431,6 +481,10 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
431
481
|
llm_build_refact(const llama_model & model, const llm_graph_params & params);
|
|
432
482
|
};
|
|
433
483
|
|
|
484
|
+
struct llm_build_rnd1 : public llm_graph_context {
|
|
485
|
+
llm_build_rnd1(const llama_model & model, const llm_graph_params & params);
|
|
486
|
+
};
|
|
487
|
+
|
|
434
488
|
struct llm_build_rwkv6 : public llm_build_rwkv6_base {
|
|
435
489
|
llm_build_rwkv6(const llama_model & model, const llm_graph_params & params);
|
|
436
490
|
};
|