@fugood/llama.node 1.3.8 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.js +25 -18
- package/lib/binding.ts +19 -1
- package/lib/index.js +3 -3
- package/lib/index.ts +1 -1
- package/package.json +17 -17
- package/scripts/llama.cpp.patch +53 -4
- package/src/LlamaCompletionWorker.cpp +2 -2
- package/src/LlamaContext.cpp +6 -1
- package/src/llama.cpp/common/arg.cpp +1 -1
- package/src/llama.cpp/common/chat-parser.cpp +968 -0
- package/src/llama.cpp/common/chat.cpp +0 -952
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +336 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +11 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +234 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +48 -3
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +6 -2
- package/src/llama.cpp/src/llama-hparams.h +1 -1
- package/src/llama.cpp/src/llama-model.cpp +102 -5
- package/src/llama.cpp/src/llama-model.h +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +13 -5
- package/src/llama.cpp/src/models/lfm2.cpp +5 -3
- package/src/llama.cpp/src/models/models.h +51 -1
- package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
#include "llama-impl.h"
|
|
4
4
|
#include "llama-mmap.h"
|
|
5
|
-
#include "llama-batch.h"
|
|
6
5
|
#include "llama-cparams.h"
|
|
7
6
|
#include "llama-model-loader.h"
|
|
8
7
|
|
|
@@ -2225,6 +2224,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2225
2224
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2226
2225
|
}
|
|
2227
2226
|
} break;
|
|
2227
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
2228
|
+
{
|
|
2229
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
2230
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2231
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2232
|
+
|
|
2233
|
+
// Load linear attention (gated delta net) parameters
|
|
2234
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
2235
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
2236
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
2237
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
2238
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
2239
|
+
|
|
2240
|
+
// Mark recurrent layers (linear attention layers)
|
|
2241
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2242
|
+
hparams.recurrent_layer_arr[i] = ((i + 1) % 4 != 0); // TODO: extract the magic 4 from "full_attention_interval"
|
|
2243
|
+
}
|
|
2244
|
+
|
|
2245
|
+
switch (hparams.n_layer) {
|
|
2246
|
+
case 80: type = LLM_TYPE_80B_A3B; break;
|
|
2247
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2248
|
+
}
|
|
2249
|
+
} break;
|
|
2228
2250
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2229
2251
|
}
|
|
2230
2252
|
|
|
@@ -6133,9 +6155,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6133
6155
|
case LLM_ARCH_LFM2:
|
|
6134
6156
|
case LLM_ARCH_LFM2MOE:
|
|
6135
6157
|
{
|
|
6136
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,
|
|
6137
|
-
|
|
6138
|
-
|
|
6158
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6159
|
+
|
|
6160
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6161
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
6139
6162
|
|
|
6140
6163
|
if (output == NULL) {
|
|
6141
6164
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
@@ -6414,6 +6437,74 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6414
6437
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
6415
6438
|
}
|
|
6416
6439
|
} break;
|
|
6440
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
6441
|
+
{
|
|
6442
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
6443
|
+
|
|
6444
|
+
// output
|
|
6445
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
6446
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
6447
|
+
|
|
6448
|
+
// if output is NULL, init from the input tok embed
|
|
6449
|
+
if (output == NULL) {
|
|
6450
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
6451
|
+
}
|
|
6452
|
+
|
|
6453
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
6454
|
+
|
|
6455
|
+
// Calculate dimensions from hyperparameters
|
|
6456
|
+
const int64_t head_k_dim = hparams.ssm_d_state;
|
|
6457
|
+
const int64_t head_v_dim = hparams.ssm_d_state;
|
|
6458
|
+
const int64_t n_k_heads = hparams.ssm_n_group;
|
|
6459
|
+
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
|
6460
|
+
const int64_t key_dim = head_k_dim * n_k_heads;
|
|
6461
|
+
const int64_t value_dim = head_v_dim * n_v_heads;
|
|
6462
|
+
const int64_t conv_dim = key_dim * 2 + value_dim;
|
|
6463
|
+
|
|
6464
|
+
// Calculate projection sizes
|
|
6465
|
+
const int64_t qkvz_dim = key_dim * 2 + value_dim * 2;
|
|
6466
|
+
const int64_t ba_dim = n_v_heads * 2;
|
|
6467
|
+
|
|
6468
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6469
|
+
auto & layer = layers[i];
|
|
6470
|
+
|
|
6471
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
6472
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
|
6473
|
+
|
|
6474
|
+
if (!hparams.is_recurrent(i)) {
|
|
6475
|
+
// Attention layers
|
|
6476
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
|
|
6477
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
6478
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
6479
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
6480
|
+
|
|
6481
|
+
// Q/K normalization for attention layers
|
|
6482
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6483
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
6484
|
+
} else {
|
|
6485
|
+
// Linear attention (gated delta net) specific tensors
|
|
6486
|
+
// Create tensors with calculated dimensions
|
|
6487
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
|
6488
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6489
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6490
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0);
|
|
6491
|
+
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
|
6492
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
6493
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
6494
|
+
}
|
|
6495
|
+
|
|
6496
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
|
6497
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
6498
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
|
6499
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
6500
|
+
|
|
6501
|
+
// Shared experts
|
|
6502
|
+
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
|
6503
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
|
6504
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, hparams.n_ff_shexp }, 0);
|
|
6505
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { hparams.n_ff_shexp, n_embd }, 0);
|
|
6506
|
+
}
|
|
6507
|
+
} break;
|
|
6417
6508
|
default:
|
|
6418
6509
|
throw std::runtime_error("unknown architecture");
|
|
6419
6510
|
}
|
|
@@ -6684,6 +6775,7 @@ void llama_model::print_info() const {
|
|
|
6684
6775
|
arch == LLM_ARCH_FALCON_H1 ||
|
|
6685
6776
|
arch == LLM_ARCH_PLAMO2 ||
|
|
6686
6777
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
6778
|
+
arch == LLM_ARCH_QWEN3NEXT ||
|
|
6687
6779
|
arch == LLM_ARCH_NEMOTRON_H) {
|
|
6688
6780
|
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
6689
6781
|
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
@@ -7425,7 +7517,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7425
7517
|
case LLM_ARCH_PANGU_EMBED:
|
|
7426
7518
|
{
|
|
7427
7519
|
llm = std::make_unique<llm_build_pangu_embedded>(*this, params);
|
|
7428
|
-
}break;
|
|
7520
|
+
} break;
|
|
7521
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
7522
|
+
{
|
|
7523
|
+
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
|
7524
|
+
} break;
|
|
7429
7525
|
default:
|
|
7430
7526
|
GGML_ABORT("fatal error");
|
|
7431
7527
|
}
|
|
@@ -7652,6 +7748,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7652
7748
|
case LLM_ARCH_COGVLM:
|
|
7653
7749
|
case LLM_ARCH_PANGU_EMBED:
|
|
7654
7750
|
case LLM_ARCH_AFMOE:
|
|
7751
|
+
case LLM_ARCH_QWEN3NEXT:
|
|
7655
7752
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
7656
7753
|
|
|
7657
7754
|
case LLM_ARCH_QWEN2VL:
|
|
@@ -113,6 +113,7 @@ enum llm_type {
|
|
|
113
113
|
LLM_TYPE_16B_A1B,
|
|
114
114
|
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
115
115
|
LLM_TYPE_30B_A3B,
|
|
116
|
+
LLM_TYPE_80B_A3B, // Qwen3 Next
|
|
116
117
|
LLM_TYPE_100B_A6B,
|
|
117
118
|
LLM_TYPE_106B_A12B, // GLM-4.5-Air
|
|
118
119
|
LLM_TYPE_230B_A10B, // Minimax M2
|
|
@@ -309,6 +310,9 @@ struct llama_layer {
|
|
|
309
310
|
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
|
310
311
|
struct ggml_tensor * ssm_dt_b = nullptr;
|
|
311
312
|
|
|
313
|
+
// qwen3next
|
|
314
|
+
struct ggml_tensor * ssm_beta_alpha = nullptr;
|
|
315
|
+
|
|
312
316
|
// rwkv
|
|
313
317
|
struct ggml_tensor * time_mix_w1 = nullptr;
|
|
314
318
|
struct ggml_tensor * time_mix_w2 = nullptr;
|
|
@@ -681,7 +681,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
681
681
|
}
|
|
682
682
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
683
683
|
continue;
|
|
684
|
-
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
if (remapped_name != it.first) {
|
|
685
687
|
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
686
688
|
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
687
689
|
}
|
|
@@ -726,13 +728,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
726
728
|
{
|
|
727
729
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
728
730
|
// attention layers have a non-zero number of kv heads
|
|
729
|
-
int32_t
|
|
731
|
+
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
730
732
|
if (llama_model_has_encoder(&model)) {
|
|
731
|
-
// now
|
|
733
|
+
// now n_layer_attn is the number of attention layers in the encoder
|
|
732
734
|
// for each decoder block, there are 2 attention layers
|
|
733
|
-
|
|
735
|
+
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
|
734
736
|
}
|
|
735
|
-
|
|
737
|
+
|
|
738
|
+
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
|
739
|
+
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
|
740
|
+
|
|
741
|
+
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
|
742
|
+
|
|
743
|
+
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
|
736
744
|
}
|
|
737
745
|
|
|
738
746
|
size_t total_size_org = 0;
|
|
@@ -9,6 +9,8 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
|
|
|
9
9
|
ggml_tensor * cur = build_inp_embd(model.tok_embd);
|
|
10
10
|
cb(cur, "model.embed_tokens", -1);
|
|
11
11
|
|
|
12
|
+
ggml_build_forward_expand(gf, cur);
|
|
13
|
+
|
|
12
14
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
13
15
|
auto * inp_hybrid = build_inp_mem_hybrid();
|
|
14
16
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
@@ -40,12 +42,12 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params
|
|
|
40
42
|
cur = ggml_add(ctx0, cur, ffn_out);
|
|
41
43
|
}
|
|
42
44
|
|
|
43
|
-
cur = build_norm(cur, model.
|
|
44
|
-
cb(cur, "
|
|
45
|
+
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
|
46
|
+
cb(cur, "result_norm", -1);
|
|
45
47
|
res->t_embd = cur;
|
|
46
48
|
|
|
47
49
|
cur = build_lora_mm(model.output, cur);
|
|
48
|
-
cb(cur, "
|
|
50
|
+
cb(cur, "result_output", -1);
|
|
49
51
|
|
|
50
52
|
res->t_logits = cur;
|
|
51
53
|
|
|
@@ -2,8 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
#include "../llama-model.h"
|
|
4
4
|
#include "../llama-graph.h"
|
|
5
|
-
#include "../llama-memory-recurrent.h"
|
|
6
5
|
|
|
6
|
+
// TODO: remove in follow-up PR - move to .cpp files
|
|
7
|
+
#include "../llama-memory-recurrent.h"
|
|
7
8
|
#include <cmath>
|
|
8
9
|
|
|
9
10
|
struct llm_graph_context_mamba : public llm_graph_context {
|
|
@@ -421,7 +422,56 @@ struct llm_build_qwen3vl : public llm_graph_context {
|
|
|
421
422
|
struct llm_build_qwen3vlmoe : public llm_graph_context {
|
|
422
423
|
llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params);
|
|
423
424
|
};
|
|
425
|
+
struct llm_build_qwen3next : public llm_graph_context_mamba {
|
|
426
|
+
llm_build_qwen3next(const llama_model & model, const llm_graph_params & params);
|
|
427
|
+
private:
|
|
428
|
+
ggml_tensor * build_layer_attn(
|
|
429
|
+
llm_graph_input_attn_kv * inp_attn,
|
|
430
|
+
ggml_tensor * cur,
|
|
431
|
+
ggml_tensor * inp_pos,
|
|
432
|
+
int il);
|
|
433
|
+
|
|
434
|
+
ggml_tensor * build_layer_attn_linear(
|
|
435
|
+
llm_graph_input_rs * inp,
|
|
436
|
+
ggml_tensor * cur,
|
|
437
|
+
ggml_tensor * causal_mask,
|
|
438
|
+
ggml_tensor * identity,
|
|
439
|
+
int il);
|
|
424
440
|
|
|
441
|
+
ggml_tensor * build_layer_ffn(
|
|
442
|
+
ggml_tensor * cur,
|
|
443
|
+
int il);
|
|
444
|
+
|
|
445
|
+
ggml_tensor * build_delta_net_recurrent(
|
|
446
|
+
ggml_tensor * q,
|
|
447
|
+
ggml_tensor * k,
|
|
448
|
+
ggml_tensor * v,
|
|
449
|
+
ggml_tensor * g,
|
|
450
|
+
ggml_tensor * beta,
|
|
451
|
+
ggml_tensor * state,
|
|
452
|
+
ggml_tensor * causal_mask,
|
|
453
|
+
ggml_tensor * identity,
|
|
454
|
+
int il);
|
|
455
|
+
|
|
456
|
+
ggml_tensor * build_delta_net_chunking(
|
|
457
|
+
ggml_tensor * q,
|
|
458
|
+
ggml_tensor * k,
|
|
459
|
+
ggml_tensor * v,
|
|
460
|
+
ggml_tensor * g,
|
|
461
|
+
ggml_tensor * beta,
|
|
462
|
+
ggml_tensor * state,
|
|
463
|
+
ggml_tensor * causal_mask,
|
|
464
|
+
ggml_tensor * identity,
|
|
465
|
+
int il);
|
|
466
|
+
|
|
467
|
+
ggml_tensor * build_norm_gated(
|
|
468
|
+
ggml_tensor * input,
|
|
469
|
+
ggml_tensor * weights,
|
|
470
|
+
ggml_tensor * gate,
|
|
471
|
+
int layer);
|
|
472
|
+
|
|
473
|
+
const llama_model & model;
|
|
474
|
+
};
|
|
425
475
|
|
|
426
476
|
struct llm_build_qwen : public llm_graph_context {
|
|
427
477
|
llm_build_qwen(const llama_model & model, const llm_graph_params & params);
|