@fugood/llama.node 1.4.2 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -1
- package/lib/binding.js +3 -0
- package/lib/binding.ts +10 -0
- package/lib/index.js +9 -0
- package/lib/index.ts +10 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +25 -11
- package/src/LlamaContext.cpp +24 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/CMakeLists.txt +21 -6
- package/src/llama.cpp/common/CMakeLists.txt +6 -0
- package/src/llama.cpp/common/arg.cpp +83 -22
- package/src/llama.cpp/common/chat-parser.cpp +40 -0
- package/src/llama.cpp/common/chat-peg-parser.cpp +110 -0
- package/src/llama.cpp/common/chat-peg-parser.h +105 -0
- package/src/llama.cpp/common/chat.cpp +40 -29
- package/src/llama.cpp/common/chat.h +10 -1
- package/src/llama.cpp/common/common.cpp +70 -7
- package/src/llama.cpp/common/common.h +23 -5
- package/src/llama.cpp/common/download.cpp +18 -8
- package/src/llama.cpp/common/download.h +3 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +18 -27
- package/src/llama.cpp/common/log.h +19 -12
- package/src/llama.cpp/common/peg-parser.cpp +1712 -0
- package/src/llama.cpp/common/peg-parser.h +459 -0
- package/src/llama.cpp/common/unicode.cpp +64 -0
- package/src/llama.cpp/common/unicode.h +22 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +52 -48
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +98 -12
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +30 -1
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-graph.cpp +3 -6
- package/src/llama.cpp/src/llama-hparams.h +2 -2
- package/src/llama.cpp/src/llama-impl.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model.cpp +54 -6
- package/src/llama.cpp/src/llama-quant.cpp +0 -29
- package/src/llama.cpp/src/llama-vocab.cpp +1 -2
- package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
- package/src/llama.cpp/src/models/mistral3.cpp +160 -0
- package/src/llama.cpp/src/models/models.h +4 -0
- package/src/llama.cpp/src/unicode.cpp +2 -2
|
@@ -423,8 +423,8 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, llama_split_mode s
|
|
|
423
423
|
}
|
|
424
424
|
|
|
425
425
|
struct llama_model::impl {
|
|
426
|
-
impl()
|
|
427
|
-
~impl()
|
|
426
|
+
impl() = default;
|
|
427
|
+
~impl() = default;
|
|
428
428
|
|
|
429
429
|
uint64_t n_elements = 0;
|
|
430
430
|
|
|
@@ -461,7 +461,7 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|
|
461
461
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
462
462
|
}
|
|
463
463
|
|
|
464
|
-
llama_model::~llama_model()
|
|
464
|
+
llama_model::~llama_model() = default;
|
|
465
465
|
|
|
466
466
|
void llama_model::load_stats(llama_model_loader & ml) {
|
|
467
467
|
pimpl->n_elements = ml.n_elements;
|
|
@@ -663,8 +663,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
663
663
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
664
664
|
hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope
|
|
665
665
|
} else {
|
|
666
|
-
hparams.swa_type
|
|
667
|
-
hparams.n_swa
|
|
666
|
+
hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED;
|
|
667
|
+
hparams.n_swa = 8192;
|
|
668
|
+
hparams.n_attn_temp_floor_scale = 8192;
|
|
669
|
+
hparams.f_attn_temp_scale = 0.1f;
|
|
668
670
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
|
669
671
|
}
|
|
670
672
|
|
|
@@ -1626,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1626
1628
|
}
|
|
1627
1629
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
1628
1630
|
|
|
1631
|
+
// (optional) temperature tuning - used by mistral-large
|
|
1632
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
1633
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
|
1634
|
+
|
|
1629
1635
|
switch (hparams.n_layer) {
|
|
1630
1636
|
case 27: type = LLM_TYPE_16B; break;
|
|
1631
1637
|
case 60: type = LLM_TYPE_236B; break;
|
|
@@ -2247,6 +2253,42 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2247
2253
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2248
2254
|
}
|
|
2249
2255
|
} break;
|
|
2256
|
+
case LLM_ARCH_MISTRAL3:
|
|
2257
|
+
{
|
|
2258
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2259
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
2260
|
+
|
|
2261
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
2262
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
2263
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
2264
|
+
|
|
2265
|
+
// TODO: maybe add n_attn_temp_floor_scale as a separate KV?
|
|
2266
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
2267
|
+
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
|
2268
|
+
if (hparams.n_attn_temp_floor_scale == 0) {
|
|
2269
|
+
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
|
|
2270
|
+
}
|
|
2271
|
+
}
|
|
2272
|
+
|
|
2273
|
+
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
2274
|
+
// but may need further verification with other values
|
|
2275
|
+
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
2276
|
+
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
2277
|
+
float mscale = 1.0f;
|
|
2278
|
+
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
2279
|
+
static auto get_mscale = [](float scale, float mscale) {
|
|
2280
|
+
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
2281
|
+
};
|
|
2282
|
+
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
2283
|
+
}
|
|
2284
|
+
|
|
2285
|
+
switch (hparams.n_layer) {
|
|
2286
|
+
case 26: type = LLM_TYPE_3B; break;
|
|
2287
|
+
case 34: type = LLM_TYPE_8B; break;
|
|
2288
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
2289
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2290
|
+
}
|
|
2291
|
+
} break;
|
|
2250
2292
|
default: throw std::runtime_error("unsupported model architecture");
|
|
2251
2293
|
}
|
|
2252
2294
|
|
|
@@ -2560,6 +2602,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2560
2602
|
case LLM_ARCH_MINICPM:
|
|
2561
2603
|
case LLM_ARCH_GRANITE:
|
|
2562
2604
|
case LLM_ARCH_GRANITE_MOE:
|
|
2605
|
+
case LLM_ARCH_MISTRAL3:
|
|
2563
2606
|
{
|
|
2564
2607
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2565
2608
|
|
|
@@ -6487,7 +6530,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6487
6530
|
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0);
|
|
6488
6531
|
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
6489
6532
|
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
6490
|
-
layer.ssm_a = create_tensor(tn(
|
|
6533
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
6491
6534
|
layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0);
|
|
6492
6535
|
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
6493
6536
|
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
@@ -7522,6 +7565,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7522
7565
|
{
|
|
7523
7566
|
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
|
7524
7567
|
} break;
|
|
7568
|
+
case LLM_ARCH_MISTRAL3:
|
|
7569
|
+
{
|
|
7570
|
+
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
7571
|
+
} break;
|
|
7525
7572
|
default:
|
|
7526
7573
|
GGML_ABORT("fatal error");
|
|
7527
7574
|
}
|
|
@@ -7690,6 +7737,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
7690
7737
|
case LLM_ARCH_ARCEE:
|
|
7691
7738
|
case LLM_ARCH_ERNIE4_5:
|
|
7692
7739
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
7740
|
+
case LLM_ARCH_MISTRAL3:
|
|
7693
7741
|
return LLAMA_ROPE_TYPE_NORM;
|
|
7694
7742
|
|
|
7695
7743
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
666
666
|
|
|
667
667
|
std::map<int, std::string> mapped;
|
|
668
668
|
int blk_id = 0;
|
|
669
|
-
int pruned_attention_w = 0;
|
|
670
669
|
|
|
671
670
|
// make a list of weights
|
|
672
671
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
674
673
|
for (const auto & it : ml.weights_map) {
|
|
675
674
|
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
676
675
|
if (remapped_name.empty()) {
|
|
677
|
-
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
678
|
-
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
679
|
-
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
680
|
-
pruned_attention_w++;
|
|
681
|
-
}
|
|
682
676
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
683
677
|
continue;
|
|
684
678
|
}
|
|
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
703
697
|
});
|
|
704
698
|
}
|
|
705
699
|
|
|
706
|
-
bool is_clip_model = false;
|
|
707
700
|
for (const auto * it : tensors) {
|
|
708
701
|
const struct ggml_tensor * tensor = it->tensor;
|
|
709
702
|
|
|
@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
717
710
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
718
711
|
qs.has_output = true;
|
|
719
712
|
}
|
|
720
|
-
|
|
721
|
-
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
722
713
|
}
|
|
723
714
|
|
|
724
715
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
725
716
|
|
|
726
|
-
// sanity checks for models that have attention layers
|
|
727
|
-
if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
728
|
-
{
|
|
729
|
-
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
730
|
-
// attention layers have a non-zero number of kv heads
|
|
731
|
-
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
732
|
-
if (llama_model_has_encoder(&model)) {
|
|
733
|
-
// now n_layer_attn is the number of attention layers in the encoder
|
|
734
|
-
// for each decoder block, there are 2 attention layers
|
|
735
|
-
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
|
739
|
-
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
|
740
|
-
|
|
741
|
-
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
|
742
|
-
|
|
743
|
-
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
|
744
|
-
}
|
|
745
|
-
|
|
746
717
|
size_t total_size_org = 0;
|
|
747
718
|
size_t total_size_new = 0;
|
|
748
719
|
|
|
@@ -3253,8 +3253,7 @@ void llama_vocab::impl::print_info() const {
|
|
|
3253
3253
|
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
|
3254
3254
|
}
|
|
3255
3255
|
|
|
3256
|
-
llama_vocab::~llama_vocab()
|
|
3257
|
-
}
|
|
3256
|
+
llama_vocab::~llama_vocab() = default;
|
|
3258
3257
|
|
|
3259
3258
|
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
3260
3259
|
pimpl->load(ml, kv);
|
|
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
30
30
|
// {n_embd, n_tokens}
|
|
31
31
|
inpL = build_inp_embd(model.tok_embd);
|
|
32
32
|
|
|
33
|
+
// (optional) temperature tuning - used by mistral-large
|
|
34
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
35
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
36
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
37
|
+
}
|
|
38
|
+
|
|
33
39
|
// inp_pos - contains the positions
|
|
34
40
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
35
41
|
|
|
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
128
134
|
ggml_tensor * Vcur = kv_cmpr;
|
|
129
135
|
cb(Vcur, "Vcur", il);
|
|
130
136
|
|
|
137
|
+
if (inp_attn_scale) {
|
|
138
|
+
// apply llama 4 temperature scaling
|
|
139
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
140
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
141
|
+
}
|
|
142
|
+
|
|
131
143
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
132
144
|
cur = build_attn(inp_attn,
|
|
133
145
|
model.layers[il].wo, NULL,
|
|
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
160
172
|
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
161
173
|
cb(Kcur, "Kcur", il);
|
|
162
174
|
|
|
175
|
+
if (inp_attn_scale) {
|
|
176
|
+
// apply llama 4 temperature scaling
|
|
177
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
178
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
179
|
+
}
|
|
180
|
+
|
|
163
181
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
164
182
|
cur = build_attn(inp_attn,
|
|
165
183
|
model.layers[il].wo, NULL,
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
#include "models.h"
|
|
2
|
+
|
|
3
|
+
llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
4
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5
|
+
|
|
6
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
8
|
+
|
|
9
|
+
ggml_tensor * cur;
|
|
10
|
+
ggml_tensor * inpL;
|
|
11
|
+
|
|
12
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
13
|
+
|
|
14
|
+
// inp_pos - contains the positions
|
|
15
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
16
|
+
|
|
17
|
+
// (optional) temperature tuning
|
|
18
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
19
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
20
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
auto * inp_attn = build_attn_inp_kv();
|
|
24
|
+
|
|
25
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
26
|
+
|
|
27
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
28
|
+
|
|
29
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
30
|
+
ggml_tensor * inpSA = inpL;
|
|
31
|
+
|
|
32
|
+
// norm
|
|
33
|
+
cur = build_norm(inpL,
|
|
34
|
+
model.layers[il].attn_norm, NULL,
|
|
35
|
+
LLM_NORM_RMS, il);
|
|
36
|
+
cb(cur, "attn_norm", il);
|
|
37
|
+
|
|
38
|
+
// self-attention
|
|
39
|
+
{
|
|
40
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
41
|
+
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
|
|
42
|
+
|
|
43
|
+
// compute Q and K and RoPE them
|
|
44
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
45
|
+
cb(Qcur, "Qcur", il);
|
|
46
|
+
if (model.layers[il].bq) {
|
|
47
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
48
|
+
cb(Qcur, "Qcur", il);
|
|
49
|
+
}
|
|
50
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
51
|
+
cb(Kcur, "Kcur", il);
|
|
52
|
+
if (model.layers[il].bk) {
|
|
53
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
54
|
+
cb(Kcur, "Kcur", il);
|
|
55
|
+
}
|
|
56
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
57
|
+
cb(Vcur, "Vcur", il);
|
|
58
|
+
if (model.layers[il].bv) {
|
|
59
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
60
|
+
cb(Vcur, "Vcur", il);
|
|
61
|
+
}
|
|
62
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
63
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
64
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
65
|
+
|
|
66
|
+
Qcur = ggml_rope_ext(
|
|
67
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
68
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
69
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
70
|
+
);
|
|
71
|
+
|
|
72
|
+
Kcur = ggml_rope_ext(
|
|
73
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
74
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
75
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
cb(Qcur, "Qcur", il);
|
|
79
|
+
cb(Kcur, "Kcur", il);
|
|
80
|
+
cb(Vcur, "Vcur", il);
|
|
81
|
+
|
|
82
|
+
if (inp_attn_scale) {
|
|
83
|
+
// apply llama 4 temperature scaling
|
|
84
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
85
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
cur = build_attn(inp_attn,
|
|
89
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
90
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
|
|
91
|
+
cb(cur, "attn_out", il);
|
|
92
|
+
}
|
|
93
|
+
if (il == n_layer - 1 && inp_out_ids) {
|
|
94
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
95
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
96
|
+
}
|
|
97
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
98
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
99
|
+
|
|
100
|
+
// feed-forward network (non-MoE)
|
|
101
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
102
|
+
|
|
103
|
+
cur = build_norm(ffn_inp,
|
|
104
|
+
model.layers[il].ffn_norm, NULL,
|
|
105
|
+
LLM_NORM_RMS, il);
|
|
106
|
+
cb(cur, "ffn_norm", il);
|
|
107
|
+
|
|
108
|
+
cur = build_ffn(cur,
|
|
109
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
110
|
+
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
|
|
111
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
112
|
+
NULL,
|
|
113
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
114
|
+
cb(cur, "ffn_out", il);
|
|
115
|
+
} else {
|
|
116
|
+
// MoE branch
|
|
117
|
+
cur = build_norm(ffn_inp,
|
|
118
|
+
model.layers[il].ffn_norm, NULL,
|
|
119
|
+
LLM_NORM_RMS, il);
|
|
120
|
+
cb(cur, "ffn_norm", il);
|
|
121
|
+
|
|
122
|
+
cur = build_moe_ffn(cur,
|
|
123
|
+
model.layers[il].ffn_gate_inp,
|
|
124
|
+
model.layers[il].ffn_up_exps,
|
|
125
|
+
model.layers[il].ffn_gate_exps,
|
|
126
|
+
model.layers[il].ffn_down_exps,
|
|
127
|
+
nullptr,
|
|
128
|
+
n_expert, n_expert_used,
|
|
129
|
+
LLM_FFN_SILU, true,
|
|
130
|
+
false, 0.0,
|
|
131
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
132
|
+
il);
|
|
133
|
+
cb(cur, "ffn_moe_out", il);
|
|
134
|
+
}
|
|
135
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
136
|
+
cb(cur, "ffn_out", il);
|
|
137
|
+
|
|
138
|
+
cur = build_cvec(cur, il);
|
|
139
|
+
cb(cur, "l_out", il);
|
|
140
|
+
|
|
141
|
+
// input for next layer
|
|
142
|
+
inpL = cur;
|
|
143
|
+
}
|
|
144
|
+
cur = inpL;
|
|
145
|
+
|
|
146
|
+
cur = build_norm(cur,
|
|
147
|
+
model.output_norm, NULL,
|
|
148
|
+
LLM_NORM_RMS, -1);
|
|
149
|
+
|
|
150
|
+
cb(cur, "result_norm", -1);
|
|
151
|
+
res->t_embd = cur;
|
|
152
|
+
|
|
153
|
+
// lm_head
|
|
154
|
+
cur = build_lora_mm(model.output, cur);
|
|
155
|
+
|
|
156
|
+
cb(cur, "result_output", -1);
|
|
157
|
+
res->t_logits = cur;
|
|
158
|
+
|
|
159
|
+
ggml_build_forward_expand(gf, cur);
|
|
160
|
+
}
|
|
@@ -322,6 +322,10 @@ struct llm_build_minimax_m2 : public llm_graph_context {
|
|
|
322
322
|
llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params);
|
|
323
323
|
};
|
|
324
324
|
|
|
325
|
+
struct llm_build_mistral3 : public llm_graph_context {
|
|
326
|
+
llm_build_mistral3(const llama_model & model, const llm_graph_params & params);
|
|
327
|
+
};
|
|
328
|
+
|
|
325
329
|
struct llm_build_mpt : public llm_graph_context {
|
|
326
330
|
llm_build_mpt(const llama_model & model, const llm_graph_params & params);
|
|
327
331
|
};
|
|
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
|
499
499
|
|
|
500
500
|
// use std::wregex to split the text
|
|
501
501
|
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
|
502
|
-
std::wregex expr(regex_expr);
|
|
502
|
+
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
|
503
503
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
504
504
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
505
505
|
size_t start = 0;
|
|
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
|
|
|
529
529
|
|
|
530
530
|
// use std::regex to split the text
|
|
531
531
|
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
|
532
|
-
std::regex expr(regex_expr);
|
|
532
|
+
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
|
533
533
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
534
534
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
535
535
|
size_t start = 0;
|