@fugood/llama.node 1.4.3 → 1.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/lib/index.js +9 -0
- package/lib/index.ts +10 -0
- package/package.json +15 -15
- package/src/LlamaContext.cpp +24 -0
- package/src/LlamaContext.h +3 -0
- package/src/llama.cpp/common/arg.cpp +19 -7
- package/src/llama.cpp/common/common.cpp +46 -2
- package/src/llama.cpp/common/common.h +7 -0
- package/src/llama.cpp/common/log.cpp +3 -26
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
- package/src/llama.cpp/ggml/include/ggml.h +22 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +38 -11
- package/src/llama.cpp/src/llama-model.cpp +4 -0
- package/src/llama.cpp/src/llama-quant.cpp +0 -29
- package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
- package/src/llama.cpp/src/unicode.cpp +2 -2
|
@@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
|
|
|
6554
6554
|
ggml_compute_forward_mul_mat(params, &dst);
|
|
6555
6555
|
}
|
|
6556
6556
|
|
|
6557
|
+
static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
|
|
6558
|
+
return (coord + size) % size; // adding size avoids negative number weirdness
|
|
6559
|
+
}
|
|
6560
|
+
|
|
6557
6561
|
// ggml_compute_forward_conv_2d
|
|
6558
6562
|
|
|
6563
|
+
|
|
6559
6564
|
static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
|
|
6560
6565
|
const ggml_tensor * kernel, // [KW, KH, IC, OC]
|
|
6561
6566
|
const ggml_tensor * src, // [W, H, C, N]
|
|
@@ -7591,6 +7596,7 @@ void ggml_compute_forward_upscale(
|
|
|
7591
7596
|
|
|
7592
7597
|
// ggml_compute_forward_pad
|
|
7593
7598
|
|
|
7599
|
+
template<bool circular_t>
|
|
7594
7600
|
static void ggml_compute_forward_pad_f32(
|
|
7595
7601
|
const ggml_compute_params * params,
|
|
7596
7602
|
ggml_tensor * dst) {
|
|
@@ -7615,23 +7621,40 @@ static void ggml_compute_forward_pad_f32(
|
|
|
7615
7621
|
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
7616
7622
|
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
7617
7623
|
|
|
7618
|
-
|
|
7619
7624
|
// TODO: optimize
|
|
7620
7625
|
|
|
7621
7626
|
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
|
7622
7627
|
for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
|
7623
7628
|
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
|
7624
7629
|
for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
|
7625
|
-
|
|
7626
|
-
if (
|
|
7627
|
-
|
|
7628
|
-
|
|
7629
|
-
|
|
7630
|
-
const int64_t
|
|
7630
|
+
// circular means wrap around on a torus, so x and y loop around
|
|
7631
|
+
if constexpr (circular_t) {
|
|
7632
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
7633
|
+
const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
|
|
7634
|
+
const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
|
|
7635
|
+
const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
|
|
7636
|
+
const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
|
|
7637
|
+
|
|
7638
|
+
const int64_t src_idx =
|
|
7639
|
+
src_i3*nb03 +
|
|
7640
|
+
src_i2*nb02 +
|
|
7641
|
+
src_i1*nb01 +
|
|
7642
|
+
src_i0*nb00;
|
|
7643
|
+
|
|
7631
7644
|
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
7632
7645
|
dst_ptr[dst_idx] = *src_ptr;
|
|
7633
7646
|
} else {
|
|
7634
|
-
|
|
7647
|
+
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
|
7648
|
+
if ((i0 >= lp0 && i0 < ne0 - rp0) \
|
|
7649
|
+
&& (i1 >= lp1 && i1 < ne1 - rp1) \
|
|
7650
|
+
&& (i2 >= lp2 && i2 < ne2 - rp2) \
|
|
7651
|
+
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
|
|
7652
|
+
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
|
|
7653
|
+
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
|
|
7654
|
+
dst_ptr[dst_idx] = *src_ptr;
|
|
7655
|
+
} else {
|
|
7656
|
+
dst_ptr[dst_idx] = 0;
|
|
7657
|
+
}
|
|
7635
7658
|
}
|
|
7636
7659
|
}
|
|
7637
7660
|
}
|
|
@@ -7639,16 +7662,20 @@ static void ggml_compute_forward_pad_f32(
|
|
|
7639
7662
|
}
|
|
7640
7663
|
}
|
|
7641
7664
|
|
|
7665
|
+
|
|
7642
7666
|
void ggml_compute_forward_pad(
|
|
7643
7667
|
const ggml_compute_params * params,
|
|
7644
7668
|
ggml_tensor * dst) {
|
|
7645
|
-
|
|
7646
7669
|
const ggml_tensor * src0 = dst->src[0];
|
|
7647
|
-
|
|
7670
|
+
const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
|
|
7648
7671
|
switch (src0->type) {
|
|
7649
7672
|
case GGML_TYPE_F32:
|
|
7650
7673
|
{
|
|
7651
|
-
|
|
7674
|
+
if (circular) {
|
|
7675
|
+
ggml_compute_forward_pad_f32<true>(params, dst);
|
|
7676
|
+
} else {
|
|
7677
|
+
ggml_compute_forward_pad_f32<false>(params, dst);
|
|
7678
|
+
}
|
|
7652
7679
|
} break;
|
|
7653
7680
|
default:
|
|
7654
7681
|
{
|
|
@@ -1628,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1628
1628
|
}
|
|
1629
1629
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
1630
1630
|
|
|
1631
|
+
// (optional) temperature tuning - used by mistral-large
|
|
1632
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
1633
|
+
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
|
|
1634
|
+
|
|
1631
1635
|
switch (hparams.n_layer) {
|
|
1632
1636
|
case 27: type = LLM_TYPE_16B; break;
|
|
1633
1637
|
case 60: type = LLM_TYPE_236B; break;
|
|
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
666
666
|
|
|
667
667
|
std::map<int, std::string> mapped;
|
|
668
668
|
int blk_id = 0;
|
|
669
|
-
int pruned_attention_w = 0;
|
|
670
669
|
|
|
671
670
|
// make a list of weights
|
|
672
671
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
674
673
|
for (const auto & it : ml.weights_map) {
|
|
675
674
|
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
676
675
|
if (remapped_name.empty()) {
|
|
677
|
-
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
678
|
-
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
679
|
-
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
680
|
-
pruned_attention_w++;
|
|
681
|
-
}
|
|
682
676
|
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
683
677
|
continue;
|
|
684
678
|
}
|
|
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
703
697
|
});
|
|
704
698
|
}
|
|
705
699
|
|
|
706
|
-
bool is_clip_model = false;
|
|
707
700
|
for (const auto * it : tensors) {
|
|
708
701
|
const struct ggml_tensor * tensor = it->tensor;
|
|
709
702
|
|
|
@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
717
710
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
718
711
|
qs.has_output = true;
|
|
719
712
|
}
|
|
720
|
-
|
|
721
|
-
is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
722
713
|
}
|
|
723
714
|
|
|
724
715
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
725
716
|
|
|
726
|
-
// sanity checks for models that have attention layers
|
|
727
|
-
if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
728
|
-
{
|
|
729
|
-
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
730
|
-
// attention layers have a non-zero number of kv heads
|
|
731
|
-
int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
|
|
732
|
-
if (llama_model_has_encoder(&model)) {
|
|
733
|
-
// now n_layer_attn is the number of attention layers in the encoder
|
|
734
|
-
// for each decoder block, there are 2 attention layers
|
|
735
|
-
n_layer_attn += 2 * model.hparams.dec_n_layer;
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
|
|
739
|
-
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
|
|
740
|
-
|
|
741
|
-
LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
|
|
742
|
-
|
|
743
|
-
GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
|
|
744
|
-
}
|
|
745
|
-
|
|
746
717
|
size_t total_size_org = 0;
|
|
747
718
|
size_t total_size_new = 0;
|
|
748
719
|
|
|
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
30
30
|
// {n_embd, n_tokens}
|
|
31
31
|
inpL = build_inp_embd(model.tok_embd);
|
|
32
32
|
|
|
33
|
+
// (optional) temperature tuning - used by mistral-large
|
|
34
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
35
|
+
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
36
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
37
|
+
}
|
|
38
|
+
|
|
33
39
|
// inp_pos - contains the positions
|
|
34
40
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
35
41
|
|
|
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
128
134
|
ggml_tensor * Vcur = kv_cmpr;
|
|
129
135
|
cb(Vcur, "Vcur", il);
|
|
130
136
|
|
|
137
|
+
if (inp_attn_scale) {
|
|
138
|
+
// apply llama 4 temperature scaling
|
|
139
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
140
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
141
|
+
}
|
|
142
|
+
|
|
131
143
|
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
132
144
|
cur = build_attn(inp_attn,
|
|
133
145
|
model.layers[il].wo, NULL,
|
|
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
160
172
|
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
161
173
|
cb(Kcur, "Kcur", il);
|
|
162
174
|
|
|
175
|
+
if (inp_attn_scale) {
|
|
176
|
+
// apply llama 4 temperature scaling
|
|
177
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
178
|
+
cb(Qcur, "Qcur_attn_temp_scaled", il);
|
|
179
|
+
}
|
|
180
|
+
|
|
163
181
|
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
164
182
|
cur = build_attn(inp_attn,
|
|
165
183
|
model.layers[il].wo, NULL,
|
|
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
|
|
499
499
|
|
|
500
500
|
// use std::wregex to split the text
|
|
501
501
|
static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
|
|
502
|
-
std::wregex expr(regex_expr);
|
|
502
|
+
std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
|
503
503
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
504
504
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
505
505
|
size_t start = 0;
|
|
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
|
|
|
529
529
|
|
|
530
530
|
// use std::regex to split the text
|
|
531
531
|
static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
|
532
|
-
std::regex expr(regex_expr);
|
|
532
|
+
std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
|
|
533
533
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
|
534
534
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
|
535
535
|
size_t start = 0;
|