@fugood/llama.node 1.0.0-beta.6 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +12 -0
- package/lib/index.js +10 -0
- package/lib/index.ts +17 -1
- package/package.json +14 -14
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +7 -3
- package/src/LlamaCompletionWorker.h +2 -0
- package/src/LlamaContext.cpp +49 -6
- package/src/LlamaContext.h +1 -0
- package/src/RerankWorker.h +26 -0
- package/src/common.hpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/src/llama.cpp/include/llama.h +6 -3
- package/src/llama.cpp/src/llama-arch.cpp +54 -0
- package/src/llama.cpp/src/llama-arch.h +17 -0
- package/src/llama.cpp/src/llama-batch.cpp +20 -7
- package/src/llama.cpp/src/llama-chat.cpp +11 -6
- package/src/llama.cpp/src/llama-context.cpp +0 -1
- package/src/llama.cpp/src/llama-graph.cpp +19 -4
- package/src/llama.cpp/src/llama-graph.h +14 -2
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
- package/src/llama.cpp/src/llama-kv-cells.h +33 -9
- package/src/llama.cpp/src/llama-model.cpp +518 -1
- package/src/llama.cpp/src/llama-model.h +22 -0
- package/src/llama.cpp/src/llama-quant.cpp +87 -5
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#include "llama-quant.h"
|
|
2
|
-
|
|
3
2
|
#include "llama-impl.h"
|
|
4
3
|
#include "llama-model.h"
|
|
5
4
|
#include "llama-model-loader.h"
|
|
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
|
|
|
27
26
|
}
|
|
28
27
|
}
|
|
29
28
|
|
|
29
|
+
static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
|
|
30
|
+
if (prune.empty()) {
|
|
31
|
+
return orig_name;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
35
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
36
|
+
const int blk = std::stoi(match[1]);
|
|
37
|
+
std::string new_name = orig_name;
|
|
38
|
+
|
|
39
|
+
if (mapped.count(blk)) {
|
|
40
|
+
// Already mapped, do nothing
|
|
41
|
+
} else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
|
|
42
|
+
mapped[blk] = "";
|
|
43
|
+
} else if (blk < prune.front()) {
|
|
44
|
+
mapped[blk] = std::to_string(blk);
|
|
45
|
+
next_id = blk + 1;
|
|
46
|
+
} else {
|
|
47
|
+
mapped[blk] = std::to_string(next_id);
|
|
48
|
+
++next_id;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return orig_name;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
|
|
58
|
+
if (mapped.empty()) {
|
|
59
|
+
return orig_name;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
63
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
64
|
+
const std::string blk(match[1]);
|
|
65
|
+
std::string new_name = orig_name;
|
|
66
|
+
|
|
67
|
+
for (const auto & p : mapped) {
|
|
68
|
+
if (p.second == blk) {
|
|
69
|
+
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
|
|
70
|
+
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return orig_name;
|
|
77
|
+
}
|
|
78
|
+
|
|
30
79
|
struct quantize_state_impl {
|
|
31
80
|
const llama_model & model;
|
|
32
81
|
const llama_model_quantize_params * params;
|
|
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
174
223
|
new_type = GGML_TYPE_Q6_K;
|
|
175
224
|
}
|
|
176
225
|
}
|
|
177
|
-
} else if (name == "token_embd.weight") {
|
|
226
|
+
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
|
178
227
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
179
228
|
new_type = qs.params->token_embedding_type;
|
|
180
229
|
} else {
|
|
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
568
617
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
|
569
618
|
gguf_context_ptr ctx_out { gguf_init_empty() };
|
|
570
619
|
|
|
620
|
+
std::vector<int> prune_list = {};
|
|
621
|
+
if (params->prune_layers) {
|
|
622
|
+
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
|
|
623
|
+
}
|
|
624
|
+
|
|
571
625
|
// copy the KV pairs from the input file
|
|
572
626
|
gguf_set_kv (ctx_out.get(), ml.meta.get());
|
|
573
627
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
|
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
597
651
|
}
|
|
598
652
|
}
|
|
599
653
|
|
|
654
|
+
std::map<int, std::string> mapped;
|
|
655
|
+
int blk_id = 0;
|
|
656
|
+
int pruned_attention_w = 0;
|
|
657
|
+
|
|
600
658
|
// make a list of weights
|
|
601
659
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
602
660
|
tensors.reserve(ml.weights_map.size());
|
|
603
661
|
for (const auto & it : ml.weights_map) {
|
|
662
|
+
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
663
|
+
if (remapped_name.empty()) {
|
|
664
|
+
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
665
|
+
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
666
|
+
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
667
|
+
pruned_attention_w++;
|
|
668
|
+
}
|
|
669
|
+
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
670
|
+
continue;
|
|
671
|
+
} else if (remapped_name != it.first) {
|
|
672
|
+
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
673
|
+
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
674
|
+
}
|
|
604
675
|
tensors.push_back(&it.second);
|
|
605
676
|
}
|
|
677
|
+
if (!prune_list.empty()) {
|
|
678
|
+
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
|
|
679
|
+
}
|
|
606
680
|
|
|
607
681
|
// keep_split requires that the weights are sorted by split index
|
|
608
682
|
if (params->keep_split) {
|
|
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
640
714
|
if (llama_model_has_encoder(&model)) {
|
|
641
715
|
n_attn_layer *= 3;
|
|
642
716
|
}
|
|
643
|
-
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
|
717
|
+
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
|
644
718
|
}
|
|
645
719
|
|
|
646
720
|
size_t total_size_org = 0;
|
|
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
681
755
|
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
|
682
756
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
|
683
757
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
|
684
|
-
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(),
|
|
758
|
+
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
|
|
685
759
|
}
|
|
686
760
|
}
|
|
687
761
|
|
|
@@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
756
830
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
757
831
|
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
758
832
|
|
|
833
|
+
// these are very small (e.g. 4x4)
|
|
834
|
+
quantize &= name.find("altup") == std::string::npos;
|
|
835
|
+
quantize &= name.find("laurel") == std::string::npos;
|
|
836
|
+
|
|
837
|
+
// these are not too big so keep them as it is
|
|
838
|
+
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
839
|
+
|
|
759
840
|
// do not quantize positional embeddings and token types (BERT)
|
|
760
841
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
761
842
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
@@ -832,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
832
913
|
|
|
833
914
|
const float * imatrix = nullptr;
|
|
834
915
|
if (imatrix_data) {
|
|
835
|
-
auto it = imatrix_data->find(tensor->name);
|
|
916
|
+
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
|
836
917
|
if (it == imatrix_data->end()) {
|
|
837
918
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
838
919
|
} else {
|
|
@@ -947,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
947
1028
|
/*.imatrix =*/ nullptr,
|
|
948
1029
|
/*.kv_overrides =*/ nullptr,
|
|
949
1030
|
/*.tensor_type =*/ nullptr,
|
|
1031
|
+
/*.prune_layers =*/ nullptr
|
|
950
1032
|
};
|
|
951
1033
|
|
|
952
1034
|
return result;
|