@fugood/llama.node 1.0.0-beta.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/lib/binding.ts +12 -0
  3. package/lib/index.js +10 -0
  4. package/lib/index.ts +17 -1
  5. package/package.json +14 -14
  6. package/src/EmbeddingWorker.cpp +1 -1
  7. package/src/LlamaCompletionWorker.cpp +7 -3
  8. package/src/LlamaCompletionWorker.h +2 -0
  9. package/src/LlamaContext.cpp +49 -6
  10. package/src/LlamaContext.h +1 -0
  11. package/src/RerankWorker.h +26 -0
  12. package/src/common.hpp +1 -1
  13. package/src/llama.cpp/CMakeLists.txt +1 -1
  14. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  15. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  16. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +8 -0
  18. package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +13 -12
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  28. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  29. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +59 -16
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +48 -48
  35. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +15 -14
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  40. package/src/llama.cpp/include/llama.h +6 -3
  41. package/src/llama.cpp/src/llama-arch.cpp +54 -0
  42. package/src/llama.cpp/src/llama-arch.h +17 -0
  43. package/src/llama.cpp/src/llama-batch.cpp +20 -7
  44. package/src/llama.cpp/src/llama-chat.cpp +11 -6
  45. package/src/llama.cpp/src/llama-context.cpp +0 -1
  46. package/src/llama.cpp/src/llama-graph.cpp +19 -4
  47. package/src/llama.cpp/src/llama-graph.h +14 -2
  48. package/src/llama.cpp/src/llama-hparams.h +6 -0
  49. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +28 -2
  50. package/src/llama.cpp/src/llama-kv-cells.h +33 -9
  51. package/src/llama.cpp/src/llama-model.cpp +518 -1
  52. package/src/llama.cpp/src/llama-model.h +22 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +87 -5
@@ -1,5 +1,4 @@
1
1
  #include "llama-quant.h"
2
-
3
2
  #include "llama-impl.h"
4
3
  #include "llama-model.h"
5
4
  #include "llama-model-loader.h"
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
27
26
  }
28
27
  }
29
28
 
29
+ static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
30
+ if (prune.empty()) {
31
+ return orig_name;
32
+ }
33
+
34
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
35
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
36
+ const int blk = std::stoi(match[1]);
37
+ std::string new_name = orig_name;
38
+
39
+ if (mapped.count(blk)) {
40
+ // Already mapped, do nothing
41
+ } else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
42
+ mapped[blk] = "";
43
+ } else if (blk < prune.front()) {
44
+ mapped[blk] = std::to_string(blk);
45
+ next_id = blk + 1;
46
+ } else {
47
+ mapped[blk] = std::to_string(next_id);
48
+ ++next_id;
49
+ }
50
+
51
+ return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
52
+ }
53
+
54
+ return orig_name;
55
+ }
56
+
57
+ static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
58
+ if (mapped.empty()) {
59
+ return orig_name;
60
+ }
61
+
62
+ static const std::regex pattern(R"(blk\.(\d+)\.)");
63
+ if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
64
+ const std::string blk(match[1]);
65
+ std::string new_name = orig_name;
66
+
67
+ for (const auto & p : mapped) {
68
+ if (p.second == blk) {
69
+ LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
70
+ return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
71
+ }
72
+ }
73
+ GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
74
+ }
75
+
76
+ return orig_name;
77
+ }
78
+
30
79
  struct quantize_state_impl {
31
80
  const llama_model & model;
32
81
  const llama_model_quantize_params * params;
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
174
223
  new_type = GGML_TYPE_Q6_K;
175
224
  }
176
225
  }
177
- } else if (name == "token_embd.weight") {
226
+ } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
178
227
  if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
179
228
  new_type = qs.params->token_embedding_type;
180
229
  } else {
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
568
617
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
569
618
  gguf_context_ptr ctx_out { gguf_init_empty() };
570
619
 
620
+ std::vector<int> prune_list = {};
621
+ if (params->prune_layers) {
622
+ prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
623
+ }
624
+
571
625
  // copy the KV pairs from the input file
572
626
  gguf_set_kv (ctx_out.get(), ml.meta.get());
573
627
  gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
597
651
  }
598
652
  }
599
653
 
654
+ std::map<int, std::string> mapped;
655
+ int blk_id = 0;
656
+ int pruned_attention_w = 0;
657
+
600
658
  // make a list of weights
601
659
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
602
660
  tensors.reserve(ml.weights_map.size());
603
661
  for (const auto & it : ml.weights_map) {
662
+ const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
663
+ if (remapped_name.empty()) {
664
+ if (it.first.find("attn_v.weight") != std::string::npos ||
665
+ it.first.find("attn_qkv.weight") != std::string::npos ||
666
+ it.first.find("attn_kv_b.weight") != std::string::npos) {
667
+ pruned_attention_w++;
668
+ }
669
+ LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
670
+ continue;
671
+ } else if (remapped_name != it.first) {
672
+ ggml_set_name(it.second.tensor, remapped_name.c_str());
673
+ LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
674
+ }
604
675
  tensors.push_back(&it.second);
605
676
  }
677
+ if (!prune_list.empty()) {
678
+ gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
679
+ }
606
680
 
607
681
  // keep_split requires that the weights are sorted by split index
608
682
  if (params->keep_split) {
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
640
714
  if (llama_model_has_encoder(&model)) {
641
715
  n_attn_layer *= 3;
642
716
  }
643
- GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
717
+ GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
644
718
  }
645
719
 
646
720
  size_t total_size_org = 0;
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
681
755
  for (size_t i = 0; i < ctx_outs.size(); ++i) {
682
756
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
683
757
  gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
684
- gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
758
+ gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
685
759
  }
686
760
  }
687
761
 
@@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
756
830
  // NOTE: can't use LLM_TN here because the layer number is not known
757
831
  quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
758
832
 
833
+ // these are very small (e.g. 4x4)
834
+ quantize &= name.find("altup") == std::string::npos;
835
+ quantize &= name.find("laurel") == std::string::npos;
836
+
837
+ // these are not too big so keep them as it is
838
+ quantize &= name.find("per_layer_model_proj") == std::string::npos;
839
+
759
840
  // do not quantize positional embeddings and token types (BERT)
760
841
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
761
842
  quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
@@ -832,7 +913,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
832
913
 
833
914
  const float * imatrix = nullptr;
834
915
  if (imatrix_data) {
835
- auto it = imatrix_data->find(tensor->name);
916
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
836
917
  if (it == imatrix_data->end()) {
837
918
  LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
838
919
  } else {
@@ -947,6 +1028,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
947
1028
  /*.imatrix =*/ nullptr,
948
1029
  /*.kv_overrides =*/ nullptr,
949
1030
  /*.tensor_type =*/ nullptr,
1031
+ /*.prune_layers =*/ nullptr
950
1032
  };
951
1033
 
952
1034
  return result;