@fugood/llama.node 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +423 -186
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +23 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/sampling.cpp +1 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  19. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  31. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  42. package/src/llama.cpp/include/llama.h +23 -11
  43. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  44. package/src/llama.cpp/src/llama-arch.h +22 -0
  45. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  46. package/src/llama.cpp/src/llama-context.cpp +157 -0
  47. package/src/llama.cpp/src/llama-context.h +10 -0
  48. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  49. package/src/llama.cpp/src/llama-graph.h +10 -1
  50. package/src/llama.cpp/src/llama-hparams.h +17 -2
  51. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
  52. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  53. package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
  54. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  55. package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  56. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  57. package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
  58. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  59. package/src/llama.cpp/src/llama-memory.h +3 -0
  60. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  61. package/src/llama.cpp/src/llama-model.cpp +582 -45
  62. package/src/llama.cpp/src/llama-model.h +23 -1
  63. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  64. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  65. package/src/llama.cpp/src/llama-vocab.h +41 -40
  66. package/src/llama.cpp/src/unicode.h +43 -0
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
2027
2027
  n_reused = 0;
2028
2028
  }
2029
2029
 
2030
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
2031
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
2032
+ for (const auto & buft_size : model.memory_breakdown()) {
2033
+ ret[buft_size.first].model += buft_size.second;
2034
+ }
2035
+ for (const auto & buft_size : memory->memory_breakdown()) {
2036
+ ret[buft_size.first].context += buft_size.second;
2037
+ }
2038
+ for (const auto & backend_ptr : backends) {
2039
+ ggml_backend_t backend = backend_ptr.get();
2040
+ ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
2041
+ }
2042
+ return ret;
2043
+ }
2044
+
2030
2045
  //
2031
2046
  // training
2032
2047
  //
@@ -2331,6 +2346,12 @@ llama_context * llama_init_from_model(
2331
2346
  return nullptr;
2332
2347
  }
2333
2348
 
2349
+ if (params.pooling_type != model->hparams.pooling_type) {
2350
+ //user-specified pooling-type is different from the model default
2351
+ LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
2352
+ model->hparams.pooling_type, params.pooling_type);
2353
+ }
2354
+
2334
2355
  try {
2335
2356
  auto * ctx = new llama_context(*model, params);
2336
2357
  return ctx;
@@ -2765,6 +2786,142 @@ void llama_perf_context_reset(llama_context * ctx) {
2765
2786
  ctx->perf_reset();
2766
2787
  }
2767
2788
 
2789
+ void llama_memory_breakdown_print(const struct llama_context * ctx) {
2790
+ const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
2791
+
2792
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
2793
+
2794
+ std::vector<std::array<std::string, 9>> table_data;
2795
+ table_data.reserve(devices.size());
2796
+ const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
2797
+ const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
2798
+ const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
2799
+
2800
+ table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
2801
+
2802
+ constexpr size_t MiB = 1024 * 1024;
2803
+ const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
2804
+
2805
+ // track seen buffer types to avoid double counting:
2806
+ std::set<ggml_backend_buffer_type_t> seen_buffer_types;
2807
+
2808
+ // accumulative memory breakdown for each device and for host:
2809
+ std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
2810
+ llama_memory_breakdown_data mb_host;
2811
+
2812
+ for (const auto & buft_mb : memory_breakdown) {
2813
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2814
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2815
+ if (ggml_backend_buft_is_host(buft)) {
2816
+ mb_host.model += mb.model;
2817
+ mb_host.context += mb.context;
2818
+ mb_host.compute += mb.compute;
2819
+ seen_buffer_types.insert(buft);
2820
+ continue;
2821
+ }
2822
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
2823
+ if (dev) {
2824
+ int i_dev = -1;
2825
+ for (size_t i = 0; i < devices.size(); i++) {
2826
+ if (devices[i] == dev) {
2827
+ i_dev = i;
2828
+ break;
2829
+ }
2830
+ }
2831
+ if (i_dev != -1) {
2832
+ mb_dev[i_dev].model += mb.model;
2833
+ mb_dev[i_dev].context += mb.context;
2834
+ mb_dev[i_dev].compute += mb.compute;
2835
+ seen_buffer_types.insert(buft);
2836
+ continue;
2837
+ }
2838
+ }
2839
+ }
2840
+
2841
+ // print memory breakdown for each device:
2842
+ for (size_t i = 0; i < devices.size(); i++) {
2843
+ ggml_backend_dev_t dev = devices[i];
2844
+ llama_memory_breakdown_data mb = mb_dev[i];
2845
+
2846
+ const std::string name = ggml_backend_dev_name(dev);
2847
+ std::string desc = ggml_backend_dev_description(dev);
2848
+ for (const std::string & prefix : desc_prefixes_strip) {
2849
+ if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
2850
+ desc = desc.substr(prefix.length());
2851
+ }
2852
+ }
2853
+
2854
+ size_t free, total;
2855
+ ggml_backend_dev_memory(dev, &free, &total);
2856
+
2857
+ const size_t self = mb.model + mb.context + mb.compute;
2858
+ const size_t unaccounted = total - self - free;
2859
+
2860
+ table_data.push_back({
2861
+ template_gpu,
2862
+ " - " + name + " (" + desc + ")",
2863
+ std::to_string(total / MiB),
2864
+ std::to_string(free / MiB),
2865
+ std::to_string(self / MiB),
2866
+ std::to_string(mb.model / MiB),
2867
+ std::to_string(mb.context / MiB),
2868
+ std::to_string(mb.compute / MiB),
2869
+ std::to_string(unaccounted / MiB)});
2870
+ }
2871
+
2872
+ // print memory breakdown for host:
2873
+ {
2874
+ const size_t self = mb_host.model + mb_host.context + mb_host.compute;
2875
+ table_data.push_back({
2876
+ template_other,
2877
+ " - Host",
2878
+ "", // total
2879
+ "", // free
2880
+ std::to_string(self / MiB),
2881
+ std::to_string(mb_host.model / MiB),
2882
+ std::to_string(mb_host.context / MiB),
2883
+ std::to_string(mb_host.compute / MiB),
2884
+ ""}); // unaccounted
2885
+ }
2886
+
2887
+ // print memory breakdown for all remaining buffer types:
2888
+ for (const auto & buft_mb : memory_breakdown) {
2889
+ ggml_backend_buffer_type_t buft = buft_mb.first;
2890
+ const llama_memory_breakdown_data & mb = buft_mb.second;
2891
+ if (seen_buffer_types.count(buft) == 1) {
2892
+ continue;
2893
+ }
2894
+ const std::string name = ggml_backend_buft_name(buft);
2895
+ const size_t self = mb.model + mb.context + mb.compute;
2896
+ table_data.push_back({
2897
+ template_other,
2898
+ " - " + name,
2899
+ "", // total
2900
+ "", // free
2901
+ std::to_string(self / MiB),
2902
+ std::to_string(mb.model / MiB),
2903
+ std::to_string(mb.context / MiB),
2904
+ std::to_string(mb.compute / MiB),
2905
+ ""}); // unaccounted
2906
+ seen_buffer_types.insert(buft);
2907
+ }
2908
+
2909
+ for (size_t j = 1; j < table_data[0].size(); j++) {
2910
+ size_t max_len = 0;
2911
+ for (const auto & td : table_data) {
2912
+ max_len = std::max(max_len, td[j].length());
2913
+ }
2914
+ for (auto & td : table_data) {
2915
+ td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
2916
+ }
2917
+ }
2918
+ for (const auto & td : table_data) {
2919
+ LLAMA_LOG_INFO(td[0].c_str(),
2920
+ __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
2921
+ td[6].c_str(), td[7].c_str(), td[8].c_str());
2922
+ }
2923
+ }
2924
+
2768
2925
  //
2769
2926
  // training
2770
2927
  //
@@ -17,9 +17,17 @@ class llama_batch_allocr;
17
17
  class llama_io_read_i;
18
18
  class llama_io_write_i;
19
19
 
20
+ // "memory" as in abstract memory for the context
20
21
  struct llama_memory_i;
21
22
  struct llama_memory_context_i;
22
23
 
24
+ // "memory" as in physical memory for a buffer type, in bytes
25
+ struct llama_memory_breakdown_data {
26
+ size_t model = 0; // memory allocated for the model
27
+ size_t context = 0; // memory allocated for the context
28
+ size_t compute = 0; // memory allocated for temporary compute buffers
29
+ };
30
+
23
31
  struct llama_context {
24
32
  // init scheduler and compute buffers, reserve worst-case graphs
25
33
  llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
144
152
  llama_perf_context_data perf_get_data() const;
145
153
  void perf_reset();
146
154
 
155
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
156
+
147
157
  //
148
158
  // training
149
159
  //
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
204
204
  std::vector<int> target_pos(n_seqs_unq, -1);
205
205
  std::vector<int> target_row(n_seqs_unq, -1);
206
206
 
207
- bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
207
+ const bool last = (
208
+ cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
209
+ (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
210
+ );
208
211
 
209
212
  for (int i = 0; i < n_tokens; ++i) {
210
213
  const llama_pos pos = ubatch->pos[i];
@@ -920,15 +923,29 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
920
923
  selection_probs = logits;
921
924
  }
922
925
 
926
+ if (arch == LLM_ARCH_GROVEMOE) {
927
+ selection_probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
928
+ cb(selection_probs, "ffn_moe_probs_biased", il);
929
+ }
930
+
923
931
  // select experts
924
932
  ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
925
933
  cb(selected_experts->src[0], "ffn_moe_argsort", il);
926
934
  cb(selected_experts, "ffn_moe_topk", il);
927
935
 
928
- ggml_tensor * weights = ggml_get_rows(ctx0,
929
- ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
936
+ if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
937
+ // TODO: Use scalar div instead when/if implemented
938
+ ggml_tensor * f_sel = ggml_cast(ctx0, selected_experts, GGML_TYPE_F32);
939
+ selected_experts = ggml_cast(ctx0, ggml_scale(ctx0, f_sel, 1.0f / float(hparams.n_group_experts)), GGML_TYPE_I32);
940
+ probs = ggml_reshape_3d(ctx0, probs, 1, hparams.n_expert, n_tokens);
941
+ } else {
942
+ probs = ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens);
943
+ }
944
+
945
+ ggml_tensor * weights = ggml_get_rows(ctx0, probs, selected_experts); // [1, n_expert_used, n_tokens]
930
946
  cb(weights, "ffn_moe_weights", il);
931
947
 
948
+
932
949
  if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT) {
933
950
  weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens);
934
951
  weights = ggml_soft_max(ctx0, weights); // [n_expert_used, n_tokens]
@@ -952,6 +969,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
952
969
  cb(weights, "ffn_moe_weights_scaled", il);
953
970
  }
954
971
 
972
+ //call early so that topk-moe can be used
973
+ ggml_build_forward_expand(gf, weights);
974
+
955
975
  cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
956
976
 
957
977
  if (weight_before_ffn) {
@@ -1177,7 +1197,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
1177
1197
  }
1178
1198
 
1179
1199
  ggml_tensor * llm_graph_context::build_inp_cls() const {
1180
- auto inp = std::make_unique<llm_graph_input_cls>(cparams);
1200
+ auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
1181
1201
 
1182
1202
  auto & cur = inp->cls;
1183
1203
 
@@ -1833,6 +1853,23 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
1833
1853
  return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp));
1834
1854
  }
1835
1855
 
1856
+ void llm_graph_context::build_dense_out(
1857
+ ggml_tensor * dense_2,
1858
+ ggml_tensor * dense_3) const {
1859
+ if (!cparams.embeddings || dense_2 == nullptr || dense_3 == nullptr) {
1860
+ return;
1861
+ }
1862
+ ggml_tensor * cur = res->t_embd_pooled != nullptr ? res->t_embd_pooled : res->t_embd;
1863
+ GGML_ASSERT(cur != nullptr && "missing t_embd_pooled/t_embd");
1864
+
1865
+ cur = ggml_mul_mat(ctx0, dense_2, cur);
1866
+ cur = ggml_mul_mat(ctx0, dense_3, cur);
1867
+ cb(cur, "result_embd_pooled", -1);
1868
+ res->t_embd_pooled = cur;
1869
+ ggml_build_forward_expand(gf, cur);
1870
+ }
1871
+
1872
+
1836
1873
  void llm_graph_context::build_pooling(
1837
1874
  ggml_tensor * cls,
1838
1875
  ggml_tensor * cls_b,
@@ -1877,34 +1914,32 @@ void llm_graph_context::build_pooling(
1877
1914
  case LLAMA_POOLING_TYPE_RANK:
1878
1915
  {
1879
1916
  ggml_tensor * inp_cls = build_inp_cls();
1880
- inp = ggml_get_rows(ctx0, inp, inp_cls);
1917
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
1881
1918
 
1919
+ // classification head
1920
+ // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1882
1921
  if (cls) {
1883
- // classification head
1884
- // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
1885
- cur = ggml_mul_mat(ctx0, cls, inp);
1922
+ cur = ggml_mul_mat(ctx0, cls, cur);
1886
1923
  if (cls_b) {
1887
1924
  cur = ggml_add(ctx0, cur, cls_b);
1888
1925
  }
1889
1926
  cur = ggml_tanh(ctx0, cur);
1927
+ }
1890
1928
 
1891
- // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1892
- // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1893
- if (cls_out) {
1894
- cur = ggml_mul_mat(ctx0, cls_out, cur);
1895
- if (cls_out_b) {
1896
- cur = ggml_add(ctx0, cur, cls_out_b);
1897
- }
1898
- }
1899
- } else if (cls_out) {
1900
- // Single layer classification head (direct projection)
1901
- // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
1902
- cur = ggml_mul_mat(ctx0, cls_out, inp);
1929
+ // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
1930
+ // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
1931
+ // Single layer classification head (direct projection)
1932
+ // https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
1933
+ if (cls_out) {
1934
+ cur = ggml_mul_mat(ctx0, cls_out, cur);
1903
1935
  if (cls_out_b) {
1904
1936
  cur = ggml_add(ctx0, cur, cls_out_b);
1905
1937
  }
1906
- } else {
1907
- GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
1938
+ }
1939
+
1940
+ // softmax for qwen3 reranker
1941
+ if (arch == LLM_ARCH_QWEN3) {
1942
+ cur = ggml_soft_max(ctx0, cur);
1908
1943
  }
1909
1944
  } break;
1910
1945
  default:
@@ -206,7 +206,7 @@ public:
206
206
 
207
207
  class llm_graph_input_cls : public llm_graph_input_i {
208
208
  public:
209
- llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
209
+ llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
210
210
  virtual ~llm_graph_input_cls() = default;
211
211
 
212
212
  void set_input(const llama_ubatch * ubatch) override;
@@ -214,6 +214,7 @@ public:
214
214
  ggml_tensor * cls; // I32 [n_batch]
215
215
 
216
216
  const llama_cparams cparams;
217
+ const llm_arch arch;
217
218
  };
218
219
 
219
220
  class llm_graph_input_rs : public llm_graph_input_i {
@@ -813,6 +814,14 @@ struct llm_graph_context {
813
814
  ggml_tensor * cls_b,
814
815
  ggml_tensor * cls_out,
815
816
  ggml_tensor * cls_out_b) const;
817
+
818
+ //
819
+ // dense (out)
820
+ //
821
+
822
+ void build_dense_out(
823
+ ggml_tensor * dense_2,
824
+ ggml_tensor * dense_3) const;
816
825
  };
817
826
 
818
827
  // TODO: better name
@@ -42,7 +42,7 @@ struct llama_hparams {
42
42
  uint32_t n_embd;
43
43
  uint32_t n_embd_features = 0;
44
44
  uint32_t n_layer;
45
- int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
45
+ int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
46
46
  uint32_t n_rot;
47
47
  uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
48
48
  uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
@@ -69,10 +69,13 @@ struct llama_hparams {
69
69
  uint32_t n_lora_kv = 0;
70
70
  uint32_t n_ff_exp = 0;
71
71
  uint32_t n_ff_shexp = 0;
72
+ uint32_t n_ff_chexp = 0;
72
73
  uint32_t n_expert_shared = 0;
73
74
  uint32_t n_norm_groups = 0;
75
+ uint32_t n_group_experts = 0;
74
76
 
75
- float expert_weights_scale = 0.0;
77
+ float expert_group_scale = 0.05f;
78
+ float expert_weights_scale = 0.0f;
76
79
  bool expert_weights_norm = false;
77
80
  uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
78
81
  uint32_t moe_every_n_layers = 0;
@@ -166,6 +169,18 @@ struct llama_hparams {
166
169
  uint32_t laurel_rank = 64;
167
170
  uint32_t n_embd_altup = 256;
168
171
 
172
+ // needed for sentence-transformers dense layers
173
+ uint32_t dense_2_feat_in = 0; // in_features of the 2_Dense
174
+ uint32_t dense_2_feat_out = 0; // out_features of the 2_Dense
175
+ uint32_t dense_3_feat_in = 0; // in_features of the 3_Dense
176
+ uint32_t dense_3_feat_out = 0; // out_features of the 3_Dense
177
+
178
+ // xIELU
179
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_n;
180
+ std::array<float, LLAMA_MAX_LAYERS> xielu_alpha_p;
181
+ std::array<float, LLAMA_MAX_LAYERS> xielu_beta;
182
+ std::array<float, LLAMA_MAX_LAYERS> xielu_eps;
183
+
169
184
  // needed by encoder-decoder models (e.g. T5, FLAN-T5)
170
185
  // ref: https://github.com/ggerganov/llama.cpp/pull/8141
171
186
  llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
113
113
  return kv_swa->seq_pos_max(seq_id);
114
114
  }
115
115
 
116
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
117
+ std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
118
+ for (const auto & buft_size : kv_swa->memory_breakdown()) {
119
+ mb[buft_size.first] += buft_size.second;
120
+ }
121
+ return mb;
122
+ }
123
+
116
124
  llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
117
125
  GGML_UNUSED(embd_all);
118
126
 
@@ -212,7 +220,7 @@ bool llama_kv_cache_iswa::get_can_shift() const {
212
220
  }
213
221
 
214
222
  void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
215
- if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
223
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
216
224
  kv_base->state_write(io, seq_id, flags);
217
225
  }
218
226
 
@@ -220,7 +228,7 @@ void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id
220
228
  }
221
229
 
222
230
  void llama_kv_cache_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
223
- if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
231
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
224
232
  kv_base->state_read(io, seq_id, flags);
225
233
  }
226
234
 
@@ -56,6 +56,8 @@ public:
56
56
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
57
57
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
58
58
 
59
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
60
+
59
61
  // state write/load
60
62
 
61
63
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -123,11 +123,8 @@ llama_kv_cache::llama_kv_cache(
123
123
  throw std::runtime_error("failed to create ggml context for kv cache");
124
124
  }
125
125
 
126
- ggml_tensor * k;
127
- ggml_tensor * v;
128
-
129
- k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
130
- v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
126
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream);
127
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream);
131
128
 
132
129
  ggml_format_name(k, "cache_k_l%d", il);
133
130
  ggml_format_name(v, "cache_v_l%d", il);
@@ -473,6 +470,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
473
470
  return cells.seq_pos_max(seq_id);
474
471
  }
475
472
 
473
+ std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
474
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
475
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
476
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
477
+ }
478
+ return ret;
479
+ }
480
+
476
481
  llama_memory_context_ptr llama_kv_cache::init_batch(
477
482
  llama_batch_allocr & balloc,
478
483
  uint32_t n_ubatch,
@@ -121,6 +121,8 @@ public:
121
121
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
122
122
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
123
123
 
124
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
125
+
124
126
  // state write/load
125
127
 
126
128
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
73
73
  // if all tokens are output, split by sequence
74
74
  ubatch = balloc.split_seq(n_ubatch);
75
75
  } else {
76
- ubatch = balloc.split_equal(n_ubatch, false);
76
+ // TODO: non-sequential equal split can be done if using unified KV cache
77
+ // for simplicity, we always use sequential equal split for now
78
+ ubatch = balloc.split_equal(n_ubatch, true);
77
79
  }
78
80
 
79
81
  if (ubatch.n_tokens == 0) {
@@ -166,18 +168,26 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
166
168
  return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
167
169
  }
168
170
 
169
- void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
170
- GGML_UNUSED(flags);
171
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
172
+ std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
173
+ for (const auto & buft_size : mem_recr->memory_breakdown()) {
174
+ mb[buft_size.first] += buft_size.second;
175
+ }
176
+ return mb;
177
+ }
171
178
 
172
- mem_attn->state_write(io, seq_id);
173
- mem_recr->state_write(io, seq_id);
179
+ void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
180
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
181
+ mem_attn->state_write(io, seq_id, flags);
182
+ }
183
+ mem_recr->state_write(io, seq_id, flags);
174
184
  }
175
185
 
176
186
  void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
177
- GGML_UNUSED(flags);
178
-
179
- mem_attn->state_read(io, seq_id);
180
- mem_recr->state_read(io, seq_id);
187
+ if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
188
+ mem_attn->state_read(io, seq_id, flags);
189
+ }
190
+ mem_recr->state_read(io, seq_id, flags);
181
191
  }
182
192
 
183
193
  llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
@@ -68,6 +68,8 @@ public:
68
68
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
69
69
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
70
70
 
71
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
72
+
71
73
  // state write/load
72
74
 
73
75
  void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -136,6 +136,7 @@ void llama_memory_recurrent::clear(bool data) {
136
136
  }
137
137
 
138
138
  bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
139
+ //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
139
140
  uint32_t new_head = size;
140
141
 
141
142
  if (p0 < 0) {
@@ -156,7 +157,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
156
157
  if (tail_id >= 0) {
157
158
  const auto & cell = cells[tail_id];
158
159
  // partial intersection is invalid
159
- if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
160
+ if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
161
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
160
162
  return false;
161
163
  }
162
164
  // invalidate tails which will be cleared
@@ -167,6 +169,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
167
169
  } else {
168
170
  // seq_id is negative, then the range should include everything or nothing
169
171
  if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
172
+ //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
170
173
  return false;
171
174
  }
172
175
  }
@@ -359,6 +362,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
359
362
  return result;
360
363
  }
361
364
 
365
+ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
366
+ std::map<ggml_backend_buffer_type_t, size_t> ret;
367
+ for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
368
+ ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
369
+ }
370
+ return ret;
371
+ }
372
+
362
373
  llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
363
374
  do {
364
375
  balloc.split_reset();
@@ -371,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
371
382
  // if all tokens are output, split by sequence
372
383
  ubatch = balloc.split_seq(n_ubatch);
373
384
  } else {
374
- ubatch = balloc.split_equal(n_ubatch, false);
385
+ // TODO: non-sequential equal split can be done if using unified KV cache
386
+ // for simplicity, we always use sequential equal split for now
387
+ ubatch = balloc.split_equal(n_ubatch, true);
375
388
  }
376
389
 
377
390
  if (ubatch.n_tokens == 0) {
@@ -848,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
848
861
  bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
849
862
  if (dest_seq_id != -1) {
850
863
  // single sequence
851
-
852
864
  seq_rm(dest_seq_id, -1, -1);
853
865
 
866
+ if (cell_count == 0) {
867
+ return true;
868
+ }
869
+
854
870
  llama_batch_allocr balloc(hparams.n_pos_per_embd());
855
871
 
856
872
  llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
@@ -4,6 +4,7 @@
4
4
  #include "llama-graph.h"
5
5
  #include "llama-memory.h"
6
6
 
7
+ #include <map>
7
8
  #include <set>
8
9
  #include <vector>
9
10
 
@@ -50,6 +51,8 @@ public:
50
51
  llama_pos seq_pos_min(llama_seq_id seq_id) const override;
51
52
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
52
53
 
54
+ std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
55
+
53
56
  bool prepare(const std::vector<llama_ubatch> & ubatches);
54
57
 
55
58
  // find a contiguous slot of memory cells and emplace the ubatch there
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ #include <map>
5
6
  #include <memory>
6
7
  #include <functional>
7
8
 
@@ -108,6 +109,8 @@ struct llama_memory_i {
108
109
  virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
109
110
  virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
110
111
 
112
+ virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
113
+
111
114
  //
112
115
  // state write/read
113
116
  //
@@ -465,6 +465,8 @@ namespace GGUFMeta {
465
465
  // TODO: this is not very clever - figure out something better
466
466
  template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
467
467
  template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
468
+ template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
469
+
468
470
 
469
471
  llama_model_loader::llama_model_loader(
470
472
  const std::string & fname,