@fugood/llama.node 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +37 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +20 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  60. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  61. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  62. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  63. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  64. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
  65. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  66. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
  67. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  68. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  69. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  70. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  71. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  72. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
  73. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  74. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  75. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  76. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  78. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  79. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  82. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
  83. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  84. package/src/llama.cpp/include/llama.h +86 -22
  85. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  86. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  87. package/src/llama.cpp/src/llama-adapter.h +11 -9
  88. package/src/llama.cpp/src/llama-arch.cpp +103 -16
  89. package/src/llama.cpp/src/llama-arch.h +18 -0
  90. package/src/llama.cpp/src/llama-batch.h +2 -2
  91. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  92. package/src/llama.cpp/src/llama-context.h +214 -77
  93. package/src/llama.cpp/src/llama-cparams.h +1 -0
  94. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  95. package/src/llama.cpp/src/llama-graph.h +574 -0
  96. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  97. package/src/llama.cpp/src/llama-hparams.h +9 -0
  98. package/src/llama.cpp/src/llama-io.cpp +15 -0
  99. package/src/llama.cpp/src/llama-io.h +35 -0
  100. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  101. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  102. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  103. package/src/llama.cpp/src/llama-memory.h +21 -0
  104. package/src/llama.cpp/src/llama-model.cpp +8244 -173
  105. package/src/llama.cpp/src/llama-model.h +34 -1
  106. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  107. package/src/llama.cpp/src/llama.cpp +51 -9984
  108. package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
  109. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  110. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -1,8 +1,4 @@
1
1
  find_package (Threads REQUIRED)
2
- find_program(GLSLC_EXECUTABLE glslc)
3
- if(NOT GLSLC_EXECUTABLE)
4
- message(FATAL_ERROR "glslc not found.")
5
- endif()
6
2
 
7
3
  set(TARGET vulkan-shaders-gen)
8
4
  add_executable(${TARGET} vulkan-shaders-gen.cpp)
@@ -426,14 +426,16 @@ void process_shaders() {
426
426
  }
427
427
  }
428
428
 
429
- string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
430
- string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
429
+ string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
430
+ string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
431
+ string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
431
432
 
432
433
  // Norms
433
434
  string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
434
435
  string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
435
436
  string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
436
437
  string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
438
+ string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
437
439
 
438
440
  string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
439
441
  string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
@@ -444,6 +446,7 @@ void process_shaders() {
444
446
 
445
447
  for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
446
448
  string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
449
+ string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
447
450
  string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
448
451
  }
449
452
 
@@ -528,6 +531,8 @@ void process_shaders() {
528
531
 
529
532
  string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
530
533
 
534
+ string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
535
+
531
536
  string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));
532
537
 
533
538
  for (auto &c : compiles) {
@@ -929,6 +929,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
929
929
  "RMS_NORM",
930
930
  "RMS_NORM_BACK",
931
931
  "GROUP_NORM",
932
+ "L2_NORM",
932
933
 
933
934
  "MUL_MAT",
934
935
  "MUL_MAT_ID",
@@ -977,6 +978,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
977
978
  "ADD_REL_POS",
978
979
  "RWKV_WKV6",
979
980
  "GATED_LINEAR_ATTN",
981
+ "RWKV_WKV7",
980
982
 
981
983
  "UNARY",
982
984
 
@@ -996,7 +998,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
996
998
  "OPT_STEP_ADAMW",
997
999
  };
998
1000
 
999
- static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
1001
+ static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
1000
1002
 
1001
1003
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1002
1004
  "none",
@@ -1026,6 +1028,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1026
1028
  "rms_norm(x)",
1027
1029
  "rms_norm_back(x)",
1028
1030
  "group_norm(x)",
1031
+ "l2_norm(x)",
1029
1032
 
1030
1033
  "X*Y",
1031
1034
  "X[i]*Y",
@@ -1074,6 +1077,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1074
1077
  "add_rel_pos(x)",
1075
1078
  "rwkv_wkv6(k, v, r, tf, td, s)",
1076
1079
  "gated_linear_attn(k, v, q, gate, s)",
1080
+ "rwkv_wkv7(r, w, k, v, a, b, s)",
1077
1081
 
1078
1082
  "unary(x)",
1079
1083
 
@@ -1093,7 +1097,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
1093
1097
  "adamw(x)",
1094
1098
  };
1095
1099
 
1096
- static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
1100
+ static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");
1097
1101
 
1098
1102
  static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
1099
1103
 
@@ -2686,6 +2690,37 @@ struct ggml_tensor * ggml_group_norm_inplace(
2686
2690
  return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
2687
2691
  }
2688
2692
 
2693
+ // ggml_l2_norm
2694
+
2695
+ static struct ggml_tensor * ggml_l2_norm_impl(
2696
+ struct ggml_context * ctx,
2697
+ struct ggml_tensor * a,
2698
+ float eps,
2699
+ bool inplace) {
2700
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
2701
+
2702
+ ggml_set_op_params_f32(result, 0, eps);
2703
+
2704
+ result->op = GGML_OP_L2_NORM;
2705
+ result->src[0] = a;
2706
+
2707
+ return result;
2708
+ }
2709
+
2710
+ struct ggml_tensor * ggml_l2_norm(
2711
+ struct ggml_context * ctx,
2712
+ struct ggml_tensor * a,
2713
+ float eps) {
2714
+ return ggml_l2_norm_impl(ctx, a, eps, false);
2715
+ }
2716
+
2717
+ struct ggml_tensor * ggml_l2_norm_inplace(
2718
+ struct ggml_context * ctx,
2719
+ struct ggml_tensor * a,
2720
+ float eps) {
2721
+ return ggml_l2_norm_impl(ctx, a, eps, true);
2722
+ }
2723
+
2689
2724
  // ggml_mul_mat
2690
2725
 
2691
2726
  static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
@@ -4720,6 +4755,54 @@ struct ggml_tensor * ggml_gated_linear_attn(
4720
4755
  return result;
4721
4756
  }
4722
4757
 
4758
+ // ggml_rwkv_wkv7
4759
+
4760
+ struct ggml_tensor * ggml_rwkv_wkv7(
4761
+ struct ggml_context * ctx,
4762
+ struct ggml_tensor * r,
4763
+ struct ggml_tensor * w,
4764
+ struct ggml_tensor * k,
4765
+ struct ggml_tensor * v,
4766
+ struct ggml_tensor * a,
4767
+ struct ggml_tensor * b,
4768
+ struct ggml_tensor * state) {
4769
+ GGML_ASSERT(ggml_is_contiguous(r));
4770
+ GGML_ASSERT(ggml_is_contiguous(w));
4771
+ GGML_ASSERT(ggml_is_contiguous(k));
4772
+ GGML_ASSERT(ggml_is_contiguous(v));
4773
+ GGML_ASSERT(ggml_is_contiguous(a));
4774
+ GGML_ASSERT(ggml_is_contiguous(b));
4775
+ GGML_ASSERT(ggml_is_contiguous(state));
4776
+
4777
+ const int64_t S = k->ne[0];
4778
+ const int64_t H = k->ne[1];
4779
+ const int64_t n_tokens = k->ne[2];
4780
+ const int64_t n_seqs = state->ne[1];
4781
+ {
4782
+ GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens);
4783
+ GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens);
4784
+ GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
4785
+ GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens);
4786
+ GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens);
4787
+ GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
4788
+ }
4789
+
4790
+ // concat output and new_state
4791
+ const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
4792
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
4793
+
4794
+ result->op = GGML_OP_RWKV_WKV7;
4795
+ result->src[0] = r;
4796
+ result->src[1] = w;
4797
+ result->src[2] = k;
4798
+ result->src[3] = v;
4799
+ result->src[4] = a;
4800
+ result->src[5] = b;
4801
+ result->src[6] = state;
4802
+
4803
+ return result;
4804
+ }
4805
+
4723
4806
  // ggml_unary
4724
4807
 
4725
4808
  static struct ggml_tensor * ggml_unary_impl(
@@ -60,6 +60,7 @@ extern "C" {
60
60
  struct llama_model;
61
61
  struct llama_context;
62
62
  struct llama_sampler;
63
+ struct llama_kv_cache;
63
64
 
64
65
  typedef int32_t llama_pos;
65
66
  typedef int32_t llama_token;
@@ -469,7 +470,8 @@ extern "C" {
469
470
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
470
471
 
471
472
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
472
- LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
473
+ LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
474
+ LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
473
475
 
474
476
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
475
477
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
@@ -586,7 +588,7 @@ extern "C" {
586
588
  // KV cache
587
589
  //
588
590
 
589
- // TODO: remove llama_kv_cache_view_* API
591
+ // TODO: start using struct llama_kv_cache
590
592
 
591
593
  // Information associated with an individual cell in the KV cache view.
592
594
  struct llama_kv_cache_view_cell {
@@ -641,13 +643,19 @@ extern "C" {
641
643
 
642
644
  // Returns the number of tokens in the KV cache (slow, use only for debug)
643
645
  // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
644
- LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
646
+ LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
647
+
648
+ DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
649
+ "use llama_kv_self_n_tokens instead");
645
650
 
646
651
  // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
647
- LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
652
+ LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
653
+
654
+ DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
655
+ "use llama_kv_self_used_cells instead");
648
656
 
649
657
  // Clear the KV cache - both cell info is erased and KV data is zeroed
650
- LLAMA_API void llama_kv_cache_clear(
658
+ LLAMA_API void llama_kv_self_clear(
651
659
  struct llama_context * ctx);
652
660
 
653
661
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -655,7 +663,7 @@ extern "C" {
655
663
  // seq_id < 0 : match any sequence
656
664
  // p0 < 0 : [0, p1]
657
665
  // p1 < 0 : [p0, inf)
658
- LLAMA_API bool llama_kv_cache_seq_rm(
666
+ LLAMA_API bool llama_kv_self_seq_rm(
659
667
  struct llama_context * ctx,
660
668
  llama_seq_id seq_id,
661
669
  llama_pos p0,
@@ -665,7 +673,7 @@ extern "C" {
665
673
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
666
674
  // p0 < 0 : [0, p1]
667
675
  // p1 < 0 : [p0, inf)
668
- LLAMA_API void llama_kv_cache_seq_cp(
676
+ LLAMA_API void llama_kv_self_seq_cp(
669
677
  struct llama_context * ctx,
670
678
  llama_seq_id seq_id_src,
671
679
  llama_seq_id seq_id_dst,
@@ -673,17 +681,17 @@ extern "C" {
673
681
  llama_pos p1);
674
682
 
675
683
  // Removes all tokens that do not belong to the specified sequence
676
- LLAMA_API void llama_kv_cache_seq_keep(
684
+ LLAMA_API void llama_kv_self_seq_keep(
677
685
  struct llama_context * ctx,
678
686
  llama_seq_id seq_id);
679
687
 
680
688
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
681
689
  // If the KV cache is RoPEd, the KV data is updated accordingly:
682
690
  // - lazily on next llama_decode()
683
- // - explicitly with llama_kv_cache_update()
691
+ // - explicitly with llama_kv_self_update()
684
692
  // p0 < 0 : [0, p1]
685
693
  // p1 < 0 : [p0, inf)
686
- LLAMA_API void llama_kv_cache_seq_add(
694
+ LLAMA_API void llama_kv_self_seq_add(
687
695
  struct llama_context * ctx,
688
696
  llama_seq_id seq_id,
689
697
  llama_pos p0,
@@ -693,10 +701,10 @@ extern "C" {
693
701
  // Integer division of the positions by factor of `d > 1`
694
702
  // If the KV cache is RoPEd, the KV data is updated accordingly:
695
703
  // - lazily on next llama_decode()
696
- // - explicitly with llama_kv_cache_update()
704
+ // - explicitly with llama_kv_self_update()
697
705
  // p0 < 0 : [0, p1]
698
706
  // p1 < 0 : [p0, inf)
699
- LLAMA_API void llama_kv_cache_seq_div(
707
+ LLAMA_API void llama_kv_self_seq_div(
700
708
  struct llama_context * ctx,
701
709
  llama_seq_id seq_id,
702
710
  llama_pos p0,
@@ -704,24 +712,76 @@ extern "C" {
704
712
  int d);
705
713
 
706
714
  // Returns the largest position present in the KV cache for the specified sequence
707
- LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
715
+ LLAMA_API llama_pos llama_kv_self_seq_pos_max(
708
716
  struct llama_context * ctx,
709
- llama_seq_id seq_id);
710
-
711
- // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
712
- // how to avoid this?
717
+ llama_seq_id seq_id);
713
718
 
714
719
  // Defragment the KV cache
715
720
  // This will be applied:
716
721
  // - lazily on next llama_decode()
717
- // - explicitly with llama_kv_cache_update()
718
- LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
722
+ // - explicitly with llama_kv_self_update()
723
+ LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
724
+
725
+ // Check if the context supports KV cache shifting
726
+ LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
719
727
 
720
728
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
721
- LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
729
+ LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
730
+
731
+ DEPRECATED(LLAMA_API void llama_kv_cache_clear(
732
+ struct llama_context * ctx),
733
+ "use llama_kv_self_clear instead");
734
+
735
+ DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
736
+ struct llama_context * ctx,
737
+ llama_seq_id seq_id,
738
+ llama_pos p0,
739
+ llama_pos p1),
740
+ "use llama_kv_self_seq_rm instead");
741
+
742
+ DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
743
+ struct llama_context * ctx,
744
+ llama_seq_id seq_id_src,
745
+ llama_seq_id seq_id_dst,
746
+ llama_pos p0,
747
+ llama_pos p1),
748
+ "use llama_kv_self_seq_cp instead");
749
+
750
+ DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
751
+ struct llama_context * ctx,
752
+ llama_seq_id seq_id),
753
+ "use llama_kv_self_seq_keep instead");
754
+
755
+ DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
756
+ struct llama_context * ctx,
757
+ llama_seq_id seq_id,
758
+ llama_pos p0,
759
+ llama_pos p1,
760
+ llama_pos delta),
761
+ "use llama_kv_self_seq_add instead");
762
+
763
+ DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
764
+ struct llama_context * ctx,
765
+ llama_seq_id seq_id,
766
+ llama_pos p0,
767
+ llama_pos p1,
768
+ int d),
769
+ "use llama_kv_self_seq_div instead");
770
+
771
+ DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
772
+ struct llama_context * ctx,
773
+ llama_seq_id seq_id),
774
+ "use llama_kv_self_seq_pos_max instead");
775
+
776
+ DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
777
+ "use llama_kv_self_defrag instead");
778
+
779
+ DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
780
+ "use llama_kv_self_can_shift instead");
781
+
782
+ DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
783
+ "use llama_kv_self_update instead");
722
784
 
723
- // Check if the context supports KV cache shifting
724
- LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
725
785
 
726
786
  //
727
787
  // State / sessions
@@ -885,6 +945,10 @@ extern "C" {
885
945
  // If set to true, the model will only attend to the past tokens
886
946
  LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
887
947
 
948
+ // Set whether the model is in warmup mode or not
949
+ // If true, all model tensors are activated during llama_decode() to load and cache their weights.
950
+ LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup);
951
+
888
952
  // Set abort callback
889
953
  LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
890
954
 
@@ -15,18 +15,21 @@ add_library(llama
15
15
  llama-chat.cpp
16
16
  llama-context.cpp
17
17
  llama-grammar.cpp
18
+ llama-graph.cpp
18
19
  llama-hparams.cpp
19
20
  llama-impl.cpp
21
+ llama-io.cpp
20
22
  llama-kv-cache.cpp
23
+ llama-memory.cpp
21
24
  llama-mmap.cpp
22
25
  llama-model-loader.cpp
23
26
  llama-model.cpp
24
27
  llama-quant.cpp
25
28
  llama-sampling.cpp
26
29
  llama-vocab.cpp
27
- unicode.h
28
- unicode.cpp
29
30
  unicode-data.cpp
31
+ unicode.cpp
32
+ unicode.h
30
33
  )
31
34
 
32
35
  target_include_directories(llama PUBLIC . ../include ../common)
@@ -4,14 +4,13 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-model.h"
6
6
 
7
- #include <algorithm>
8
7
  #include <map>
9
8
  #include <cassert>
10
9
  #include <stdexcept>
11
10
 
12
11
  // vec
13
12
 
14
- struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
13
+ ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
15
14
  if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
16
15
  return nullptr;
17
16
  }
@@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
19
18
  return tensors[il];
20
19
  }
21
20
 
22
- struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
21
+ ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const {
23
22
  ggml_tensor * layer_dir = tensor_for(il);
24
23
  if (layer_dir != nullptr) {
25
24
  cur = ggml_add(ctx, cur, layer_dir);
@@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
40
39
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
41
40
  auto it = ctx_map.find(buft);
42
41
  if (it == ctx_map.end()) {
43
- struct ggml_init_params params = {
42
+ ggml_init_params params = {
44
43
  /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
45
44
  /*.mem_buffer =*/ NULL,
46
45
  /*.no_alloc =*/ true,
@@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
91
90
  return true;
92
91
  }
93
92
 
94
- int32_t llama_adapter_cvec::apply(
93
+ bool llama_adapter_cvec::apply(
95
94
  const llama_model & model,
96
95
  const float * data,
97
96
  size_t len,
@@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
104
103
  // disable the current control vector (but leave allocated for later)
105
104
  layer_start = -1;
106
105
  layer_end = -1;
107
- return 0;
106
+ return true;
108
107
  }
109
108
 
110
109
  if (n_embd != (int) hparams.n_embd) {
111
110
  LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
112
- return 1;
111
+ return false;
113
112
  }
114
113
 
115
114
  if (tensors.empty()) {
116
115
  if (!init(model)) {
117
- return 1;
116
+ return false;
118
117
  }
119
118
  }
120
119
 
@@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
130
129
  }
131
130
  }
132
131
 
133
- return 0;
132
+ return true;
134
133
  }
135
134
 
136
135
  // lora
137
136
 
138
- llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
137
+ llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
139
138
  const std::string name(w->name);
140
139
 
141
140
  const auto pos = ab_map.find(name);
@@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
146
145
  return nullptr;
147
146
  }
148
147
 
149
- static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
148
+ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
150
149
  LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
151
150
 
152
151
  ggml_context * ctx_init;
153
- struct gguf_init_params meta_gguf_params = {
152
+ gguf_init_params meta_gguf_params = {
154
153
  /* .no_alloc = */ true,
155
154
  /* .ctx = */ &ctx_init,
156
155
  };
@@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
201
200
  auto it = ctx_map.find(buft);
202
201
  if (it == ctx_map.end()) {
203
202
  // add a new context
204
- struct ggml_init_params params = {
203
+ ggml_init_params params = {
205
204
  /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
206
205
  /*.mem_buffer =*/ NULL,
207
206
  /*.no_alloc =*/ true,
@@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
264
263
  throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
265
264
  }
266
265
 
267
- struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
266
+ ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
268
267
  // validate tensor shape
269
268
  if (is_token_embd) {
270
269
  // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
281
280
  }
282
281
 
283
282
  // save tensor to adapter
284
- struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
285
- struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
283
+ ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
284
+ ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
286
285
  ggml_set_name(tensor_a, w.a->name);
287
286
  ggml_set_name(tensor_b, w.b->name);
288
287
  adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
308
307
  {
309
308
  llama_file gguf_file(path_lora, "rb");
310
309
  std::vector<uint8_t> read_buf;
311
- auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
310
+ auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
312
311
  size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
313
312
  size_t size = ggml_nbytes(orig);
314
313
  read_buf.resize(size);
@@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
327
326
  LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
328
327
  }
329
328
 
330
- struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
331
- struct llama_adapter_lora * adapter = new llama_adapter_lora();
329
+ llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
330
+ llama_adapter_lora * adapter = new llama_adapter_lora();
332
331
 
333
332
  try {
334
333
  llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
342
341
  return nullptr;
343
342
  }
344
343
 
345
- void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
344
+ void llama_adapter_lora_free(llama_adapter_lora * adapter) {
346
345
  delete adapter;
347
346
  }
@@ -15,11 +15,11 @@
15
15
  //
16
16
 
17
17
  struct llama_adapter_cvec {
18
- struct ggml_tensor * tensor_for(int il) const;
18
+ ggml_tensor * tensor_for(int il) const;
19
19
 
20
- struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
20
+ ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int il) const;
21
21
 
22
- int32_t apply(
22
+ bool apply(
23
23
  const llama_model & model,
24
24
  const float * data,
25
25
  size_t len,
@@ -36,7 +36,7 @@ private:
36
36
  std::vector<ggml_context_ptr> ctxs;
37
37
  std::vector<ggml_backend_buffer_ptr> bufs;
38
38
 
39
- std::vector<struct ggml_tensor *> tensors; // per layer
39
+ std::vector<ggml_tensor *> tensors; // per layer
40
40
  };
41
41
 
42
42
  //
@@ -44,8 +44,8 @@ private:
44
44
  //
45
45
 
46
46
  struct llama_adapter_lora_weight {
47
- struct ggml_tensor * a = nullptr;
48
- struct ggml_tensor * b = nullptr;
47
+ ggml_tensor * a = nullptr;
48
+ ggml_tensor * b = nullptr;
49
49
 
50
50
  // get actual scale based on rank and alpha
51
51
  float get_scale(float alpha, float adapter_scale) const {
@@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
55
55
  }
56
56
 
57
57
  llama_adapter_lora_weight() = default;
58
- llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
58
+ llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
59
59
  };
60
60
 
61
61
  struct llama_adapter_lora {
62
62
  // map tensor name to lora_a_b
63
- std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
63
+ std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
64
64
 
65
65
  std::vector<ggml_context_ptr> ctxs;
66
66
  std::vector<ggml_backend_buffer_ptr> bufs;
@@ -70,5 +70,7 @@ struct llama_adapter_lora {
70
70
  llama_adapter_lora() = default;
71
71
  ~llama_adapter_lora() = default;
72
72
 
73
- llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
73
+ llama_adapter_lora_weight * get_weight(ggml_tensor * w);
74
74
  };
75
+
76
+ using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;