@fugood/llama.node 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +37 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +20 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  60. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  61. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  62. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  63. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  64. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
  65. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  66. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
  67. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  68. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  69. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  70. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  71. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  72. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
  73. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  74. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  75. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  76. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  78. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  79. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  82. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
  83. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  84. package/src/llama.cpp/include/llama.h +86 -22
  85. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  86. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  87. package/src/llama.cpp/src/llama-adapter.h +11 -9
  88. package/src/llama.cpp/src/llama-arch.cpp +103 -16
  89. package/src/llama.cpp/src/llama-arch.h +18 -0
  90. package/src/llama.cpp/src/llama-batch.h +2 -2
  91. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  92. package/src/llama.cpp/src/llama-context.h +214 -77
  93. package/src/llama.cpp/src/llama-cparams.h +1 -0
  94. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  95. package/src/llama.cpp/src/llama-graph.h +574 -0
  96. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  97. package/src/llama.cpp/src/llama-hparams.h +9 -0
  98. package/src/llama.cpp/src/llama-io.cpp +15 -0
  99. package/src/llama.cpp/src/llama-io.h +35 -0
  100. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  101. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  102. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  103. package/src/llama.cpp/src/llama-memory.h +21 -0
  104. package/src/llama.cpp/src/llama-model.cpp +8244 -173
  105. package/src/llama.cpp/src/llama-model.h +34 -1
  106. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  107. package/src/llama.cpp/src/llama.cpp +51 -9984
  108. package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
  109. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  110. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.14",
4
+ "version": "0.3.16",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -676,6 +676,35 @@ jobs:
676
676
  -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
677
677
  cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
678
678
 
679
+ macOS-latest-cmake-visionos:
680
+ runs-on: macos-latest
681
+
682
+ steps:
683
+ - name: Clone
684
+ id: checkout
685
+ uses: actions/checkout@v4
686
+
687
+ - name: Dependencies
688
+ id: depends
689
+ continue-on-error: true
690
+ run: |
691
+ brew update
692
+
693
+ - name: Build
694
+ id: cmake_build
695
+ run: |
696
+ sysctl -a
697
+ cmake -B build -G Xcode \
698
+ -DGGML_METAL_USE_BF16=ON \
699
+ -DGGML_METAL_EMBED_LIBRARY=ON \
700
+ -DLLAMA_BUILD_EXAMPLES=OFF \
701
+ -DLLAMA_BUILD_TESTS=OFF \
702
+ -DLLAMA_BUILD_SERVER=OFF \
703
+ -DCMAKE_SYSTEM_NAME=visionOS \
704
+ -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
705
+ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
706
+ cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
707
+
679
708
  macOS-latest-swift:
680
709
  runs-on: macos-latest
681
710
 
@@ -1379,7 +1408,7 @@ jobs:
1379
1408
  id: pack_artifacts
1380
1409
  if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
1381
1410
  run: |
1382
- zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1411
+ zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
1383
1412
 
1384
1413
  - name: Upload artifacts
1385
1414
  if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -29,6 +29,8 @@ else()
29
29
  set(LLAMA_STANDALONE OFF)
30
30
  endif()
31
31
 
32
+ option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
33
+
32
34
  if (EMSCRIPTEN)
33
35
  set(BUILD_SHARED_LIBS_DEFAULT OFF)
34
36
 
@@ -145,7 +147,13 @@ endif()
145
147
  # 3rd-party
146
148
  #
147
149
 
148
- if (NOT TARGET ggml)
150
+ if (LLAMA_USE_SYSTEM_GGML)
151
+ message(STATUS "Using system-provided libggml, skipping ggml build")
152
+ find_package(ggml REQUIRED)
153
+ add_library(ggml ALIAS ggml::ggml)
154
+ endif()
155
+
156
+ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
149
157
  add_subdirectory(ggml)
150
158
  # ... otherwise assume ggml is added by a parent CMakeLists.txt
151
159
  endif()
@@ -1,3 +1,5 @@
1
+ include("ggml/cmake/common.cmake")
2
+
1
3
  function(llama_add_compile_flags)
2
4
  if (LLAMA_FATAL_WARNINGS)
3
5
  if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
764
764
  ).set_env("LLAMA_ARG_CTX_SIZE"));
765
765
  add_opt(common_arg(
766
766
  {"-n", "--predict", "--n-predict"}, "N",
767
- string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
767
+ string_format(
768
+ ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769
+ ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770
+ : "number of tokens to predict (default: %d, -1 = infinity)",
771
+ params.n_predict),
768
772
  [](common_params & params, int value) {
769
773
  params.n_predict = value;
770
774
  }
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
849
853
  }
850
854
  }
851
855
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
856
+ add_opt(common_arg(
857
+ {"-sysf", "--system-prompt-file"}, "FNAME",
858
+ "a file containing the system prompt (default: none)",
859
+ [](common_params & params, const std::string & value) {
860
+ std::ifstream file(value);
861
+ if (!file) {
862
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863
+ }
864
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
865
+ if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866
+ params.system_prompt.pop_back();
867
+ }
868
+ }
869
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
852
870
  add_opt(common_arg(
853
871
  {"--in-file"}, "FNAME",
854
872
  "an input file (repeat to specify multiple files)",
@@ -1871,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1871
1889
  [](common_params & params, const std::string & value) {
1872
1890
  params.out_file = value;
1873
1891
  }
1874
- ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
1875
1893
  add_opt(common_arg(
1876
1894
  {"-ofreq", "--output-frequency"}, "N",
1877
1895
  string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
955
955
  return iparams;
956
956
  }
957
957
 
958
- if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
959
- LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
958
+ if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
959
+ LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
960
960
  params.ctx_shift = false;
961
961
  }
962
962
 
@@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) {
1033
1033
  if (params.warmup) {
1034
1034
  LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
1035
1035
 
1036
+ llama_set_warmup(lctx, true);
1037
+
1036
1038
  std::vector<llama_token> tmp;
1037
1039
  llama_token bos = llama_vocab_bos(vocab);
1038
1040
  llama_token eos = llama_vocab_eos(vocab);
@@ -1060,9 +1062,10 @@ struct common_init_result common_init_from_params(common_params & params) {
1060
1062
  if (llama_model_has_decoder(model)) {
1061
1063
  llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
1062
1064
  }
1063
- llama_kv_cache_clear(lctx);
1065
+ llama_kv_self_clear(lctx);
1064
1066
  llama_synchronize(lctx);
1065
1067
  llama_perf_context_reset(lctx);
1068
+ llama_set_warmup(lctx, false);
1066
1069
  }
1067
1070
 
1068
1071
  iparams.model.reset(model);
@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
173
173
  result.reserve(params.n_draft);
174
174
 
175
175
  if (reuse_n == 0) {
176
- llama_kv_cache_clear(ctx);
176
+ llama_kv_self_clear(ctx);
177
177
 
178
178
  prompt.clear();
179
179
  } else {
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
192
192
  }
193
193
 
194
194
  if (reuse_i > 0) {
195
- llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
196
- llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
195
+ llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
196
+ llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
197
197
 
198
198
  prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
199
199
  }
200
200
 
201
201
  if (reuse_n < (int) prompt.size()) {
202
- llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
202
+ llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
203
203
 
204
204
  prompt.erase(prompt.begin() + reuse_n, prompt.end());
205
205
  }
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
132
132
 
133
133
  const auto t_pp_start = ggml_time_us();
134
134
 
135
- llama_kv_cache_clear(ctx);
135
+ llama_kv_self_clear(ctx);
136
136
 
137
137
  if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
138
138
  LOG_ERR("%s: llama_decode() failed\n", __func__);
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
141
141
 
142
142
  if (is_pp_shared) {
143
143
  for (int32_t i = 1; i < pl; ++i) {
144
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
144
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
145
145
  }
146
146
  }
147
147
 
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
342
342
  }
343
343
 
344
344
  static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
345
- llama_kv_cache_clear(ctx);
345
+ llama_kv_self_clear(ctx);
346
346
  if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
347
347
  fprintf(stderr, "%s : failed to eval\n", __func__);
348
348
  return false;
@@ -38,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
38
38
  const struct llama_model * model = llama_get_model(ctx);
39
39
 
40
40
  // clear previous kv_cache values (irrelevant for embeddings)
41
- llama_kv_cache_clear(ctx);
41
+ llama_kv_self_clear(ctx);
42
42
 
43
43
  // run model
44
44
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
45
45
  }
46
46
 
47
47
  // clear previous kv_cache values (irrelevant for embeddings)
48
- llama_kv_cache_clear(ctx);
48
+ llama_kv_self_clear(ctx);
49
49
  llama_set_embeddings(ctx, true);
50
50
  llama_set_causal_attn(ctx, false);
51
51
 
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
102
102
 
103
103
  llama_token eos_token = llama_vocab_eos(vocab);
104
104
 
105
- llama_kv_cache_clear(ctx);
105
+ llama_kv_self_clear(ctx);
106
106
  llama_set_embeddings(ctx, false);
107
107
  llama_set_causal_attn(ctx, true);
108
108
 
@@ -495,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
495
495
  const auto t_start = std::chrono::high_resolution_clock::now();
496
496
 
497
497
  // clear the KV cache
498
- llama_kv_cache_clear(ctx);
498
+ llama_kv_self_clear(ctx);
499
499
 
500
500
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
501
501
 
@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
332
332
  LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
333
333
  n_past, n_left, n_ctx, params.n_keep, n_discard);
334
334
 
335
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
335
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
336
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
337
337
 
338
338
  n_past -= n_discard;
339
339
 
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
1578
1578
 
1579
1579
  test t(inst, lmodel, ctx);
1580
1580
 
1581
- llama_kv_cache_clear(ctx);
1581
+ llama_kv_self_clear(ctx);
1582
1582
 
1583
1583
  // cool off before the test
1584
1584
  if (params.delay) {
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
1618
1618
  }
1619
1619
 
1620
1620
  for (int i = 0; i < params.reps; i++) {
1621
- llama_kv_cache_clear(ctx);
1621
+ llama_kv_self_clear(ctx);
1622
1622
 
1623
1623
  uint64_t t_start = get_time_ns();
1624
1624
 
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
194
194
  }
195
195
 
196
196
  batch->logits[batch->n_tokens - 1] = true;
197
- llama_kv_cache_clear(context);
197
+ llama_kv_self_clear(context);
198
198
 
199
199
  const auto t_pp_start = ggml_time_us();
200
200
  if (llama_decode(context, *batch) != 0) {
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
206
206
 
207
207
  LOGi("Benchmark text generation (tg)");
208
208
 
209
- llama_kv_cache_clear(context);
209
+ llama_kv_self_clear(context);
210
210
  const auto t_tg_start = ggml_time_us();
211
211
  for (i = 0; i < tg; i++) {
212
212
 
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
223
223
 
224
224
  const auto t_tg_end = ggml_time_us();
225
225
 
226
- llama_kv_cache_clear(context);
226
+ llama_kv_self_clear(context);
227
227
 
228
228
  const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
229
229
  const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
448
448
  extern "C"
449
449
  JNIEXPORT void JNICALL
450
450
  Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
451
- llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
451
+ llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
452
452
  }
@@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
309
309
  }
310
310
  if (line == "/clear") {
311
311
  ctx.n_past = 0;
312
- llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
312
+ llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
313
313
  LOG("Chat history cleared\n\n");
314
314
  continue;
315
315
  }
@@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
96
96
  llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
97
97
 
98
98
  for (int s = 1; s < W + G + 1; ++s) {
99
- llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
99
+ llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
100
100
  }
101
101
 
102
102
  const auto t_enc_end = ggml_time_us();
@@ -438,17 +438,17 @@ int main(int argc, char ** argv) {
438
438
 
439
439
  // KV cache management
440
440
  // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
441
- llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
441
+ llama_kv_self_seq_rm(ctx, -1, n_past, -1);
442
442
 
443
443
  if (seq_id_best != 0) {
444
444
  // if a verification token matched, we keep the best sequence and remove the rest
445
445
  // this leads to some KV cache fragmentation
446
- llama_kv_cache_seq_keep(ctx, seq_id_best);
447
- llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
448
- llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
446
+ llama_kv_self_seq_keep(ctx, seq_id_best);
447
+ llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
448
+ llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
449
449
 
450
450
  for (int s = 1; s < W + G + 1; ++s) {
451
- llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
451
+ llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
452
452
  }
453
453
  }
454
454
  }
@@ -192,7 +192,7 @@ int main(int argc, char ** argv){
192
192
 
193
193
  // KV cache management
194
194
  // clean the cache of draft tokens that weren't accepted
195
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
195
+ llama_kv_self_seq_rm(ctx, 0, n_past, -1);
196
196
 
197
197
  common_batch_clear(batch_tgt);
198
198
  common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
@@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
354
354
  }
355
355
 
356
356
  // remove any "future" tokens that we might have inherited from the previous session
357
- llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
357
+ llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
358
358
  }
359
359
 
360
360
  LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
602
602
  LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
603
603
  n_past, n_left, n_ctx, params.n_keep, n_discard);
604
604
 
605
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
606
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
605
+ llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
606
+ llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
607
607
 
608
608
  n_past -= n_discard;
609
609
 
@@ -626,9 +626,9 @@ int main(int argc, char ** argv) {
626
626
  LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
627
627
  LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
628
628
 
629
- llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
630
- llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
631
- llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
629
+ llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
630
+ llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
631
+ llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
632
632
 
633
633
  n_past -= bd;
634
634
 
@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
202
202
 
203
203
  // assign the system KV cache to all parallel sequences
204
204
  for (int32_t i = 1; i <= n_clients; ++i) {
205
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
205
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
206
206
  }
207
207
 
208
208
  LOG_INF("\n");
@@ -234,9 +234,9 @@ int main(int argc, char ** argv) {
234
234
  if (batch.n_tokens == 0) {
235
235
  // all sequences have ended - clear the entire KV cache
236
236
  for (int i = 1; i <= n_clients; ++i) {
237
- llama_kv_cache_seq_rm(ctx, i, -1, -1);
237
+ llama_kv_self_seq_rm(ctx, i, -1, -1);
238
238
  // but keep the system prompt
239
- llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
239
+ llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
240
240
  }
241
241
 
242
242
  LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -372,8 +372,8 @@ int main(int argc, char ** argv) {
372
372
  }
373
373
 
374
374
  // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
375
- llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
376
- llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
375
+ llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
376
+ llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
377
377
 
378
378
  const auto t_main_end = ggml_time_us();
379
379
 
@@ -133,11 +133,11 @@ int main(int argc, char ** argv) {
133
133
  const int ib = i/n_batch - 1;
134
134
  const int bd = n_batch_grp*(n_grp - 1);
135
135
 
136
- llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
137
- llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
138
- llama_kv_cache_update (ctx);
136
+ llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
137
+ llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
138
+ llama_kv_self_update (ctx);
139
139
 
140
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
140
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
141
141
  }
142
142
 
143
143
  common_batch_clear(batch);
@@ -167,12 +167,12 @@ int main(int argc, char ** argv) {
167
167
 
168
168
  LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
169
169
 
170
- llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
171
- llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
172
- //llama_kv_cache_defrag (ctx);
173
- llama_kv_cache_update (ctx);
170
+ llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
171
+ llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
172
+ //llama_kv_self_defrag (ctx);
173
+ llama_kv_self_update (ctx);
174
174
 
175
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
175
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
176
176
 
177
177
  common_batch_clear(batch);
178
178
 
@@ -198,12 +198,12 @@ int main(int argc, char ** argv) {
198
198
  if (n_discard > 0) {
199
199
  LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
200
200
 
201
- llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
202
- llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
203
- //llama_kv_cache_defrag (ctx);
204
- llama_kv_cache_update (ctx);
201
+ llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
202
+ llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
203
+ //llama_kv_self_defrag (ctx);
204
+ llama_kv_self_update (ctx);
205
205
 
206
- n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
206
+ n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
207
207
  }
208
208
  }
209
209
 
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
361
361
  const auto t_start = std::chrono::high_resolution_clock::now();
362
362
 
363
363
  // clear the KV cache
364
- llama_kv_cache_clear(ctx);
364
+ llama_kv_self_clear(ctx);
365
365
 
366
366
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
367
367
 
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
547
547
  const auto t_start = std::chrono::high_resolution_clock::now();
548
548
 
549
549
  // clear the KV cache
550
- llama_kv_cache_clear(ctx);
550
+ llama_kv_self_clear(ctx);
551
551
 
552
552
  for (int j = 0; j < num_batches; ++j) {
553
553
  const int batch_start = start + j * n_batch;
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
924
924
  return;
925
925
  }
926
926
 
927
- llama_kv_cache_clear(ctx);
927
+ llama_kv_self_clear(ctx);
928
928
 
929
929
  // decode all tasks [i0, i1)
930
930
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
1203
1203
  return;
1204
1204
  }
1205
1205
 
1206
- llama_kv_cache_clear(ctx);
1206
+ llama_kv_self_clear(ctx);
1207
1207
 
1208
1208
  // decode all tasks [i0, i1)
1209
1209
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
1575
1575
  return;
1576
1576
  }
1577
1577
 
1578
- llama_kv_cache_clear(ctx);
1578
+ llama_kv_self_clear(ctx);
1579
1579
 
1580
1580
  // decode all tasks [i0, i1)
1581
1581
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
1765
1765
  }
1766
1766
 
1767
1767
  // clear the KV cache
1768
- llama_kv_cache_clear(ctx);
1768
+ llama_kv_self_clear(ctx);
1769
1769
 
1770
1770
  llama_batch batch = llama_batch_init(n_batch, 0, 1);
1771
1771
 
@@ -1,6 +1,6 @@
1
1
  #include "ggml.h"
2
2
  #include "llama.h"
3
- #include "llama-context.h"
3
+ #include "llama-model.h"
4
4
  #include "common.h"
5
5
 
6
6
  #include <algorithm>
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
328
328
  }
329
329
  }
330
330
 
331
- const auto & tensors = llama_internal_get_tensor_map(ctx);
331
+ const auto & tensors = llama_internal_get_tensor_map(model);
332
332
 
333
333
  // check layer tensors
334
334
  int included_layers = 0;
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
83
83
 
84
84
  static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
85
85
  // clear previous kv_cache values (irrelevant for embeddings)
86
- llama_kv_cache_clear(ctx);
86
+ llama_kv_self_clear(ctx);
87
87
 
88
88
  // run model
89
89
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);