@fugood/llama.node 0.3.14 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/.github/workflows/build.yml +30 -1
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/arg.cpp +20 -2
- package/src/llama.cpp/common/common.cpp +6 -3
- package/src/llama.cpp/common/speculative.cpp +4 -4
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +6 -6
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/run.cpp +91 -46
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +37 -15
- package/src/llama.cpp/examples/server/utils.hpp +3 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/tts/tts.cpp +20 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +24 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
- package/src/llama.cpp/ggml/src/ggml.c +85 -2
- package/src/llama.cpp/include/llama.h +86 -22
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +103 -16
- package/src/llama.cpp/src/llama-arch.h +18 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -110
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-model.cpp +8244 -173
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama.cpp +51 -9984
- package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -676,6 +676,35 @@ jobs:
|
|
|
676
676
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
|
677
677
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
|
678
678
|
|
|
679
|
+
macOS-latest-cmake-visionos:
|
|
680
|
+
runs-on: macos-latest
|
|
681
|
+
|
|
682
|
+
steps:
|
|
683
|
+
- name: Clone
|
|
684
|
+
id: checkout
|
|
685
|
+
uses: actions/checkout@v4
|
|
686
|
+
|
|
687
|
+
- name: Dependencies
|
|
688
|
+
id: depends
|
|
689
|
+
continue-on-error: true
|
|
690
|
+
run: |
|
|
691
|
+
brew update
|
|
692
|
+
|
|
693
|
+
- name: Build
|
|
694
|
+
id: cmake_build
|
|
695
|
+
run: |
|
|
696
|
+
sysctl -a
|
|
697
|
+
cmake -B build -G Xcode \
|
|
698
|
+
-DGGML_METAL_USE_BF16=ON \
|
|
699
|
+
-DGGML_METAL_EMBED_LIBRARY=ON \
|
|
700
|
+
-DLLAMA_BUILD_EXAMPLES=OFF \
|
|
701
|
+
-DLLAMA_BUILD_TESTS=OFF \
|
|
702
|
+
-DLLAMA_BUILD_SERVER=OFF \
|
|
703
|
+
-DCMAKE_SYSTEM_NAME=visionOS \
|
|
704
|
+
-DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
|
|
705
|
+
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
|
706
|
+
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
|
707
|
+
|
|
679
708
|
macOS-latest-swift:
|
|
680
709
|
runs-on: macos-latest
|
|
681
710
|
|
|
@@ -1379,7 +1408,7 @@ jobs:
|
|
|
1379
1408
|
id: pack_artifacts
|
|
1380
1409
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
1381
1410
|
run: |
|
|
1382
|
-
zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
|
1411
|
+
zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework
|
|
1383
1412
|
|
|
1384
1413
|
- name: Upload artifacts
|
|
1385
1414
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
@@ -29,6 +29,8 @@ else()
|
|
|
29
29
|
set(LLAMA_STANDALONE OFF)
|
|
30
30
|
endif()
|
|
31
31
|
|
|
32
|
+
option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF)
|
|
33
|
+
|
|
32
34
|
if (EMSCRIPTEN)
|
|
33
35
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
|
34
36
|
|
|
@@ -145,7 +147,13 @@ endif()
|
|
|
145
147
|
# 3rd-party
|
|
146
148
|
#
|
|
147
149
|
|
|
148
|
-
if (
|
|
150
|
+
if (LLAMA_USE_SYSTEM_GGML)
|
|
151
|
+
message(STATUS "Using system-provided libggml, skipping ggml build")
|
|
152
|
+
find_package(ggml REQUIRED)
|
|
153
|
+
add_library(ggml ALIAS ggml::ggml)
|
|
154
|
+
endif()
|
|
155
|
+
|
|
156
|
+
if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
|
|
149
157
|
add_subdirectory(ggml)
|
|
150
158
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
|
151
159
|
endif()
|
|
@@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
764
764
|
).set_env("LLAMA_ARG_CTX_SIZE"));
|
|
765
765
|
add_opt(common_arg(
|
|
766
766
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
767
|
-
string_format(
|
|
767
|
+
string_format(
|
|
768
|
+
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
|
769
|
+
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
770
|
+
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
771
|
+
params.n_predict),
|
|
768
772
|
[](common_params & params, int value) {
|
|
769
773
|
params.n_predict = value;
|
|
770
774
|
}
|
|
@@ -849,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
849
853
|
}
|
|
850
854
|
}
|
|
851
855
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
856
|
+
add_opt(common_arg(
|
|
857
|
+
{"-sysf", "--system-prompt-file"}, "FNAME",
|
|
858
|
+
"a file containing the system prompt (default: none)",
|
|
859
|
+
[](common_params & params, const std::string & value) {
|
|
860
|
+
std::ifstream file(value);
|
|
861
|
+
if (!file) {
|
|
862
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
863
|
+
}
|
|
864
|
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
|
865
|
+
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
|
866
|
+
params.system_prompt.pop_back();
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
852
870
|
add_opt(common_arg(
|
|
853
871
|
{"--in-file"}, "FNAME",
|
|
854
872
|
"an input file (repeat to specify multiple files)",
|
|
@@ -1871,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1871
1889
|
[](common_params & params, const std::string & value) {
|
|
1872
1890
|
params.out_file = value;
|
|
1873
1891
|
}
|
|
1874
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1892
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
|
1875
1893
|
add_opt(common_arg(
|
|
1876
1894
|
{"-ofreq", "--output-frequency"}, "N",
|
|
1877
1895
|
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
|
@@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
955
955
|
return iparams;
|
|
956
956
|
}
|
|
957
957
|
|
|
958
|
-
if (params.ctx_shift && !
|
|
959
|
-
LOG_WRN("%s: KV cache shifting is not supported for this
|
|
958
|
+
if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
|
|
959
|
+
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
|
960
960
|
params.ctx_shift = false;
|
|
961
961
|
}
|
|
962
962
|
|
|
@@ -1033,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1033
1033
|
if (params.warmup) {
|
|
1034
1034
|
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
|
1035
1035
|
|
|
1036
|
+
llama_set_warmup(lctx, true);
|
|
1037
|
+
|
|
1036
1038
|
std::vector<llama_token> tmp;
|
|
1037
1039
|
llama_token bos = llama_vocab_bos(vocab);
|
|
1038
1040
|
llama_token eos = llama_vocab_eos(vocab);
|
|
@@ -1060,9 +1062,10 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1060
1062
|
if (llama_model_has_decoder(model)) {
|
|
1061
1063
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
|
1062
1064
|
}
|
|
1063
|
-
|
|
1065
|
+
llama_kv_self_clear(lctx);
|
|
1064
1066
|
llama_synchronize(lctx);
|
|
1065
1067
|
llama_perf_context_reset(lctx);
|
|
1068
|
+
llama_set_warmup(lctx, false);
|
|
1066
1069
|
}
|
|
1067
1070
|
|
|
1068
1071
|
iparams.model.reset(model);
|
|
@@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
173
173
|
result.reserve(params.n_draft);
|
|
174
174
|
|
|
175
175
|
if (reuse_n == 0) {
|
|
176
|
-
|
|
176
|
+
llama_kv_self_clear(ctx);
|
|
177
177
|
|
|
178
178
|
prompt.clear();
|
|
179
179
|
} else {
|
|
@@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
|
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
if (reuse_i > 0) {
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
|
|
196
|
+
llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
|
|
197
197
|
|
|
198
198
|
prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
|
|
199
199
|
}
|
|
200
200
|
|
|
201
201
|
if (reuse_n < (int) prompt.size()) {
|
|
202
|
-
|
|
202
|
+
llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
|
|
203
203
|
|
|
204
204
|
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
205
205
|
}
|
|
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
|
|
|
132
132
|
|
|
133
133
|
const auto t_pp_start = ggml_time_us();
|
|
134
134
|
|
|
135
|
-
|
|
135
|
+
llama_kv_self_clear(ctx);
|
|
136
136
|
|
|
137
137
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
138
138
|
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
@@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
|
|
|
141
141
|
|
|
142
142
|
if (is_pp_shared) {
|
|
143
143
|
for (int32_t i = 1; i < pl; ++i) {
|
|
144
|
-
|
|
144
|
+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
|
145
145
|
}
|
|
146
146
|
}
|
|
147
147
|
|
|
@@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
342
342
|
}
|
|
343
343
|
|
|
344
344
|
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
|
345
|
-
|
|
345
|
+
llama_kv_self_clear(ctx);
|
|
346
346
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
|
347
347
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
348
348
|
return false;
|
|
@@ -38,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
38
38
|
const struct llama_model * model = llama_get_model(ctx);
|
|
39
39
|
|
|
40
40
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
41
|
-
|
|
41
|
+
llama_kv_self_clear(ctx);
|
|
42
42
|
|
|
43
43
|
// run model
|
|
44
44
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
@@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
45
45
|
}
|
|
46
46
|
|
|
47
47
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
48
|
-
|
|
48
|
+
llama_kv_self_clear(ctx);
|
|
49
49
|
llama_set_embeddings(ctx, true);
|
|
50
50
|
llama_set_causal_attn(ctx, false);
|
|
51
51
|
|
|
@@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|
|
102
102
|
|
|
103
103
|
llama_token eos_token = llama_vocab_eos(vocab);
|
|
104
104
|
|
|
105
|
-
|
|
105
|
+
llama_kv_self_clear(ctx);
|
|
106
106
|
llama_set_embeddings(ctx, false);
|
|
107
107
|
llama_set_causal_attn(ctx, true);
|
|
108
108
|
|
|
@@ -495,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
|
|
495
495
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
496
496
|
|
|
497
497
|
// clear the KV cache
|
|
498
|
-
|
|
498
|
+
llama_kv_self_clear(ctx);
|
|
499
499
|
|
|
500
500
|
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
501
501
|
|
|
@@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
|
|
|
332
332
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
333
333
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
334
334
|
|
|
335
|
-
|
|
336
|
-
|
|
335
|
+
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
|
336
|
+
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
|
337
337
|
|
|
338
338
|
n_past -= n_discard;
|
|
339
339
|
|
|
@@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
|
|
|
1578
1578
|
|
|
1579
1579
|
test t(inst, lmodel, ctx);
|
|
1580
1580
|
|
|
1581
|
-
|
|
1581
|
+
llama_kv_self_clear(ctx);
|
|
1582
1582
|
|
|
1583
1583
|
// cool off before the test
|
|
1584
1584
|
if (params.delay) {
|
|
@@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
|
|
|
1618
1618
|
}
|
|
1619
1619
|
|
|
1620
1620
|
for (int i = 0; i < params.reps; i++) {
|
|
1621
|
-
|
|
1621
|
+
llama_kv_self_clear(ctx);
|
|
1622
1622
|
|
|
1623
1623
|
uint64_t t_start = get_time_ns();
|
|
1624
1624
|
|
|
@@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|
|
194
194
|
}
|
|
195
195
|
|
|
196
196
|
batch->logits[batch->n_tokens - 1] = true;
|
|
197
|
-
|
|
197
|
+
llama_kv_self_clear(context);
|
|
198
198
|
|
|
199
199
|
const auto t_pp_start = ggml_time_us();
|
|
200
200
|
if (llama_decode(context, *batch) != 0) {
|
|
@@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|
|
206
206
|
|
|
207
207
|
LOGi("Benchmark text generation (tg)");
|
|
208
208
|
|
|
209
|
-
|
|
209
|
+
llama_kv_self_clear(context);
|
|
210
210
|
const auto t_tg_start = ggml_time_us();
|
|
211
211
|
for (i = 0; i < tg; i++) {
|
|
212
212
|
|
|
@@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|
|
223
223
|
|
|
224
224
|
const auto t_tg_end = ggml_time_us();
|
|
225
225
|
|
|
226
|
-
|
|
226
|
+
llama_kv_self_clear(context);
|
|
227
227
|
|
|
228
228
|
const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
|
|
229
229
|
const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
|
|
@@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|
|
448
448
|
extern "C"
|
|
449
449
|
JNIEXPORT void JNICALL
|
|
450
450
|
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
|
451
|
-
|
|
451
|
+
llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
|
|
452
452
|
}
|
|
@@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
|
|
|
309
309
|
}
|
|
310
310
|
if (line == "/clear") {
|
|
311
311
|
ctx.n_past = 0;
|
|
312
|
-
|
|
312
|
+
llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
|
|
313
313
|
LOG("Chat history cleared\n\n");
|
|
314
314
|
continue;
|
|
315
315
|
}
|
|
@@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
|
|
|
96
96
|
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1));
|
|
97
97
|
|
|
98
98
|
for (int s = 1; s < W + G + 1; ++s) {
|
|
99
|
-
|
|
99
|
+
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
|
100
100
|
}
|
|
101
101
|
|
|
102
102
|
const auto t_enc_end = ggml_time_us();
|
|
@@ -438,17 +438,17 @@ int main(int argc, char ** argv) {
|
|
|
438
438
|
|
|
439
439
|
// KV cache management
|
|
440
440
|
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
|
441
|
-
|
|
441
|
+
llama_kv_self_seq_rm(ctx, -1, n_past, -1);
|
|
442
442
|
|
|
443
443
|
if (seq_id_best != 0) {
|
|
444
444
|
// if a verification token matched, we keep the best sequence and remove the rest
|
|
445
445
|
// this leads to some KV cache fragmentation
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
446
|
+
llama_kv_self_seq_keep(ctx, seq_id_best);
|
|
447
|
+
llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
|
448
|
+
llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1);
|
|
449
449
|
|
|
450
450
|
for (int s = 1; s < W + G + 1; ++s) {
|
|
451
|
-
|
|
451
|
+
llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
|
|
452
452
|
}
|
|
453
453
|
}
|
|
454
454
|
}
|
|
@@ -192,7 +192,7 @@ int main(int argc, char ** argv){
|
|
|
192
192
|
|
|
193
193
|
// KV cache management
|
|
194
194
|
// clean the cache of draft tokens that weren't accepted
|
|
195
|
-
|
|
195
|
+
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
|
196
196
|
|
|
197
197
|
common_batch_clear(batch_tgt);
|
|
198
198
|
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
|
@@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
|
|
|
354
354
|
}
|
|
355
355
|
|
|
356
356
|
// remove any "future" tokens that we might have inherited from the previous session
|
|
357
|
-
|
|
357
|
+
llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
|
358
358
|
}
|
|
359
359
|
|
|
360
360
|
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
|
@@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
|
|
|
602
602
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
603
603
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
604
604
|
|
|
605
|
-
|
|
606
|
-
|
|
605
|
+
llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
606
|
+
llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
|
607
607
|
|
|
608
608
|
n_past -= n_discard;
|
|
609
609
|
|
|
@@ -626,9 +626,9 @@ int main(int argc, char ** argv) {
|
|
|
626
626
|
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
|
627
627
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
|
628
628
|
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
629
|
+
llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
|
630
|
+
llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
|
631
|
+
llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
|
632
632
|
|
|
633
633
|
n_past -= bd;
|
|
634
634
|
|
|
@@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
|
|
|
202
202
|
|
|
203
203
|
// assign the system KV cache to all parallel sequences
|
|
204
204
|
for (int32_t i = 1; i <= n_clients; ++i) {
|
|
205
|
-
|
|
205
|
+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
|
206
206
|
}
|
|
207
207
|
|
|
208
208
|
LOG_INF("\n");
|
|
@@ -234,9 +234,9 @@ int main(int argc, char ** argv) {
|
|
|
234
234
|
if (batch.n_tokens == 0) {
|
|
235
235
|
// all sequences have ended - clear the entire KV cache
|
|
236
236
|
for (int i = 1; i <= n_clients; ++i) {
|
|
237
|
-
|
|
237
|
+
llama_kv_self_seq_rm(ctx, i, -1, -1);
|
|
238
238
|
// but keep the system prompt
|
|
239
|
-
|
|
239
|
+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
|
240
240
|
}
|
|
241
241
|
|
|
242
242
|
LOG_INF("%s: clearing the KV cache\n", __func__);
|
|
@@ -372,8 +372,8 @@ int main(int argc, char ** argv) {
|
|
|
372
372
|
}
|
|
373
373
|
|
|
374
374
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
|
375
|
-
|
|
376
|
-
|
|
375
|
+
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
|
|
376
|
+
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
|
377
377
|
|
|
378
378
|
const auto t_main_end = ggml_time_us();
|
|
379
379
|
|
|
@@ -133,11 +133,11 @@ int main(int argc, char ** argv) {
|
|
|
133
133
|
const int ib = i/n_batch - 1;
|
|
134
134
|
const int bd = n_batch_grp*(n_grp - 1);
|
|
135
135
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
|
137
|
+
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
|
138
|
+
llama_kv_self_update (ctx);
|
|
139
139
|
|
|
140
|
-
n_past =
|
|
140
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
141
141
|
}
|
|
142
142
|
|
|
143
143
|
common_batch_clear(batch);
|
|
@@ -167,12 +167,12 @@ int main(int argc, char ** argv) {
|
|
|
167
167
|
|
|
168
168
|
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
|
169
169
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
//
|
|
173
|
-
|
|
170
|
+
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
|
171
|
+
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
|
172
|
+
//llama_kv_self_defrag (ctx);
|
|
173
|
+
llama_kv_self_update (ctx);
|
|
174
174
|
|
|
175
|
-
n_past =
|
|
175
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
176
176
|
|
|
177
177
|
common_batch_clear(batch);
|
|
178
178
|
|
|
@@ -198,12 +198,12 @@ int main(int argc, char ** argv) {
|
|
|
198
198
|
if (n_discard > 0) {
|
|
199
199
|
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
|
200
200
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
//
|
|
204
|
-
|
|
201
|
+
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
|
202
|
+
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
|
203
|
+
//llama_kv_self_defrag (ctx);
|
|
204
|
+
llama_kv_self_update (ctx);
|
|
205
205
|
|
|
206
|
-
n_past =
|
|
206
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
207
207
|
}
|
|
208
208
|
}
|
|
209
209
|
|
|
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
|
|
361
361
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
362
362
|
|
|
363
363
|
// clear the KV cache
|
|
364
|
-
|
|
364
|
+
llama_kv_self_clear(ctx);
|
|
365
365
|
|
|
366
366
|
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
367
367
|
|
|
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
|
|
547
547
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
548
548
|
|
|
549
549
|
// clear the KV cache
|
|
550
|
-
|
|
550
|
+
llama_kv_self_clear(ctx);
|
|
551
551
|
|
|
552
552
|
for (int j = 0; j < num_batches; ++j) {
|
|
553
553
|
const int batch_start = start + j * n_batch;
|
|
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
924
924
|
return;
|
|
925
925
|
}
|
|
926
926
|
|
|
927
|
-
|
|
927
|
+
llama_kv_self_clear(ctx);
|
|
928
928
|
|
|
929
929
|
// decode all tasks [i0, i1)
|
|
930
930
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
|
|
|
1203
1203
|
return;
|
|
1204
1204
|
}
|
|
1205
1205
|
|
|
1206
|
-
|
|
1206
|
+
llama_kv_self_clear(ctx);
|
|
1207
1207
|
|
|
1208
1208
|
// decode all tasks [i0, i1)
|
|
1209
1209
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
|
|
1575
1575
|
return;
|
|
1576
1576
|
}
|
|
1577
1577
|
|
|
1578
|
-
|
|
1578
|
+
llama_kv_self_clear(ctx);
|
|
1579
1579
|
|
|
1580
1580
|
// decode all tasks [i0, i1)
|
|
1581
1581
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
|
1765
1765
|
}
|
|
1766
1766
|
|
|
1767
1767
|
// clear the KV cache
|
|
1768
|
-
|
|
1768
|
+
llama_kv_self_clear(ctx);
|
|
1769
1769
|
|
|
1770
1770
|
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
1771
1771
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
2
|
#include "llama.h"
|
|
3
|
-
#include "llama-
|
|
3
|
+
#include "llama-model.h"
|
|
4
4
|
#include "common.h"
|
|
5
5
|
|
|
6
6
|
#include <algorithm>
|
|
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
|
|
|
328
328
|
}
|
|
329
329
|
}
|
|
330
330
|
|
|
331
|
-
const auto & tensors = llama_internal_get_tensor_map(
|
|
331
|
+
const auto & tensors = llama_internal_get_tensor_map(model);
|
|
332
332
|
|
|
333
333
|
// check layer tensors
|
|
334
334
|
int included_layers = 0;
|
|
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|
|
83
83
|
|
|
84
84
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
|
85
85
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
86
|
-
|
|
86
|
+
llama_kv_self_clear(ctx);
|
|
87
87
|
|
|
88
88
|
// run model
|
|
89
89
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|