@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
#include "log.h"
|
|
5
5
|
#include "sampling.h"
|
|
6
6
|
#include "llama.h"
|
|
7
|
-
#include "chat
|
|
7
|
+
#include "chat.h"
|
|
8
8
|
|
|
9
9
|
#include <cstdio>
|
|
10
10
|
#include <cstring>
|
|
@@ -31,8 +31,6 @@
|
|
|
31
31
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
32
32
|
#endif
|
|
33
33
|
|
|
34
|
-
static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
|
|
35
|
-
|
|
36
34
|
static llama_context ** g_ctx;
|
|
37
35
|
static llama_model ** g_model;
|
|
38
36
|
static common_sampler ** g_smpl;
|
|
@@ -47,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
|
|
|
47
45
|
(void) argc;
|
|
48
46
|
|
|
49
47
|
LOG("\nexample usage:\n");
|
|
50
|
-
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
|
51
|
-
LOG("\n chat (conversation): %s -m your_model.gguf -
|
|
48
|
+
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
|
|
49
|
+
LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
|
|
52
50
|
LOG("\n");
|
|
53
51
|
}
|
|
54
52
|
|
|
@@ -158,7 +156,7 @@ int main(int argc, char ** argv) {
|
|
|
158
156
|
}
|
|
159
157
|
|
|
160
158
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
161
|
-
auto chat_templates =
|
|
159
|
+
auto chat_templates = common_chat_templates_init(model, params.chat_template);
|
|
162
160
|
|
|
163
161
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
|
164
162
|
|
|
@@ -201,7 +199,7 @@ int main(int argc, char ** argv) {
|
|
|
201
199
|
}
|
|
202
200
|
|
|
203
201
|
// auto enable conversation mode if chat template is available
|
|
204
|
-
const bool has_chat_template = chat_templates.
|
|
202
|
+
const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
|
|
205
203
|
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
|
206
204
|
if (has_chat_template) {
|
|
207
205
|
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
|
@@ -219,7 +217,11 @@ int main(int argc, char ** argv) {
|
|
|
219
217
|
// print chat template example in conversation mode
|
|
220
218
|
if (params.conversation_mode) {
|
|
221
219
|
if (params.enable_chat_template) {
|
|
222
|
-
|
|
220
|
+
if (!params.prompt.empty() && params.system_prompt.empty()) {
|
|
221
|
+
LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
|
|
223
225
|
} else {
|
|
224
226
|
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
|
225
227
|
}
|
|
@@ -263,21 +265,45 @@ int main(int argc, char ** argv) {
|
|
|
263
265
|
|
|
264
266
|
std::vector<llama_token> embd_inp;
|
|
265
267
|
|
|
268
|
+
bool waiting_for_first_input = false;
|
|
266
269
|
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
|
|
267
|
-
common_chat_msg new_msg
|
|
268
|
-
|
|
269
|
-
|
|
270
|
+
common_chat_msg new_msg;
|
|
271
|
+
new_msg.role = role;
|
|
272
|
+
new_msg.content = content;
|
|
273
|
+
auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
|
|
274
|
+
chat_msgs.push_back(new_msg);
|
|
270
275
|
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
|
271
276
|
return formatted;
|
|
272
277
|
};
|
|
273
278
|
|
|
279
|
+
std::string prompt;
|
|
274
280
|
{
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
281
|
+
if (params.conversation_mode && params.enable_chat_template) {
|
|
282
|
+
if (!params.system_prompt.empty()) {
|
|
283
|
+
// format the system prompt (will use template default if empty)
|
|
284
|
+
chat_add_and_format("system", params.system_prompt);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if (!params.prompt.empty()) {
|
|
288
|
+
// format and append the user prompt
|
|
289
|
+
chat_add_and_format("user", params.prompt);
|
|
290
|
+
} else {
|
|
291
|
+
waiting_for_first_input = true;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if (!params.system_prompt.empty() || !params.prompt.empty()) {
|
|
295
|
+
common_chat_templates_inputs inputs;
|
|
296
|
+
inputs.messages = chat_msgs;
|
|
297
|
+
inputs.add_generation_prompt = !params.prompt.empty();
|
|
298
|
+
|
|
299
|
+
prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
|
|
300
|
+
}
|
|
301
|
+
} else {
|
|
278
302
|
// otherwise use the prompt as is
|
|
279
|
-
|
|
280
|
-
|
|
303
|
+
prompt = params.prompt;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
|
|
281
307
|
LOG_DBG("tokenize the prompt\n");
|
|
282
308
|
embd_inp = common_tokenize(ctx, prompt, true, true);
|
|
283
309
|
} else {
|
|
@@ -290,7 +316,7 @@ int main(int argc, char ** argv) {
|
|
|
290
316
|
}
|
|
291
317
|
|
|
292
318
|
// Should not run without any tokens
|
|
293
|
-
if (embd_inp.empty()) {
|
|
319
|
+
if (!waiting_for_first_input && embd_inp.empty()) {
|
|
294
320
|
if (add_bos) {
|
|
295
321
|
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
296
322
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
@@ -328,7 +354,7 @@ int main(int argc, char ** argv) {
|
|
|
328
354
|
}
|
|
329
355
|
|
|
330
356
|
// remove any "future" tokens that we might have inherited from the previous session
|
|
331
|
-
|
|
357
|
+
llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
|
332
358
|
}
|
|
333
359
|
|
|
334
360
|
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
|
@@ -350,7 +376,12 @@ int main(int argc, char ** argv) {
|
|
|
350
376
|
}
|
|
351
377
|
|
|
352
378
|
if (params.conversation_mode) {
|
|
353
|
-
params.
|
|
379
|
+
if (params.single_turn && !params.prompt.empty()) {
|
|
380
|
+
params.interactive = false;
|
|
381
|
+
params.interactive_first = false;
|
|
382
|
+
} else {
|
|
383
|
+
params.interactive_first = true;
|
|
384
|
+
}
|
|
354
385
|
}
|
|
355
386
|
|
|
356
387
|
// enable interactive mode if interactive start is specified
|
|
@@ -474,8 +505,8 @@ int main(int argc, char ** argv) {
|
|
|
474
505
|
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
475
506
|
#endif
|
|
476
507
|
LOG_INF( "%s", control_message);
|
|
477
|
-
if (params.conversation_mode && params.enable_chat_template && params.
|
|
478
|
-
LOG_INF( " -
|
|
508
|
+
if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
|
|
509
|
+
LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n");
|
|
479
510
|
}
|
|
480
511
|
LOG_INF("\n");
|
|
481
512
|
|
|
@@ -571,8 +602,8 @@ int main(int argc, char ** argv) {
|
|
|
571
602
|
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
572
603
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
573
604
|
|
|
574
|
-
|
|
575
|
-
|
|
605
|
+
llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
606
|
+
llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
|
576
607
|
|
|
577
608
|
n_past -= n_discard;
|
|
578
609
|
|
|
@@ -595,9 +626,9 @@ int main(int argc, char ** argv) {
|
|
|
595
626
|
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
|
596
627
|
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
|
597
628
|
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
629
|
+
llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
|
630
|
+
llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
|
631
|
+
llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
|
|
601
632
|
|
|
602
633
|
n_past -= bd;
|
|
603
634
|
|
|
@@ -755,11 +786,14 @@ int main(int argc, char ** argv) {
|
|
|
755
786
|
|
|
756
787
|
// check for reverse prompt using special tokens
|
|
757
788
|
llama_token last_token = common_sampler_last(smpl);
|
|
758
|
-
|
|
759
|
-
if (
|
|
760
|
-
|
|
789
|
+
for (auto token : antiprompt_token) {
|
|
790
|
+
if (token == last_token) {
|
|
791
|
+
if (params.interactive) {
|
|
792
|
+
is_interacting = true;
|
|
793
|
+
}
|
|
794
|
+
is_antiprompt = true;
|
|
795
|
+
break;
|
|
761
796
|
}
|
|
762
|
-
is_antiprompt = true;
|
|
763
797
|
}
|
|
764
798
|
|
|
765
799
|
if (is_antiprompt) {
|
|
@@ -768,7 +802,7 @@ int main(int argc, char ** argv) {
|
|
|
768
802
|
}
|
|
769
803
|
|
|
770
804
|
// deal with end of generation tokens in interactive mode
|
|
771
|
-
if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
805
|
+
if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
772
806
|
LOG_DBG("found an EOG token\n");
|
|
773
807
|
|
|
774
808
|
if (params.interactive) {
|
|
@@ -788,12 +822,17 @@ int main(int argc, char ** argv) {
|
|
|
788
822
|
}
|
|
789
823
|
|
|
790
824
|
// if current token is not EOG, we add it to current assistant message
|
|
791
|
-
if (params.conversation_mode) {
|
|
825
|
+
if (params.conversation_mode && !waiting_for_first_input) {
|
|
792
826
|
const auto id = common_sampler_last(smpl);
|
|
793
827
|
assistant_ss << common_token_to_piece(ctx, id, false);
|
|
828
|
+
|
|
829
|
+
if (!prompt.empty()) {
|
|
830
|
+
prompt.clear();
|
|
831
|
+
is_interacting = false;
|
|
832
|
+
}
|
|
794
833
|
}
|
|
795
834
|
|
|
796
|
-
if (n_past > 0 && is_interacting) {
|
|
835
|
+
if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
|
|
797
836
|
LOG_DBG("waiting for user input\n");
|
|
798
837
|
|
|
799
838
|
if (params.conversation_mode) {
|
|
@@ -883,11 +922,17 @@ int main(int argc, char ** argv) {
|
|
|
883
922
|
input_echo = false; // do not echo this again
|
|
884
923
|
}
|
|
885
924
|
|
|
886
|
-
if (n_past > 0) {
|
|
925
|
+
if (n_past > 0 || waiting_for_first_input) {
|
|
887
926
|
if (is_interacting) {
|
|
888
927
|
common_sampler_reset(smpl);
|
|
889
928
|
}
|
|
890
929
|
is_interacting = false;
|
|
930
|
+
|
|
931
|
+
if (waiting_for_first_input && params.single_turn) {
|
|
932
|
+
params.interactive = false;
|
|
933
|
+
params.interactive_first = false;
|
|
934
|
+
}
|
|
935
|
+
waiting_for_first_input = false;
|
|
891
936
|
}
|
|
892
937
|
}
|
|
893
938
|
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
#include <string>
|
|
13
13
|
#include <vector>
|
|
14
14
|
#include <ctime>
|
|
15
|
+
#include <algorithm>
|
|
15
16
|
|
|
16
17
|
// trim whitespace from the beginning and end of a string
|
|
17
18
|
static std::string trim(const std::string & str) {
|
|
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
|
|
|
201
202
|
|
|
202
203
|
// assign the system KV cache to all parallel sequences
|
|
203
204
|
for (int32_t i = 1; i <= n_clients; ++i) {
|
|
204
|
-
|
|
205
|
+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
|
205
206
|
}
|
|
206
207
|
|
|
207
208
|
LOG_INF("\n");
|
|
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
|
|
|
233
234
|
if (batch.n_tokens == 0) {
|
|
234
235
|
// all sequences have ended - clear the entire KV cache
|
|
235
236
|
for (int i = 1; i <= n_clients; ++i) {
|
|
236
|
-
|
|
237
|
+
llama_kv_self_seq_rm(ctx, i, -1, -1);
|
|
237
238
|
// but keep the system prompt
|
|
238
|
-
|
|
239
|
+
llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
|
|
239
240
|
}
|
|
240
241
|
|
|
241
242
|
LOG_INF("%s: clearing the KV cache\n", __func__);
|
|
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
|
|
|
371
372
|
}
|
|
372
373
|
|
|
373
374
|
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1);
|
|
376
|
+
llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
|
|
376
377
|
|
|
377
378
|
const auto t_main_end = ggml_time_us();
|
|
378
379
|
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <cstdio>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <algorithm>
|
|
10
11
|
|
|
11
12
|
static void print_usage(int, char ** argv) {
|
|
12
13
|
LOG("\nexample usage:\n");
|
|
@@ -132,11 +133,11 @@ int main(int argc, char ** argv) {
|
|
|
132
133
|
const int ib = i/n_batch - 1;
|
|
133
134
|
const int bd = n_batch_grp*(n_grp - 1);
|
|
134
135
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
|
|
137
|
+
llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
|
|
138
|
+
llama_kv_self_update (ctx);
|
|
138
139
|
|
|
139
|
-
n_past =
|
|
140
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
140
141
|
}
|
|
141
142
|
|
|
142
143
|
common_batch_clear(batch);
|
|
@@ -166,12 +167,12 @@ int main(int argc, char ** argv) {
|
|
|
166
167
|
|
|
167
168
|
LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
|
|
168
169
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
//
|
|
172
|
-
|
|
170
|
+
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
|
171
|
+
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
|
172
|
+
//llama_kv_self_defrag (ctx);
|
|
173
|
+
llama_kv_self_update (ctx);
|
|
173
174
|
|
|
174
|
-
n_past =
|
|
175
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
175
176
|
|
|
176
177
|
common_batch_clear(batch);
|
|
177
178
|
|
|
@@ -197,12 +198,12 @@ int main(int argc, char ** argv) {
|
|
|
197
198
|
if (n_discard > 0) {
|
|
198
199
|
LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
|
199
200
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
//
|
|
203
|
-
|
|
201
|
+
llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
|
202
|
+
llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
|
203
|
+
//llama_kv_self_defrag (ctx);
|
|
204
|
+
llama_kv_self_update (ctx);
|
|
204
205
|
|
|
205
|
-
n_past =
|
|
206
|
+
n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
|
|
206
207
|
}
|
|
207
208
|
}
|
|
208
209
|
|
|
@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
|
|
361
361
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
362
362
|
|
|
363
363
|
// clear the KV cache
|
|
364
|
-
|
|
364
|
+
llama_kv_self_clear(ctx);
|
|
365
365
|
|
|
366
366
|
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
367
367
|
|
|
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
|
|
547
547
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
548
548
|
|
|
549
549
|
// clear the KV cache
|
|
550
|
-
|
|
550
|
+
llama_kv_self_clear(ctx);
|
|
551
551
|
|
|
552
552
|
for (int j = 0; j < num_batches; ++j) {
|
|
553
553
|
const int batch_start = start + j * n_batch;
|
|
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
924
924
|
return;
|
|
925
925
|
}
|
|
926
926
|
|
|
927
|
-
|
|
927
|
+
llama_kv_self_clear(ctx);
|
|
928
928
|
|
|
929
929
|
// decode all tasks [i0, i1)
|
|
930
930
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
|
|
|
1203
1203
|
return;
|
|
1204
1204
|
}
|
|
1205
1205
|
|
|
1206
|
-
|
|
1206
|
+
llama_kv_self_clear(ctx);
|
|
1207
1207
|
|
|
1208
1208
|
// decode all tasks [i0, i1)
|
|
1209
1209
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
|
|
1575
1575
|
return;
|
|
1576
1576
|
}
|
|
1577
1577
|
|
|
1578
|
-
|
|
1578
|
+
llama_kv_self_clear(ctx);
|
|
1579
1579
|
|
|
1580
1580
|
// decode all tasks [i0, i1)
|
|
1581
1581
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
|
1765
1765
|
}
|
|
1766
1766
|
|
|
1767
1767
|
// clear the KV cache
|
|
1768
|
-
|
|
1768
|
+
llama_kv_self_clear(ctx);
|
|
1769
1769
|
|
|
1770
1770
|
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
1771
1771
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include "ggml.h"
|
|
2
2
|
#include "llama.h"
|
|
3
|
-
#include "llama-
|
|
3
|
+
#include "llama-model.h"
|
|
4
4
|
#include "common.h"
|
|
5
5
|
|
|
6
6
|
#include <algorithm>
|
|
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
|
|
|
328
328
|
}
|
|
329
329
|
}
|
|
330
330
|
|
|
331
|
-
const auto & tensors = llama_internal_get_tensor_map(
|
|
331
|
+
const auto & tensors = llama_internal_get_tensor_map(model);
|
|
332
332
|
|
|
333
333
|
// check layer tensors
|
|
334
334
|
int included_layers = 0;
|
|
@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|
|
83
83
|
|
|
84
84
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
|
85
85
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
86
|
-
|
|
86
|
+
llama_kv_self_clear(ctx);
|
|
87
87
|
|
|
88
88
|
// run model
|
|
89
89
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|