npm - @fugood/llama.node - Versions diffs - 0.3.13 → 0.3.15 - Mend

@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (184) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +1 -1
package/package.json +1 -1
package/src/LlamaContext.cpp +98 -76
package/src/LlamaContext.h +1 -1
package/src/common.hpp +1 -2
package/src/llama.cpp/.github/workflows/build.yml +89 -10
package/src/llama.cpp/.github/workflows/server.yml +2 -0
package/src/llama.cpp/CMakeLists.txt +9 -1
package/src/llama.cpp/cmake/common.cmake +2 -0
package/src/llama.cpp/common/CMakeLists.txt +3 -3
package/src/llama.cpp/common/arg.cpp +132 -13
package/src/llama.cpp/common/chat.cpp +960 -266
package/src/llama.cpp/common/chat.h +135 -0
package/src/llama.cpp/common/common.cpp +33 -174
package/src/llama.cpp/common/common.h +27 -67
package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
package/src/llama.cpp/common/ngram-cache.cpp +1 -0
package/src/llama.cpp/common/sampling.cpp +45 -7
package/src/llama.cpp/common/speculative.cpp +10 -9
package/src/llama.cpp/common/speculative.h +1 -1
package/src/llama.cpp/docs/build.md +45 -7
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
package/src/llama.cpp/examples/infill/infill.cpp +2 -2
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +373 -107
package/src/llama.cpp/examples/llava/clip.h +19 -3
package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
package/src/llama.cpp/examples/llava/llava.cpp +4 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
package/src/llama.cpp/examples/main/main.cpp +79 -34
package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
package/src/llama.cpp/examples/run/run.cpp +196 -108
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
package/src/llama.cpp/examples/server/server.cpp +113 -101
package/src/llama.cpp/examples/server/utils.hpp +94 -105
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
package/src/llama.cpp/examples/tts/tts.cpp +263 -151
package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
package/src/llama.cpp/ggml/include/ggml.h +29 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
package/src/llama.cpp/ggml/src/ggml.c +93 -5
package/src/llama.cpp/include/llama.h +105 -27
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-all.txt +1 -0
package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
package/src/llama.cpp/requirements.txt +1 -0
package/src/llama.cpp/src/CMakeLists.txt +5 -2
package/src/llama.cpp/src/llama-adapter.cpp +19 -20
package/src/llama.cpp/src/llama-adapter.h +11 -9
package/src/llama.cpp/src/llama-arch.cpp +123 -16
package/src/llama.cpp/src/llama-arch.h +19 -0
package/src/llama.cpp/src/llama-batch.h +2 -2
package/src/llama.cpp/src/llama-chat.cpp +1 -0
package/src/llama.cpp/src/llama-context.cpp +2253 -1222
package/src/llama.cpp/src/llama-context.h +214 -77
package/src/llama.cpp/src/llama-cparams.h +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +182 -182
package/src/llama.cpp/src/llama-grammar.h +12 -3
package/src/llama.cpp/src/llama-graph.cpp +1662 -0
package/src/llama.cpp/src/llama-graph.h +574 -0
package/src/llama.cpp/src/llama-hparams.cpp +8 -0
package/src/llama.cpp/src/llama-hparams.h +9 -0
package/src/llama.cpp/src/llama-io.cpp +15 -0
package/src/llama.cpp/src/llama-io.h +35 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
package/src/llama.cpp/src/llama-kv-cache.h +178 -109
package/src/llama.cpp/src/llama-memory.cpp +1 -0
package/src/llama.cpp/src/llama-memory.h +21 -0
package/src/llama.cpp/src/llama-mmap.cpp +11 -1
package/src/llama.cpp/src/llama-model.cpp +8230 -122
package/src/llama.cpp/src/llama-model.h +34 -1
package/src/llama.cpp/src/llama-quant.cpp +10 -1
package/src/llama.cpp/src/llama-sampling.cpp +43 -10
package/src/llama.cpp/src/llama-vocab.cpp +12 -0
package/src/llama.cpp/src/llama.cpp +51 -9837
package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
package/src/llama.cpp/tests/test-chat.cpp +593 -395
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
package/src/llama.cpp/Sources/llama/llama.h +0 -4
package/src/llama.cpp/common/chat.hpp +0 -55
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
/package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0

package/src/llama.cpp/examples/main/main.cpp CHANGED Viewed

@@ -4,7 +4,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "llama.h"
-#include "chat-template.hpp"
+#include "chat.h"
 #include <cstdio>
 #include <cstring>
@@ -31,8 +31,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
@@ -47,8 +45,8 @@ static void print_usage(int argc, char ** argv) {
     (void) argc;
     LOG("\nexample usage:\n");
-    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]);
     LOG("\n");
 }
@@ -158,7 +156,7 @@ int main(int argc, char ** argv) {
     }
     const llama_vocab * vocab = llama_model_get_vocab(model);
-    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
+    auto chat_templates = common_chat_templates_init(model, params.chat_template);
     LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
@@ -201,7 +199,7 @@ int main(int argc, char ** argv) {
     }
     // auto enable conversation mode if chat template is available
-    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
+    const bool has_chat_template = common_chat_templates_was_explicit(chat_templates.get());
     if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
         if (has_chat_template) {
             LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@@ -219,7 +217,11 @@ int main(int argc, char ** argv) {
     // print chat template example in conversation mode
     if (params.conversation_mode) {
         if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
+            if (!params.prompt.empty() && params.system_prompt.empty()) {
+                LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
+            }
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
         } else {
             LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
@@ -263,21 +265,45 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_inp;
+    bool waiting_for_first_input = false;
     auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
-        common_chat_msg new_msg{role, content, {}};
-        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
-        chat_msgs.push_back({role, content, {}});
+        common_chat_msg new_msg;
+        new_msg.role = role;
+        new_msg.content = content;
+        auto formatted = common_chat_format_single(chat_templates.get(), chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back(new_msg);
         LOG_DBG("formatted: '%s'\n", formatted.c_str());
         return formatted;
     };
+    std::string prompt;
     {
-        auto prompt = (params.conversation_mode && params.enable_chat_template)
-            // format the system prompt in conversation mode (fallback to default if empty)
-            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+        if (params.conversation_mode && params.enable_chat_template) {
+            if (!params.system_prompt.empty()) {
+                // format the system prompt (will use template default if empty)
+                chat_add_and_format("system", params.system_prompt);
+            }
+            if (!params.prompt.empty()) {
+                // format and append the user prompt
+                chat_add_and_format("user", params.prompt);
+            } else {
+                waiting_for_first_input = true;
+            }
+            if (!params.system_prompt.empty() || !params.prompt.empty()) {
+                common_chat_templates_inputs inputs;
+                inputs.messages = chat_msgs;
+                inputs.add_generation_prompt = !params.prompt.empty();
+                prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt;
+            }
+        } else {
             // otherwise use the prompt as is
-            : params.prompt;
-        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
+            prompt = params.prompt;
+        }
+        if (params.interactive_first || !prompt.empty() || session_tokens.empty()) {
             LOG_DBG("tokenize the prompt\n");
             embd_inp = common_tokenize(ctx, prompt, true, true);
         } else {
@@ -290,7 +316,7 @@ int main(int argc, char ** argv) {
     }
     // Should not run without any tokens
-    if (embd_inp.empty()) {
+    if (!waiting_for_first_input && embd_inp.empty()) {
         if (add_bos) {
             embd_inp.push_back(llama_vocab_bos(vocab));
             LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
@@ -328,7 +354,7 @@ int main(int argc, char ** argv) {
         }
         // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
     LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@@ -350,7 +376,12 @@ int main(int argc, char ** argv) {
     }
     if (params.conversation_mode) {
-        params.interactive_first = true;
+        if (params.single_turn && !params.prompt.empty()) {
+            params.interactive = false;
+            params.interactive_first = false;
+        } else {
+            params.interactive_first = true;
+        }
     }
     // enable interactive mode if interactive start is specified
@@ -474,8 +505,8 @@ int main(int argc, char ** argv) {
         LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
         LOG_INF(       "%s", control_message);
-        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
-            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) {
+            LOG_INF(   " - Not using system message. To change it, set a different value via -sys PROMPT\n");
         }
         LOG_INF("\n");
@@ -571,8 +602,8 @@ int main(int argc, char ** argv) {
                     LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
                     n_past -= n_discard;
@@ -595,9 +626,9 @@ int main(int argc, char ** argv) {
                     LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                     LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
                     n_past -= bd;
@@ -755,11 +786,14 @@ int main(int argc, char ** argv) {
                 // check for reverse prompt using special tokens
                 llama_token last_token = common_sampler_last(smpl);
-                if (std::find(antiprompt_token.begin(), antiprompt_token.end(), last_token) != antiprompt_token.end()) {
-                    if (params.interactive) {
-                        is_interacting = true;
+                for (auto token : antiprompt_token) {
+                    if (token == last_token) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                        }
+                        is_antiprompt = true;
+                        break;
                     }
-                    is_antiprompt = true;
                 }
                 if (is_antiprompt) {
@@ -768,7 +802,7 @@ int main(int argc, char ** argv) {
             }
             // deal with end of generation tokens in interactive mode
-            if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
+            if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
                 LOG_DBG("found an EOG token\n");
                 if (params.interactive) {
@@ -788,12 +822,17 @@ int main(int argc, char ** argv) {
             }
             // if current token is not EOG, we add it to current assistant message
-            if (params.conversation_mode) {
+            if (params.conversation_mode && !waiting_for_first_input) {
                 const auto id = common_sampler_last(smpl);
                 assistant_ss << common_token_to_piece(ctx, id, false);
+                if (!prompt.empty()) {
+                    prompt.clear();
+                    is_interacting = false;
+                }
             }
-            if (n_past > 0 && is_interacting) {
+            if ((n_past > 0 || waiting_for_first_input) && is_interacting) {
                 LOG_DBG("waiting for user input\n");
                 if (params.conversation_mode) {
@@ -883,11 +922,17 @@ int main(int argc, char ** argv) {
                 input_echo = false; // do not echo this again
             }
-            if (n_past > 0) {
+            if (n_past > 0 || waiting_for_first_input) {
                 if (is_interacting) {
                     common_sampler_reset(smpl);
                 }
                 is_interacting = false;
+                if (waiting_for_first_input && params.single_turn) {
+                    params.interactive = false;
+                    params.interactive_first = false;
+                }
+                waiting_for_first_input = false;
             }
         }

package/src/llama.cpp/examples/parallel/parallel.cpp CHANGED Viewed

@@ -12,6 +12,7 @@
 #include <string>
 #include <vector>
 #include <ctime>
+#include <algorithm>
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@@ -201,7 +202,7 @@ int main(int argc, char ** argv) {
         // assign the system KV cache to all parallel sequences
         for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+            llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
         }
         LOG_INF("\n");
@@ -233,9 +234,9 @@ int main(int argc, char ** argv) {
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
             for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                llama_kv_self_seq_rm(ctx, i, -1, -1);
                 // but keep the system prompt
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
             }
             LOG_INF("%s: clearing the KV cache\n", __func__);
@@ -371,8 +372,8 @@ int main(int argc, char ** argv) {
                     }
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
                     const auto t_main_end = ggml_time_us();

package/src/llama.cpp/examples/passkey/passkey.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <algorithm>
 static void print_usage(int, char ** argv) {
     LOG("\nexample usage:\n");
@@ -132,11 +133,11 @@ int main(int argc, char ** argv) {
             const int ib = i/n_batch - 1;
             const int bd = n_batch_grp*(n_grp - 1);
-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_self_update  (ctx);
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
         common_batch_clear(batch);
@@ -166,12 +167,12 @@ int main(int argc, char ** argv) {
         LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+      //llama_kv_self_defrag (ctx);
+        llama_kv_self_update (ctx);
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         common_batch_clear(batch);
@@ -197,12 +198,12 @@ int main(int argc, char ** argv) {
         if (n_discard > 0) {
             LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+          //llama_kv_self_defrag (ctx);
+            llama_kv_self_update (ctx);
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
         }
     }

package/src/llama.cpp/examples/perplexity/perplexity.cpp CHANGED Viewed

@@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         const auto t_start = std::chrono::high_resolution_clock::now();
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         llama_batch batch = llama_batch_init(n_batch, 0, 1);
@@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         const auto t_start = std::chrono::high_resolution_clock::now();
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         for (int j = 0; j < num_batches; ++j) {
             const int batch_start = start + j * n_batch;
@@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
             return;
         }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
             return;
         }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
             return;
         }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         }
         // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
         llama_batch batch = llama_batch_init(n_batch, 0, 1);

package/src/llama.cpp/examples/quantize/quantize.cpp CHANGED Viewed

@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
+#include <cctype>
 struct quant_option {
     std::string name;

package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp CHANGED Viewed

@@ -1,6 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
-#include "llama-context.h"
+#include "llama-model.h"
 #include "common.h"
 #include <algorithm>
@@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
         }
     }
-    const auto & tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(model);
     // check layer tensors
     int included_layers = 0;

package/src/llama.cpp/examples/retrieval/retrieval.cpp CHANGED Viewed

@@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_self_clear(ctx);
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);