npm - @fugood/llama.node - Versions diffs - 0.3.12 → 0.3.14 - Mend

@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +2 -1
package/package.json +1 -1
package/src/LlamaCompletionWorker.cpp +14 -0
package/src/LlamaContext.cpp +110 -79
package/src/LlamaContext.h +1 -1
package/src/common.hpp +1 -2
package/src/llama.cpp/.github/workflows/build.yml +95 -13
package/src/llama.cpp/.github/workflows/docker.yml +2 -0
package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
package/src/llama.cpp/.github/workflows/server.yml +2 -0
package/src/llama.cpp/common/CMakeLists.txt +23 -6
package/src/llama.cpp/common/arg.cpp +292 -14
package/src/llama.cpp/common/chat.cpp +1128 -315
package/src/llama.cpp/common/chat.h +135 -0
package/src/llama.cpp/common/common.cpp +27 -171
package/src/llama.cpp/common/common.h +41 -73
package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
package/src/llama.cpp/common/llguidance.cpp +3 -3
package/src/llama.cpp/common/log.cpp +1 -0
package/src/llama.cpp/common/log.h +2 -1
package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
package/src/llama.cpp/common/ngram-cache.cpp +1 -0
package/src/llama.cpp/common/sampling.cpp +93 -49
package/src/llama.cpp/common/speculative.cpp +6 -5
package/src/llama.cpp/common/speculative.h +1 -1
package/src/llama.cpp/docs/build.md +47 -9
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +373 -107
package/src/llama.cpp/examples/llava/clip.h +19 -3
package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
package/src/llama.cpp/examples/llava/llava.cpp +4 -2
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
package/src/llama.cpp/examples/main/main.cpp +73 -28
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
package/src/llama.cpp/examples/run/run.cpp +115 -79
package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/server/httplib.h +381 -292
package/src/llama.cpp/examples/server/server.cpp +134 -128
package/src/llama.cpp/examples/server/utils.hpp +95 -106
package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
package/src/llama.cpp/examples/tts/tts.cpp +251 -142
package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
package/src/llama.cpp/ggml/include/ggml.h +6 -2
package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
package/src/llama.cpp/ggml/src/ggml.c +9 -4
package/src/llama.cpp/include/llama.h +32 -14
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-all.txt +1 -0
package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
package/src/llama.cpp/requirements.txt +1 -0
package/src/llama.cpp/src/llama-arch.cpp +21 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-chat.cpp +1 -0
package/src/llama.cpp/src/llama-grammar.cpp +183 -183
package/src/llama.cpp/src/llama-grammar.h +13 -4
package/src/llama.cpp/src/llama-impl.h +6 -6
package/src/llama.cpp/src/llama-kv-cache.h +2 -1
package/src/llama.cpp/src/llama-mmap.cpp +11 -1
package/src/llama.cpp/src/llama-mmap.h +1 -0
package/src/llama.cpp/src/llama-model.cpp +70 -6
package/src/llama.cpp/src/llama-sampling.cpp +174 -67
package/src/llama.cpp/src/llama-vocab.cpp +12 -0
package/src/llama.cpp/src/llama.cpp +154 -5
package/src/llama.cpp/src/unicode.cpp +9 -2
package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
package/src/llama.cpp/tests/test-chat.cpp +691 -325
package/src/llama.cpp/tests/test-gguf.cpp +4 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
package/src/llama.cpp/tests/test-sampling.cpp +15 -0
package/src/llama.cpp/Sources/llama/llama.h +0 -4
package/src/llama.cpp/common/chat.hpp +0 -52

package/src/llama.cpp/common/chat.h ADDED Viewed

@@ -0,0 +1,135 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+#pragma once
+#include "common.h"
+#include <string>
+#include <vector>
+struct common_chat_templates;
+struct common_chat_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
+struct common_chat_msg_content_part {
+    std::string type;
+    std::string text;
+};
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_chat_msg_content_part> content_parts = {};
+    std::vector<common_chat_tool_call> tool_calls = {};
+    std::string reasoning_content;
+    std::string tool_name;
+    std::string tool_call_id;
+};
+struct common_chat_tool {
+    std::string name;
+    std::string description;
+    std::string parameters;
+};
+enum common_chat_tool_choice {
+    COMMON_CHAT_TOOL_CHOICE_AUTO,
+    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
+    COMMON_CHAT_TOOL_CHOICE_NONE,
+};
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+struct common_chat_templates_inputs {
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
+    // Parameters below only supported when use_jinja is true
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    bool extract_reasoning     = true;
+};
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::string                         prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+void common_chat_templates_free(struct common_chat_templates * tmpls);
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
+typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+// Parses a JSON array of messages in OpenAI's chat completion API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
+template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
+template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -10,10 +10,7 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
-#include "json-schema-to-grammar.h"
 #include "llama.h"
-#include "chat.hpp"
-#include "chat-template.hpp"
 #include <algorithm>
 #include <cinttypes>
@@ -485,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
+std::string regex_escape(const std::string & s) {
+    static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
+    return std::regex_replace(s, special_chars, "\\$0");
+}
 std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
     std::ostringstream result;
     for (size_t i = 0; i < values.size(); ++i) {
@@ -1768,174 +1770,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
     return text;
 }
-//
-// Chat template utils
-//
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
-    if (use_jinja) {
-        try {
-            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
-            common_chat_inputs inputs;
-            inputs.messages = json::array({{
-                {"role", "user"},
-                {"content", "test"},
-            }});
-            common_chat_params_init(chat_template, inputs);
-            return true;
-        } catch (const std::exception & e) {
-            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
-            return false;
-        }
-    }
-    llama_chat_message chat[] = {{"user", "test"}};
-    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-std::string common_chat_apply_template(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & msgs,
-        bool add_ass,
-        bool use_jinja) {
-    if (use_jinja) {
-        auto messages = json::array();
-        for (const auto & msg : msgs) {
-            messages.push_back({{"role", msg.role}, {"content", msg.content}});
-        }
-        common_chat_inputs inputs;
-        inputs.messages = messages;
-        inputs.add_generation_prompt = add_ass;
-        return common_chat_params_init(tmpl, inputs).prompt;
-    }
-    int alloc_size = 0;
-    std::vector<llama_chat_message> chat;
-    for (const auto & msg : msgs) {
-        chat.push_back({msg.role.c_str(), msg.content.c_str()});
-        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
-    }
-    std::vector<char> buf(alloc_size);
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-    // error: chat template is not supported
-    if (res < 0) {
-        // if the custom "tmpl" is not supported, we throw an error
-        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-        throw std::runtime_error("this custom template is not supported");
-    }
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-    }
-    std::string formatted_chat(buf.data(), res);
-    return formatted_chat;
-}
-std::string common_chat_format_single(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja) {
-    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
-    std::vector<common_chat_msg> chat_new(past_msg);
-    // if the past_msg ends with a newline, we must preserve it in the formatted version
-    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
-        ss << "\n";
-    };
-    // format chat with new_msg
-    chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
-    // get the diff part
-    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-    return ss.str();
-}
-std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
-    std::vector<common_chat_msg> msgs = {
-        {"system",    "You are a helpful assistant", {}},
-        {"user",      "Hello", {}},
-        {"assistant", "Hi there", {}},
-        {"user",      "How are you?", {}},
-    };
-    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
-}
-#define CHATML_TEMPLATE_SRC \
-    "{%- for message in messages -%}\n" \
-    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
-    "{%- endfor -%}\n" \
-    "{%- if add_generation_prompt -%}\n" \
-    "  {{- '<|im_start|>assistant\n' -}}\n" \
-    "{%- endif -%}"
-common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
-{
-    std::string default_template_src;
-    std::string template_tool_use_src;
-    bool has_explicit_template = !chat_template_override.empty();
-    if (chat_template_override.empty()) {
-        auto str = llama_model_chat_template(model, /* name */ nullptr);
-        if (str) {
-            default_template_src = str;
-            has_explicit_template = true;
-        }
-        str = llama_model_chat_template(model, /* name */ "tool_use");
-        if (str) {
-            template_tool_use_src = str;
-            has_explicit_template = true;
-        }
-    } else {
-        default_template_src = chat_template_override;
-    }
-    if (default_template_src.empty() || default_template_src == "chatml") {
-        if (!template_tool_use_src.empty()) {
-            default_template_src = template_tool_use_src;
-        } else {
-            default_template_src = CHATML_TEMPLATE_SRC;
-        }
-    }
-    auto vocab = llama_model_get_vocab(model);
-    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
-        if (token == LLAMA_TOKEN_NULL) {
-            if (default_template_src.find(jinja_variable_name) != std::string::npos
-                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
-                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
-            }
-            return std::string();
-        } else {
-            return common_token_to_piece(vocab, token, true);
-        }
-    };
-    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
-    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
-    try {
-        return {
-            has_explicit_template,
-            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
-            template_tool_use_src.empty()
-                ? nullptr
-                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
-        };
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
-        return {
-            has_explicit_template,
-            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
-            nullptr,
-        };
-    }
-}
 //
 // KV cache utils
 //
@@ -2196,3 +2030,25 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
     return result;
 }
+template <>
+json common_grammar_trigger::to_json() const {
+    json out {
+        {"type", (int) type},
+        {"value", value},
+    };
+    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out["token"] = (int) token;
+    }
+    return out;
+}
+template <>
+common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
+    common_grammar_trigger out;
+    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
+    out.value = in.at("value").get<std::string>();
+    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out.token = (llama_token) in.at("token").get<int>();
+    }
+    return out;
+}

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -110,9 +110,21 @@ enum common_conversation_mode {
     COMMON_CONVERSATION_MODE_AUTO     = 2,
 };
+enum common_grammar_trigger_type {
+    COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+    COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START,
+};
 struct common_grammar_trigger {
-    std::string word;
-    bool at_start;
+    common_grammar_trigger_type type;
+    std::string value;
+    llama_token token = LLAMA_TOKEN_NULL;
+    // T can only be nlohmann::ordered_json
+    template <class T> T to_json() const;
+    template <class T> static common_grammar_trigger from_json(const T & in);
 };
 // sampling parameters
@@ -140,6 +152,7 @@ struct common_params_sampling {
     int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
     int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
     float   mirostat_tau       = 5.00f; // target entropy
     float   mirostat_eta       = 0.10f; // learning rate
     bool    ignore_eos         = false;
@@ -162,8 +175,7 @@ struct common_params_sampling {
     std::string                         grammar; // optional BNF-like grammar to constrain sampling
     bool                                grammar_lazy = false;
-    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
-    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
     std::set<llama_token>               preserved_tokens;
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -177,10 +189,10 @@ struct common_params_speculative {
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
-    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+    float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
@@ -199,9 +211,16 @@ struct common_params_vocoder {
     std::string model     = ""; // model path                                                // NOLINT
     std::string model_url = ""; // model url to download                                     // NOLINT
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT
     bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+};
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
@@ -255,6 +274,7 @@ struct common_params {
     std::string hf_repo              = ""; // HF repo                                                       // NOLINT
     std::string hf_file              = ""; // HF file                                                       // NOLINT
     std::string prompt               = "";                                                                  // NOLINT
+    std::string system_prompt        = "";                                                                  // NOLINT
     std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
     std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
     std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
@@ -292,6 +312,7 @@ struct common_params {
     bool   kl_divergence    = false; // compute KL divergence
     bool usage             = false; // print usage
+    bool completion        = false; // print source-able completion script
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
     bool interactive       = false; // interactive mode
@@ -318,6 +339,8 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
+    bool single_turn       = false; // single turn chat conversation
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -346,6 +369,7 @@ struct common_params {
     std::string chat_template = "";                                                                         // NOLINT
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     std::vector<std::string> api_keys;
@@ -383,8 +407,6 @@ struct common_params {
     int32_t i_pos  = -1;  // position of the passkey in the junk text
     // imatrix params
-    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
     int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
     int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
     int32_t i_chunk     =  0; // start processing from this chunk
@@ -396,16 +418,16 @@ struct common_params {
     int n_pca_batch = 100;
     int n_pca_iterations = 1000;
     dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
-    std::string cvector_outfile       = "control_vector.gguf";
     std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
     std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
-    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
     // batched-bench params
     bool batched_bench_output_jsonl = false;
+    // common params
+    std::string out_file; // output filename for all example programs
 };
 // call once at the start of a program if it uses libcommon
@@ -424,13 +446,13 @@ bool set_process_priority(enum ggml_sched_priority prio);
 //
 #ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    if defined(__MINGW32__) && !defined(__clang__)
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    else
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -445,6 +467,8 @@ std::string string_repeat(const std::string & str, size_t n);
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
+std::string regex_escape(const std::string & s);
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
     static_assert(!std::is_same<T, std::string>::value, "Please use the specialized version for std::string");
@@ -608,62 +632,6 @@ std::string common_detokenize(
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
-//
-// Chat template utils
-//
-struct common_tool_call {
-    std::string name;
-    std::string arguments;
-    std::string id;
-};
-// same with llama_chat_message, but uses std::string
-struct common_chat_msg {
-    std::string role;
-    std::string content;
-    std::vector<common_tool_call> tool_calls;
-    std::string tool_plan = "";
-};
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
-namespace minja {
-    class chat_template;
-}
-typedef minja::chat_template common_chat_template;
-struct common_chat_templates {
-    bool has_explicit_template; // Model had builtin template or template overridde was specified.
-    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
-    std::unique_ptr<common_chat_template> template_tool_use;
-};
-// CPP wrapper for llama_chat_apply_template
-// If the built-in template is not supported, we default to chatml
-// If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & chat,
-        bool add_ass,
-        bool use_jinja);
-// Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja);
-// Returns an example of formatted chat
-std::string common_chat_format_example(
-    const common_chat_template & tmpl, bool use_jinja);
-common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
 //
 // KV cache utils
 //

package/src/llama.cpp/common/json-schema-to-grammar.cpp CHANGED Viewed

@@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
     throw std::runtime_error("At least one of min_value or max_value must be set");
 }
-const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
+const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}";
 struct BuiltinRule {
     std::string content;
@@ -764,11 +764,10 @@ private:
 public:
     SchemaConverter(
         const std::function<json(const std::string &)> & fetch_json,
-        bool dotall,
-        bool compact_spaces)
+        bool dotall)
           : _fetch_json(fetch_json), _dotall(dotall)
     {
-        _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE;
+        _rules["space"] = SPACE_RULE;
     }
     void resolve_refs(json & schema, const std::string & url) {
@@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
 }
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
-    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces);
+    SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
     common_grammar_builder builder {
         /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
             return converter._add_rule(name, rule);

package/src/llama.cpp/common/json-schema-to-grammar.h CHANGED Viewed

@@ -16,7 +16,6 @@ struct common_grammar_builder {
 struct common_grammar_options {
     bool dotall = false;
-    bool compact_spaces = false;
 };
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});

package/src/llama.cpp/common/llguidance.cpp CHANGED Viewed

@@ -254,10 +254,10 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
         };
     }
-    return new llama_sampler{
+    return llama_sampler_init(
         /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx,
-    };
+        /* .ctx   = */ ctx
+    );
 }
 #else

package/src/llama.cpp/common/log.cpp CHANGED Viewed

@@ -1,5 +1,6 @@
 #include "log.h"
+#include <chrono>
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>

package/src/llama.cpp/common/log.h CHANGED Viewed

@@ -2,6 +2,7 @@
 #include "ggml.h" // for ggml_log_level
+#define LOG_CLR_TO_EOL  "\033[K\r"
 #define LOG_COL_DEFAULT "\033[0m"
 #define LOG_COL_BOLD    "\033[1m"
 #define LOG_COL_RED     "\033[31m"
@@ -14,7 +15,7 @@
 #ifndef __GNUC__
 #    define LOG_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))