npm - cui-llama.rn - Versions diffs - 1.4.2 → 1.4.4 - Mend

cui-llama.rn 1.4.2 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/cpp/chat.hpp ADDED Viewed

@@ -0,0 +1,55 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+#pragma once
+#include "common.h"
+#include "json.hpp"
+#include <optional>
+#include <string>
+#include <vector>
+using json = nlohmann::ordered_json;
+struct common_chat_inputs {
+    json messages;
+    json tools;
+    json tool_choice;
+    json json_schema;
+    bool parallel_tool_calls;
+    bool stream;
+    std::string grammar;
+    bool add_generation_prompt = true;
+    bool extract_reasoning     = true;
+};
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    json                                prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+struct common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & params);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);

package/cpp/common.cpp CHANGED Viewed

@@ -12,6 +12,8 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
+#include "chat.hpp"
+#include "chat-template.hpp"
 #include <algorithm>
 #include <cinttypes>
@@ -489,6 +491,48 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
+std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
+    std::ostringstream result;
+    for (size_t i = 0; i < values.size(); ++i) {
+        if (i > 0) {
+            result << separator;
+        }
+        result << values[i];
+    }
+    return result.str();
+}
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> parts;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+    while (end != std::string::npos) {
+        parts.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+    parts.push_back(str.substr(start));
+    return parts;
+}
+std::string string_repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+    std::string result;
+    result.reserve(str.length() * n);
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+    return result;
+}
 std::string string_from(bool value) {
     return value ? "true" : "false";
 }
@@ -1526,67 +1570,80 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
 // Chat template utils
 //
-std::string common_get_builtin_chat_template(const struct llama_model * model) {
-    const char * ptr_tmpl = llama_model_chat_template(model);
-    return ptr_tmpl == nullptr ? "" : ptr_tmpl;
-}
-bool common_chat_verify_template(const std::string & tmpl) {
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
+    if (use_jinja) {
+        try {
+            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
+            common_chat_inputs inputs;
+            inputs.messages = json::array({{
+                {"role", "user"},
+                {"content", "test"},
+            }});
+            common_chat_params_init(chat_template, inputs);
+            return true;
+        } catch (const std::exception & e) {
+            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
+            return false;
+        }
+    }
     llama_chat_message chat[] = {{"user", "test"}};
     const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
     return res >= 0;
 }
-std::string common_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
         const std::vector<common_chat_msg> & msgs,
-        bool add_ass) {
+        bool add_ass,
+        bool use_jinja) {
+    if (use_jinja) {
+        auto messages = json::array();
+        for (const auto & msg : msgs) {
+            messages.push_back({{"role", msg.role}, {"content", msg.content}});
+        }
+        common_chat_inputs inputs;
+        inputs.messages = messages;
+        inputs.add_generation_prompt = add_ass;
+        return common_chat_params_init(tmpl, inputs).prompt;
+    }
     int alloc_size = 0;
-    bool fallback = false; // indicate if we must fallback to default chatml
     std::vector<llama_chat_message> chat;
     for (const auto & msg : msgs) {
         chat.push_back({msg.role.c_str(), msg.content.c_str()});
         alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
     }
-    const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
     std::vector<char> buf(alloc_size);
     // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     // error: chat template is not supported
     if (res < 0) {
-        if (ptr_tmpl != nullptr) {
-            // if the custom "tmpl" is not supported, we throw an error
-            // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-            throw std::runtime_error("this custom template is not supported");
-        }
-        // If the built-in template is not supported, we default to chatml
-        res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-        fallback = true;
+        // if the custom "tmpl" is not supported, we throw an error
+        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
+        throw std::runtime_error("this custom template is not supported");
     }
     // if it turns out that our buffer is too small, we resize it
     if ((size_t) res > buf.size()) {
         buf.resize(res);
-        res = llama_chat_apply_template(
-            fallback ? "chatml" : ptr_tmpl,
-            chat.data(), chat.size(), add_ass, buf.data(), buf.size());
+        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
     }
     std::string formatted_chat(buf.data(), res);
     return formatted_chat;
 }
-std::string common_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
         const std::vector<common_chat_msg> & past_msg,
         const common_chat_msg & new_msg,
-        bool add_ass) {
+        bool add_ass,
+        bool use_jinja) {
     std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
+    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
     std::vector<common_chat_msg> chat_new(past_msg);
     // if the past_msg ends with a newline, we must preserve it in the formatted version
     if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1594,21 +1651,87 @@ std::string common_chat_format_single(const struct llama_model * model,
     };
     // format chat with new_msg
     chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
+    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
     // get the diff part
     ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
     return ss.str();
 }
-std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl) {
+std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
     std::vector<common_chat_msg> msgs = {
-        {"system",    "You are a helpful assistant"},
-        {"user",      "Hello"},
-        {"assistant", "Hi there"},
-        {"user",      "How are you?"},
+        {"system",    "You are a helpful assistant", {}},
+        {"user",      "Hello", {}},
+        {"assistant", "Hi there", {}},
+        {"user",      "How are you?", {}},
     };
-    return common_chat_apply_template(model, tmpl, msgs, true);
+    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
+}
+#define CHATML_TEMPLATE_SRC \
+    "{%- for message in messages -%}\n" \
+    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
+    "{%- endfor -%}\n" \
+    "{%- if add_generation_prompt -%}\n" \
+    "  {{- '<|im_start|>assistant\n' -}}\n" \
+    "{%- endif -%}"
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
+{
+    std::string default_template_src;
+    std::string template_tool_use_src;
+    bool has_explicit_template = !chat_template_override.empty();
+    if (chat_template_override.empty()) {
+        auto str = llama_model_chat_template(model, /* name */ nullptr);
+        if (str) {
+            default_template_src = str;
+            has_explicit_template = true;
+        }
+        str = llama_model_chat_template(model, /* name */ "tool_use");
+        if (str) {
+            template_tool_use_src = str;
+            has_explicit_template = true;
+        }
+    } else {
+        default_template_src = chat_template_override;
+    }
+    if (default_template_src.empty() || default_template_src == "chatml") {
+        if (!template_tool_use_src.empty()) {
+            default_template_src = template_tool_use_src;
+        } else {
+            default_template_src = CHATML_TEMPLATE_SRC;
+        }
+    }
+    auto vocab = llama_model_get_vocab(model);
+    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
+        if (token == LLAMA_TOKEN_NULL) {
+            if (default_template_src.find(jinja_variable_name) != std::string::npos
+                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
+                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
+            }
+            return std::string();
+        } else {
+            return common_token_to_piece(vocab, token, true);
+        }
+    };
+    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
+    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+    try {
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
+            template_tool_use_src.empty()
+                ? nullptr
+                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
+        };
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
+        return {
+            has_explicit_template,
+            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
+            nullptr,
+        };
+    }
 }
 //

package/cpp/common.h CHANGED Viewed

@@ -4,6 +4,7 @@
 #include "llama-cpp.h"
+#include <set>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -120,6 +121,11 @@ enum common_conversation_mode {
     COMMON_CONVERSATION_MODE_AUTO     = 2,
 };
+struct common_grammar_trigger {
+    std::string word;
+    bool at_start;
+};
 // sampling parameters
 struct common_params_sampling {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -145,6 +151,7 @@ struct common_params_sampling {
     int32_t dry_allowed_length = 2;     // tokens extending repetitions beyond this receive penalty
     int32_t dry_penalty_last_n = -1;    // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   top_n_sigma        = -1.00f;// -1.0 = disabled
     float   mirostat_tau       = 5.00f; // target entropy
     float   mirostat_eta       = 0.10f; // learning rate
     bool    ignore_eos         = false;
@@ -165,7 +172,11 @@ struct common_params_sampling {
         COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
-    std::string grammar; // optional BNF-like grammar to constrain sampling
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_trigger_words;  // optional trigger words to trigger lazy grammar
+    std::vector<llama_token>            grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens.
+    std::set<llama_token>               preserved_tokens;
     std::vector<llama_logit_bias> logit_bias; // logit biases to apply
@@ -203,6 +214,11 @@ struct common_params_vocoder {
     bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };
+enum common_reasoning_format {
+    COMMON_REASONING_FORMAT_NONE,
+    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+};
 struct common_params {
     void * progress_callback_user_data        = nullptr;
@@ -297,6 +313,7 @@ struct common_params {
     bool   kl_divergence    = false; // compute KL divergence
     bool usage             = false; // print usage
+    bool completion        = false; // print source-able completion script
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
     bool interactive       = false; // interactive mode
@@ -349,7 +366,9 @@ struct common_params {
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
+    bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     std::vector<std::string> api_keys;
@@ -428,13 +447,13 @@ bool set_process_priority(enum lm_ggml_sched_priority prio);
 //
 #ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    if defined(__MINGW32__) && !defined(__clang__)
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#    else
+#        define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#    endif
 #else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#    define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
@@ -443,6 +462,10 @@ std::string string_format(const char * fmt, ...);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_join(const std::vector<std::string> & values, const std::string & separator);
+std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
+std::string string_repeat(const std::string & str, size_t n);
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 template<class T>
@@ -534,6 +557,7 @@ struct llama_model * common_load_model_from_hf(
     const std::string & local_path,
     const std::string & hf_token,
     const struct llama_model_params & params);
 std::pair<std::string, std::string> common_get_hf_file(
     const std::string & hf_repo_with_tag,
     const std::string & hf_token);
@@ -615,36 +639,57 @@ std::string common_detokenize(
 // Chat template utils
 //
+struct common_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
 // same with llama_chat_message, but uses std::string
 struct common_chat_msg {
     std::string role;
     std::string content;
+    std::vector<common_tool_call> tool_calls;
+    std::string reasoning_content = "";
 };
-// Get the built-in chat template for the model. Return empty string if not present.
-std::string common_get_builtin_chat_template(const struct llama_model * model);
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool common_chat_verify_template(const std::string & tmpl);
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+namespace minja {
+    class chat_template;
+}
+typedef minja::chat_template common_chat_template;
+struct common_chat_templates {
+    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+    std::unique_ptr<common_chat_template> template_tool_use;
+};
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string common_chat_apply_template(const struct llama_model * model,
-        const std::string & tmpl,
+std::string common_chat_apply_template(
+        const common_chat_template & tmpl,
         const std::vector<common_chat_msg> & chat,
-        bool add_ass);
+        bool add_ass,
+        bool use_jinja);
 // Format single message, while taking into account the position of that message in chat history
-std::string common_chat_format_single(const struct llama_model * model,
-        const std::string & tmpl,
+std::string common_chat_format_single(
+        const common_chat_template & tmpl,
         const std::vector<common_chat_msg> & past_msg,
         const common_chat_msg & new_msg,
-        bool add_ass);
+        bool add_ass,
+        bool use_jinja);
 // Returns an example of formatted chat
-std::string common_chat_format_example(const struct llama_model * model,
-        const std::string & tmpl);
+std::string common_chat_format_example(
+    const common_chat_template & tmpl, bool use_jinja);
+common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
 //
 // KV cache utils

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -989,19 +989,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
             this_size = LM_GGML_PAD(lm_ggml_backend_buft_get_alloc_size(buft, t), alignment);
         }
-        if (this_size > max_size) {
-            LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    lm_ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                lm_ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
             if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                 return NULL;

package/cpp/ggml-common.h CHANGED Viewed

@@ -473,7 +473,6 @@ LM_GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
     240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
 LM_GGML_TABLE_END()
-//#if __CUDA_ARCH__ >= LM_GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
 LM_GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
     0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
     0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
@@ -508,7 +507,6 @@ LM_GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
     0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
     0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
 LM_GGML_TABLE_END()
-//#endif
 LM_GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)

package/cpp/ggml-cpu-impl.h CHANGED Viewed

@@ -360,21 +360,15 @@ inline static int32x4_t lm_ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t
 #endif
 #if defined(__loongarch_asx)
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
 /* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+static __m128 __lsx_vreplfr2vr_s(const float val) {
+    v4f32 res = {val, val, val, val};
+    return (__m128)res;
 }
-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+static __m256 __lasx_xvreplfr2vr_s(const float val) {
+    v8f32 res = {val, val, val, val, val, val, val, val};
+    return (__m256)res;
 }
 #endif