npm - @novastera-oss/llamarn - Versions diffs - 0.2.1 → 0.2.3 - Mend

@novastera-oss/llamarn 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (268) hide show

package/cpp/llama.cpp/src/llama-model.h CHANGED Viewed

@@ -96,6 +96,8 @@ enum llm_type {
     LLM_TYPE_235B_A22B,
 };
+std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
 struct llama_layer_posnet {
     // resnet
     struct ggml_tensor * norm1   = nullptr;
@@ -396,7 +398,10 @@ struct llama_model {
     const struct ggml_tensor * get_tensor(const char * name) const;
-    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
+    float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+    float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
+    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
     // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface

package/cpp/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -14,6 +14,12 @@
 #include <thread>
 #include <unordered_map>
+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
         {}
 };
-// changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         nthread = std::thread::hardware_concurrency();
     }
-    // mmap consistently increases speed Linux, and also increases speed on Windows with
+    // mmap consistently increases speed on Linux, and also increases speed on Windows with
     // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
     constexpr bool use_mmap = true;
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     llama_model_kv_override * kv_overrides = nullptr;
     if (params->kv_overrides) {
-        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+        auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
         kv_overrides = v->data();
     }
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 // unless the user specifies a type
                 if (params->tensor_types) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            if (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                            if  (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                             }
-                            new_type = qtype;
-                            break;
                         }
                     }
                 }
             }
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
             }

package/cpp/llama.cpp/src/llama-sampling.cpp CHANGED Viewed

@@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
         }
         // if we have enough values the operation was a success
-        if (filtered_tokens.size() >= ctx->min_keep) {
+        if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
             memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
             cur_p->size = filtered_tokens.size();
             min_p_applied = true;
@@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
         cum_sum += cur_p->data[idx].p;
         // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
+        if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
             last_idx = i + 1;
             break;
         }

package/cpp/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -1,5 +1,7 @@
 #include "llama-vocab.h"
+#include "ggml.h"
+#include "gguf.h"
 #include "llama-impl.h"
 #include "llama-model-loader.h"
@@ -833,7 +835,7 @@ struct llm_tokenizer_ugm_session {
         }
         // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
-        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
+        std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
         // at the beginning tokenization score is zero
         tokenization_results[0] = { vocab.token_unk(), 0, 0 };
@@ -865,7 +867,7 @@ struct llm_tokenizer_ugm_session {
                     const double challenger_score = current_best.score_sum + token_score;
                     struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                     if (challenger_score > current_champ.score_sum) {
-                        struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
+                        struct best_tokenization challenger = { token_id, input_offset, challenger_score };
                         current_champ = challenger;
                     }
                 }
@@ -879,7 +881,7 @@ struct llm_tokenizer_ugm_session {
                 prefix_offset = input_offset + n_utf8_code_units;
                 struct best_tokenization & current_champ = tokenization_results[prefix_offset];
                 if (challenger_score > current_champ.score_sum) {
-                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
+                    struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
                     current_champ = challenger;
                 }
             }
@@ -1005,7 +1007,7 @@ private:
     struct best_tokenization {
         llama_token token_id;
         size_t input_offset;
-        float score_sum;
+        double score_sum;
     };
     struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
@@ -1234,6 +1236,9 @@ struct fragment_buffer_variant {
 struct llama_vocab::impl {
     uint32_t n_token_types = 0; // for BERT-style token types
+    std::string tokenizer_model;
+    std::string tokenizer_pre;
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
     enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
@@ -1369,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     // determine vocab type
     {
-        std::string tokenizer_model;
-        std::string tokenizer_pre;
         ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
         ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
@@ -1466,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
             if (precompiled_charsmap_keyidx != -1) {
-                size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
+                const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
+                GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
+                const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
                 const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                 precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #ifdef IS_BIG_ENDIAN
@@ -2789,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
     pimpl->load(ml, kv);
 }
+std::string llama_vocab::get_tokenizer_model() const {
+    return pimpl->tokenizer_model;
+}
+std::string llama_vocab::get_tokenizer_pre() const {
+    return pimpl->tokenizer_pre;
+}
 enum llama_vocab_type llama_vocab::get_type() const {
     return pimpl->type;
 }
@@ -3011,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
     return it->second;
 }
+std::vector<std::string> llama_vocab::get_bpe_merges() const {
+    std::vector<std::string> result(pimpl->bpe_ranks.size());
+    for (const auto & pair : pimpl->bpe_ranks) {
+        result[pair.second] = pair.first.first + " " + pair.first.second;
+    }
+    return result;
+}
+std::vector<char> llama_vocab::get_precompiled_charsmap() const {
+    return pimpl->precompiled_charsmap;
+}
 int32_t llama_vocab::tokenize(
                   const char * text,
                      int32_t   text_len,

package/cpp/llama.cpp/src/llama-vocab.h CHANGED Viewed

@@ -21,6 +21,9 @@ struct llama_vocab {
     void load(llama_model_loader & ml, const LLM_KV & kv);
+    std::string get_tokenizer_model() const;
+    std::string get_tokenizer_pre() const;
     enum llama_vocab_type     get_type()     const;
     enum llama_vocab_pre_type get_pre_type() const;
@@ -80,6 +83,9 @@ struct llama_vocab {
     int max_token_len() const;
     int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+    std::vector<std::string> get_bpe_merges() const;
+    std::vector<char> get_precompiled_charsmap() const;
     int32_t tokenize(
                    const char * text,

package/cpp/llama.cpp/src/llama.cpp CHANGED Viewed

@@ -4,6 +4,7 @@
 #include "llama-mmap.h"
 #include "llama-vocab.h"
 #include "llama-model-loader.h"
+#include "llama-model-saver.h"
 #include "llama-model.h"
 #include "ggml.h"
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
         struct llama_model_params params) {
     ggml_time_init();
+    if (!params.vocab_only && ggml_backend_reg_count() == 0) {
+        LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
+        return nullptr;
+    }
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
     return llama_model_load_from_file_impl(splits.front(), splits, params);
 }
+void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
+    llama_model_saver ms(*model);
+    ms.add_kv_from_model();
+    ms.add_tensors_from_model();
+    ms.save(path_model);
+}
 //
 // chat templates
 //
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }

package/cpp/rn-completion.cpp CHANGED Viewed

@@ -298,7 +298,8 @@ CompletionResult run_completion(
         }
         const int64_t t_end_generation = ggml_time_us();
-        const double generation_time_ms = (t_end_generation - t_start_generation) / 1000.0;
+        // Note: keeping generation_time_ms for future timing measurements
+        // const double generation_time_ms = (t_end_generation - t_start_generation) / 1000.0;
         // Set the result
         result.content = state.generated_text;
@@ -349,8 +350,9 @@ CompletionResult run_chat_completion(
         common_chat_templates_inputs template_inputs;
         template_inputs.messages = chat_msgs;
         template_inputs.add_generation_prompt = true;
-        template_inputs.use_jinja = options.use_jinja;
-        template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
+        template_inputs.use_jinja = rn_ctx->params.use_jinja;
+        // Note: extract_reasoning field doesn't exist in current llama.cpp version
+        // template_inputs.extract_reasoning = true; // Default to true to extract reasoning content if available
         // Add grammar if present in options
         if (!options.grammar.empty()) {
@@ -389,6 +391,31 @@ CompletionResult run_chat_completion(
         result = run_completion(rn_ctx, cmpl_options, callback);
         if (result.success) {
+            // Parse the generated content for tool calls and structured responses
+            common_chat_msg parsed_msg;
+            bool has_parsed_content = false;
+            // Only parse if we have tools available and the response isn't empty
+            if (!template_inputs.tools.empty() && !result.content.empty()) {
+                try {
+                    // Construct the chat syntax for parsing using the format from template application
+                    common_chat_syntax syntax;
+                    syntax.format = chat_params.format;  // Use format from template, not from params
+                    syntax.reasoning_format = rn_ctx->params.reasoning_format;
+                    syntax.reasoning_in_content = true;
+                    syntax.thinking_forced_open = false;
+                    syntax.parse_tool_calls = true;
+                    // Parse the generated content for tool calls
+                    parsed_msg = common_chat_parse(result.content, false, syntax);
+                    has_parsed_content = true;
+                } catch (const std::exception& e) {
+                    // If parsing fails, treat as regular content
+                    has_parsed_content = false;
+                }
+            }
             // Create OpenAI-compatible response
             json response = {
                 {"id", gen_chatcmplid()},
@@ -401,11 +428,39 @@ CompletionResult run_chat_completion(
             json choice = {
                 {"index", 0},
                 {"message", {
-                    {"role", "assistant"},
-                    {"content", result.content}
+                    {"role", "assistant"}
                 }},
                 {"finish_reason", "stop"}
             };
+            // Add parsed content and tool calls if available
+            if (has_parsed_content && !parsed_msg.tool_calls.empty()) {
+                // Set content to the parsed content (may be null for tool-only responses)
+                if (!parsed_msg.content.empty()) {
+                    choice["message"]["content"] = parsed_msg.content;
+                } else {
+                    choice["message"]["content"] = nullptr;
+                }
+                // Add tool calls to the message
+                json tool_calls = json::array();
+                for (const auto& tool_call : parsed_msg.tool_calls) {
+                    json tc = {
+                        {"id", tool_call.id.empty() ? ("call_" + std::to_string(std::rand())) : tool_call.id},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", tool_call.name},
+                            {"arguments", tool_call.arguments}
+                        }}
+                    };
+                    tool_calls.push_back(tc);
+                }
+                choice["message"]["tool_calls"] = tool_calls;
+                choice["finish_reason"] = "tool_calls";
+            } else {
+                // Regular text response
+                choice["message"]["content"] = has_parsed_content ? parsed_msg.content : result.content;
+            }
             choices.push_back(choice);
             response["choices"] = choices;

package/ios/include/chat.h CHANGED Viewed

@@ -3,6 +3,8 @@
 #pragma once
 #include "common.h"
+#include <functional>
+#include <chrono>
 #include <string>
 #include <vector>
@@ -12,11 +14,19 @@ struct common_chat_tool_call {
     std::string name;
     std::string arguments;
     std::string id;
+    bool operator==(const common_chat_tool_call & other) const {
+        return name == other.name && arguments == other.arguments && id == other.id;
+    }
 };
 struct common_chat_msg_content_part {
     std::string type;
     std::string text;
+    bool operator==(const common_chat_msg_content_part & other) const {
+        return type == other.type && text == other.text;
+    }
 };
 struct common_chat_msg {
@@ -27,6 +37,51 @@ struct common_chat_msg {
     std::string reasoning_content;
     std::string tool_name;
     std::string tool_call_id;
+    template <class T> T to_json_oaicompat() const;
+    bool empty() const {
+        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
+    }
+    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+        for (auto i = 0u; i < tool_calls.size(); i++) {
+            if (ids_cache.size() <= i) {
+                auto id = tool_calls[i].id;
+                if (id.empty()) {
+                    id = gen_tool_call_id();
+                }
+                ids_cache.push_back(id);
+            }
+            tool_calls[i].id = ids_cache[i];
+        }
+    }
+    bool operator==(const common_chat_msg & other) const {
+        return role == other.role
+            && content == other.content
+            && content_parts == other.content_parts
+            && tool_calls == other.tool_calls
+            && reasoning_content == other.reasoning_content
+            && tool_name == other.tool_name
+            && tool_call_id == other.tool_call_id;
+    }
+    bool operator!=(const common_chat_msg & other) const {
+        return !(*this == other);
+    }
+};
+struct common_chat_msg_diff {
+    // std::string reasoning_content_delta;
+    std::string content_delta;
+    size_t tool_call_index = std::string::npos;
+    common_chat_tool_call tool_call_delta;
+    static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
+    bool operator==(const common_chat_msg_diff & other) const {
+        return content_delta == other.content_delta
+        && tool_call_index == other.tool_call_index
+        && tool_call_delta == other.tool_call_delta;
+    }
 };
 struct common_chat_tool {
@@ -48,14 +103,11 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
-    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
-    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
-    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -70,7 +122,9 @@ struct common_chat_templates_inputs {
     std::vector<common_chat_tool> tools;
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
-    bool extract_reasoning     = true;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
+    std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 struct common_chat_params {
@@ -78,11 +132,21 @@ struct common_chat_params {
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy = false;
+    bool                                thinking_forced_open = false;
     std::vector<common_grammar_trigger> grammar_triggers;
     std::vector<std::string>            preserved_tokens;
     std::vector<std::string>            additional_stops;
 };
+struct common_chat_syntax {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+};
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
@@ -119,8 +183,9 @@ std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
     bool use_jinja);
-std::string               common_chat_format_name(common_chat_format format);
-common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -133,3 +198,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
 template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
+template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);

package/ios/include/common/minja/chat-template.hpp CHANGED Viewed

@@ -13,10 +13,12 @@
 #include <chrono>
 #include <cstddef>
 #include <cstdio>
+#include <ctime>
 #include <exception>
 #include <iomanip>
 #include <memory>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <vector>
@@ -393,8 +395,8 @@ class chat_template {
             for (const auto & message_ : adjusted_messages) {
                 auto message = message_;
-                if (!message.contains("role") || !message.contains("content")) {
-                    throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
+                if (!message.contains("role") || (!message.contains("content") && !message.contains("tool_calls"))) {
+                    throw std::runtime_error("message must have 'role' and one of 'content' or 'tool_calls' fields: " + message.dump());
                 }
                 std::string role = message.at("role");
@@ -415,7 +417,6 @@ class chat_template {
                         }
                     }
                     if (polyfill_tool_calls) {
-                        auto content = message.at("content");
                         auto tool_calls = json::array();
                         for (const auto & tool_call : message.at("tool_calls")) {
                             if (tool_call.at("type") != "function") {
@@ -434,8 +435,11 @@ class chat_template {
                         auto obj = json {
                             {"tool_calls", tool_calls},
                         };
-                        if (!content.is_null() && !content.empty()) {
-                            obj["content"] = content;
+                        if (message.contains("content")) {
+                            auto content = message.at("content");
+                            if (!content.is_null() && !content.empty()) {
+                                obj["content"] = content;
+                            }
                         }
                         message["content"] = obj.dump(2);
                         message.erase("tool_calls");