npm - @novastera-oss/llamarn - Versions diffs - 0.2.5 → 0.2.6 - Mend

@novastera-oss/llamarn 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (123) hide show

package/cpp/rn-completion.cpp CHANGED Viewed

@@ -152,6 +152,16 @@ CompletionResult run_completion(
         common_params_sampling sampling_params = params.sampling;
         if (!options.grammar.empty()) {
             sampling_params.grammar = options.grammar;
+            // Force grammar_lazy to false whenever tools are present to ensure strict JSON format enforcement
+            if (!options.tools.empty()) {
+                sampling_params.grammar_lazy = false;
+            } else {
+                sampling_params.grammar_lazy = options.grammar_lazy;
+            }
+            // Pass grammar_triggers if any were provided by chat_params and passed via options
+            if (!options.grammar_triggers.empty()) {
+                sampling_params.grammar_triggers = options.grammar_triggers;
+            }
         }
         // Parse tool_choice
@@ -228,9 +238,9 @@ CompletionResult run_completion(
                 return result;
             }
-            // Only accept tokens during prompt processing if no grammar is present
-            // Grammar-based sampling needs to start fresh from the generation phase
-            if (sampling_params.grammar.empty()) {
+            // For lazy grammars, we need to accept prompt tokens to properly set up the grammar state
+            // For non-lazy grammars, we only accept if no grammar is present (grammar needs clean state)
+            if (sampling_params.grammar.empty() || sampling_params.grammar_lazy) {
                 common_sampler_accept(state.sampler, token, true);
             }
             state.n_past++;
@@ -238,6 +248,12 @@ CompletionResult run_completion(
         result.n_prompt_tokens = state.prompt_tokens.size();
+        // If using a non-lazy grammar, ensure the sampler is in a clean state for the grammar
+        if (!sampling_params.grammar.empty() && !sampling_params.grammar_lazy) {
+            common_sampler_free(state.sampler);
+            state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
+        }
         // Start generating tokens
         const int64_t t_start_generation = ggml_time_us();
@@ -338,6 +354,15 @@ CompletionResult run_chat_completion(
     std::function<bool(const std::string&, bool)> callback) {
     CompletionResult result;
+    // Log incoming tools via callback
+    /*
+    if (callback) {
+        std::string tools_json_str = options.tools.dump(2);
+        std::string debug_msg = "[DEBUG RN_COMPLETION_OPTIONS_TOOLS] options.tools JSON: " + tools_json_str;
+        callback(debug_msg, false); // false for is_done
+    }
+    */
+    completion_state state;
     if (!rn_ctx || !rn_ctx->model || !rn_ctx->ctx) {
         result.success = false;
@@ -372,9 +397,9 @@ CompletionResult run_chat_completion(
         // Parse tools if present
         if (data.contains("tools") && !data["tools"].empty()) {
             template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
-            // Check if parallel tool calls are allowed (advanced feature)
-            template_inputs.parallel_tool_calls = data.contains("parallel_tool_calls") ?
-                json_value(data, "parallel_tool_calls", false) : false;
+            // Force parallel_tool_calls to true if tools are present, as this generally
+            // aligns with grammars expecting a list of tool calls.
+            template_inputs.parallel_tool_calls = true;
         }
         // Parse tool_choice if present
@@ -388,13 +413,42 @@ CompletionResult run_chat_completion(
         // Apply template
         const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
-        // Set up completion options
         CompletionOptions cmpl_options = options;
         cmpl_options.prompt = chat_params.prompt;
-        // Apply grammar if needed
         if (!chat_params.grammar.empty()) {
             cmpl_options.grammar = chat_params.grammar;
+            // Always force grammar_lazy to false when tools are present
+            if (!template_inputs.tools.empty()) {
+                cmpl_options.grammar_lazy = false;
+            } else {
+                // Only use chat_params.grammar_lazy if no tools are present
+                cmpl_options.grammar_lazy = chat_params.grammar_lazy;
+            }
+            // Default to grammar_triggers provided by chat_params
+            cmpl_options.grammar_triggers = chat_params.grammar_triggers;
+            bool original_grammar_lazy = chat_params.grammar_lazy; // Store original for logging
+            // Add a debug log to observe final grammar_lazy and grammar_triggers
+            /*
+            if (callback) {
+                std::string tool_choice_str;
+                switch (template_inputs.tool_choice) {
+                    case COMMON_CHAT_TOOL_CHOICE_AUTO: tool_choice_str = "auto"; break;
+                    case COMMON_CHAT_TOOL_CHOICE_NONE: tool_choice_str = "none"; break;
+                    case COMMON_CHAT_TOOL_CHOICE_REQUIRED: tool_choice_str = "required"; break;
+                    default: tool_choice_str = "unknown"; break;
+                }
+                std::string debug_msg = "[DEBUG CHAT_PARAMS] grammar_lazy: " +
+                                      std::string(cmpl_options.grammar_lazy ? "true" : "false") +
+                                      " | grammar_triggers_count: " + std::to_string(cmpl_options.grammar_triggers.size()) + // Log triggers from cmpl_options
+                                      " | For Tool Choice: " + tool_choice_str +
+                                      " | Parallel Tool Calls: " + std::string(template_inputs.parallel_tool_calls ? "true" : "false") +
+                                      " | Original chat_params.grammar_lazy: " + std::string(original_grammar_lazy ? "true" : "false"); // Log original lazy
+                callback(debug_msg, false);
+            }
+            */
         }
         // Run standard completion with the processed prompt
@@ -481,3 +535,4 @@ CompletionResult run_chat_completion(
 } // namespace facebook::react

package/cpp/rn-utils.hpp CHANGED Viewed

@@ -10,7 +10,7 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include "nlohmann/json.hpp"
 #include "base64.hpp"
 #include "chat.h"
@@ -66,6 +66,7 @@ struct CompletionOptions {
     int seed = -1;
     json tools;         // tools for function calling
     std::string tool_choice = "auto"; // tool choice mode: "auto", "none", or "required"
+    std::vector<common_grammar_trigger> grammar_triggers; // For lazy grammar
     // Convert to JSON for the completion API
     json to_json() const {
@@ -98,6 +99,12 @@ struct CompletionOptions {
             j["tools"] = tools;
             j["tool_choice"] = tool_choice;
         }
+        // Add grammar_triggers if available (mainly for internal use, not direct API option)
+        if (!grammar_triggers.empty()) {
+            // This part is tricky as json can't directly hold common_grammar_trigger easily.
+            // For now, we'll skip adding it to the generic to_json() as it's passed internally.
+            // If it were needed for an API, we'd need a proper serialization for grammar_triggers.
+        }
         return j;
     }

package/ios/include/common/minja/chat-template.hpp CHANGED Viewed

@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;

package/ios/include/common/minja/minja.hpp CHANGED Viewed

@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;

package/ios/include/json-schema-to-grammar.h CHANGED Viewed

@@ -1,9 +1,9 @@
 #pragma once
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json_fwd.hpp>
+#include <functional>
+#include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                    bool force_gbnf = false);

package/ios/include/llama.h CHANGED Viewed

@@ -259,9 +259,9 @@ extern "C" {
         llama_token  *  token;
         float        *  embd;
         llama_pos    *  pos;
-        int32_t      *  n_seq_id;
-        llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int32_t      *  n_seq_id; // TODO: remove, should belong to only 1 sequence
+        llama_seq_id ** seq_id;   // TODO: become llama_seq_id * seq_id;
+        int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
     enum llama_model_kv_override_type {
@@ -366,6 +366,8 @@ extern "C" {
         bool no_perf;     // measure performance timings
         bool op_offload;  // offload host tensor operations to device
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
     };
     // model quantization parameters
@@ -502,6 +504,7 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
@@ -652,7 +655,6 @@ extern "C" {
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_add(
@@ -665,7 +667,6 @@ extern "C" {
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
     LLAMA_API void llama_kv_self_seq_div(
@@ -677,12 +678,14 @@ extern "C" {
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
     // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
     LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
@@ -691,14 +694,15 @@ extern "C" {
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
     // Check if the context supports KV cache shifting
     LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
     //
     // State / sessions