npm - @novastera-oss/llamarn - Versions diffs - 0.2.5 → 0.2.7 - Mend

@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (225) hide show

package/cpp/rn-completion.cpp CHANGED Viewed

@@ -1,4 +1,4 @@
-#include "rn-llama.hpp"
+#include "rn-llama.h"
 // Suppress unused function warnings from llama.cpp headers
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
@@ -7,7 +7,7 @@
 #include "llama.h"
 #include "sampling.h"
 #pragma GCC diagnostic pop
-#include "rn-utils.hpp"
+#include "rn-utils.h"
 #include <string>
 #include <vector>
@@ -152,6 +152,16 @@ CompletionResult run_completion(
         common_params_sampling sampling_params = params.sampling;
         if (!options.grammar.empty()) {
             sampling_params.grammar = options.grammar;
+            // Force grammar_lazy to false whenever tools are present to ensure strict JSON format enforcement
+            if (!options.tools.empty()) {
+                sampling_params.grammar_lazy = false;
+            } else {
+                sampling_params.grammar_lazy = options.grammar_lazy;
+            }
+            // Pass grammar_triggers if any were provided by chat_params and passed via options
+            if (!options.grammar_triggers.empty()) {
+                sampling_params.grammar_triggers = options.grammar_triggers;
+            }
         }
         // Parse tool_choice
@@ -228,9 +238,9 @@ CompletionResult run_completion(
                 return result;
             }
-            // Only accept tokens during prompt processing if no grammar is present
-            // Grammar-based sampling needs to start fresh from the generation phase
-            if (sampling_params.grammar.empty()) {
+            // For lazy grammars, we need to accept prompt tokens to properly set up the grammar state
+            // For non-lazy grammars, we only accept if no grammar is present (grammar needs clean state)
+            if (sampling_params.grammar.empty() || sampling_params.grammar_lazy) {
                 common_sampler_accept(state.sampler, token, true);
             }
             state.n_past++;
@@ -238,6 +248,12 @@ CompletionResult run_completion(
         result.n_prompt_tokens = state.prompt_tokens.size();
+        // If using a non-lazy grammar, ensure the sampler is in a clean state for the grammar
+        if (!sampling_params.grammar.empty() && !sampling_params.grammar_lazy) {
+            common_sampler_free(state.sampler);
+            state.sampler = common_sampler_init(rn_ctx->model, sampling_params);
+        }
         // Start generating tokens
         const int64_t t_start_generation = ggml_time_us();
@@ -338,6 +354,15 @@ CompletionResult run_chat_completion(
     std::function<bool(const std::string&, bool)> callback) {
     CompletionResult result;
+    // Log incoming tools via callback
+    /*
+    if (callback) {
+        std::string tools_json_str = options.tools.dump(2);
+        std::string debug_msg = "[DEBUG RN_COMPLETION_OPTIONS_TOOLS] options.tools JSON: " + tools_json_str;
+        callback(debug_msg, false); // false for is_done
+    }
+    */
+    completion_state state;
     if (!rn_ctx || !rn_ctx->model || !rn_ctx->ctx) {
         result.success = false;
@@ -372,9 +397,9 @@ CompletionResult run_chat_completion(
         // Parse tools if present
         if (data.contains("tools") && !data["tools"].empty()) {
             template_inputs.tools = common_chat_tools_parse_oaicompat(data["tools"]);
-            // Check if parallel tool calls are allowed (advanced feature)
-            template_inputs.parallel_tool_calls = data.contains("parallel_tool_calls") ?
-                json_value(data, "parallel_tool_calls", false) : false;
+            // Force parallel_tool_calls to true if tools are present, as this generally
+            // aligns with grammars expecting a list of tool calls.
+            template_inputs.parallel_tool_calls = true;
         }
         // Parse tool_choice if present
@@ -388,13 +413,42 @@ CompletionResult run_chat_completion(
         // Apply template
         const auto& chat_params = common_chat_templates_apply(rn_ctx->chat_templates.get(), template_inputs);
-        // Set up completion options
         CompletionOptions cmpl_options = options;
         cmpl_options.prompt = chat_params.prompt;
-        // Apply grammar if needed
         if (!chat_params.grammar.empty()) {
             cmpl_options.grammar = chat_params.grammar;
+            // Always force grammar_lazy to false when tools are present
+            if (!template_inputs.tools.empty()) {
+                cmpl_options.grammar_lazy = false;
+            } else {
+                // Only use chat_params.grammar_lazy if no tools are present
+                cmpl_options.grammar_lazy = chat_params.grammar_lazy;
+            }
+            // Default to grammar_triggers provided by chat_params
+            cmpl_options.grammar_triggers = chat_params.grammar_triggers;
+            bool original_grammar_lazy = chat_params.grammar_lazy; // Store original for logging
+            // Add a debug log to observe final grammar_lazy and grammar_triggers
+            /*
+            if (callback) {
+                std::string tool_choice_str;
+                switch (template_inputs.tool_choice) {
+                    case COMMON_CHAT_TOOL_CHOICE_AUTO: tool_choice_str = "auto"; break;
+                    case COMMON_CHAT_TOOL_CHOICE_NONE: tool_choice_str = "none"; break;
+                    case COMMON_CHAT_TOOL_CHOICE_REQUIRED: tool_choice_str = "required"; break;
+                    default: tool_choice_str = "unknown"; break;
+                }
+                std::string debug_msg = "[DEBUG CHAT_PARAMS] grammar_lazy: " +
+                                      std::string(cmpl_options.grammar_lazy ? "true" : "false") +
+                                      " | grammar_triggers_count: " + std::to_string(cmpl_options.grammar_triggers.size()) + // Log triggers from cmpl_options
+                                      " | For Tool Choice: " + tool_choice_str +
+                                      " | Parallel Tool Calls: " + std::string(template_inputs.parallel_tool_calls ? "true" : "false") +
+                                      " | Original chat_params.grammar_lazy: " + std::string(original_grammar_lazy ? "true" : "false"); // Log original lazy
+                callback(debug_msg, false);
+            }
+            */
         }
         // Run standard completion with the processed prompt
@@ -481,3 +535,4 @@ CompletionResult run_chat_completion(
 } // namespace facebook::react

package/cpp/{rn-llama.hpp → rn-llama.h} RENAMED Viewed

@@ -10,7 +10,7 @@
 #include "json-schema-to-grammar.h"
 #pragma GCC diagnostic pop
-#include "rn-utils.hpp"
+#include "rn-utils.h"
 #include <functional>
 #include <mutex>

package/cpp/{rn-utils.hpp → rn-utils.h} RENAMED Viewed

@@ -10,7 +10,7 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include "nlohmann/json.hpp"
 #include "base64.hpp"
 #include "chat.h"
@@ -66,6 +66,7 @@ struct CompletionOptions {
     int seed = -1;
     json tools;         // tools for function calling
     std::string tool_choice = "auto"; // tool choice mode: "auto", "none", or "required"
+    std::vector<common_grammar_trigger> grammar_triggers; // For lazy grammar
     // Convert to JSON for the completion API
     json to_json() const {
@@ -98,6 +99,12 @@ struct CompletionOptions {
             j["tools"] = tools;
             j["tool_choice"] = tool_choice;
         }
+        // Add grammar_triggers if available (mainly for internal use, not direct API option)
+        if (!grammar_triggers.empty()) {
+            // This part is tricky as json can't directly hold common_grammar_trigger easily.
+            // For now, we'll skip adding it to the generic to_json() as it's passed internally.
+            // If it were needed for an API, we'd need a proper serialization for grammar_triggers.
+        }
         return j;
     }

package/ios/include/chat.h CHANGED Viewed

@@ -70,7 +70,7 @@ struct common_chat_msg {
 };
 struct common_chat_msg_diff {
-    // std::string reasoning_content_delta;
+    std::string reasoning_content_delta;
     std::string content_delta;
     size_t tool_call_index = std::string::npos;
     common_chat_tool_call tool_call_delta;

package/ios/include/common/minja/chat-template.hpp CHANGED Viewed

@@ -22,7 +22,7 @@
 #include <string>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;

package/ios/include/common/minja/minja.hpp CHANGED Viewed

@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
-#include <json.hpp>
+#include <nlohmann/json.hpp>
 using json = nlohmann::ordered_json;

package/ios/include/common.h CHANGED Viewed

@@ -199,6 +199,9 @@ struct common_params_speculative {
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
@@ -215,7 +218,8 @@ struct common_params_vocoder {
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 struct common_params {
@@ -354,7 +358,6 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
-    bool reranking         = false; // enable reranking support on server
     // server params
     int32_t port           = 8080;         // server listens on this network port

package/ios/include/json-schema-to-grammar.h CHANGED Viewed

@@ -1,9 +1,9 @@
 #pragma once
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
+#include <nlohmann/json_fwd.hpp>
+#include <functional>
+#include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                    bool force_gbnf = false);

package/ios/include/llama.h CHANGED Viewed

@@ -61,7 +61,10 @@ extern "C" {
     struct llama_model;
     struct llama_context;
     struct llama_sampler;
-    struct llama_kv_cache;
+    typedef struct llama_memory_i * llama_memory_t;
+    struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
     typedef int32_t llama_pos;
     typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
     typedef bool (*llama_progress_callback)(float progress, void * user_data);
-    // Input data for llama_decode
+    // Input data for llama_encode/llama_decode
     // A llama_batch object can contain input about one or many sequences
     // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
     //
     // - token  : the token ids of the input (used when embd is NULL)
     // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
     // - pos    : the positions of the respective token in the sequence
-    //            (if set to NULL, the token position will be tracked automatically by llama_decode)
+    //            (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
     // - seq_id : the sequence to which the respective token belongs
     //            (if set to NULL, the sequence ID will be assumed to be 0)
     // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
-    //            (if set to NULL, only the logits for last token will be returned)
+    //            (if set to NULL:
+    //               - if embeddings: all tokens are output
+    //               - if not:        only the last token is output
+    //            )
     //
     typedef struct llama_batch {
         int32_t n_tokens;
@@ -261,7 +267,7 @@ extern "C" {
         llama_pos    *  pos;
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
-        int8_t       *  logits; // TODO: rename this to "output"
+        int8_t       *  logits;   // TODO: rename this to "output"
     } llama_batch;
     enum llama_model_kv_override_type {
@@ -366,6 +372,8 @@ extern "C" {
         bool no_perf;     // measure performance timings
         bool op_offload;  // offload host tensor operations to device
         bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
+                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
     };
     // model quantization parameters
@@ -491,9 +499,11 @@ extern "C" {
     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
+    LLAMA_API           llama_memory_t   llama_get_memory  (const struct llama_context * ctx);
     LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
+    DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
     LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@@ -502,10 +512,18 @@ extern "C" {
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_swa      (const struct llama_model * model);
     // Get the model's RoPE frequency scaling factor
     LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
+    // Returns the number of classifier outputs (only valid for classifier models)
+    // Undefined behavior for non-classifier models
+    LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
+    // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
+    LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
     LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -606,7 +624,81 @@ extern "C" {
                          int32_t   il_end);
     //
-    // KV cache
+    // Memory
+    //
+    // Clear the memory contents
+    // If data == true, the data buffers will also be cleared together with the metadata
+    LLAMA_API void llama_memory_clear(
+            llama_memory_t mem,
+                      bool data);
+    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
+    // seq_id < 0 : match any sequence
+    // p0 < 0     : [0,  p1]
+    // p1 < 0     : [p0, inf)
+    LLAMA_API bool llama_memory_seq_rm(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1);
+    // Copy all tokens that belong to the specified sequence to another sequence
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_cp(
+            llama_memory_t mem,
+              llama_seq_id seq_id_src,
+              llama_seq_id seq_id_dst,
+                 llama_pos p0,
+                 llama_pos p1);
+    // Removes all tokens that do not belong to the specified sequence
+    LLAMA_API void llama_memory_seq_keep(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_add(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1,
+                 llama_pos delta);
+    // Integer division of the positions by factor of `d > 1`
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_memory_seq_div(
+            llama_memory_t mem,
+              llama_seq_id seq_id,
+                 llama_pos p0,
+                 llama_pos p1,
+                       int d);
+    // Returns the smallest position present in the memory for the specified sequence
+    // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_memory_seq_pos_min(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+    // Returns the largest position present in the memory for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
+    // Return -1 if the sequence is empty
+    LLAMA_API llama_pos llama_memory_seq_pos_max(
+            llama_memory_t mem,
+              llama_seq_id seq_id);
+    // Check if the memory supports shifting
+    LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
+    //
+    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
     //
     // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -619,93 +711,103 @@ extern "C" {
                "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
     // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_self_clear(
-            struct llama_context * ctx);
+    DEPRECATED(LLAMA_API void llama_kv_self_clear(
+                struct llama_context * ctx),
+            "Use llama_memory_clear() instead");
     // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
     // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_self_seq_rm(
+    DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
-                       llama_pos   p1);
+                       llama_pos   p1),
+            "Use llama_memory_seq_rm() instead");
     // Copy all tokens that belong to the specified sequence to another sequence
     // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_cp(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
             struct llama_context * ctx,
                     llama_seq_id   seq_id_src,
                     llama_seq_id   seq_id_dst,
                        llama_pos   p0,
-                       llama_pos   p1);
+                       llama_pos   p1),
+            "Use llama_memory_seq_cp() instead");
     // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_self_seq_keep(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_keep() instead");
     // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_add(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
                        llama_pos   p1,
-                       llama_pos   delta);
+                       llama_pos   delta),
+            "Use llama_memory_seq_add() instead");
     // Integer division of the positions by factor of `d > 1`
     // If the KV cache is RoPEd, the KV data is updated accordingly:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
     // p0 < 0 : [0,  p1]
     // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_self_seq_div(
+    DEPRECATED(void llama_kv_self_seq_div(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,
                        llama_pos   p1,
-                             int   d);
+                             int   d),
+            "Use llama_memory_seq_div() instead");
     // Returns the smallest position present in the KV cache for the specified sequence
     // This is typically non-zero only for SWA caches
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_min(
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_min() instead");
     // Returns the largest position present in the KV cache for the specified sequence
+    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
     // Return -1 if the sequence is empty
-    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
+    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
             struct llama_context * ctx,
-                    llama_seq_id   seq_id);
+                    llama_seq_id   seq_id),
+            "Use llama_memory_seq_pos_max() instead");
     // Defragment the KV cache
     // This will be applied:
     //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
+    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
+            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
     // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
+            "use llama_memory_can_shift() instead");
     // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
-    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
+    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
+            "simply remove this call, updates are applied lazily on the next llama_decode()");
     //
     // State / sessions
     //
     // Returns the *actual* size in bytes of the state
-    // (logits, embedding and kv_cache)
+    // (logits, embedding and memory)
     // Only use when saving the state, not when restoring it, otherwise the size may be too small.
     LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
     LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -761,12 +863,12 @@ extern "C" {
                           size_t   n_token_count),
         "use llama_state_save_file instead");
-    // Get the exact size needed to copy the KV cache of a single sequence
+    // Get the exact size needed to copy the state of a single sequence
     LLAMA_API size_t llama_state_seq_get_size(
             struct llama_context * ctx,
                     llama_seq_id   seq_id);
-    // Copy the KV cache of a single sequence into the specified buffer
+    // Copy the state of a single sequence into the specified buffer
     LLAMA_API size_t llama_state_seq_get_data(
             struct llama_context * ctx,
                          uint8_t * dst,
@@ -832,16 +934,16 @@ extern "C" {
     // For encode-decoder contexts, processes the batch using the encoder.
     // Can store the encoder output internally for later use by the decoder's cross-attention layers.
     //   0 - success
-    // < 0 - error. the KV cache state is restored to the state before this call
+    // < 0 - error. the memory state is restored to the state before this call
     LLAMA_API int32_t llama_encode(
             struct llama_context * ctx,
               struct llama_batch   batch);
     // Process a batch of tokens.
-    // Requires KV cache.
+    // Requires the context to have a memory.
     // For encode-decoder contexts, processes the batch using the decoder.
     // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the KV cache state is restored to the state before this call
+    // Upon non-zero return values, the memory state is restored to the state before this call
     //    0 - success
     //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
     //    2 - aborted
@@ -862,8 +964,8 @@ extern "C" {
     // Get the number of threads used for prompt and batch processing (multiple token).
     LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
-    // Set whether the model is in embeddings mode or not
-    // If true, embeddings will be returned but logits will not
+    // Set whether the context outputs embeddings or not
+    // TODO: rename to avoid confusion with llama_get_embeddings()
     LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
     // Set whether to use causal attention or not
@@ -912,7 +1014,7 @@ extern "C" {
     // Get the embeddings for a sequence id
     // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
     // otherwise: float[n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);