npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h ADDED Viewed

@@ -0,0 +1,38 @@
+#pragma once
+#include "nlohmann/json.hpp"
+// Healing marker (empty if the JSON was fully parsed / wasn't healed).
+struct common_healing_marker {
+    // Raw marker.
+    std::string marker;
+    // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format).
+    std::string json_dump_marker;
+};
+// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string)
+struct common_json {
+    nlohmann::ordered_json json;
+    common_healing_marker healing_marker;
+};
+// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty.
+//
+// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON.
+// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker.
+// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format).
+//
+// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again).
+bool common_json_parse(
+    const std::string & input,
+    const std::string & healing_marker,
+    common_json & out);
+// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds.
+bool common_json_parse(
+    std::string::const_iterator & it,
+    const std::string::const_iterator & end,
+    const std::string & healing_marker,
+    common_json & out);

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h CHANGED Viewed

@@ -1,9 +1,9 @@
 #pragma once
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to LM_GGML_ASSERT:
-#define JSON_ASSERT LM_GGML_ASSERT
-#include "json.hpp"
+#include "nlohmann/json_fwd.hpp"
+#include <functional>
+#include <string>
 std::string json_schema_to_grammar(const nlohmann::ordered_json & schema,
                                    bool force_gbnf = false);

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h CHANGED Viewed

@@ -24,6 +24,7 @@ enum llm_arch {
     LLM_ARCH_BERT,
     LLM_ARCH_NOMIC_BERT,
     LLM_ARCH_NOMIC_BERT_MOE,
+    LLM_ARCH_NEO_BERT,
     LLM_ARCH_JINA_BERT_V2,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
@@ -45,6 +46,7 @@ enum llm_arch {
     LLM_ARCH_GEMMA,
     LLM_ARCH_GEMMA2,
     LLM_ARCH_GEMMA3,
+    LLM_ARCH_GEMMA3N,
     LLM_ARCH_STARCODER2,
     LLM_ARCH_MAMBA,
     LLM_ARCH_XVERSE,
@@ -76,6 +78,8 @@ enum llm_arch {
     LLM_ARCH_WAVTOKENIZER_DEC,
     LLM_ARCH_PLM,
     LLM_ARCH_BAILINGMOE,
+    LLM_ARCH_DOTS1,
+    LLM_ARCH_ARCEE,
     LLM_ARCH_UNKNOWN,
 };
@@ -148,6 +152,7 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_LAYER_INDICES,
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -190,13 +195,13 @@ enum llm_kv {
     LLM_KV_TOKENIZER_MASK_ID,
     LLM_KV_TOKENIZER_ADD_BOS,
     LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_SEP,
     LLM_KV_TOKENIZER_ADD_PREFIX,
     LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
     LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
-    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,
@@ -213,6 +218,8 @@ enum llm_kv {
     LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
     LLM_KV_CONVNEXT_BLOCK_COUNT,
+    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
     // deprecated:
     LLM_KV_TOKENIZER_PREFIX_ID,
     LLM_KV_TOKENIZER_SUFFIX_ID,
@@ -263,6 +270,22 @@ enum llm_tensor {
     LLM_TENSOR_LAYER_OUT_NORM,
     LLM_TENSOR_POST_ATTN_NORM,
     LLM_TENSOR_POST_MLP_NORM,
+    LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n
+    LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n
+    LLM_TENSOR_PER_LAYER_INP_GATE,   // gemma3n
+    LLM_TENSOR_PER_LAYER_PROJ,       // gemma3n
+    LLM_TENSOR_PER_LAYER_PROJ_NORM,  // gemma3n
+    LLM_TENSOR_PER_LAYER_POST_NORM,  // gemma3n
+    LLM_TENSOR_ALTUP_PROJ,           // gemma3n
+    LLM_TENSOR_ALTUP_UNEMBD_PROJ,    // gemma3n
+    LLM_TENSOR_ALTUP_CORRECT_COEF,   // gemma3n
+    LLM_TENSOR_ALTUP_CORRECT_SCALE,  // gemma3n
+    LLM_TENSOR_ALTUP_PREDICT_COEF,   // gemma3n
+    LLM_TENSOR_ALTUP_ROUTER,         // gemma3n
+    LLM_TENSOR_ALTUP_ROUTER_NORM,    // gemma3n
+    LLM_TENSOR_LAUREL_L,             // gemma3n
+    LLM_TENSOR_LAUREL_R,             // gemma3n
+    LLM_TENSOR_LAUREL_POST_NORM,     // gemma3n
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,
@@ -435,3 +458,6 @@ const char * llm_arch_name(llm_arch arch);
 llm_arch llm_arch_from_string(const std::string & name);
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
+bool llm_arch_is_recurrent(const llm_arch & arch);
+bool llm_arch_is_hybrid   (const llm_arch & arch);

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h CHANGED Viewed

@@ -2,88 +2,146 @@
 #include "llama.h"
+#include "llama-cparams.h"
 #include <array>
 #include <vector>
+#include <set>
+#include <bitset>
+#include <unordered_map>
-// very similar to llama_batch,
-// but has more metadata about sequences
+// keep this struct lightweight
+// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;
-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
-    int8_t       *  output;   // [n_tokens]
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+    // seq_id_unq: unique sequence ids in the ubatch
+    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    //             used for extracting sequence pooled embeddings
+    //                          // size               | idx | val
+    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    float        *  embd;       // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;     // [n_tokens]         | i   | -
 };
-struct llama_sbatch_seq {
-    int32_t n_seq_id;
+// a helper for sanitizing, fulfilling and splitting a batch
+class llama_batch_allocr {
+public:
+    llama_batch_allocr(uint32_t n_pos_per_embd);
-    llama_seq_id * seq_id;
+    // sanitize and auto-gen missing data in the input batch
+    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
+    bool init(
+            const llama_batch & batch_inp,
+            const llama_vocab & vocab,
+            const llama_memory_i * memory,
+            uint32_t n_embd,
+            bool output_all);
-    size_t offset;
-    size_t length;
-};
+    const llama_batch & get_batch() const;
-// sequence-length-aware batch splitting
-struct llama_sbatch {
-    // tokens left in this batch
-    size_t n_tokens;
+    uint32_t get_n_tokens()  const;
+    uint32_t get_n_outputs() const;
-    size_t n_embd;
+    // the array of output indices in the order they were encountered during the ubatch splitting
+    std::vector<int32_t> & get_out_ids();
-    bool logits_all; // TODO: remove once lctx.logits_all is removed too
+    // min/max positions of each sequence in the current ubatch
+    llama_pos seq_pos_min(llama_seq_id seq_id) const;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const;
-    // sorted indices into the batch
-    std::vector<int64_t> ids;
-    // batch indices of the output
-    std::vector<int64_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    // call once before splitting the batch to reset the internal state
+    void split_reset();
-    const llama_batch * batch = nullptr;
+    // simple split, unknown number of sequence sets of unequal lengths
+    llama_ubatch split_simple(uint32_t n_ubatch);
-    // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
-    std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
-    std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
-    std::vector<int8_t>         ubatch_output;
+    // make ubatches of equal-length sequences sets
+    llama_ubatch split_equal(uint32_t n_ubatch);
-    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
+    // sequence-set-wise split - each ubatch contains a single sequence-set
+    llama_ubatch split_seq(uint32_t n_ubatch);
-    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
+    // a helper method for creating a well-defined ubatch of tokens
+    // TODO: support embeddings if needed in the future
+    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
-    // simple split, unknown number of sequences of unequal lengths
-    llama_ubatch split_simple(size_t n_ubatch);
+private:
+    void clear();
-    // make batches of equal-length sequences
-    llama_ubatch split_equal(size_t n_ubatch);
+    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
-    // sequence-wise split
-    llama_ubatch split_seq(size_t n_ubatch);
+    // for debugging, start with LLAMA_BATCH_DEBUG=2
+    void ubatch_print(const llama_ubatch & ubatch, int debug);
-    llama_sbatch() = default;
-    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
-};
+    llama_batch batch;
+    // only for debugging purposes
+    const llama_vocab * vocab;
+    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
+    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+    const uint32_t n_pos_per_embd;
-// temporary allocate memory for the input batch if needed
-struct llama_batch_allocr {
-    struct llama_batch batch;
+    uint32_t n_embd;
+    uint32_t n_outputs;
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
+    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int32_t>        seq_idx;
+    std::vector<int8_t>         output;
+    using pos_set_t = std::set<llama_pos>;
+    using seq_cpl_t = std::vector<bool>;
+    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
+    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+    using idx_vec_t = std::vector<int32_t>;
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
+    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+    // batch indices of the output
+    std::vector<int32_t> out_ids;
+    // used[i] indicates if token i has already been used in a previous ubatch
+    std::vector<bool> used;
+    // llama_ubatch points to this data:
+    struct ubatch {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
+        std::vector<int8_t>         output;
+    };
+    // current splitting state:
+    std::vector<ubatch> ubatches;
+    int debug;
 };

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h CHANGED Viewed

@@ -43,6 +43,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
     LLM_CHAT_TEMPLATE_SMOLVLM,
+    LLM_CHAT_TEMPLATE_DOTS1,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h CHANGED Viewed

@@ -1,7 +1,6 @@
 #pragma once
 #include "llama.h"
-#include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
@@ -13,11 +12,14 @@
 #include <vector>
 struct llama_model;
-struct llama_kv_cache;
+class llama_batch_allocr;
 class llama_io_read_i;
 class llama_io_write_i;
+struct llama_memory_i;
+struct llama_memory_context_i;
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
@@ -44,10 +46,12 @@ struct llama_context {
     uint32_t n_threads()       const;
     uint32_t n_threads_batch() const;
-          llama_kv_cache * get_kv_self();
-    const llama_kv_cache * get_kv_self() const;
+    llama_memory_t get_memory() const;
-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update(bool optimize);
+    void kv_self_defrag_sched();
     enum llama_pooling_type pooling_type() const;
@@ -88,8 +92,18 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
-    int encode(llama_batch & inp_batch);
-    int decode(llama_batch & inp_batch);
+    // process a single ubatch with a specific graph type
+    // if memory_context is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != LM_GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+            llama_memory_context_i * mctx,
+                       lm_ggml_status & ret);
+    int encode(const llama_batch & batch_inp);
+    int decode(const llama_batch & batch_inp);
     //
     // state save/load
@@ -167,7 +181,7 @@ private:
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
-    int32_t output_reserve(int32_t n_outputs);
+    uint32_t output_reserve(int32_t n_outputs);
     //
     // graph
@@ -180,16 +194,18 @@ public:
     lm_ggml_cgraph * graph_init();
     // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
+    lm_ggml_status graph_compute(lm_ggml_cgraph * gf, bool batched);
+    // reserve a graph with a dummy ubatch of the specified size
+    lm_ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
 private:
     llm_graph_result_ptr graph_build(
-            lm_ggml_context * ctx,
-             lm_ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                      lm_ggml_context * ctx,
+                       lm_ggml_cgraph * gf,
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+      const llama_memory_context_i * mctx);
     llm_graph_cb graph_get_cb() const;
@@ -214,6 +230,9 @@ private:
     std::unique_ptr<llama_memory_i> memory;
+    // TODO: temporary, until the llama_kv_self_defrag() API is removed
+    bool memory_force_optimize = false;
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
@@ -227,8 +246,10 @@ private:
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
-    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
+    // reuse the batch_allocr to avoid unnecessary memory allocations
+    std::unique_ptr<llama_batch_allocr> balloc;
+    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h CHANGED Viewed

@@ -4,6 +4,8 @@
 #include <cstdint>
+#define LLAMA_MAX_SEQ 64
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
     uint32_t n_batch;