npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h CHANGED Viewed

@@ -2,88 +2,146 @@
 #include "llama.h"
+#include "llama-cparams.h"
 #include <array>
 #include <vector>
+#include <set>
+#include <bitset>
+#include <unordered_map>
-// very similar to llama_batch,
-// but has more metadata about sequences
+// keep this struct lightweight
+// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
     bool equal_seqs;
     // TODO: whole_seqs for embeddings?
-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;
-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
-    int8_t       *  output;   // [n_tokens]
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
+    // seq_id_unq: unique sequence ids in the ubatch
+    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    //             used for extracting sequence pooled embeddings
+    //                          // size               | idx | val
+    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    float        *  embd;       // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;     // [n_tokens]         | i   | -
 };
-struct llama_sbatch_seq {
-    int32_t n_seq_id;
+// a helper for sanitizing, fulfilling and splitting a batch
+class llama_batch_allocr {
+public:
+    llama_batch_allocr(uint32_t n_pos_per_embd);
-    llama_seq_id * seq_id;
+    // sanitize and auto-gen missing data in the input batch
+    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
+    bool init(
+            const llama_batch & batch_inp,
+            const llama_vocab & vocab,
+            const llama_memory_i * memory,
+            uint32_t n_embd,
+            bool output_all);
-    size_t offset;
-    size_t length;
-};
+    const llama_batch & get_batch() const;
-// sequence-length-aware batch splitting
-struct llama_sbatch {
-    // tokens left in this batch
-    size_t n_tokens;
+    uint32_t get_n_tokens()  const;
+    uint32_t get_n_outputs() const;
-    size_t n_embd;
+    // the array of output indices in the order they were encountered during the ubatch splitting
+    std::vector<int32_t> & get_out_ids();
-    bool logits_all; // TODO: remove once lctx.logits_all is removed too
+    // min/max positions of each sequence in the current ubatch
+    llama_pos seq_pos_min(llama_seq_id seq_id) const;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const;
-    // sorted indices into the batch
-    std::vector<int64_t> ids;
-    // batch indices of the output
-    std::vector<int64_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    // call once before splitting the batch to reset the internal state
+    void split_reset();
-    const llama_batch * batch = nullptr;
+    // simple split, unknown number of sequence sets of unequal lengths
+    llama_ubatch split_simple(uint32_t n_ubatch);
-    // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
-    std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
-    std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
-    std::vector<int8_t>         ubatch_output;
+    // make ubatches of equal-length sequences sets
+    llama_ubatch split_equal(uint32_t n_ubatch);
-    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
+    // sequence-set-wise split - each ubatch contains a single sequence-set
+    llama_ubatch split_seq(uint32_t n_ubatch);
-    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
+    // a helper method for creating a well-defined ubatch of tokens
+    // TODO: support embeddings if needed in the future
+    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
-    // simple split, unknown number of sequences of unequal lengths
-    llama_ubatch split_simple(size_t n_ubatch);
+private:
+    void clear();
-    // make batches of equal-length sequences
-    llama_ubatch split_equal(size_t n_ubatch);
+    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
-    // sequence-wise split
-    llama_ubatch split_seq(size_t n_ubatch);
+    // for debugging, start with LLAMA_BATCH_DEBUG=2
+    void ubatch_print(const llama_ubatch & ubatch, int debug);
-    llama_sbatch() = default;
-    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
-};
+    llama_batch batch;
+    // only for debugging purposes
+    const llama_vocab * vocab;
+    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
+    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+    const uint32_t n_pos_per_embd;
-// temporary allocate memory for the input batch if needed
-struct llama_batch_allocr {
-    struct llama_batch batch;
+    uint32_t n_embd;
+    uint32_t n_outputs;
     std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
     std::vector<llama_pos>      pos;
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
+    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int32_t>        seq_idx;
+    std::vector<int8_t>         output;
+    using pos_set_t = std::set<llama_pos>;
+    using seq_cpl_t = std::vector<bool>;
+    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
+    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+    using idx_vec_t = std::vector<int32_t>;
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
+    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+    // batch indices of the output
+    std::vector<int32_t> out_ids;
+    // used[i] indicates if token i has already been used in a previous ubatch
+    std::vector<bool> used;
+    // llama_ubatch points to this data:
+    struct ubatch {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
+        std::vector<int8_t>         output;
+    };
+    // current splitting state:
+    std::vector<ubatch> ubatches;
+    int debug;
 };

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h CHANGED Viewed

@@ -43,6 +43,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
     LLM_CHAT_TEMPLATE_SMOLVLM,
+    LLM_CHAT_TEMPLATE_DOTS1,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h CHANGED Viewed

@@ -1,7 +1,6 @@
 #pragma once
 #include "llama.h"
-#include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
@@ -13,11 +12,14 @@
 #include <vector>
 struct llama_model;
-struct llama_kv_cache;
+class llama_batch_allocr;
 class llama_io_read_i;
 class llama_io_write_i;
+struct llama_memory_i;
+struct llama_memory_context_i;
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
@@ -44,10 +46,12 @@ struct llama_context {
     uint32_t n_threads()       const;
     uint32_t n_threads_batch() const;
-          llama_kv_cache * get_kv_self();
-    const llama_kv_cache * get_kv_self() const;
+    llama_memory_t get_memory() const;
-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update(bool optimize);
+    void kv_self_defrag_sched();
     enum llama_pooling_type pooling_type() const;
@@ -88,8 +92,18 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
-    int encode(llama_batch & inp_batch);
-    int decode(llama_batch & inp_batch);
+    // process a single ubatch with a specific graph type
+    // if memory_context is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != LM_GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+            llama_memory_context_i * mctx,
+                       lm_ggml_status & ret);
+    int encode(const llama_batch & batch_inp);
+    int decode(const llama_batch & batch_inp);
     //
     // state save/load
@@ -167,7 +181,7 @@ private:
     // Make sure enough space is available for outputs.
     // Returns max number of outputs for which space was reserved.
-    int32_t output_reserve(int32_t n_outputs);
+    uint32_t output_reserve(int32_t n_outputs);
     //
     // graph
@@ -180,16 +194,18 @@ public:
     lm_ggml_cgraph * graph_init();
     // returns the result of lm_ggml_backend_sched_graph_compute_async execution
-    lm_ggml_status graph_compute(
-            lm_ggml_cgraph * gf,
-                   bool   batched);
+    lm_ggml_status graph_compute(lm_ggml_cgraph * gf, bool batched);
+    // reserve a graph with a dummy ubatch of the specified size
+    lm_ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
 private:
     llm_graph_result_ptr graph_build(
-            lm_ggml_context * ctx,
-             lm_ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                      lm_ggml_context * ctx,
+                       lm_ggml_cgraph * gf,
+                const llama_ubatch & ubatch,
+                    llm_graph_type   gtype,
+      const llama_memory_context_i * mctx);
     llm_graph_cb graph_get_cb() const;
@@ -214,6 +230,9 @@ private:
     std::unique_ptr<llama_memory_i> memory;
+    // TODO: temporary, until the llama_kv_self_defrag() API is removed
+    bool memory_force_optimize = false;
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
@@ -227,8 +246,10 @@ private:
     // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
     std::map<llama_seq_id, std::vector<float>> embd_seq;
-    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
+    // reuse the batch_allocr to avoid unnecessary memory allocations
+    std::unique_ptr<llama_batch_allocr> balloc;
+    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h CHANGED Viewed

@@ -4,6 +4,8 @@
 #include <cstdint>
+#define LLAMA_MAX_SEQ 64
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference
     uint32_t n_batch;

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h CHANGED Viewed

@@ -17,10 +17,12 @@ struct lm_ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
-class llama_memory_i;
-class llama_kv_cache_unified;
-class llama_kv_cache_unified_iswa;
-class llama_kv_cache_recurrent;
+struct llama_memory_context_i;
+class llama_kv_cache_unified_context;
+class llama_kv_cache_unified_iswa_context;
+class llama_memory_recurrent_context;
+class llama_memory_hybrid_context;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@@ -35,6 +37,7 @@ enum llm_ffn_op_type {
     LLM_FFN_RELU,
     LLM_FFN_RELU_SQR,
     LLM_FFN_SWIGLU,
+    LLM_FFN_GEGLU,
 };
 enum llm_ffn_gate_type {
@@ -92,14 +95,14 @@ public:
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+    llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * pos = nullptr; // I32 [n_batch]
-    const int64_t n_pos_per_embd = 1;
+    const uint32_t n_pos_per_embd = 1;
 };
 // temperature tuning, used by llama4
@@ -133,7 +136,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
     llm_graph_input_pos_bucket_kv(
             const llama_hparams & hparams,
-            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+            const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
     virtual ~llm_graph_input_pos_bucket_kv() = default;
     void set_input(const llama_ubatch * ubatch) override;
@@ -141,7 +144,8 @@ public:
     lm_ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
     const llama_hparams & hparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_context * mctx;
 };
 class llm_graph_input_out_ids : public llm_graph_input_i {
@@ -186,28 +190,16 @@ public:
     const llama_cparams & cparams;
 };
-class llm_graph_input_s_copy : public llm_graph_input_i {
+class llm_graph_input_rs : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_s_copy() = default;
+    llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
+    virtual ~llm_graph_input_rs() = default;
     void set_input(const llama_ubatch * ubatch) override;
     lm_ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_kv_cache_recurrent * kv_self;
-};
-class llm_graph_input_s_mask : public llm_graph_input_i {
-public:
-    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_s_mask() = default;
-    void set_input(const llama_ubatch * ubatch) override;
-    lm_ggml_tensor * s_mask; // F32 [1, n_kv]
-    const llama_kv_cache_recurrent * kv_self;
+    const llama_memory_recurrent_context * mctx;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@@ -247,10 +239,10 @@ public:
     llm_graph_input_attn_kv_unified(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified * kv_self) :
+            const llama_kv_cache_unified_context * mctx) :
         hparams(hparams),
         cparams(cparams),
-        kv_self(kv_self) {
+        mctx(mctx) {
     }
     ~llm_graph_input_attn_kv_unified() = default;
@@ -264,7 +256,7 @@ public:
     const llama_hparams & hparams;
     const llama_cparams & cparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_context * mctx;
 };
 class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
@@ -272,10 +264,10 @@ public:
     llm_graph_input_attn_kv_unified_iswa(
             const llama_hparams & hparams,
             const llama_cparams & cparams,
-            const llama_kv_cache_unified_iswa * kv_self) :
+            const llama_kv_cache_unified_iswa_context * mctx) :
         hparams(hparams),
         cparams(cparams),
-        kv_self(kv_self) {
+        mctx(mctx) {
     }
     ~llm_graph_input_attn_kv_unified_iswa() = default;
@@ -292,7 +284,7 @@ public:
     const llama_hparams & hparams;
     const llama_cparams & cparams;
-    const llama_kv_cache_unified_iswa * kv_self;
+    const llama_kv_cache_unified_iswa_context * mctx;
 };
 class llm_graph_input_attn_cross : public llm_graph_input_i {
@@ -310,6 +302,44 @@ public:
     const llama_cross * cross = nullptr;
 };
+class llm_graph_input_mem_hybrid : public llm_graph_input_i {
+public:
+    llm_graph_input_mem_hybrid(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_memory_hybrid_context * mctx) :
+        hparams(hparams),
+        cparams(cparams),
+        mctx(mctx) {
+    }
+    virtual ~llm_graph_input_mem_hybrid() = default;
+    void set_input(const llama_ubatch * ubatch) override;
+    lm_ggml_tensor * s_copy; // I32 [kv_size]
+    lm_ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+    lm_ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    lm_ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+    const llama_memory_hybrid_context * mctx;
+};
+// TODO: remove this when lm_ggml_scale_add is implemented
+class llm_graph_input_one : public llm_graph_input_i {
+public:
+    llm_graph_input_one() {}
+    virtual ~llm_graph_input_one() = default;
+    void set_input(const llama_ubatch *) override;
+    lm_ggml_tensor * one = nullptr; // F32
+};
 //
 // llm_graph_result
 //
@@ -383,12 +413,12 @@ struct llm_graph_params {
     lm_ggml_backend_sched_t sched;
     lm_ggml_backend_t backend_cpu;
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
-    int32_t n_outputs;
+    uint32_t n_outputs;
     const llm_graph_cb & cb;
 };
@@ -422,8 +452,8 @@ struct llm_graph_context {
     const float norm_eps;
     const float norm_rms_eps;
-    const int32_t n_tokens;
-    const int32_t n_outputs;
+    const int64_t n_tokens;
+    const int64_t n_outputs;
     const int32_t n_ctx_orig; // yarn
     const enum llama_pooling_type pooling_type;
@@ -435,10 +465,10 @@ struct llm_graph_context {
     lm_ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec     * cvec;
+    const llama_adapter_loras    * loras;
+    const llama_memory_context_i * mctx;
+    const llama_cross            * cross;
     const llm_graph_cb & cb_func;
@@ -446,8 +476,6 @@ struct llm_graph_context {
     llm_graph_context(const llm_graph_params & params);
-    int64_t n_pos_per_embd() const;
     void cb(lm_ggml_tensor * cur, const char * name, int il) const;
     //
@@ -518,14 +546,14 @@ struct llm_graph_context {
     lm_ggml_tensor * build_inp_out_ids() const;
     lm_ggml_tensor * build_inp_mean() const;
     lm_ggml_tensor * build_inp_cls() const;
-    lm_ggml_tensor * build_inp_s_copy() const;
-    lm_ggml_tensor * build_inp_s_mask() const;
     lm_ggml_tensor * build_inp_cross_embd() const;
     lm_ggml_tensor * build_inp_pos_bucket_enc() const;
     lm_ggml_tensor * build_inp_pos_bucket_dec() const;
     lm_ggml_tensor * build_pos_bias(lm_ggml_tensor * pos_bucket, lm_ggml_tensor * attn_rel_b) const;
+    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
     //
     // attention
     //
@@ -572,14 +600,15 @@ struct llm_graph_context {
     llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
     lm_ggml_tensor * build_attn(
             llm_graph_input_attn_kv_unified_iswa * inp,
             lm_ggml_cgraph * gf,
             lm_ggml_tensor * wo,
             lm_ggml_tensor * wo_b,
             lm_ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
-            lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
-            lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+            lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
             lm_ggml_tensor * kq_b,
             lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale,
@@ -600,23 +629,62 @@ struct llm_graph_context {
                   float   kq_scale,
                     int   il) const;
+    lm_ggml_tensor * build_attn(
+            llm_graph_input_mem_hybrid * inp,
+            lm_ggml_cgraph * gf,
+            lm_ggml_tensor * wo,
+            lm_ggml_tensor * wo_b,
+            lm_ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            lm_ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            lm_ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            lm_ggml_tensor * kq_b,
+            lm_ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
     //
     // recurrent
     //
-    lm_ggml_tensor * build_copy_mask_state(
-             lm_ggml_cgraph * gf,
-             lm_ggml_tensor * s,
-             lm_ggml_tensor * state_copy,
-             lm_ggml_tensor * state_mask,
-                 int32_t   n_state,
-                 int32_t   n_seqs) const;
+    // TODO: avoid notion of "kv"
+    // TODO: move this implementation to llama_memory_recurrent.
+    //       this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
+    //       when moving, avoid passing `lm_ggml_cgraph` - only pass `lm_ggml_context`. would likely need to split the
+    //         implementation in 2 separate methods. the goal is to avoid calling `lm_ggml_build_forward_expand` in
+    //         `llama_memory_recurrent`
+    lm_ggml_tensor * build_rs(
+            lm_ggml_cgraph * gf,
+            lm_ggml_tensor * s,
+            lm_ggml_tensor * state_copy,
+                int32_t   state_size,
+                int32_t   n_seqs,
+               uint32_t   n_kv,
+               uint32_t   kv_head,
+               uint32_t   kv_size,
+                int32_t   rs_zero,
+                   bool   avoid_copies = false) const;
+    llm_graph_input_rs * build_rs_inp() const;
+    lm_ggml_tensor * build_rs(
+            llm_graph_input_rs * inp,
+            lm_ggml_cgraph * gf,
+            lm_ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;
+    lm_ggml_tensor * build_rs(
+            llm_graph_input_mem_hybrid * inp,
+            lm_ggml_cgraph * gf,
+            lm_ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;
     lm_ggml_tensor * build_rwkv_token_shift_load(
-             lm_ggml_cgraph * gf,
-             lm_ggml_tensor * state_copy,
-             lm_ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
+        llm_graph_input_rs * inp,
+               lm_ggml_cgraph * gf,
+        const llama_ubatch & ubatch,
                      int   il) const;
     lm_ggml_tensor * build_rwkv_token_shift_store(