npm - @novastera-oss/llamarn - Versions diffs - 0.2.7 → 0.2.9 - Mend

@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

package/cpp/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -32,7 +32,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     mem_attn(new llama_kv_cache_unified(
         model,
         filter_attn == nullptr ?
-            [&](int32_t il) { return !model.hparams.is_recurrent(il); }
+            [&](int32_t il) { return !hparams.is_recurrent(il); }
             : filter_attn,
         type_k,
         type_v,
@@ -47,7 +47,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     mem_recr(new llama_memory_recurrent(
         model,
         filter_recr == nullptr ?
-            [&](int32_t il) { return model.hparams.is_recurrent(il); }
+            [&](int32_t il) { return hparams.is_recurrent(il); }
             : filter_recr,
         type_r,
         type_s,
@@ -56,50 +56,57 @@ llama_memory_hybrid::llama_memory_hybrid(
         n_seq_max
     )) {}
-llama_memory_state_ptr llama_memory_hybrid::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled) {
+llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    do {
+        balloc.split_reset();
-    // since this includes a recurrent cache, we cannot use split_simple
-    auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
+        // follow the recurrent pattern for creating the ubatch splits
+        std::vector<llama_ubatch> ubatches;
-    // follow the recurrent pattern for creating the ubatch splits
-    std::vector<llama_ubatch> ubatches;
-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
+        while (true) {
+            llama_ubatch ubatch;
-        if (embd_pooled) {
-            // Pooled embeddings cannot be split across ubatches (yet)
-            ubatch = sbatch.split_seq(n_ubatch);
-        } else {
-            ubatch = sbatch.split_equal(n_ubatch);
+            if (embd_all) {
+                // if all tokens are output, split by sequence
+                ubatch = balloc.split_seq(n_ubatch);
+            } else {
+                ubatch = balloc.split_equal(n_ubatch);
+            }
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+            ubatches.push_back(std::move(ubatch)); // NOLINT
         }
-        ubatches.push_back(ubatch);
-    }
+        // prepare the recurrent batches first
+        if (!mem_recr->prepare(ubatches)) {
+            // TODO: will the recurrent cache be in an undefined context at this point?
+            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
-    // prepare the recurrent batches first
-    if (!mem_recr->prepare(ubatches)) {
-        // TODO: will the recurrent cache be in an undefined state at this point?
-        LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
-        return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
+        // prepare the attention cache
+        auto heads_attn = mem_attn->prepare(ubatches);
+        if (heads_attn.empty()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
-    // prepare the attention cache
-    auto heads_attn = mem_attn->prepare(ubatches);
-    if (heads_attn.empty()) {
-        LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
-        return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
+        return std::make_unique<llama_memory_hybrid_context>(
+                this, std::move(heads_attn), std::move(ubatches));
+    } while(false);
-    return std::make_unique<llama_memory_hybrid_state>(
-        this, std::move(sbatch), std::move(heads_attn), std::move(ubatches));
+    return std::make_unique<llama_memory_hybrid_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
-llama_memory_state_ptr llama_memory_hybrid::init_full() {
-    return std::make_unique<llama_memory_hybrid_state>(this);
+llama_memory_context_ptr llama_memory_hybrid::init_full() {
+    return std::make_unique<llama_memory_hybrid_context>(this);
 }
-llama_memory_state_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
-    return std::make_unique<llama_memory_hybrid_state>(this, lctx, optimize);
+llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_memory_hybrid_context>(this, lctx, optimize);
 }
 bool llama_memory_hybrid::get_can_shift() const {
@@ -169,41 +176,39 @@ llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
     return mem_recr.get();
 }
-llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_status status) : status(status) {}
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_status status) : status(status) {}
-llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_hybrid * mem) :
-    state_attn(mem->get_mem_attn()->init_full()),
-    state_recr(mem->get_mem_recr()->init_full()),
-    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
+llama_memory_hybrid_context::llama_memory_hybrid_context(llama_memory_hybrid * mem) :
+    ctx_attn(mem->get_mem_attn()->init_full()),
+    ctx_recr(mem->get_mem_recr()->init_full()),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
-llama_memory_hybrid_state::llama_memory_hybrid_state(
+llama_memory_hybrid_context::llama_memory_hybrid_context(
         llama_memory_hybrid * mem,
               llama_context * lctx,
                        bool   optimize) :
-    state_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
-    state_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
-    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
+    ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+    ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
-llama_memory_hybrid_state::llama_memory_hybrid_state(
+llama_memory_hybrid_context::llama_memory_hybrid_context(
               llama_memory_hybrid * mem,
-                     llama_sbatch   sbatch,
             std::vector<uint32_t>   heads_attn,
         std::vector<llama_ubatch>   ubatches) :
-    sbatch(std::move(sbatch)),
     ubatches(std::move(ubatches)),
     // note: here we copy the ubatches. not sure if this is ideal
-    state_attn(new llama_kv_cache_unified_state(mem->get_mem_attn(), {}, std::move(heads_attn), this->ubatches)),
-    state_recr(new llama_memory_recurrent_state(mem->get_mem_recr(), {},                        this->ubatches)),
-    status(LLAMA_MEMORY_STATUS_SUCCESS) {
+    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
+    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
+    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
-bool llama_memory_hybrid_state::next() {
+bool llama_memory_hybrid_context::next() {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    state_attn->next();
-    state_recr->next();
+    ctx_attn->next();
+    ctx_recr->next();
     if (++i_next >= ubatches.size()) {
         return false;
@@ -212,36 +217,30 @@ bool llama_memory_hybrid_state::next() {
     return true;
 }
-bool llama_memory_hybrid_state::apply() {
+bool llama_memory_hybrid_context::apply() {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
     bool res = true;
-    res = res & state_attn->apply();
-    res = res & state_recr->apply();
+    res = res & ctx_attn->apply();
+    res = res & ctx_recr->apply();
     return res;
 }
-std::vector<int64_t> & llama_memory_hybrid_state::out_ids() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    return sbatch.out_ids;
-}
-llama_memory_status llama_memory_hybrid_state::get_status() const {
+llama_memory_status llama_memory_hybrid_context::get_status() const {
     return status;
 }
-const llama_ubatch & llama_memory_hybrid_state::get_ubatch() const {
+const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
     return ubatches[i_next];
 }
-const llama_kv_cache_unified_state * llama_memory_hybrid_state::get_state_attn() const {
-    return static_cast<const llama_kv_cache_unified_state *>(state_attn.get());
+const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
+    return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
 }
-const llama_memory_recurrent_state * llama_memory_hybrid_state::get_state_recr() const {
-    return static_cast<const llama_memory_recurrent_state *>(state_recr.get());
+const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {
+    return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
 }

package/cpp/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -49,14 +49,14 @@ public:
     // llama_memory_i
     //
-    llama_memory_state_ptr init_batch(
-            const llama_batch & batch,
+    llama_memory_context_ptr init_batch(
+            llama_batch_allocr & balloc,
             uint32_t n_ubatch,
-            bool embd_pooled) override;
+            bool embd_all) override;
-    llama_memory_state_ptr init_full() override;
+    llama_memory_context_ptr init_full() override;
-    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;
     bool get_can_shift() const override;
@@ -90,54 +90,49 @@ private:
     const std::unique_ptr<llama_memory_recurrent> mem_recr;
 };
-class llama_memory_hybrid_state : public llama_memory_state_i {
+class llama_memory_hybrid_context : public llama_memory_context_i {
 public:
     // init failure
-    explicit llama_memory_hybrid_state(llama_memory_status status);
+    explicit llama_memory_hybrid_context(llama_memory_status status);
     // init full
-    explicit llama_memory_hybrid_state(llama_memory_hybrid * mem);
+    explicit llama_memory_hybrid_context(llama_memory_hybrid * mem);
     // init update
-    explicit llama_memory_hybrid_state(
+    explicit llama_memory_hybrid_context(
         llama_memory_hybrid * mem,
               llama_context * lctx,
                        bool   optimize);
     // init success
-    llama_memory_hybrid_state(
+    llama_memory_hybrid_context(
               llama_memory_hybrid * mem,
-                     llama_sbatch   sbatch,
             std::vector<uint32_t>   heads_attn,
         std::vector<llama_ubatch>   ubatches);
-    ~llama_memory_hybrid_state() = default;
+    ~llama_memory_hybrid_context() = default;
     bool next()  override;
     bool apply() override;
-    std::vector<int64_t> & out_ids() override;
     llama_memory_status  get_status() const override;
     const llama_ubatch & get_ubatch() const override;
     //
-    // llama_memory_hybrid_state
+    // llama_memory_hybrid_context
     //
-    const llama_kv_cache_unified_state * get_state_attn() const;
-    const llama_memory_recurrent_state * get_state_recr() const;
+    const llama_kv_cache_unified_context * get_attn() const;
+    const llama_memory_recurrent_context * get_recr() const;
 private:
-    llama_sbatch sbatch;
     // the index of the next ubatch to process
     size_t i_next = 0;
     std::vector<llama_ubatch> ubatches;
-    const llama_memory_state_ptr state_attn;
-    const llama_memory_state_ptr state_recr;
+    const llama_memory_context_ptr ctx_attn;
+    const llama_memory_context_ptr ctx_recr;
     const llama_memory_status status;
 };

package/cpp/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -362,40 +362,47 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
-llama_memory_state_ptr llama_memory_recurrent::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_all) {
-    auto sbatch = llama_sbatch(batch, hparams.n_embd, false);
+llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    do {
+        balloc.split_reset();
-    std::vector<llama_ubatch> ubatches;
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            llama_ubatch ubatch;
-    while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch;
+            if (embd_all) {
+                // if all tokens are output, split by sequence
+                ubatch = balloc.split_seq(n_ubatch);
+            } else {
+                ubatch = balloc.split_equal(n_ubatch);
+            }
-        if (embd_all) {
-            // if all tokens are output, split by sequence
-            ubatch = sbatch.split_seq(n_ubatch);
-        } else {
-            ubatch = sbatch.split_equal(n_ubatch);
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+            ubatches.push_back(std::move(ubatch)); // NOLINT
         }
-        ubatches.push_back(ubatch);
-    }
+        if (!prepare(ubatches)) {
+            break;
+        }
-    if (!prepare(ubatches)) {
-        return std::make_unique<llama_memory_recurrent_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
+        return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
+    } while (false);
-    return std::make_unique<llama_memory_recurrent_state>(this, std::move(sbatch), std::move(ubatches));
+    return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
-llama_memory_state_ptr llama_memory_recurrent::init_full() {
-    return std::make_unique<llama_memory_recurrent_state>(this);
+llama_memory_context_ptr llama_memory_recurrent::init_full() {
+    return std::make_unique<llama_memory_recurrent_context>(this);
 }
-llama_memory_state_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
+llama_memory_context_ptr llama_memory_recurrent::init_update(llama_context * lctx, bool optimize) {
     GGML_UNUSED(lctx);
     GGML_UNUSED(optimize);
-    return std::make_unique<llama_memory_recurrent_state>(LLAMA_MEMORY_STATUS_NO_UPDATE);
+    return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_NO_UPDATE);
 }
 bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
@@ -423,9 +430,8 @@ bool llama_memory_recurrent::prepare(const std::vector<llama_ubatch> & ubatches)
 }
 bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
-    const uint32_t n_seqs = ubatch.n_seqs;
     const uint32_t n_seq_tokens = ubatch.n_seq_tokens;
+    const uint32_t n_seqs       = ubatch.n_seqs;
     // if we have enough unused cells before the current head ->
     //   better to start searching from the beginning of the cache, hoping to fill it
@@ -445,9 +451,11 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
     // everything should fit if all seq_ids are smaller than the max
     for (uint32_t s = 0; s < n_seqs; ++s) {
-        const uint32_t n_seq_id = ubatch.n_seq_id[s];
+        const uint32_t i = s*n_seq_tokens; // first token of sequence set s
+        const uint32_t n_seq_id = ubatch.n_seq_id[i];
         for (uint32_t j = 0; j < n_seq_id; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+            const llama_seq_id seq_id = ubatch.seq_id[i][j];
             if (seq_id < 0 || (uint32_t) seq_id >= size) {
                 // too big seq_id
@@ -506,7 +514,8 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
     // find usable cell range
     for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_seq_id seq_id = ubatch.seq_id[s][0];
+        const uint32_t i = s*n_seq_tokens;
+        const llama_seq_id seq_id = ubatch.seq_id[i][0];
         auto & seq_meta = cells[seq_id];
         bool has_cell = false;
         if (seq_meta.tail >= 0) {
@@ -530,7 +539,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
             seq_meta.tail = next_empty_cell;
             // find next empty cell
             if (s + 1 < n_seqs) {
-                for (uint32_t i = 0; i < size; ++i) {
+                for (uint32_t j = 0; j < size; ++j) {
                     next_empty_cell += 1;
                     if (next_empty_cell >= size) { next_empty_cell -= size; }
                     auto & cell = cells[next_empty_cell];
@@ -544,8 +553,9 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
     // gather and re-order
     for (uint32_t s = 0; s < n_seqs; ++s) {
+        const uint32_t i = s*n_seq_tokens;
         const int32_t dst_id = s + min;
-        const int32_t src_id = cells[ubatch.seq_id[s][0]].tail;
+        const int32_t src_id = cells[ubatch.seq_id[i][0]].tail;
         if (dst_id != src_id) {
             auto & dst_cell = cells[dst_id];
             auto & src_cell = cells[src_id];
@@ -555,8 +565,8 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
             std::swap(dst_cell.seq_id, src_cell.seq_id);
             // swap tails
-            for (uint32_t i = 0; i < size; ++i) {
-                int32_t & tail = cells[i].tail;
+            for (uint32_t j = 0; j < size; ++j) {
+                int32_t & tail = cells[j].tail;
                 if (tail == src_id) {
                     tail = dst_id;
                 } else if (tail == dst_id) {
@@ -568,7 +578,8 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
     // update the pos of the used seqs
     for (uint32_t s = 0; s < n_seqs; ++s) {
-        const llama_pos last_pos = ubatch.pos[n_seq_tokens * s + n_seq_tokens - 1];
+        const uint32_t i = s*n_seq_tokens;
+        const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
         const int32_t cell_id = s + min;
         auto & cell = cells[cell_id];
@@ -576,12 +587,12 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
             // What should happen when the pos backtracks or skips a value?
             // Clearing the state mid-batch would require special-casing which isn't done.
             LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
-                __func__, last_pos, cell.pos, ubatch.seq_id[s][0], n_seq_tokens);
+                __func__, last_pos, cell.pos, ubatch.seq_id[i][0], n_seq_tokens);
         }
         cell.pos = last_pos;
         cell.seq_id.clear();
-        for (int32_t j = 0; j < ubatch.n_seq_id[s]; ++j) {
-            const llama_seq_id seq_id = ubatch.seq_id[s][j];
+        for (int32_t j = 0; j < ubatch.n_seq_id[i]; ++j) {
+            const llama_seq_id seq_id = ubatch.seq_id[i][j];
             cell.seq_id.insert(seq_id);
             cells[seq_id].tail = cell_id;
         }
@@ -827,12 +838,9 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell
         seq_rm(dest_seq_id, -1, -1);
-        llama_sbatch sbatch;
-        llama_ubatch batch = sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
+        llama_batch_allocr balloc(hparams.n_pos_per_embd());
-        batch.n_tokens = cell_count;
-        batch.n_seq_tokens = cell_count;
-        batch.n_seqs = 1;
+        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);
         for (uint32_t i = 0; i < cell_count; ++i) {
             llama_pos pos;
@@ -846,12 +854,12 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell
                 return false;
             }
-            batch.pos[i] = pos;
+            ubatch.pos[i] = pos;
         }
-        batch.n_seq_id[0] = 1;
-        batch.seq_id[0] = &dest_seq_id;
+        ubatch.n_seq_id[0] = 1;
+        ubatch.seq_id[0] = &dest_seq_id;
-        if (!find_slot(batch)) {
+        if (!find_slot(ubatch)) {
             LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
             return false;
         }
@@ -859,8 +867,8 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell
         // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
         // Assume that this is one contiguous block of cells
         GGML_ASSERT(head + cell_count <= size);
-        GGML_ASSERT(cells[head].pos == batch.pos[0]);
-        GGML_ASSERT(cells[head + cell_count - 1].pos == batch.pos[cell_count - 1]);
+        GGML_ASSERT(cells[head].pos == ubatch.pos[0]);
+        GGML_ASSERT(cells[head + cell_count - 1].pos == ubatch.pos[cell_count - 1]);
         GGML_ASSERT(cells[head].has_seq_id(dest_seq_id));
         GGML_ASSERT(cells[head + cell_count - 1].has_seq_id(dest_seq_id));
     } else {
@@ -1037,23 +1045,22 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
 }
 //
-// llama_memory_recurrent_state
+// llama_memory_recurrent_context
 //
-llama_memory_recurrent_state::llama_memory_recurrent_state(llama_memory_status status) : status(status) {}
+llama_memory_recurrent_context::llama_memory_recurrent_context(llama_memory_status status) : status(status) {}
-llama_memory_recurrent_state::llama_memory_recurrent_state(
+llama_memory_recurrent_context::llama_memory_recurrent_context(
         llama_memory_recurrent * mem) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), is_full(true) {
 }
-llama_memory_recurrent_state::llama_memory_recurrent_state(
+llama_memory_recurrent_context::llama_memory_recurrent_context(
         llama_memory_recurrent * mem,
-        llama_sbatch sbatch,
-        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), sbatch(std::move(sbatch)), ubatches(std::move(ubatches)) {}
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), mem(mem), ubatches(std::move(ubatches)) {}
-llama_memory_recurrent_state::~llama_memory_recurrent_state() = default;
+llama_memory_recurrent_context::~llama_memory_recurrent_context() = default;
-bool llama_memory_recurrent_state::next() {
+bool llama_memory_recurrent_context::next() {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
     if (++i_next >= ubatches.size()) {
@@ -1063,7 +1070,7 @@ bool llama_memory_recurrent_state::next() {
     return true;
 }
-bool llama_memory_recurrent_state::apply() {
+bool llama_memory_recurrent_context::apply() {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
     mem->find_slot(ubatches[i_next]);
@@ -1071,46 +1078,40 @@ bool llama_memory_recurrent_state::apply() {
     return true;
 }
-std::vector<int64_t> & llama_memory_recurrent_state::out_ids() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    return sbatch.out_ids;
-}
-llama_memory_status llama_memory_recurrent_state::get_status() const {
+llama_memory_status llama_memory_recurrent_context::get_status() const {
     return status;
 }
-const llama_ubatch & llama_memory_recurrent_state::get_ubatch() const {
+const llama_ubatch & llama_memory_recurrent_context::get_ubatch() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
     return ubatches[i_next];
 }
-uint32_t llama_memory_recurrent_state::get_n_rs() const {
+uint32_t llama_memory_recurrent_context::get_n_rs() const {
     return is_full ? mem->size : mem->n;
 }
-uint32_t llama_memory_recurrent_state::get_head() const {
+uint32_t llama_memory_recurrent_context::get_head() const {
     return is_full ? 0 : mem->head;
 }
-int32_t llama_memory_recurrent_state::get_rs_z() const {
+int32_t llama_memory_recurrent_context::get_rs_z() const {
     return is_full ? 0 : mem->rs_z;
 }
-uint32_t llama_memory_recurrent_state::get_size() const {
+uint32_t llama_memory_recurrent_context::get_size() const {
     return mem->size;
 }
-ggml_tensor * llama_memory_recurrent_state::get_r_l(int32_t il) const {
+ggml_tensor * llama_memory_recurrent_context::get_r_l(int32_t il) const {
     return mem->r_l[il];
 }
-ggml_tensor * llama_memory_recurrent_state::get_s_l(int32_t il) const {
+ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
     return mem->s_l[il];
 }
-int32_t llama_memory_recurrent_state::s_copy(int i) const {
+int32_t llama_memory_recurrent_context::s_copy(int i) const {
     return  mem->cells[i + mem->head].src0;
 }