npm - cui-llama.rn - Versions diffs - 1.6.0 → 1.6.1 - Mend

cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (195) hide show

package/cpp/llama-context.cpp CHANGED Viewed

@@ -6,7 +6,6 @@
 #include "llama-model.h"
 #include "llama-kv-cache.h"
-#include <cassert>
 #include <cstring>
 #include <stdexcept>
 #include <cinttypes>
@@ -113,7 +112,7 @@ llama_context::llama_context(
     }
     if (n_ctx_per_seq > hparams.n_ctx_train) {
-        LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
+        LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
                 __func__, n_ctx_per_seq, hparams.n_ctx_train);
     }
@@ -176,44 +175,13 @@ llama_context::llama_context(
     }
     // init the memory module
-    // TODO: for now, always create a unified KV cache
     if (!hparams.vocab_only) {
-        kv_self.reset(static_cast<llama_kv_cache_unified *>(model.create_memory()));
+        llama_memory_params params_mem = {
+            /*.type_k =*/ params.type_k,
+            /*.type_v =*/ params.type_v,
+        };
-        LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
-        cparams.n_ctx = LM_GGML_PAD(cparams.n_ctx, kv_self->get_padding(cparams));
-        LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
-        uint32_t kv_size = cparams.n_ctx;
-        lm_ggml_type type_k = params.type_k;
-        lm_ggml_type type_v = params.type_v;
-        if (llama_model_is_recurrent(&model)) {
-            // Mamba needs at least as many KV cells as there are sequences kept at any time
-            kv_size = std::max((uint32_t) 1, params.n_seq_max);
-            // it's probably best to keep as much precision as possible for the states
-            type_k = LM_GGML_TYPE_F32; // required by lm_ggml_ssm_conv for Mamba's conv_states
-            type_v = LM_GGML_TYPE_F32; // required by lm_ggml_ssm_scan for Mamba's ssm_states
-        }
-        LM_GGML_ASSERT(hparams.n_embd_head_k % lm_ggml_blck_size(type_k) == 0);
-        LM_GGML_ASSERT(hparams.n_embd_head_v % lm_ggml_blck_size(type_v) == 0);
-        if (!kv_self->init(model, cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
-            throw std::runtime_error("failed to initialize self-attention cache");
-        }
-        {
-            const size_t memory_size_k = kv_self->size_k_bytes();
-            const size_t memory_size_v = kv_self->size_v_bytes();
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                    (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                    lm_ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                    lm_ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
-        }
+        memory.reset(model.create_memory(params_mem, cparams));
     }
     // init backends
@@ -304,7 +272,9 @@ llama_context::llama_context(
         int n_nodes_tg  = -1;
         // simulate full KV cache
-        kv_self->n = kv_self->size;
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+        kv_self->set_full();
         cross.v_embd.clear();
@@ -426,6 +396,18 @@ const llama_model & llama_context::get_model() const {
     return model;
 }
+const llama_cparams & llama_context::get_cparams() const {
+    return cparams;
+}
+lm_ggml_backend_sched_t llama_context::get_sched() const {
+    return sched.get();
+}
+lm_ggml_context * llama_context::get_ctx_compute() const {
+    return ctx_compute.get();
+}
 uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
@@ -455,345 +437,21 @@ uint32_t llama_context::n_threads_batch() const {
 }
 llama_kv_cache * llama_context::get_kv_self() {
-    return kv_self.get();
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    return kv_self;
 }
 const llama_kv_cache * llama_context::get_kv_self() const {
-    return kv_self.get();
-}
-lm_ggml_tensor * llama_context::build_rope_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_tensor * cur,
-        lm_ggml_tensor * shift,
-        lm_ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        lm_ggml_backend_buffer * bbuf) const {
-    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
-    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
-    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
-    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
-    const auto & hparams = model.hparams;
-    const auto & n_rot     = hparams.n_rot;
-    const auto & rope_type = hparams.rope_type;
-    lm_ggml_tensor * tmp;
-    if (lm_ggml_is_quantized(cur->type)) {
-        // dequantize to f32 -> RoPE -> quantize back
-        tmp = lm_ggml_cast(ctx0, cur, LM_GGML_TYPE_F32);
-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (lm_ggml_backend_supports_buft(backend.get(), lm_ggml_backend_buffer_get_type(bbuf))) {
-                    lm_ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-        }
-        tmp = lm_ggml_rope_ext_inplace(ctx0, tmp,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-        tmp = lm_ggml_cpy(ctx0, tmp, cur);
-    } else {
-        // we rotate only the first n_rot dimensions
-        tmp = lm_ggml_rope_ext_inplace(ctx0, cur,
-                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
-    }
-    return tmp;
-}
-class llm_graph_input_k_shift : public llm_graph_input_i {
-public:
-    llm_graph_input_k_shift(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_k_shift() = default;
-    void set_input(const llama_ubatch * ubatch) override;
-    lm_ggml_tensor * k_shift; // I32 [kv_size]
-    const llama_kv_cache_unified * kv_self;
-};
-void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
-    LM_GGML_UNUSED(ubatch);
-    if (k_shift) {
-        assert(lm_ggml_backend_buffer_is_host(k_shift->buffer));
-        int32_t * data = (int32_t *) k_shift->data;
-        for (uint32_t i = 0; i < kv_self->size; ++i) {
-            data[i] = kv_self->cells[i].delta;
-        }
-    }
-}
-llm_graph_result_ptr llama_context::build_kv_self_shift(
-        lm_ggml_context * ctx0,
-        lm_ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-    const auto & hparams = model.hparams;
-    const auto & n_layer = hparams.n_layer;
-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-    //LM_GGML_ASSERT(kv_self->size == n_ctx);
-    auto inp = std::make_unique<llm_graph_input_k_shift>(kv_self.get());
-    inp->k_shift = lm_ggml_new_tensor_1d(ctx0, LM_GGML_TYPE_I32, cparams.n_ctx);
-    lm_ggml_set_input(inp->k_shift);
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const int64_t n_head_kv    = hparams.n_head_kv(il);
-        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-        const bool is_swa = hparams.is_swa(il);
-        // note: the swa rope params could become part of the cparams in the future
-        //       if we decide to make them configurable, like the non-sliding ones
-        const float freq_base_l  = is_swa ? hparams.rope_freq_base_train_swa  : cparams.rope_freq_base;
-        const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale;
-        lm_ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il);
-        lm_ggml_tensor * k =
-            lm_ggml_view_3d(ctx0, kv_self->k_l[il],
-                n_embd_head_k, n_head_kv, kv_self->size,
-                lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k),
-                lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                0);
-        lm_ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
-        lm_ggml_build_forward_expand(gf, cur);
-    }
-    res->add_input(std::move(inp));
-    return res;
-}
-llm_graph_result_ptr llama_context::build_kv_self_defrag(
-        lm_ggml_context * ctx0,
-        lm_ggml_cgraph * gf) const {
-    auto res = std::make_unique<llm_graph_result>();
-    const auto & hparams = model.hparams;
-    const auto & ids = kv_self->defrag_info.ids;
-#if 0
-    // CPU defrag
-    //
-    // TODO: optimizations are possible:
-    //       - multiple threads
-    //       - avoid copying to the host memory when already there
-    //
-    // likely not worth the effort, as we have lm_ggml_graph based defrag
-    //
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-    const uint32_t kv_size = size;
-    std::vector<uint8_t> buf_k;
-    std::vector<uint8_t> buf_v;
-    for (uint32_t il = 0; il < n_layer; ++il) {
-        const size_t k_size_row = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa);
-        const size_t k_size     = lm_ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
-        const size_t v_size_el = lm_ggml_type_size(v_l[il]->type);
-        const size_t v_size    = lm_ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
-        buf_k.resize(k_size);
-        buf_v.resize(v_size);
-        lm_ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
-        lm_ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
-        // batch move [i, i+nm) to [id, id+nm)
-        // note: cells can move only to a lower index
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            const uint32_t id = ids[i];
-            if (i == id || id == n_kv) {
-                continue;
-            }
-            uint32_t nm = 1;
-            while (i + nm < n_kv && ids[i + nm] == id + nm) {
-                nm++;
-            }
-            // move keys
-            {
-                const int64_t os =  i*k_size_row;
-                const int64_t od = id*k_size_row;
-                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
-            }
-            // move values (note: they are transposed)
-            {
-                const int64_t os =  i;
-                const int64_t od = id;
-                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
-                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
-                }
-            }
-            i += nm - 1;
-        }
-        lm_ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
-        lm_ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
-    }
-#else
-    for (uint32_t i = 0; i < ids.size(); ++i) {
-        const uint32_t id = ids[i];
-        if (i == id || id == ids.size()) {
-            continue;
-        }
-        uint32_t nm = 1;
-        while (i + nm < ids.size() && ids[i + nm] == id + nm) {
-            nm++;
-        }
-        for (uint32_t il = 0; il < hparams.n_layer; ++il) { // NOLINT
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-            const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
-            lm_ggml_tensor * view_k_src = lm_ggml_view_2d(ctx0, kv_self->k_l[il],
-                    n_embd_k_gqa, nm,
-                    lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*i));
-            lm_ggml_tensor * view_k_dst = lm_ggml_view_2d(ctx0, kv_self->k_l[il],
-                    n_embd_k_gqa, nm,
-                    lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
-                    lm_ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa*id));
-            lm_ggml_tensor * view_v_src;
-            lm_ggml_tensor * view_v_dst;
-            if (cparams.flash_attn) {
-                // NOTE: the V cache is not transposed when using flash attention
-                view_v_src = lm_ggml_view_2d(ctx0, kv_self->v_l[il],
-                        n_embd_v_gqa, nm,
-                        lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*i));
-                view_v_dst = lm_ggml_view_2d(ctx0, kv_self->v_l[il],
-                        n_embd_v_gqa, nm,
-                        lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa),
-                        lm_ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa*id));
-            } else {
-                view_v_src = lm_ggml_view_2d(ctx0, kv_self->v_l[il],
-                        nm, n_embd_v_gqa,
-                        lm_ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        lm_ggml_row_size(kv_self->v_l[il]->type, i));
-                view_v_dst = lm_ggml_view_2d(ctx0, kv_self->v_l[il],
-                        nm, n_embd_v_gqa,
-                        lm_ggml_row_size(kv_self->v_l[il]->type, kv_self->size),
-                        lm_ggml_row_size(kv_self->v_l[il]->type, id));
-            }
-            lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, view_k_src, view_k_dst));
-            lm_ggml_build_forward_expand(gf, lm_ggml_cpy(ctx0, view_v_src, view_v_dst));
-        }
-        i += nm - 1;
-    }
-    //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
-#endif
-    return res;
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    return kv_self;
 }
 void llama_context::kv_self_update() {
-    auto & kv = kv_self;
     bool need_reserve = false;
-    if (kv->has_shift) {
-        if (!kv->get_can_shift()) {
-            LM_GGML_ABORT("The current context does not support K-shift");
-        }
-        LLAMA_LOG_DEBUG("%s: applying K-shift\n", __func__);
-        // apply K-shift if needed
-        if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
-            lm_ggml_backend_sched_reset(sched.get());
-            auto * gf = graph_init();
-            auto res = build_kv_self_shift(ctx_compute.get(), gf);
-            lm_ggml_backend_sched_alloc_graph(sched.get(), gf);
-            res->set_inputs(nullptr);
-            graph_compute(gf, false);
-            need_reserve = true;
-        }
-        {
-            kv->has_shift = false;
-            for (uint32_t i = 0; i < kv->size; ++i) {
-                kv->cells[i].delta = 0;
-            }
-        }
-    }
-    // defragment the KV cache if needed
-    if (kv->do_defrag) {
-        LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
-        if (kv->defrag_prepare(graph_max_nodes())) {
-            lm_ggml_backend_sched_reset(sched.get());
-            auto * gf = graph_init();
-            auto res = build_kv_self_defrag(ctx_compute.get(), gf);
-            lm_ggml_backend_sched_alloc_graph(sched.get(), gf);
-            res->set_inputs(nullptr);
-            graph_compute(gf, false);
-            need_reserve = true;
-        }
-        kv->do_defrag = false;
-    }
+    need_reserve = kv_self->update(*this);
     // reserve a worst case graph if needed
     if (need_reserve) {
@@ -804,7 +462,7 @@ void llama_context::kv_self_update() {
         uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
         // simulate full KV cache
-        kv_self->n = kv_self->size;
+        kv_self->set_full();
         llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
         llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
@@ -825,9 +483,6 @@ enum llama_pooling_type llama_context::pooling_type() const {
 }
 float * llama_context::get_logits() {
-    // reorder logits for backward compatibility
-    output_reorder();
     return logits;
 }
@@ -870,9 +525,6 @@ float * llama_context::get_logits_ith(int32_t i) {
 }
 float * llama_context::get_embeddings() {
-    // reorder embeddings for backward compatibility
-    output_reorder();
     return embd;
 }
@@ -1024,8 +676,8 @@ int llama_context::encode(llama_batch & inp_batch) {
     }
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
+    // note: during encode, we always pass the full sequence starting from pos = 0
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
     const llama_batch & batch = batch_allocr.batch;
     const int32_t n_tokens = batch.n_tokens;
@@ -1054,7 +706,7 @@ int llama_context::encode(llama_batch & inp_batch) {
     const int64_t n_embd = hparams.n_embd;
-    sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
+    llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
     const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
@@ -1188,9 +840,11 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -1;
     }
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->pos_max() + 1);
+    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
     const llama_batch & batch = batch_allocr.batch;
@@ -1202,7 +856,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     const int64_t n_tokens_all = batch.n_tokens;
     const int64_t n_embd       = hparams.n_embd;
-    llama_kv_cache_guard kv_guard(kv_self.get());
+    llama_kv_cache_guard kv_guard(kv_self);
     LM_GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
@@ -1243,11 +897,7 @@ int llama_context::decode(llama_batch & inp_batch) {
         n_outputs_all = 1;
     }
-    const bool logits_all = n_outputs_all == n_tokens_all;
-    sbatch.from_batch(batch, n_embd,
-            /* simple_split */ !kv_self->recurrent,
-            /* logits_all   */ logits_all);
+    llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
     // reserve output buffer
     if (output_reserve(n_outputs_all) < n_outputs_all) {
@@ -1261,22 +911,7 @@ int llama_context::decode(llama_batch & inp_batch) {
     int64_t n_outputs_prev = 0;
     while (sbatch.n_tokens > 0) {
-        llama_ubatch ubatch = llama_ubatch();
-        const auto & n_ubatch = cparams.n_ubatch;
-        if (kv_self->recurrent) {
-            if (embd_pooled) {
-                // Pooled embeddings cannot be split across ubatches (yet)
-                ubatch = sbatch.split_seq(cparams.n_ubatch);
-            } else {
-                // recurrent model architectures are easier to implement
-                // with equal-length sequences
-                ubatch = sbatch.split_equal(cparams.n_ubatch);
-            }
-        } else {
-            ubatch = sbatch.split_simple(n_ubatch);
-        }
+        llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
         // count the outputs in this u_batch
         {
@@ -1296,24 +931,12 @@ int llama_context::decode(llama_batch & inp_batch) {
         }
         // find KV slot
-        {
-            if (!kv_self->find_slot(ubatch)) {
-                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-                return 1;
-            }
+        if (!kv_self->find_slot(ubatch)) {
+            LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
-            if (!kv_self->recurrent) {
-                // a heuristic, to avoid attending the full cache if it is not yet utilized
-                // after enough generations, the benefit from this heuristic disappears
-                // if we start defragmenting the cache, the benefit from this will be more important
-                const uint32_t pad = kv_self->get_padding(cparams);
-                kv_self->n = std::min(kv_self->size, std::max(pad, LM_GGML_PAD(kv_self->cell_max(), pad)));
-            }
+            return 1;
         }
-        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self->n, kv_self->used, kv_self->head);
         lm_ggml_backend_sched_reset(sched.get());
         lm_ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
@@ -1427,43 +1050,68 @@ int llama_context::decode(llama_batch & inp_batch) {
     // finalize the batch processing
     kv_guard.commit();
+    // set to total number of outputs in the batch, for use in llama_get_logits_ith
+    n_outputs = n_outputs_all;
     // set output mappings
     {
         bool sorted_output = true;
-        LM_GGML_ASSERT(sbatch.out_ids.size() == (size_t) n_outputs_all);
+        auto & out_ids = sbatch.out_ids;
+        LM_GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
         for (int64_t i = 0; i < n_outputs_all; ++i) {
-            int64_t out_id = sbatch.out_ids[i];
+            int64_t out_id = out_ids[i];
             output_ids[out_id] = i;
             if (out_id != i) {
                 sorted_output = false;
             }
         }
-        if (sorted_output) {
-            sbatch.out_ids.clear();
+        // make the outputs have the same order they had in the user-provided batch
+        // note: this is mostly relevant for recurrent models atm
+        if (!sorted_output) {
+            const uint32_t n_vocab = model.vocab.n_tokens();
+            const uint32_t n_embd  = model.hparams.n_embd;
+            LM_GGML_ASSERT((size_t) n_outputs == out_ids.size());
+            // TODO: is there something more efficient which also minimizes swaps?
+            // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
+            for (int32_t i = 0; i < n_outputs - 1; ++i) {
+                int32_t j_min = i;
+                for (int32_t j = i + 1; j < n_outputs; ++j) {
+                    if (out_ids[j] < out_ids[j_min]) {
+                        j_min = j;
+                    }
+                }
+                if (j_min == i) { continue; }
+                std::swap(out_ids[i], out_ids[j_min]);
+                if (logits_size > 0) {
+                    for (uint32_t k = 0; k < n_vocab; k++) {
+                        std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
+                    }
+                }
+                if (embd_size > 0) {
+                    for (uint32_t k = 0; k < n_embd; k++) {
+                        std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
+                    }
+                }
+            }
+            std::fill(output_ids.begin(), output_ids.end(), -1);
+            for (int32_t i = 0; i < n_outputs; ++i) {
+                output_ids[out_ids[i]] = i;
+            }
         }
     }
-    // set to total number of outputs in the batch, for use in llama_get_logits_ith
-    n_outputs = n_outputs_all;
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
     // decide if we need to defrag the kv cache
-    if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
-        // - do not defrag small contexts (i.e. < 2048 tokens)
-        // - count the padding towards the number of used tokens
-        const float fragmentation = kv_self->n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self->used + kv_self->get_padding(cparams))/float(kv_self->n)) : 0.0f;
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-            kv_self->defrag();
-        }
+    if (cparams.defrag_thold > 0.0f) {
+        kv_self->defrag_sched(cparams.defrag_thold);
     }
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
@@ -1543,52 +1191,12 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     // set all ids as invalid (negative)
     std::fill(output_ids.begin(), output_ids.end(), -1);
-    lm_ggml_backend_buffer_clear(buf_output.get(), 0);
     this->n_outputs     = 0;
     this->n_outputs_max = n_outputs_max;
     return n_outputs_max;
 }
-void llama_context::output_reorder() {
-    auto & out_ids = sbatch.out_ids;
-    if (!out_ids.empty()) {
-        const uint32_t n_vocab = model.vocab.n_tokens();
-        const uint32_t n_embd  = model.hparams.n_embd;
-        LM_GGML_ASSERT((size_t) n_outputs == out_ids.size());
-        // TODO: is there something more efficient which also minimizes swaps?
-        // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
-        for (int32_t i = 0; i < n_outputs - 1; ++i) {
-            int32_t j_min = i;
-            for (int32_t j = i + 1; j < n_outputs; ++j) {
-                if (out_ids[j] < out_ids[j_min]) {
-                    j_min = j;
-                }
-            }
-            if (j_min == i) { continue; }
-            std::swap(out_ids[i], out_ids[j_min]);
-            if (logits_size > 0) {
-                for (uint32_t k = 0; k < n_vocab; k++) {
-                    std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                }
-            }
-            if (embd_size > 0) {
-                for (uint32_t k = 0; k < n_embd; k++) {
-                    std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                }
-            }
-        }
-        std::fill(output_ids.begin(), output_ids.end(), -1);
-        for (int32_t i = 0; i < n_outputs; ++i) {
-            output_ids[out_ids[i]] = i;
-        }
-        out_ids.clear();
-    }
-}
 //
 // graph
 //
@@ -1625,7 +1233,7 @@ llm_graph_result_ptr llama_context::graph_build(
                 /*.backend_cpu =*/ backend_cpu,
                 /*.cvec        =*/ &cvec,
                 /*.loras       =*/ &loras,
-                /*.memory      =*/ kv_self.get(),
+                /*.memory      =*/ memory.get(),
                 /*.cross       =*/ &cross,
                 /*.n_outputs   =*/ n_outputs,
                 /*.cb          =*/ graph_get_cb(),
@@ -2029,8 +1637,6 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     {
         LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
-        output_reorder();
         const auto n_outputs    = this->n_outputs;
         const auto & output_ids = this->output_ids;
@@ -2084,6 +1690,8 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
     }
     LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     kv_self->state_write(io);
     return io.n_bytes();
@@ -2168,6 +1776,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
     }
     LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     kv_self->state_read(io);
     return io.n_bytes();
@@ -2176,6 +1786,8 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
 size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
     LM_GGML_UNUSED(seq_id);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     kv_self->state_write(io, seq_id);
     return io.n_bytes();
@@ -2184,6 +1796,8 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
 size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
     LM_GGML_UNUSED(seq_id);
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
     kv_self->state_read(io, seq_id);
     return io.n_bytes();
@@ -2539,7 +2153,7 @@ void llama_kv_cache_seq_cp(
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1) {
-    return llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
+    llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
 }
 void llama_kv_self_seq_cp(
@@ -2553,14 +2167,14 @@ void llama_kv_self_seq_cp(
         return;
     }
-    return kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 // deprecated
 void llama_kv_cache_seq_keep(
         llama_context * ctx,
          llama_seq_id   seq_id) {
-    return llama_kv_self_seq_keep(ctx, seq_id);
+    llama_kv_self_seq_keep(ctx, seq_id);
 }
 void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
@@ -2569,7 +2183,7 @@ void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
         return;
     }
-    return kv->seq_keep(seq_id);
+    kv->seq_keep(seq_id);
 }
 // deprecated
@@ -2579,7 +2193,7 @@ void llama_kv_cache_seq_add(
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta) {
-    return llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
+    llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
 }
 void llama_kv_self_seq_add(
@@ -2593,7 +2207,7 @@ void llama_kv_self_seq_add(
         return;
     }
-    return kv->seq_add(seq_id, p0, p1, delta);
+    kv->seq_add(seq_id, p0, p1, delta);
 }
 // deprecated
@@ -2603,7 +2217,7 @@ void llama_kv_cache_seq_div(
             llama_pos   p0,
             llama_pos   p1,
                   int   d) {
-    return llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
+    llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
 }
 void llama_kv_self_seq_div(
@@ -2617,7 +2231,7 @@ void llama_kv_self_seq_div(
         return;
     }
-    return kv->seq_div(seq_id, p0, p1, d);
+    kv->seq_div(seq_id, p0, p1, d);
 }
 // deprecated
@@ -2636,7 +2250,7 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
 // deprecated
 void llama_kv_cache_defrag(llama_context * ctx) {
-    return llama_kv_self_defrag(ctx);
+    llama_kv_self_defrag(ctx);
 }
 void llama_kv_self_defrag(llama_context * ctx) {
@@ -2645,7 +2259,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
         return;
     }
-    return kv->defrag();
+    // force defrag
+    kv->defrag_sched(-1.0f);
 }
 // deprecated