npm - @fugood/llama.node - Versions diffs - 1.0.1 → 1.0.3 - Mend

@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/package.json +14 -14
package/scripts/llama.cpp.patch +12 -12
package/src/llama.cpp/CMakeLists.txt +0 -1
package/src/llama.cpp/common/arg.cpp +17 -0
package/src/llama.cpp/common/chat.cpp +37 -20
package/src/llama.cpp/common/chat.h +2 -0
package/src/llama.cpp/common/common.h +4 -0
package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +181 -10
package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
package/src/llama.cpp/include/llama.h +1 -0
package/src/llama.cpp/src/llama-arch.cpp +108 -2
package/src/llama.cpp/src/llama-arch.h +7 -0
package/src/llama.cpp/src/llama-batch.cpp +27 -1
package/src/llama.cpp/src/llama-batch.h +8 -1
package/src/llama.cpp/src/llama-chat.cpp +15 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +95 -81
package/src/llama.cpp/src/llama-graph.h +43 -16
package/src/llama.cpp/src/llama-hparams.cpp +2 -1
package/src/llama.cpp/src/llama-hparams.h +1 -0
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
package/src/llama.cpp/src/llama-kv-cells.h +62 -10
package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
package/src/llama.cpp/src/llama-memory.cpp +17 -0
package/src/llama.cpp/src/llama-memory.h +3 -0
package/src/llama.cpp/src/llama-model.cpp +1374 -210
package/src/llama.cpp/src/llama-model.h +3 -0
package/src/llama.cpp/src/llama-vocab.cpp +8 -1
package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50

package/src/llama.cpp/src/llama-kv-cache-unified.cpp CHANGED Viewed

@@ -156,6 +156,13 @@ llama_kv_cache_unified::llama_kv_cache_unified(
     const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
     debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
+    const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
+    supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) : 0;
+    if (!supports_set_rows) {
+        LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
+    }
 }
 void llama_kv_cache_unified::clear(bool data) {
@@ -353,13 +360,18 @@ llama_memory_context_ptr llama_kv_cache_unified::init_batch(
             ubatches.push_back(std::move(ubatch)); // NOLINT
         }
-        auto heads = prepare(ubatches);
-        if (heads.empty()) {
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+        auto sinfos = prepare(ubatches);
+        if (sinfos.empty()) {
             break;
         }
         return std::make_unique<llama_kv_cache_unified_context>(
-                this, std::move(heads), std::move(ubatches));
+                this, std::move(sinfos), std::move(ubatches));
     } while (false);
     return std::make_unique<llama_kv_cache_unified_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
@@ -402,12 +414,13 @@ llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lct
     return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo));
 }
-llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
-    llama_kv_cache_unified::ubatch_heads res;
+llama_kv_cache_unified::slot_info_vec_t llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
+    llama_kv_cache_unified::slot_info_vec_t res;
     struct state {
         uint32_t head_old; // old position of the head, before placing the ubatch
-        uint32_t head_new; // new position of the head, after placing the ubatch
+        slot_info sinfo; // slot info for the ubatch
         llama_kv_cells_unified cells; // copy of the old cells, before placing the ubatch
     };
@@ -418,26 +431,29 @@ llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::
     bool success = true;
     for (const auto & ubatch : ubatches) {
+        // non-continuous slots require support for ggml_set_rows()
+        const bool cont = supports_set_rows ? false : true;
         // only find a suitable slot for the ubatch. don't modify the cells yet
-        const int32_t head_new = find_slot(ubatch);
-        if (head_new < 0) {
+        const auto sinfo_new = find_slot(ubatch, cont);
+        if (sinfo_new.empty()) {
             success = false;
             break;
         }
         // remeber the position that we found
-        res.push_back(head_new);
+        res.push_back(sinfo_new);
         // store the old state of the cells in the recovery stack
-        states.push_back({head, (uint32_t) head_new, cells.cp(head_new, ubatch.n_tokens)});
+        states.push_back({head, sinfo_new, cells.cp(sinfo_new.idxs)});
         // now emplace the ubatch
-        apply_ubatch(head_new, ubatch);
+        apply_ubatch(sinfo_new, ubatch);
     }
     // iterate backwards and restore the cells to their original state
     for (auto it = states.rbegin(); it != states.rend(); ++it) {
-        cells.set(it->head_new, it->cells);
+        cells.set(it->sinfo.idxs, it->cells);
         head = it->head_old;
     }
@@ -539,7 +555,7 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
     return updated;
 }
-int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
+llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
     const uint32_t n_tokens = ubatch.n_tokens;
     uint32_t head_cur = this->head;
@@ -552,7 +568,7 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
     if (n_tokens > cells.size()) {
         LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %u\n", __func__, n_tokens, cells.size());
-        return -1;
+        return { };
     }
     if (debug > 0) {
@@ -615,15 +631,26 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
     uint32_t n_tested = 0;
+    // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
+    // for non-continuous slots, we test the tokens one by one
+    const uint32_t n_test = cont ? n_tokens : 1;
+    slot_info res;
+    auto & idxs = res.idxs;
+    idxs.reserve(n_tokens);
     while (true) {
-        if (head_cur + n_tokens > cells.size()) {
+        if (head_cur + n_test > cells.size()) {
             n_tested += cells.size() - head_cur;
             head_cur = 0;
             continue;
         }
-        bool found = true;
-        for (uint32_t i = 0; i < n_tokens; i++) {
+        for (uint32_t i = 0; i < n_test; i++) {
+            const auto idx = head_cur;
             //const llama_pos    pos    = ubatch.pos[i];
             //const llama_seq_id seq_id = ubatch.seq_id[i][0];
@@ -633,19 +660,19 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
             //    - (disabled) mask causally, if the sequence is the same as the one we are inserting
             //    - mask SWA, using current max pos for that sequence in the cache
             //                always insert in the cell with minimum pos
-            bool can_use = cells.is_empty(head_cur + i);
+            bool can_use = cells.is_empty(idx);
-            if (!can_use && cells.seq_count(head_cur + i) == 1) {
-                const llama_pos pos_cell = cells.pos_get(head_cur + i);
+            if (!can_use && cells.seq_count(idx) == 1) {
+                const llama_pos pos_cell = cells.pos_get(idx);
                 // (disabled) causal mask
                 // note: it's better to purge any "future" tokens beforehand
-                //if (cells.seq_has(head_cur + i, seq_id)) {
+                //if (cells.seq_has(idx, seq_id)) {
                 //    can_use = pos_cell >= pos;
                 //}
                 if (!can_use) {
-                    const llama_seq_id seq_id_cell = cells.seq_get(head_cur + i);
+                    const llama_seq_id seq_id_cell = cells.seq_get(idx);
                     // SWA mask
                     if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
@@ -654,28 +681,39 @@ int32_t llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) const {
                 }
             }
-            if (!can_use) {
-                found = false;
-                head_cur += i + 1;
-                n_tested += i + 1;
+            head_cur++;
+            n_tested++;
+            if (can_use) {
+                idxs.push_back(idx);
+            } else {
                 break;
             }
         }
-        if (found) {
+        if (idxs.size() == n_tokens) {
             break;
         }
+        if (cont) {
+            idxs.clear();
+        }
         if (n_tested >= cells.size()) {
             //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
-            return -1;
+            return { };
         }
     }
-    return head_cur;
+    // we didn't find a suitable slot - return empty result
+    if (idxs.size() < n_tokens) {
+        res.clear();
+    }
+    return res;
 }
-void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch) {
+void llama_kv_cache_unified::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
     // keep track of the max sequence position that we would overwrite with this ubatch
     // for non-SWA cache, this would be always empty
     llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
@@ -683,22 +721,26 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
         seq_pos_max_rm[s] = -1;
     }
+    assert(ubatch.n_tokens == sinfo.idxs.size());
     for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
-        if (!cells.is_empty(head_cur + i)) {
-            assert(cells.seq_count(head_cur + i) == 1);
+        const auto idx = sinfo.idxs.at(i);
-            const llama_seq_id seq_id = cells.seq_get(head_cur + i);
-            const llama_pos    pos    = cells.pos_get(head_cur + i);
+        if (!cells.is_empty(idx)) {
+            assert(cells.seq_count(idx) == 1);
+            const llama_seq_id seq_id = cells.seq_get(idx);
+            const llama_pos    pos    = cells.pos_get(idx);
             seq_pos_max_rm[seq_id] = std::max(seq_pos_max_rm[seq_id], pos);
-            cells.rm(head_cur + i);
+            cells.rm(idx);
         }
-        cells.pos_set(head_cur + i, ubatch.pos[i]);
+        cells.pos_set(idx, ubatch.pos[i]);
         for (int32_t s = 0; s < ubatch.n_seq_id[i]; s++) {
-            cells.seq_add(head_cur + i, ubatch.seq_id[i][s]);
+            cells.seq_add(idx, ubatch.seq_id[i][s]);
         }
     }
@@ -719,7 +761,7 @@ void llama_kv_cache_unified::apply_ubatch(uint32_t head_cur, const llama_ubatch
     }
     // move the head at the end of the slot
-    head = head_cur + ubatch.n_tokens;
+    head = sinfo.idxs.back() + 1;
 }
 bool llama_kv_cache_unified::get_can_shift() const {
@@ -772,47 +814,133 @@ ggml_tensor * llama_kv_cache_unified::get_v(ggml_context * ctx, int32_t il, uint
             0);
 }
-ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const {
+ggml_tensor * llama_kv_cache_unified::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
     const int32_t ikv = map_layer_ids.at(il);
     auto * k = layers[ikv].k;
+    const int64_t n_embd_k_gqa = k->ne[0];
     const int64_t n_tokens = k_cur->ne[2];
+    k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
+    if (k_idxs && supports_set_rows) {
+        return ggml_set_rows(ctx, k, k_cur, k_idxs);
+    }
+    // TODO: fallback to old ggml_cpy() method for backwards compatibility
+    //       will be removed when ggml_set_rows() is adopted by all backends
     ggml_tensor * k_view = ggml_view_1d(ctx, k,
-            n_tokens*hparams.n_embd_k_gqa(il),
-            ggml_row_size(k->type, hparams.n_embd_k_gqa(il))*head_cur);
+            n_tokens*n_embd_k_gqa,
+            ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
     return ggml_cpy(ctx, k_cur, k_view);
 }
-ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const {
+ggml_tensor * llama_kv_cache_unified::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
     const int32_t ikv = map_layer_ids.at(il);
     auto * v = layers[ikv].v;
+    const int64_t n_embd_v_gqa = v->ne[0];
     const int64_t n_tokens = v_cur->ne[2];
-    v_cur = ggml_reshape_2d(ctx, v_cur, hparams.n_embd_v_gqa(il), n_tokens);
+    v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
+    if (v_idxs && supports_set_rows) {
+        if (!v_trans) {
+            return ggml_set_rows(ctx, v, v_cur, v_idxs);
+        }
+        // the row becomes a single element
+        ggml_tensor * v_view = ggml_reshape_3d(ctx, v, 1, v->ne[1], v->ne[0]);
+        // note: the V cache is transposed when not using flash attention
+        v_cur = ggml_permute(ctx, ggml_reshape_3d(ctx, v_cur, v_cur->ne[0], 1, v_cur->ne[1]), 2, 0, 1, 3);
+        // note: we can be more explicit here at the cost of extra cont
+        //       however, above we take advantage that a row of single element is always continuous regardless of the row stride
+        //v_cur = ggml_transpose(ctx, v_cur);
+        //v_cur = ggml_cont_3d(ctx, v_cur, 1, v_cur->ne[0], v_cur->ne[1]);
+        // we broadcast the KV indices n_embd_v_gqa times
+        // v      [1,        n_kv,     n_embd_v_gqa]
+        // v_cur  [1,        n_tokens, n_embd_v_gqa]
+        // v_idxs [n_tokens, 1,        1]
+        return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
+    }
+    // TODO: fallback to old ggml_cpy() method for backwards compatibility
+    //       will be removed when ggml_set_rows() is adopted by all backends
     ggml_tensor * v_view = nullptr;
     if (!v_trans) {
         v_view = ggml_view_1d(ctx, v,
-                n_tokens*hparams.n_embd_v_gqa(il),
-                ggml_row_size(v->type, hparams.n_embd_v_gqa(il))*head_cur);
+                n_tokens*n_embd_v_gqa,
+                ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
     } else {
-        // note: the V cache is transposed when not using flash attention
-        v_view = ggml_view_2d(ctx, v, n_tokens, hparams.n_embd_v_gqa(il),
-                (v->ne[1])*ggml_element_size(v),
-                (head_cur)*ggml_element_size(v));
         v_cur = ggml_transpose(ctx, v_cur);
+        v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
+                (v->ne[1]    )*ggml_element_size(v),
+                (sinfo.head())*ggml_element_size(v));
     }
     return ggml_cpy(ctx, v_cur, v_view);
 }
+ggml_tensor * llama_kv_cache_unified::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    ggml_tensor * k_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+    ggml_set_input(k_idxs);
+    return k_idxs;
+}
+ggml_tensor * llama_kv_cache_unified::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+    const uint32_t n_tokens = ubatch.n_tokens;
+    ggml_tensor * v_idxs = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, n_tokens);
+    ggml_set_input(v_idxs);
+    return v_idxs;
+}
+void llama_kv_cache_unified::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
+    if (!supports_set_rows) {
+        return;
+    }
+    const uint32_t n_tokens = ubatch->n_tokens;
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    int64_t * data = (int64_t *) dst->data;
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        data[i] = sinfo.idxs.at(i);
+    }
+}
+void llama_kv_cache_unified::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
+    if (!supports_set_rows) {
+        return;
+    }
+    const uint32_t n_tokens = ubatch->n_tokens;
+    GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+    int64_t * data = (int64_t *) dst->data;
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        data[i] = sinfo.idxs.at(i);
+    }
+}
 void llama_kv_cache_unified::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     const uint32_t n_tokens = ubatch->n_tokens;
@@ -1552,13 +1680,15 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
             ubatch.seq_id[i]   = &dest_seq_id;
         }
-        const auto head_cur = find_slot(ubatch);
-        if (head_cur < 0) {
+        const auto sinfo = find_slot(ubatch, true);
+        if (sinfo.empty()) {
             LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
             return false;
         }
-        apply_ubatch(head_cur, ubatch);
+        apply_ubatch(sinfo, ubatch);
+        const auto head_cur = sinfo.head();
         // keep the head at the old position because we will read the KV data into it in state_read_data()
         head = head_cur;
@@ -1744,7 +1874,11 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_stat
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(
         llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
     n_kv = kv->get_size();
-    head = 0;
+    // create a dummy slot info - the actual data is irrelevant. we just need to build the graph
+    sinfos.resize(1);
+    sinfos[0].idxs.resize(1);
+    sinfos[0].idxs[0] = 0;
 }
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(
@@ -1759,8 +1893,8 @@ llama_kv_cache_unified_context::llama_kv_cache_unified_context(
 llama_kv_cache_unified_context::llama_kv_cache_unified_context(
         llama_kv_cache_unified * kv,
-        llama_kv_cache_unified::ubatch_heads heads,
-        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) {
+        llama_kv_cache_unified::slot_info_vec_t sinfos,
+        std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), sinfos(std::move(sinfos)), ubatches(std::move(ubatches)) {
 }
 llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
@@ -1768,7 +1902,7 @@ llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
 bool llama_kv_cache_unified_context::next() {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    if (++i_next >= ubatches.size()) {
+    if (++i_cur >= ubatches.size()) {
         return false;
     }
@@ -1776,7 +1910,7 @@ bool llama_kv_cache_unified_context::next() {
 }
 bool llama_kv_cache_unified_context::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    assert(!llama_memory_status_is_fail(status));
     // no ubatches -> this is a KV cache update
     if (ubatches.empty()) {
@@ -1785,10 +1919,9 @@ bool llama_kv_cache_unified_context::apply() {
         return true;
     }
-    kv->apply_ubatch(heads[i_next], ubatches[i_next]);
+    kv->apply_ubatch(sinfos[i_cur], ubatches[i_cur]);
     n_kv = kv->get_n_kv();
-    head = heads[i_next];
     return true;
 }
@@ -1800,7 +1933,7 @@ llama_memory_status llama_kv_cache_unified_context::get_status() const {
 const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-    return ubatches[i_next];
+    return ubatches[i_cur];
 }
 uint32_t llama_kv_cache_unified_context::get_n_kv() const {
@@ -1815,18 +1948,34 @@ ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t
     return kv->get_v(ctx, il, n_kv);
 }
-ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
-    return kv->cpy_k(ctx, k_cur, il, head);
+ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
+    return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
+}
+ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const {
+    return kv->cpy_v(ctx, v_cur, v_idxs, il, sinfos[i_cur]);
+}
+ggml_tensor * llama_kv_cache_unified_context::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+    return kv->build_input_k_idxs(ctx, ubatch);
 }
-ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
-    return kv->cpy_v(ctx, v_cur, il, head);
+ggml_tensor * llama_kv_cache_unified_context::build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
+    return kv->build_input_v_idxs(ctx, ubatch);
 }
 void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const {
     kv->set_input_k_shift(dst);
 }
+void llama_kv_cache_unified_context::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    kv->set_input_k_idxs(dst, ubatch, sinfos[i_cur]);
+}
+void llama_kv_cache_unified_context::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const {
+    kv->set_input_v_idxs(dst, ubatch, sinfos[i_cur]);
+}
 void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
     kv->set_input_kq_mask(dst, ubatch, causal_attn);
 }

package/src/llama.cpp/src/llama-kv-cache-unified.h CHANGED Viewed

@@ -24,8 +24,6 @@ public:
     // this callback is used to filter out layers that should not be included in the cache
     using layer_filter_cb = std::function<bool(int32_t il)>;
-    using ubatch_heads = std::vector<uint32_t>;
     struct defrag_info {
         bool empty() const {
             return ids.empty();
@@ -37,6 +35,32 @@ public:
         std::vector<uint32_t> ids;
     };
+    // for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
+    //   KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
+    struct slot_info {
+        // data for ggml_set_rows
+        using idx_vec_t = std::vector<uint32_t>;
+        idx_vec_t idxs;
+        uint32_t head() const {
+            return idxs.at(0);
+        }
+        bool empty() const {
+            return idxs.empty();
+        }
+        void clear() {
+            idxs.clear();
+        }
+        // TODO: implement
+        //std::vector<idx_vec_t> seq_idxs;
+    };
+    using slot_info_vec_t = std::vector<slot_info>;
     llama_kv_cache_unified(
             const llama_model &  model,
               layer_filter_cb && filter,
@@ -102,30 +126,37 @@ public:
     ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
     // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
     //
     // preparation API
     //
-    // find places for the provided ubatches in the cache, returns the head locations
+    // find places for the provided ubatches in the cache, returns the slot infos
     // return empty vector on failure
-    ubatch_heads prepare(const std::vector<llama_ubatch> & ubatches);
+    slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
     bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
-    // return the cell position where we can insert the ubatch
-    // return -1 on failure to find a contiguous slot of kv cells
-    int32_t find_slot(const llama_ubatch & ubatch) const;
+    // find a slot of kv cells that can hold the ubatch
+    // if cont == true, then the slot must be continuous
+    // return empty slot_info on failure
+    slot_info find_slot(const llama_ubatch & ubatch, bool cont) const;
-    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
-    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
+    // emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
+    void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
     //
-    // set_input API
+    // input API
     //
+    ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+    ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+    void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
+    void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_k_shift   (ggml_tensor * dst) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
@@ -157,8 +188,13 @@ private:
     // SWA
     const uint32_t n_swa = 0;
+    // env: LLAMA_KV_CACHE_DEBUG
     int debug = 0;
+    // env: LLAMA_SET_ROWS (temporary)
+    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
+    int supports_set_rows = false;
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
     std::vector<ggml_context_ptr>        ctxs;
@@ -211,8 +247,8 @@ private:
 class llama_kv_cache_unified_context : public llama_memory_context_i {
 public:
     // some shorthands
-    using ubatch_heads = llama_kv_cache_unified::ubatch_heads;
-    using defrag_info  = llama_kv_cache_unified::defrag_info;
+    using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
+    using defrag_info     = llama_kv_cache_unified::defrag_info;
     // used for errors
     llama_kv_cache_unified_context(llama_memory_status status);
@@ -231,7 +267,7 @@ public:
     // used to create a batch procesing context from a batch
     llama_kv_cache_unified_context(
             llama_kv_cache_unified * kv,
-            ubatch_heads heads,
+            slot_info_vec_t sinfos,
             std::vector<llama_ubatch> ubatches);
     virtual ~llama_kv_cache_unified_context();
@@ -257,11 +293,16 @@ public:
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
     // store k_cur and v_cur in the cache based on the provided head location
-    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
-    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
+    ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
+    ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
-    void set_input_k_shift(ggml_tensor * dst) const;
+    void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+    void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
     void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
     void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
@@ -283,10 +324,10 @@ private:
     // batch processing context
     //
-    // the index of the next ubatch to process
-    size_t i_next = 0;
+    // the index of the cur ubatch to process
+    size_t i_cur = 0;
-    ubatch_heads heads;
+    slot_info_vec_t sinfos;
     std::vector<llama_ubatch> ubatches;
@@ -297,7 +338,4 @@ private:
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // as the cache gets filled, the benefit from this heuristic disappears
     int32_t n_kv;
-    // the beginning of the current slot in which the ubatch will be inserted
-    int32_t head;
 };