npm - @novastera-oss/llamarn - Versions diffs - 0.4.1 → 0.4.3-beta4 - Mend

@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (976) hide show

package/cpp/llama.cpp/src/llama-kv-cache.h CHANGED Viewed

@@ -19,8 +19,6 @@ struct llama_context;
 class llama_kv_cache : public llama_memory_i {
 public:
-    static uint32_t get_padding(const llama_cparams & cparams);
     struct stream_copy_info {
         bool empty() const {
             assert(ssrc.size() == sdst.size());
@@ -121,6 +119,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     // state write/load
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
@@ -212,10 +212,11 @@ private:
     // env: LLAMA_KV_CACHE_DEBUG
     int debug = 0;
+    // this is the SWA type of the cache - not to be confused with the model SWA type
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
     // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
     // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
@@ -316,9 +317,17 @@ public:
     ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
     // store k_cur and v_cur in the cache based on the provided head location
+    // note: the heads in k_cur and v_cur should be layed out contiguously in memory
+    //   - k_cur  [n_embd_head_k, n_head_k, n_tokens]
+    //   - k_idxs [n_tokens]
+    //   - v_cur  [n_embd_head_v, n_head_v, n_tokens]
+    //   - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
     ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
     ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
+    // create destination indices for each head of the current batch for where it would be written in the KV cache
+    // the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
+    //   helps understand the implementation logic of cpy_k and cpy_v
     ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
     ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;

package/cpp/llama.cpp/src/llama-kv-cells.h CHANGED Viewed

@@ -5,9 +5,27 @@
 #include <bitset>
 #include <cassert>
-#include <vector>
-#include <set>
+#include <cstring>
 #include <map>
+#include <set>
+#include <vector>
+struct llama_kv_cell_ext {
+    // 2D spatial positions, typically used for M-RoPE
+    llama_pos x = 0;
+    llama_pos y = 0;
+    // return true if the current 2D spatial position is greater than other
+    bool is_2d_gt(llama_pos ox, llama_pos oy) const {
+        return (y > oy) || (y == oy && x > ox);
+    }
+    void reset() {
+        static_assert(std::is_trivially_copyable_v<llama_kv_cell_ext>);
+        memset(this, 0, sizeof(*this));
+    }
+};
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
@@ -16,6 +34,7 @@ public:
     void reset() {
         for (uint32_t i = 0; i < pos.size(); ++i) {
             pos[i]   = -1;
+            ext[i].reset();
             shift[i] =  0;
             seq[i].reset();
         }
@@ -43,6 +62,7 @@ public:
     void resize(uint32_t n) {
         pos.resize(n);
+        ext.resize(n);
         shift.resize(n);
         seq.resize(n);
@@ -108,6 +128,7 @@ public:
             const auto idx = i + j;
             res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
             res.seq[j] = seq[idx];
             assert(shift[idx] == 0);
@@ -126,6 +147,7 @@ public:
             const auto idx = idxs[j];
             res.pos[j] = pos[idx];
+            res.ext[j] = ext[idx];
             res.seq[j] = seq[idx];
             assert(shift[idx] == 0);
@@ -154,6 +176,7 @@ public:
             }
             pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
             seq[idx] = other.seq[j];
             if (pos[idx] != -1) {
@@ -184,6 +207,7 @@ public:
             }
             pos[idx] = other.pos[j];
+            ext[idx] = other.ext[j];
             seq[idx] = other.seq[j];
             if (pos[idx] != -1) {
@@ -203,6 +227,7 @@ public:
         seq[i].reset();
         pos[i] = -1;
+        ext[i].reset();
         shift[i] = 0;
         used.erase(i);
@@ -221,6 +246,7 @@ public:
         if (seq[i].none()) {
             pos[i] = -1;
+            ext[i].reset();
             shift[i] = 0;
             used.erase(i);
@@ -250,6 +276,7 @@ public:
             seq[i].reset();
             pos[i] = -1;
+            ext[i].reset();
             shift[i] = 0;
             used.erase(i);
@@ -340,6 +367,13 @@ public:
         return pos[i];
     }
+    const llama_kv_cell_ext & ext_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+        return ext[i];
+    }
     // note: call only if the cell is not empty
     llama_pos get_shift(uint32_t i) const {
         assert(i < pos.size());
@@ -368,6 +402,11 @@ public:
         used.insert(i);
     }
+    void ext_set(uint32_t i, llama_kv_cell_ext p) {
+        assert(i < ext.size());
+        ext[i] = p;
+    }
     // pos[i] = pos[i] + d
     // sets "has_shift" to true
     // note: call only if the cell is not empty
@@ -424,6 +463,9 @@ private:
     std::vector<llama_pos> pos;
+    // stores extra info per cell
+    std::vector<llama_kv_cell_ext> ext;
     // this array accumulates any applied shifts to the pos array since the last reset_shift() call
     // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
     //

package/cpp/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -73,7 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
-                ubatch = balloc.split_equal(n_ubatch, false);
+                // TODO: non-sequential equal split can be done if using unified KV cache
+                //       for simplicity, we always use sequential equal split for now
+                ubatch = balloc.split_equal(n_ubatch, true);
             }
             if (ubatch.n_tokens == 0) {
@@ -166,18 +168,26 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
     return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
-void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
-    GGML_UNUSED(flags);
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+    for (const auto & buft_size : mem_recr->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
-    mem_attn->state_write(io, seq_id);
-    mem_recr->state_write(io, seq_id);
+void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+        mem_attn->state_write(io, seq_id, flags);
+    }
+    mem_recr->state_write(io, seq_id, flags);
 }
 void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
-    GGML_UNUSED(flags);
-    mem_attn->state_read(io, seq_id);
-    mem_recr->state_read(io, seq_id);
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
+        mem_attn->state_read(io, seq_id, flags);
+    }
+    mem_recr->state_read(io, seq_id, flags);
 }
 llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {

package/cpp/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -68,6 +68,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     // state write/load
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

package/cpp/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -7,6 +7,7 @@
 #include <algorithm>
 #include <cassert>
+#include <cstring>
 #include <limits>
 #include <map>
 #include <stdexcept>
@@ -32,8 +33,15 @@ llama_memory_recurrent::llama_memory_recurrent(
     cells.clear();
     cells.resize(mem_size);
+    // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
+    struct ggml_backend_buft_comparator {
+        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+        }
+    };
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
     // create a context for each buffer type
-    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
@@ -48,13 +56,12 @@ llama_memory_recurrent::llama_memory_recurrent(
                 return nullptr;
             }
-            ctx_map[buft] = ctx;
-            ctxs.emplace_back(ctx);
+            ctx_map.emplace(buft, ctx);
             return ctx;
         }
-        return it->second;
+        return it->second.get();
     };
     r_l.resize(n_layer);
@@ -93,17 +100,14 @@ llama_memory_recurrent::llama_memory_recurrent(
     }
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
-    for (auto it : ctx_map) {
-        auto * buft = it.first;
-        auto * ctx  = it.second;
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
         if (!buf) {
             throw std::runtime_error("failed to allocate buffer for rs cache");
         }
         ggml_backend_buffer_clear(buf, 0);
         LLAMA_LOG_INFO("%s: %10s RS buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
-        bufs.emplace_back(buf);
+        ctxs_bufs.emplace_back(std::move(ctx), buf);
     }
     {
@@ -129,13 +133,14 @@ void llama_memory_recurrent::clear(bool data) {
     used = 0;
     if (data) {
-        for (auto & buf : bufs) {
+        for (auto & [_, buf] : ctxs_bufs) {
             ggml_backend_buffer_clear(buf.get(), 0);
         }
     }
 }
 bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    //printf("[DEBUG] calling llama_memory_recurrent::seq_rm` with `seq_id=%d, p0=%d, p1=%d`\n", seq_id, p0, p1);
     uint32_t new_head = size;
     if (p0 < 0) {
@@ -146,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         p1 = std::numeric_limits<llama_pos>::max();
     }
-    // models like Mamba or RWKV can't have a state partially erased
+    // models like Mamba or RWKV can't have a state partially erased at the end
+    // of the sequence because their state isn't preserved for previous tokens
     if (seq_id >= (int64_t) size) {
         // could be fatal
         return false;
@@ -155,8 +161,9 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
         int32_t & tail_id = cells[seq_id].tail;
         if (tail_id >= 0) {
             const auto & cell = cells[tail_id];
-            // partial intersection is invalid
-            if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
+            // partial intersection is invalid if it includes the final pos
+            if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
+                //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
                 return false;
             }
             // invalidate tails which will be cleared
@@ -167,6 +174,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
     } else {
         // seq_id is negative, then the range should include everything or nothing
         if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+            //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: `seq_id` is negative, so returning false\n");
             return false;
         }
     }
@@ -359,6 +367,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const auto & [_, buf] : ctxs_bufs) {
+        ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    }
+    return ret;
+}
 llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
     do {
         balloc.split_reset();
@@ -371,7 +387,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
-                ubatch = balloc.split_equal(n_ubatch, false);
+                // TODO: non-sequential equal split can be done if using unified KV cache
+                //       for simplicity, we always use sequential equal split for now
+                ubatch = balloc.split_equal(n_ubatch, true);
             }
             if (ubatch.n_tokens == 0) {
@@ -649,7 +667,7 @@ bool llama_memory_recurrent::get_can_shift() const {
 size_t llama_memory_recurrent::total_size() const {
     size_t size = 0;
-    for (const auto & buf : bufs) {
+    for (const auto & [_, buf] : ctxs_bufs) {
         size += ggml_backend_buffer_get_size(buf.get());
     }
@@ -848,9 +866,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
 bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
     if (dest_seq_id != -1) {
         // single sequence
         seq_rm(dest_seq_id, -1, -1);
+        if (cell_count == 0) {
+            return true;
+        }
         llama_batch_allocr balloc(hparams.n_pos_per_embd());
         llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);

package/cpp/llama.cpp/src/llama-memory-recurrent.h CHANGED Viewed

@@ -4,6 +4,7 @@
 #include "llama-graph.h"
 #include "llama-memory.h"
+#include <map>
 #include <set>
 #include <vector>
@@ -50,6 +51,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     bool prepare(const std::vector<llama_ubatch> & ubatches);
     // find a contiguous slot of memory cells and emplace the ubatch there
@@ -106,8 +109,8 @@ private:
     const uint32_t n_seq_max = 1;
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
+    // ggml contexts for the KV cache along with the allocated backend buffers:
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
     size_t total_size() const;

package/cpp/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -2,6 +2,7 @@
 #include "llama.h"
+#include <map>
 #include <memory>
 #include <functional>
@@ -108,6 +109,8 @@ struct llama_memory_i {
     virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
     virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
+    virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
     //
     // state write/read
     //

package/cpp/llama.cpp/src/llama-model-loader.cpp CHANGED Viewed

@@ -465,6 +465,8 @@ namespace GGUFMeta {
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<float, 512>>(enum llm_kv kid, std::array<float, 512> & result, uint32_t n, bool required);
 llama_model_loader::llama_model_loader(
         const std::string & fname,