npm - @fugood/llama.node - Versions diffs - 1.1.7 → 1.1.9 - Mend

@fugood/llama.node 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/lib/binding.ts +4 -0
package/lib/index.js +9 -2
package/lib/index.ts +57 -30
package/lib/version.js +2 -2
package/lib/version.ts +2 -2
package/package.json +14 -14
package/src/LlamaContext.cpp +20 -0
package/src/common.hpp +8 -1
package/src/llama.cpp/common/arg.cpp +13 -4
package/src/llama.cpp/common/chat.cpp +33 -2
package/src/llama.cpp/common/common.cpp +0 -15
package/src/llama.cpp/common/common.h +6 -4
package/src/llama.cpp/ggml/CMakeLists.txt +0 -1
package/src/llama.cpp/ggml/include/ggml.h +25 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +316 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +142 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
package/src/llama.cpp/include/llama.h +1 -110
package/src/llama.cpp/src/CMakeLists.txt +2 -2
package/src/llama.cpp/src/llama-arch.cpp +19 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-chat.cpp +13 -2
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +5 -197
package/src/llama.cpp/src/llama-context.h +2 -7
package/src/llama.cpp/src/llama-cparams.h +0 -1
package/src/llama.cpp/src/llama-graph.cpp +35 -57
package/src/llama.cpp/src/llama-graph.h +36 -46
package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +47 -47
package/src/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +26 -26
package/src/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +88 -441
package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +20 -43
package/src/llama.cpp/src/llama-kv-cells.h +21 -21
package/src/llama.cpp/src/llama-memory-hybrid.cpp +5 -5
package/src/llama.cpp/src/llama-memory-hybrid.h +6 -6
package/src/llama.cpp/src/llama-memory-recurrent.h +1 -1
package/src/llama.cpp/src/llama-memory.h +3 -8
package/src/llama.cpp/src/llama-model.cpp +449 -246
package/src/llama.cpp/src/llama-model.h +2 -0

package/src/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} RENAMED Viewed

@@ -14,27 +14,16 @@ struct llama_model;
 struct llama_context;
 //
-// llama_kv_cache_unified
+// llama_kv_cache
 //
-class llama_kv_cache_unified : public llama_memory_i {
+class llama_kv_cache : public llama_memory_i {
 public:
     static uint32_t get_padding(const llama_cparams & cparams);
     // this callback is used to filter out layers that should not be included in the cache
     using layer_filter_cb = std::function<bool(int32_t il)>;
-    struct defrag_info {
-        bool empty() const {
-            return ids.empty();
-        }
-        // contains information about which cell moves where:
-        //  - cell i moves to ids[i]
-        //  - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
-        std::vector<uint32_t> ids;
-    };
     struct stream_copy_info {
         bool empty() const {
             assert(ssrc.size() == sdst.size());
@@ -92,7 +81,7 @@ public:
     using slot_info_vec_t = std::vector<slot_info>;
-    llama_kv_cache_unified(
+    llama_kv_cache(
             const llama_model &  model,
               layer_filter_cb && filter,
                     ggml_type    type_k,
@@ -106,7 +95,7 @@ public:
                      uint32_t    n_swa,
                llama_swa_type    swa_type);
-    ~llama_kv_cache_unified() = default;
+    ~llama_kv_cache() = default;
     //
     // llama_memory_i
@@ -140,7 +129,7 @@ public:
     void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
     //
-    // llama_kv_cache_unified specific API
+    // llama_kv_cache specific API
     //
     uint32_t get_size()     const;
@@ -173,7 +162,7 @@ public:
     // return empty vector on failure
     slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
-    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
+    bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info);
     // find a slot of kv cells that can hold the ubatch
     // if cont == true, then the slot must be continuous
@@ -241,7 +230,7 @@ private:
     // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
     std::vector<uint32_t> v_heads;
-    std::vector<llama_kv_cells_unified> v_cells;
+    std::vector<llama_kv_cells> v_cells;
     // maps from a sequence id to a stream id
     std::vector<uint32_t> seq_to_stream;
@@ -254,9 +243,6 @@ private:
     // model layer id -> KV cache layer id
     std::unordered_map<int32_t, int32_t> map_layer_ids;
-    // return non-empty vector if cells have been moved
-    defrag_info defrag_prepare(int32_t n_max_nodes) const;
     size_t total_size() const;
     size_t size_k_bytes() const;
@@ -277,11 +263,6 @@ private:
                llm_graph_result * res,
                   llama_context * lctx) const;
-    ggml_cgraph * build_graph_defrag(
-               llm_graph_result * res,
-                  llama_context * lctx,
-              const defrag_info & dinfo) const;
     struct cell_ranges_t {
         uint32_t strm;
@@ -295,35 +276,33 @@ private:
     bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
 };
-class llama_kv_cache_unified_context : public llama_memory_context_i {
+class llama_kv_cache_context : public llama_memory_context_i {
 public:
     // some shorthands
-    using slot_info_vec_t  = llama_kv_cache_unified::slot_info_vec_t;
-    using defrag_info      = llama_kv_cache_unified::defrag_info;
-    using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
+    using slot_info_vec_t  = llama_kv_cache::slot_info_vec_t;
+    using stream_copy_info = llama_kv_cache::stream_copy_info;
     // used for errors
-    llama_kv_cache_unified_context(llama_memory_status status);
+    llama_kv_cache_context(llama_memory_status status);
     // used to create a full-cache context
-    llama_kv_cache_unified_context(
-            llama_kv_cache_unified * kv);
+    llama_kv_cache_context(
+            llama_kv_cache * kv);
     // used to create an update context
-    llama_kv_cache_unified_context(
-            llama_kv_cache_unified * kv,
+    llama_kv_cache_context(
+            llama_kv_cache * kv,
             llama_context * lctx,
             bool do_shift,
-            defrag_info dinfo,
             stream_copy_info sc_info);
     // used to create a batch procesing context from a batch
-    llama_kv_cache_unified_context(
-            llama_kv_cache_unified * kv,
+    llama_kv_cache_context(
+            llama_kv_cache * kv,
             slot_info_vec_t sinfos,
             std::vector<llama_ubatch> ubatches);
-    virtual ~llama_kv_cache_unified_context();
+    virtual ~llama_kv_cache_context();
     //
     // llama_memory_context_i
@@ -336,7 +315,7 @@ public:
     const llama_ubatch & get_ubatch() const override;
     //
-    // llama_kv_cache_unified_context specific API
+    // llama_kv_cache_context specific API
     //
     uint32_t get_n_kv() const;
@@ -365,7 +344,7 @@ public:
 private:
     llama_memory_status status;
-    llama_kv_cache_unified * kv;
+    llama_kv_cache * kv;
     llama_context * lctx;
     //
@@ -374,8 +353,6 @@ private:
     bool do_shift = false;
-    defrag_info dinfo;
     stream_copy_info sc_info;
     //

package/src/llama.cpp/src/llama-kv-cells.h CHANGED Viewed

@@ -11,7 +11,7 @@
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
-class llama_kv_cells_unified {
+class llama_kv_cells {
 public:
     void reset() {
         for (uint32_t i = 0; i < pos.size(); ++i) {
@@ -77,30 +77,30 @@ public:
     }
     // move cell isrc to idst (used during defrag)
-    void mv(uint32_t isrc, uint32_t idst) {
-        assert(isrc < pos.size());
-        assert(idst < pos.size());
+    //void mv(uint32_t isrc, uint32_t idst) {
+    //    assert(isrc < pos.size());
+    //    assert(idst < pos.size());
-        assert(pos[idst] == -1);
-        assert(pos[isrc] != -1);
+    //    assert(pos[idst] == -1);
+    //    assert(pos[isrc] != -1);
-        pos  [idst] = pos  [isrc];
-        shift[idst] = shift[isrc];
-        seq  [idst] = seq  [isrc];
+    //    pos  [idst] = pos  [isrc];
+    //    shift[idst] = shift[isrc];
+    //    seq  [idst] = seq  [isrc];
-        pos  [isrc] = -1;
-        shift[isrc] =  0;
-        seq  [isrc].reset();
+    //    pos  [isrc] = -1;
+    //    shift[isrc] =  0;
+    //    seq  [isrc].reset();
-        used.erase (isrc);
-        used.insert(idst);
-    }
+    //    used.erase (isrc);
+    //    used.insert(idst);
+    //}
     // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
-    llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
+    llama_kv_cells cp(uint32_t i, uint32_t n) const {
         assert(i + n <= pos.size());
-        llama_kv_cells_unified res;
+        llama_kv_cells res;
         res.resize(n);
@@ -117,8 +117,8 @@ public:
     }
     // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
-    llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
-        llama_kv_cells_unified res;
+    llama_kv_cells cp(const std::vector<uint32_t> & idxs) const {
+        llama_kv_cells res;
         res.resize(idxs.size());
@@ -135,7 +135,7 @@ public:
     }
     // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
-    void set(uint32_t i, const llama_kv_cells_unified & other) {
+    void set(uint32_t i, const llama_kv_cells & other) {
         assert(i + other.pos.size() <= pos.size());
         for (uint32_t j = 0; j < other.pos.size(); ++j) {
@@ -165,7 +165,7 @@ public:
     }
     // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
-    void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
+    void set(const std::vector<uint32_t> & idxs, const llama_kv_cells & other) {
         assert(idxs.size() == other.pos.size());
         for (uint32_t j = 0; j < other.pos.size(); ++j) {

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -30,7 +30,7 @@ llama_memory_hybrid::llama_memory_hybrid(
       layer_filter_cb && filter_attn,
       layer_filter_cb && filter_recr) :
     hparams(model.hparams),
-    mem_attn(new llama_kv_cache_unified(
+    mem_attn(new llama_kv_cache(
         model,
         filter_attn == nullptr ?
             [&](int32_t il) { return !hparams.is_recurrent(il); }
@@ -179,7 +179,7 @@ void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id,
     mem_recr->state_read(io, seq_id);
 }
-llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
+llama_kv_cache * llama_memory_hybrid::get_mem_attn() const {
     return mem_attn.get();
 }
@@ -210,7 +210,7 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
         std::vector<llama_ubatch>   ubatches) :
     ubatches(std::move(ubatches)),
     // note: here we copy the ubatches. not sure if this is ideal
-    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
+    ctx_attn(new llama_kv_cache_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
     ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
@@ -248,8 +248,8 @@ const llama_ubatch & llama_memory_hybrid_context::get_ubatch() const {
     return ubatches[i_next];
 }
-const llama_kv_cache_unified_context * llama_memory_hybrid_context::get_attn() const {
-    return static_cast<const llama_kv_cache_unified_context *>(ctx_attn.get());
+const llama_kv_cache_context * llama_memory_hybrid_context::get_attn() const {
+    return static_cast<const llama_kv_cache_context *>(ctx_attn.get());
 }
 const llama_memory_recurrent_context * llama_memory_hybrid_context::get_recr() const {

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -2,7 +2,7 @@
 #include "llama-batch.h"
 #include "llama-graph.h"
-#include "llama-kv-cache-unified.h"
+#include "llama-kv-cache.h"
 #include "llama-memory.h"
 #include "llama-memory-recurrent.h"
@@ -13,7 +13,7 @@
 // llama_memory_hybrid
 //
-// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
+// utilizes instances of llama_memory_recurrent and llama_kv_cache to
 //   support models where each layer may be either attention-based or recurrent
 class llama_memory_hybrid : public llama_memory_i {
@@ -81,19 +81,19 @@ public:
     // llama_memory_hybrid specific API
     //
-    llama_kv_cache_unified * get_mem_attn() const;
+    llama_kv_cache * get_mem_attn() const;
     llama_memory_recurrent * get_mem_recr() const;
 private:
     const llama_hparams & hparams;
-    const std::unique_ptr<llama_kv_cache_unified> mem_attn;
+    const std::unique_ptr<llama_kv_cache> mem_attn;
     const std::unique_ptr<llama_memory_recurrent> mem_recr;
 };
 class llama_memory_hybrid_context : public llama_memory_context_i {
 public:
-    using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
+    using slot_info_vec_t = llama_kv_cache::slot_info_vec_t;
     // init failure
     explicit llama_memory_hybrid_context(llama_memory_status status);
@@ -125,7 +125,7 @@ public:
     // llama_memory_hybrid_context
     //
-    const llama_kv_cache_unified_context * get_attn() const;
+    const llama_kv_cache_context * get_attn() const;
     const llama_memory_recurrent_context * get_recr() const;
 private:

package/src/llama.cpp/src/llama-memory-recurrent.h CHANGED Viewed

@@ -12,7 +12,7 @@
 //
 // TODO: extract the cache state used for graph computation into llama_memory_recurrent_context_i
-//       see the implementation of llama_kv_cache_unified_context_i for an example how to do it
+//       see the implementation of llama_kv_cache_context_i for an example how to do it
 class llama_memory_recurrent : public llama_memory_i {
 public:

package/src/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -36,8 +36,8 @@ bool llama_memory_status_is_fail(llama_memory_status status);
 // the interface for managing the memory context during batch processing
 // this interface is implemented per memory type. see:
-//   - llama_kv_cache_unified_context
-//   - llama_kv_cache_unified_iswa_context
+//   - llama_kv_cache_context
+//   - llama_kv_cache_iswa_context
 //   ...
 //
 // the only method that should mutate the memory and the memory context is llama_memory_i::apply()
@@ -77,7 +77,7 @@ struct llama_memory_i {
     // simulate full cache, used for allocating worst-case compute buffers
     virtual llama_memory_context_ptr init_full() = 0;
-    // prepare for any pending memory updates, such as shifts, defrags, etc.
+    // prepare for any pending memory updates, such as shifts, copies, etc.
     // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
     virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;
@@ -109,8 +109,3 @@ struct llama_memory_i {
 };
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
-// TODO: temporary until the llama_kv_cache is removed from the public API
-struct llama_kv_cache : public llama_memory_i {
-    virtual ~llama_kv_cache() = default;
-};