npm - @fugood/llama.node - Versions diffs - 1.1.6 → 1.1.7 - Mend

@fugood/llama.node 1.1.6 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/lib/binding.ts +4 -0
package/lib/index.js +6 -1
package/lib/index.ts +6 -0
package/lib/version.js +5 -0
package/lib/version.ts +2 -0
package/package.json +14 -14
package/scripts/llama.cpp.patch +9 -9
package/src/LlamaCompletionWorker.cpp +73 -20
package/src/LlamaCompletionWorker.h +8 -0
package/src/llama.cpp/CMakeLists.txt +2 -0
package/src/llama.cpp/common/arg.cpp +124 -40
package/src/llama.cpp/common/chat-parser.cpp +9 -1
package/src/llama.cpp/common/chat.cpp +312 -9
package/src/llama.cpp/common/chat.h +4 -1
package/src/llama.cpp/common/common.cpp +54 -0
package/src/llama.cpp/common/common.h +41 -7
package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
package/src/llama.cpp/ggml/include/ggml.h +28 -2
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
package/src/llama.cpp/include/llama.h +25 -0
package/src/llama.cpp/src/llama-batch.cpp +1 -1
package/src/llama.cpp/src/llama-chat.cpp +2 -4
package/src/llama.cpp/src/llama-context.cpp +29 -17
package/src/llama.cpp/src/llama-context.h +6 -5
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
package/src/llama.cpp/src/llama-memory.h +2 -2
package/src/llama.cpp/src/llama-model.cpp +1 -0
package/src/llama.cpp/src/llama-model.h +1 -0
package/src/llama.cpp/src/llama-quant.cpp +1 -1
package/src/llama.cpp/src/llama-vocab.cpp +2 -1

package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp CHANGED Viewed

@@ -194,14 +194,20 @@ bool llama_kv_cache_unified_iswa::get_can_shift() const {
     return kv_base->get_size() == kv_swa->get_size();
 }
-void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
-    kv_base->state_write(io, seq_id);
-    kv_swa ->state_write(io, seq_id);
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
+        kv_base->state_write(io, seq_id, flags);
+    }
+    kv_swa->state_write(io, seq_id, flags);
 }
-void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
-    kv_base->state_read(io, seq_id);
-    kv_swa ->state_read(io, seq_id);
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    if ((flags & LLAMA_STATE_SEQ_FLAGS_SWA_ONLY) == 0) {
+        kv_base->state_read(io, seq_id, flags);
+    }
+    kv_swa->state_read(io, seq_id, flags);
 }
 llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {

package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h CHANGED Viewed

@@ -56,8 +56,8 @@ public:
     // state write/load
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
     //
     // llama_kv_cache_unified_iswa specific API

package/src/llama.cpp/src/llama-kv-cache-unified.cpp CHANGED Viewed

@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
 }
 bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
-    auto & cells = v_cells[seq_to_stream[seq_id]];
-    auto & head  = v_heads[seq_to_stream[seq_id]];
-    uint32_t new_head = cells.size();
+    GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
     if (p0 < 0) {
         p0 = 0;
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
     }
     if (seq_id >= 0) {
+        auto & cells = v_cells[seq_to_stream[seq_id]];
+        auto & head  = v_heads[seq_to_stream[seq_id]];
+        uint32_t new_head = cells.size();
         for (uint32_t i = 0; i < cells.size(); ++i) {
             if (!cells.pos_in(i, p0, p1)) {
                 continue;
@@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
                 }
             }
         }
+        // If we freed up a slot, set head to it so searching can start there.
+        if (new_head != cells.size() && new_head < head) {
+            head = new_head;
+        }
     } else {
         // match any sequence
-        for (uint32_t i = 0; i < cells.size(); ++i) {
-            if (!cells.pos_in(i, p0, p1)) {
-                continue;
-            }
+        for (uint32_t s = 0; s < n_stream; ++s) {
+            auto & cells = v_cells[s];
+            auto & head  = v_heads[s];
-            cells.rm(i);
+            uint32_t new_head = cells.size();
-            if (new_head == cells.size()) {
-                new_head = i;
+            for (uint32_t i = 0; i < cells.size(); ++i) {
+                if (!cells.pos_in(i, p0, p1)) {
+                    continue;
+                }
+                cells.rm(i);
+                if (new_head == cells.size()) {
+                    new_head = i;
+                }
             }
-        }
-    }
-    // If we freed up a slot, set head to it so searching can start there.
-    if (new_head != cells.size() && new_head < head) {
-        head = new_head;
+            // If we freed up a slot, set head to it so searching can start there.
+            if (new_head != cells.size() && new_head < head) {
+                head = new_head;
+            }
+        }
     }
     return true;
@@ -738,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
 }
 llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
-    if (debug > 0) {
-        const auto & cells = v_cells[seq_to_stream[1]];
-        const uint32_t head_cur = v_heads[1];
-        LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
-                __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
+    if (debug > 0) {
+        for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
+            const auto seq_id = ubatch.seq_id_unq[s];
+            const auto stream_id = seq_to_stream[seq_id];
+            const auto & cells = v_cells[stream_id];
+            const uint32_t head_cur = v_heads[stream_id];
+            LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
+                    __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
+            if ((debug == 2 && n_swa > 0) || debug > 2) {
+                std::string ss;
+                for (uint32_t i = 0; i < cells.size(); ++i) {
+                    if (cells.is_empty(i)) {
+                        ss += '.';
+                    } else {
+                        assert(cells.seq_count(i) >= 1);
-        if ((debug == 2 && n_swa > 0) || debug > 2) {
-            std::string ss;
-            for (uint32_t i = 0; i < cells.size(); ++i) {
-                if (cells.is_empty(i)) {
-                    ss += '.';
-                } else {
-                    assert(cells.seq_count(i) >= 1);
+                        if (cells.seq_count(i) == 1) {
+                            ss += std::to_string(cells.seq_get(i));
+                        } else {
+                            ss += 'M';
+                        }
+                    }
+                    if (i%256 == 255) {
+                        ss += " *";
+                        ss += '\n';
+                    }
+                }
+                LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
+            }
-                    if (cells.seq_count(i) == 1) {
-                        ss += std::to_string(cells.seq_get(i));
+            if ((debug == 2 && n_swa > 0) || debug > 2) {
+                std::string ss;
+                for (uint32_t i = 0; i < cells.size(); ++i) {
+                    std::string cur;
+                    if (cells.is_empty(i)) {
+                        cur = '.';
                     } else {
-                        ss += 'M';
+                        cur = std::to_string(cells.pos_get(i));
+                    }
+                    const int n = cur.size();
+                    for (int j = 0; j < 5 - n; ++j) {
+                        cur += ' ';
+                    }
+                    ss += cur;
+                    if (i%256 == 255) {
+                        ss += " *";
+                    }
+                    if (i%64 == 63) {
+                        ss += '\n';
                     }
                 }
-                if (i%256 == 255) {
-                    ss += " *";
-                    ss += '\n';
-                }
+                LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
             }
-            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
-        }
-        if ((debug == 2 && n_swa > 0) || debug > 2) {
-            std::string ss;
-            for (uint32_t i = 0; i < cells.size(); ++i) {
-                std::string cur;
-                if (cells.is_empty(i)) {
-                    cur = '.';
-                } else {
-                    cur = std::to_string(cells.pos_get(i));
-                }
-                const int n = cur.size();
-                for (int j = 0; j < 5 - n; ++j) {
-                    cur += ' ';
-                }
-                ss += cur;
-                if (i%256 == 255) {
-                    ss += " *";
-                }
-                if (i%64 == 63) {
-                    ss += '\n';
+            for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+                if (cells.seq_pos_min(s) < 0) {
+                    continue;
                 }
-            }
-            LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
-        }
-        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
-            if (cells.seq_pos_min(s) < 0) {
-                continue;
+                LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
             }
-            LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
         }
     }
@@ -1812,7 +1828,9 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
     return false;
 }
-void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    GGML_UNUSED(flags);
     io.write(&n_stream, sizeof(n_stream));
     for (uint32_t s = 0; s < n_stream; ++s) {
@@ -1863,7 +1881,9 @@ void llama_kv_cache_unified::state_write(llama_io_write_i & io, llama_seq_id seq
     }
 }
-void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+void llama_kv_cache_unified::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    GGML_UNUSED(flags);
     GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
     uint32_t n_stream_cur;

package/src/llama.cpp/src/llama-kv-cache-unified.h CHANGED Viewed

@@ -136,8 +136,8 @@ public:
     // state write/load
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
     //
     // llama_kv_cache_unified specific API

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -165,12 +165,16 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
     return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
-void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    GGML_UNUSED(flags);
     mem_attn->state_write(io, seq_id);
     mem_recr->state_write(io, seq_id);
 }
-void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    GGML_UNUSED(flags);
     mem_attn->state_read(io, seq_id);
     mem_recr->state_read(io, seq_id);
 }

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -74,8 +74,8 @@ public:
     // state write/load
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0)       override;
     //
     // llama_memory_hybrid specific API

package/src/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -680,7 +680,9 @@ size_t llama_memory_recurrent::size_s_bytes() const {
     return size_s_bytes;
 }
-void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
+    GGML_UNUSED(flags);
     std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
     uint32_t cell_count = 0;
@@ -718,7 +720,9 @@ void llama_memory_recurrent::state_write(llama_io_write_i & io, llama_seq_id seq
     state_write_data(io, cell_ranges);
 }
-void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+void llama_memory_recurrent::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
+    GGML_UNUSED(flags);
     uint32_t cell_count;
     io.read_to(&cell_count, sizeof(cell_count));

package/src/llama.cpp/src/llama-memory-recurrent.h CHANGED Viewed

@@ -63,8 +63,8 @@ public:
     // state write/load
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) override;
     uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
     uint32_t size = 0; // total number of cells, shared across all sequences

package/src/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -104,8 +104,8 @@ struct llama_memory_i {
     // state write/read
     //
-    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
-    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
+    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const = 0;
+    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) = 0;
 };
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -1095,6 +1095,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
+                    case 18: type = LLM_TYPE_537M; break;
                     case 26: type = LLM_TYPE_1B; break;
                     case 34: type = LLM_TYPE_4B; break;
                     case 48: type = LLM_TYPE_12B; break;

package/src/llama.cpp/src/llama-model.h CHANGED Viewed

@@ -39,6 +39,7 @@ enum llm_type {
     LLM_TYPE_410M,
     LLM_TYPE_450M,
     LLM_TYPE_475M,
+    LLM_TYPE_537M,
     LLM_TYPE_700M,
     LLM_TYPE_770M,
     LLM_TYPE_780M,

package/src/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
                 // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 1
+#if 0
                 if (new_type == GGML_TYPE_MXFP4) {
                     auto * x = f32_data_03;

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -2341,7 +2341,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
         // @ngxson : quick hack for gpt-oss, always render these tokens
         for (const auto & t : token_to_id) {
-            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
+            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
                 id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
             }
         }
@@ -2388,6 +2388,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             if (has_return && has_call && has_end) {
                 special_eog_ids.erase(end_id);
+                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
                 LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
             }
         }