npm - @fugood/llama.node - Versions diffs - 1.0.1 → 1.0.3 - Mend

@fugood/llama.node 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/package.json +14 -14
package/scripts/llama.cpp.patch +12 -12
package/src/llama.cpp/CMakeLists.txt +0 -1
package/src/llama.cpp/common/arg.cpp +17 -0
package/src/llama.cpp/common/chat.cpp +37 -20
package/src/llama.cpp/common/chat.h +2 -0
package/src/llama.cpp/common/common.h +4 -0
package/src/llama.cpp/ggml/CMakeLists.txt +7 -2
package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
package/src/llama.cpp/ggml/include/ggml.h +181 -10
package/src/llama.cpp/ggml/src/CMakeLists.txt +0 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +38 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1297 -211
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +7 -0
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +33 -9
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -9
package/src/llama.cpp/include/llama.h +1 -0
package/src/llama.cpp/src/llama-arch.cpp +108 -2
package/src/llama.cpp/src/llama-arch.h +7 -0
package/src/llama.cpp/src/llama-batch.cpp +27 -1
package/src/llama.cpp/src/llama-batch.h +8 -1
package/src/llama.cpp/src/llama-chat.cpp +15 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +95 -81
package/src/llama.cpp/src/llama-graph.h +43 -16
package/src/llama.cpp/src/llama-hparams.cpp +2 -1
package/src/llama.cpp/src/llama-hparams.h +1 -0
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +28 -18
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +4 -2
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +214 -65
package/src/llama.cpp/src/llama-kv-cache-unified.h +62 -24
package/src/llama.cpp/src/llama-kv-cells.h +62 -10
package/src/llama.cpp/src/llama-memory-hybrid.cpp +9 -4
package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
package/src/llama.cpp/src/llama-memory-recurrent.cpp +34 -16
package/src/llama.cpp/src/llama-memory.cpp +17 -0
package/src/llama.cpp/src/llama-memory.h +3 -0
package/src/llama.cpp/src/llama-model.cpp +1374 -210
package/src/llama.cpp/src/llama-model.h +3 -0
package/src/llama.cpp/src/llama-vocab.cpp +8 -1
package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50

package/src/llama.cpp/src/llama-kv-cells.h CHANGED Viewed

@@ -105,10 +105,30 @@ public:
         res.resize(n);
         for (uint32_t j = 0; j < n; ++j) {
-            res.pos[j] = pos[i + j];
-            res.seq[j] = seq[i + j];
+            const auto idx = i + j;
-            assert(shift[i + j] == 0);
+            res.pos[j] = pos[idx];
+            res.seq[j] = seq[idx];
+            assert(shift[idx] == 0);
+        }
+        return res;
+    }
+    // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
+    llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
+        llama_kv_cells_unified res;
+        res.resize(idxs.size());
+        for (uint32_t j = 0; j < idxs.size(); ++j) {
+            const auto idx = idxs[j];
+            res.pos[j] = pos[idx];
+            res.seq[j] = seq[idx];
+            assert(shift[idx] == 0);
         }
         return res;
@@ -119,26 +139,58 @@ public:
         assert(i + other.pos.size() <= pos.size());
         for (uint32_t j = 0; j < other.pos.size(); ++j) {
-            if (pos[i + j] == -1 && other.pos[j] != -1) {
+            const auto idx = i + j;
+            if (pos[idx] == -1 && other.pos[j] != -1) {
                 used.insert(i + j);
             }
-            if (pos[i + j] != -1 && other.pos[j] == -1) {
+            if (pos[idx] != -1 && other.pos[j] == -1) {
                 used.erase(i + j);
             }
-            if (pos[i + j] != -1) {
+            if (pos[idx] != -1) {
                 seq_pos_rm(i + j);
             }
-            pos[i + j] = other.pos[j];
-            seq[i + j] = other.seq[j];
+            pos[idx] = other.pos[j];
+            seq[idx] = other.seq[j];
-            if (pos[i + j] != -1) {
+            if (pos[idx] != -1) {
                 seq_pos_add(i + j);
             }
-            assert(shift[i + j] == 0);
+            assert(shift[idx] == 0);
+        }
+    }
+    // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
+    void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
+        assert(idxs.size() == other.pos.size());
+        for (uint32_t j = 0; j < other.pos.size(); ++j) {
+            const auto idx = idxs[j];
+            if (pos[idx] == -1 && other.pos[j] != -1) {
+                used.insert(idx);
+            }
+            if (pos[idx] != -1 && other.pos[j] == -1) {
+                used.erase(idx);
+            }
+            if (pos[idx] != -1) {
+                seq_pos_rm(idx);
+            }
+            pos[idx] = other.pos[j];
+            seq[idx] = other.seq[j];
+            if (pos[idx] != -1) {
+                seq_pos_add(idx);
+            }
+            assert(shift[idx] == 0);
         }
     }

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
-                ubatch = balloc.split_equal(n_ubatch);
+                ubatch = balloc.split_equal(n_ubatch, false);
             }
             if (ubatch.n_tokens == 0) {
@@ -80,6 +80,11 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
             ubatches.push_back(std::move(ubatch)); // NOLINT
         }
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
         // prepare the recurrent batches first
         if (!mem_recr->prepare(ubatches)) {
             // TODO: will the recurrent cache be in an undefined context at this point?
@@ -195,11 +200,11 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(
 llama_memory_hybrid_context::llama_memory_hybrid_context(
               llama_memory_hybrid * mem,
-            std::vector<uint32_t>   heads_attn,
+                  slot_info_vec_t   sinfos_attn,
         std::vector<llama_ubatch>   ubatches) :
     ubatches(std::move(ubatches)),
     // note: here we copy the ubatches. not sure if this is ideal
-    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
+    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
     ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }
@@ -218,7 +223,7 @@ bool llama_memory_hybrid_context::next() {
 }
 bool llama_memory_hybrid_context::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    assert(!llama_memory_status_is_fail(status));
     bool res = true;

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -92,6 +92,8 @@ private:
 class llama_memory_hybrid_context : public llama_memory_context_i {
 public:
+    using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
     // init failure
     explicit llama_memory_hybrid_context(llama_memory_status status);
@@ -107,7 +109,7 @@ public:
     // init success
     llama_memory_hybrid_context(
               llama_memory_hybrid * mem,
-            std::vector<uint32_t>   heads_attn,
+                  slot_info_vec_t   sinfos_attn,
         std::vector<llama_ubatch>   ubatches);
     ~llama_memory_hybrid_context() = default;

package/src/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -363,30 +363,40 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
 }
 llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
-    std::vector<llama_ubatch> ubatches;
+    do {
+        balloc.split_reset();
-    while (true) {
-        llama_ubatch ubatch;
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            llama_ubatch ubatch;
-        if (embd_all) {
-            // if all tokens are output, split by sequence
-            ubatch = balloc.split_seq(n_ubatch);
-        } else {
-            ubatch = balloc.split_equal(n_ubatch);
+            if (embd_all) {
+                // if all tokens are output, split by sequence
+                ubatch = balloc.split_seq(n_ubatch);
+            } else {
+                ubatch = balloc.split_equal(n_ubatch, false);
+            }
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+            ubatches.push_back(std::move(ubatch)); // NOLINT
         }
-        if (ubatch.n_tokens == 0) {
+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
             break;
         }
-        ubatches.push_back(std::move(ubatch)); // NOLINT
-    }
+        if (!prepare(ubatches)) {
+            break;
+        }
-    if (!prepare(ubatches)) {
-        return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
-    }
+        return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
+    } while (false);
-    return std::make_unique<llama_memory_recurrent_context>(this, std::move(ubatches));
+    return std::make_unique<llama_memory_recurrent_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
 llama_memory_context_ptr llama_memory_recurrent::init_full() {
@@ -1066,7 +1076,15 @@ bool llama_memory_recurrent_context::next() {
 }
 bool llama_memory_recurrent_context::apply() {
-    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    assert(!llama_memory_status_is_fail(status));
+    // no ubatches -> this is an update
+    if (ubatches.empty()) {
+        // recurrent cache never performs updates
+        assert(status == LLAMA_MEMORY_STATUS_NO_UPDATE);
+        return true;
+    }
     mem->find_slot(ubatches[i_next]);

package/src/llama.cpp/src/llama-memory.cpp CHANGED Viewed

@@ -40,3 +40,20 @@ llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_me
     // if either status has an update, then the combined status has an update
     return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
 }
+bool llama_memory_status_is_fail(llama_memory_status status) {
+    switch (status) {
+        case LLAMA_MEMORY_STATUS_SUCCESS:
+        case LLAMA_MEMORY_STATUS_NO_UPDATE:
+            {
+                return false;
+            }
+        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
+        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
+            {
+                return true;
+            }
+    }
+    return false;
+}

package/src/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -31,6 +31,9 @@ enum llama_memory_status {
 // useful for implementing hybrid memory types (e.g. iSWA)
 llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
+// helper function for checking if a memory status indicates a failure
+bool llama_memory_status_is_fail(llama_memory_status status);
 // the interface for managing the memory context during batch processing
 // this interface is implemented per memory type. see:
 //   - llama_kv_cache_unified_context