npm - @fugood/llama.node - Versions diffs - 1.2.1 → 1.2.3 - Mend

@fugood/llama.node 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/package.json +14 -14
package/src/llama.cpp/common/arg.cpp +359 -310
package/src/llama.cpp/common/chat.cpp +27 -15
package/src/llama.cpp/common/common.cpp +1 -0
package/src/llama.cpp/common/sampling.cpp +1 -0
package/src/llama.cpp/ggml/CMakeLists.txt +37 -21
package/src/llama.cpp/ggml/include/ggml-backend.h +2 -1
package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +4 -2
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
package/src/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +17 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +93 -862
package/src/llama.cpp/include/llama.h +15 -11
package/src/llama.cpp/src/llama-context.cpp +151 -0
package/src/llama.cpp/src/llama-context.h +10 -0
package/src/llama.cpp/src/llama-cparams.h +1 -1
package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +8 -0
package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
package/src/llama.cpp/src/llama-kv-cache.cpp +8 -0
package/src/llama.cpp/src/llama-kv-cache.h +2 -0
package/src/llama.cpp/src/llama-memory-hybrid.cpp +8 -0
package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
package/src/llama.cpp/src/llama-memory-recurrent.cpp +8 -0
package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
package/src/llama.cpp/src/llama-memory.h +3 -0
package/src/llama.cpp/src/llama-model.cpp +14 -4
package/src/llama.cpp/src/llama-model.h +5 -1

package/src/llama.cpp/include/llama.h CHANGED Viewed

@@ -1329,24 +1329,25 @@ extern "C" {
     //
     // Performance utils
     //
-    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
+    // NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
     //
     struct llama_perf_context_data {
-        double t_start_ms;
-        double t_load_ms;
-        double t_p_eval_ms;
-        double t_eval_ms;
-        int32_t n_p_eval;
-        int32_t n_eval;
-        int32_t n_reused; // number of times a ggml compute graph had been reused
+        // ms == milliseconds
+        double t_start_ms;  // absolute start time
+        double t_load_ms;   // time needed for loading the model
+        double t_p_eval_ms; // time needed for processing the prompt
+        double t_eval_ms;   // time needed for generating tokens
+        int32_t n_p_eval;   // number of prompt tokens
+        int32_t n_eval;     // number of generated tokens
+        int32_t n_reused;   // number of times a ggml compute graph had been reused
     };
     struct llama_perf_sampler_data {
-        double t_sample_ms;
+        double t_sample_ms; // time needed for sampling in ms
-        int32_t n_sample;
+        int32_t n_sample;   // number of sampled tokens
     };
     LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
@@ -1358,6 +1359,9 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
+    // print a breakdown of per-device memory use via LLAMA_LOG:
+    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
     //
     // training
     //

package/src/llama.cpp/src/llama-context.cpp CHANGED Viewed

@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
     n_reused    = 0;
 }
+std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
+    for (const auto & buft_size : model.memory_breakdown()) {
+        ret[buft_size.first].model += buft_size.second;
+    }
+    for (const auto & buft_size : memory->memory_breakdown()) {
+        ret[buft_size.first].context += buft_size.second;
+    }
+    for (const auto & backend_ptr : backends) {
+        ggml_backend_t backend = backend_ptr.get();
+        ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
+    }
+    return ret;
+}
 //
 // training
 //
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
     ctx->perf_reset();
 }
+void llama_memory_breakdown_print(const struct llama_context * ctx) {
+    const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
+    std::vector<std::array<std::string, 9>> table_data;
+    table_data.reserve(devices.size());
+    const std::string template_header = "%s: | %s | %s   %s    %s   %s   %s   %s    %s |\n";
+    const std::string template_gpu    = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
+    const std::string template_other  = "%s: | %s | %s   %s    %s = %s + %s + %s    %s |\n";
+    table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
+    constexpr size_t MiB = 1024 * 1024;
+    const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
+    // track seen buffer types to avoid double counting:
+    std::set<ggml_backend_buffer_type_t> seen_buffer_types;
+    // accumulative memory breakdown for each device and for host:
+    std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
+    llama_memory_breakdown_data              mb_host;
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (ggml_backend_buft_is_host(buft)) {
+            mb_host.model   += mb.model;
+            mb_host.context += mb.context;
+            mb_host.compute += mb.compute;
+            seen_buffer_types.insert(buft);
+            continue;
+        }
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
+        if (dev) {
+            int i_dev = -1;
+            for (size_t i = 0; i < devices.size(); i++) {
+                if (devices[i] == dev) {
+                    i_dev = i;
+                    break;
+                }
+            }
+            if (i_dev != -1) {
+                mb_dev[i_dev].model   += mb.model;
+                mb_dev[i_dev].context += mb.context;
+                mb_dev[i_dev].compute += mb.compute;
+                seen_buffer_types.insert(buft);
+                continue;
+            }
+        }
+    }
+    // print memory breakdown for each device:
+    for (size_t i = 0; i < devices.size(); i++) {
+        ggml_backend_dev_t          dev = devices[i];
+        llama_memory_breakdown_data mb  = mb_dev[i];
+        const std::string name = ggml_backend_dev_name(dev);
+        std::string desc = ggml_backend_dev_description(dev);
+        for (const std::string & prefix : desc_prefixes_strip) {
+            if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
+                desc = desc.substr(prefix.length());
+            }
+        }
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+        const size_t self = mb.model + mb.context + mb.compute;
+        const size_t unaccounted = total - self - free;
+        table_data.push_back({
+            template_gpu,
+            "  - " + name + " (" + desc + ")",
+            std::to_string(total / MiB),
+            std::to_string(free / MiB),
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            std::to_string(unaccounted / MiB)});
+    }
+    // print memory breakdown for host:
+    {
+        const size_t self = mb_host.model + mb_host.context + mb_host.compute;
+        table_data.push_back({
+            template_other,
+            "  - Host",
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb_host.model / MiB),
+            std::to_string(mb_host.context / MiB),
+            std::to_string(mb_host.compute / MiB),
+            ""}); // unaccounted
+    }
+    // print memory breakdown for all remaining buffer types:
+    for (const auto & buft_mb : memory_breakdown) {
+        ggml_backend_buffer_type_t          buft = buft_mb.first;
+        const llama_memory_breakdown_data & mb   = buft_mb.second;
+        if (seen_buffer_types.count(buft) == 1) {
+            continue;
+        }
+        const std::string name = ggml_backend_buft_name(buft);
+        const size_t self = mb.model + mb.context + mb.compute;
+        table_data.push_back({
+            template_other,
+            "  - " + name,
+            "", // total
+            "", // free
+            std::to_string(self / MiB),
+            std::to_string(mb.model / MiB),
+            std::to_string(mb.context / MiB),
+            std::to_string(mb.compute / MiB),
+            ""}); // unaccounted
+        seen_buffer_types.insert(buft);
+    }
+    for (size_t j = 1; j < table_data[0].size(); j++) {
+        size_t max_len = 0;
+        for (const auto & td : table_data) {
+            max_len = std::max(max_len, td[j].length());
+        }
+        for (auto & td : table_data) {
+            td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
+        }
+    }
+    for (const auto & td : table_data) {
+        LLAMA_LOG_INFO(td[0].c_str(),
+            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
+            td[6].c_str(), td[7].c_str(), td[8].c_str());
+    }
+}
 //
 // training
 //

package/src/llama.cpp/src/llama-context.h CHANGED Viewed

@@ -17,9 +17,17 @@ class llama_batch_allocr;
 class llama_io_read_i;
 class llama_io_write_i;
+// "memory" as in abstract memory for the context
 struct llama_memory_i;
 struct llama_memory_context_i;
+// "memory" as in physical memory for a buffer type, in bytes
+struct llama_memory_breakdown_data {
+    size_t model   = 0; // memory allocated for the model
+    size_t context = 0; // memory allocated for the context
+    size_t compute = 0; // memory allocated for temporary compute buffers
+};
 struct llama_context {
     // init scheduler and compute buffers, reserve worst-case graphs
     llama_context(
@@ -144,6 +152,8 @@ struct llama_context {
     llama_perf_context_data perf_get_data() const;
     void perf_reset();
+    std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
     //
     // training
     //

package/src/llama.cpp/src/llama-cparams.h CHANGED Viewed

@@ -4,7 +4,7 @@
 #include <cstdint>
-#define LLAMA_MAX_SEQ 64
+#define LLAMA_MAX_SEQ 256
 struct llama_cparams {
     uint32_t n_ctx;           // context size used during inference

package/src/llama.cpp/src/llama-kv-cache-iswa.cpp CHANGED Viewed

@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
     return kv_swa->seq_pos_max(seq_id);
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
+    for (const auto & buft_size : kv_swa->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
 llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
     GGML_UNUSED(embd_all);

package/src/llama.cpp/src/llama-kv-cache-iswa.h CHANGED Viewed

@@ -56,6 +56,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     // state write/load
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

package/src/llama.cpp/src/llama-kv-cache.cpp CHANGED Viewed

@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
     return cells.seq_pos_max(seq_id);
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
 llama_memory_context_ptr llama_kv_cache::init_batch(
             llama_batch_allocr & balloc,
             uint32_t n_ubatch,

package/src/llama.cpp/src/llama-kv-cache.h CHANGED Viewed

@@ -121,6 +121,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     // state write/load
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

package/src/llama.cpp/src/llama-memory-hybrid.cpp CHANGED Viewed

@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
     return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
+    for (const auto & buft_size : mem_recr->memory_breakdown()) {
+        mb[buft_size.first] += buft_size.second;
+    }
+    return mb;
+}
 void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
     GGML_UNUSED(flags);

package/src/llama.cpp/src/llama-memory-hybrid.h CHANGED Viewed

@@ -68,6 +68,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     // state write/load
     void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;

package/src/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
 llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
     do {
         balloc.split_reset();

package/src/llama.cpp/src/llama-memory-recurrent.h CHANGED Viewed

@@ -4,6 +4,7 @@
 #include "llama-graph.h"
 #include "llama-memory.h"
+#include <map>
 #include <set>
 #include <vector>
@@ -50,6 +51,8 @@ public:
     llama_pos seq_pos_min(llama_seq_id seq_id) const override;
     llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
     bool prepare(const std::vector<llama_ubatch> & ubatches);
     // find a contiguous slot of memory cells and emplace the ubatch there

package/src/llama.cpp/src/llama-memory.h CHANGED Viewed

@@ -2,6 +2,7 @@
 #include "llama.h"
+#include <map>
 #include <memory>
 #include <functional>
@@ -108,6 +109,8 @@ struct llama_memory_i {
     virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
     virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
+    virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
     //
     // state write/read
     //

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_1_7B:          return "1.7B";
         case LLM_TYPE_1_8B:          return "1.8B";
         case LLM_TYPE_2B:            return "2B";
+        case LLM_TYPE_2_6B:          return "2.6B";
         case LLM_TYPE_2_8B:          return "2.8B";
         case LLM_TYPE_2_9B:          return "2.9B";
         case LLM_TYPE_3B:            return "3B";
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 for (uint32_t il = 0; il < hparams.n_layer; ++il) {
                     hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
                 }
-                switch (hparams.n_embd) {
-                    case 1024: type = LLM_TYPE_350M; break;
-                    case 1536: type = LLM_TYPE_700M; break;
-                    case 2048: type = LLM_TYPE_1_2B; break;
+                switch (hparams.n_ff()) {
+                    case  4608: type = LLM_TYPE_350M; break;
+                    case  6912: type = LLM_TYPE_700M; break;
+                    case  8192: type = LLM_TYPE_1_2B; break;
+                    case 10752: type = LLM_TYPE_2_6B; break;
                     default:   type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -6003,6 +6005,14 @@ size_t llama_model::n_devices() const {
     return devices.size();
 }
+std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
+    std::map<ggml_backend_buffer_type_t, size_t> ret;
+    for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
+        ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
+    }
+    return ret;
+}
 uint64_t llama_model::n_elements() const {
     return pimpl->n_elements;
 }

package/src/llama.cpp/src/llama-model.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #include "llama-memory.h"
 #include "llama-vocab.h"
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -58,6 +59,7 @@ enum llm_type {
     LLM_TYPE_1_7B,
     LLM_TYPE_1_8B,
     LLM_TYPE_2B,
+    LLM_TYPE_2_6B,
     LLM_TYPE_2_8B,
     LLM_TYPE_2_9B,
     LLM_TYPE_3B,
@@ -452,10 +454,12 @@ struct llama_model {
     std::string desc() const;
-    size_t size() const;
+    size_t size() const; // file size
     size_t n_tensors() const;
     size_t n_devices() const;
+    std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
     // total number of parameters in the model
     uint64_t n_elements() const;