npm - @fugood/llama.node - Versions diffs - 1.0.3 → 1.0.5 - Mend

@fugood/llama.node 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/lib/binding.ts +1 -0
package/package.json +14 -14
package/src/LlamaCompletionWorker.cpp +24 -4
package/src/LlamaCompletionWorker.h +7 -1
package/src/LlamaContext.cpp +2 -1
package/src/llama.cpp/common/CMakeLists.txt +4 -5
package/src/llama.cpp/common/arg.cpp +37 -0
package/src/llama.cpp/common/common.cpp +22 -6
package/src/llama.cpp/common/common.h +14 -1
package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
package/src/llama.cpp/ggml/include/ggml.h +13 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
package/src/llama.cpp/include/llama.h +13 -48
package/src/llama.cpp/src/llama-arch.cpp +222 -15
package/src/llama.cpp/src/llama-arch.h +16 -1
package/src/llama.cpp/src/llama-batch.cpp +76 -70
package/src/llama.cpp/src/llama-batch.h +24 -18
package/src/llama.cpp/src/llama-chat.cpp +44 -1
package/src/llama.cpp/src/llama-chat.h +2 -0
package/src/llama.cpp/src/llama-context.cpp +134 -95
package/src/llama.cpp/src/llama-context.h +13 -16
package/src/llama.cpp/src/llama-cparams.h +3 -2
package/src/llama.cpp/src/llama-graph.cpp +239 -154
package/src/llama.cpp/src/llama-graph.h +162 -126
package/src/llama.cpp/src/llama-hparams.cpp +45 -0
package/src/llama.cpp/src/llama-hparams.h +11 -1
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
package/src/llama.cpp/src/llama-model.cpp +2309 -665
package/src/llama.cpp/src/llama-model.h +18 -4
package/src/llama.cpp/src/llama-quant.cpp +2 -2
package/src/llama.cpp/src/llama-vocab.cpp +368 -9
package/src/llama.cpp/src/llama-vocab.h +43 -0
package/src/llama.cpp/src/unicode.cpp +207 -0
package/src/llama.cpp/src/unicode.h +2 -0

package/lib/binding.ts CHANGED Viewed

@@ -131,6 +131,7 @@ export type LlamaCompletionResult = {
   tokens_evaluated: number
   truncated: boolean
   context_full: boolean
+  audio_tokens?: Array<number>
   timings: {
     prompt_n: number
     prompt_ms: number

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.0.3",
+  "version": "1.0.5",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -70,19 +70,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.0.3",
-    "@fugood/node-llama-linux-x64-vulkan": "1.0.3",
-    "@fugood/node-llama-linux-x64-cuda": "1.0.3",
-    "@fugood/node-llama-linux-arm64": "1.0.3",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.0.3",
-    "@fugood/node-llama-linux-arm64-cuda": "1.0.3",
-    "@fugood/node-llama-win32-x64": "1.0.3",
-    "@fugood/node-llama-win32-x64-vulkan": "1.0.3",
-    "@fugood/node-llama-win32-x64-cuda": "1.0.3",
-    "@fugood/node-llama-win32-arm64": "1.0.3",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.0.3",
-    "@fugood/node-llama-darwin-x64": "1.0.3",
-    "@fugood/node-llama-darwin-arm64": "1.0.3"
+    "@fugood/node-llama-linux-x64": "1.0.5",
+    "@fugood/node-llama-linux-x64-vulkan": "1.0.5",
+    "@fugood/node-llama-linux-x64-cuda": "1.0.5",
+    "@fugood/node-llama-linux-arm64": "1.0.5",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.0.5",
+    "@fugood/node-llama-linux-arm64-cuda": "1.0.5",
+    "@fugood/node-llama-win32-x64": "1.0.5",
+    "@fugood/node-llama-win32-x64-vulkan": "1.0.5",
+    "@fugood/node-llama-win32-x64-cuda": "1.0.5",
+    "@fugood/node-llama-win32-arm64": "1.0.5",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.0.5",
+    "@fugood/node-llama-darwin-x64": "1.0.5",
+    "@fugood/node-llama-darwin-arm64": "1.0.5"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -32,12 +32,15 @@ LlamaCompletionWorker::LlamaCompletionWorker(
     bool thinking_forced_open,
     std::string reasoning_format,
     const std::vector<std::string> &media_paths,
-    const std::vector<llama_token> &guide_tokens)
+    const std::vector<llama_token> &guide_tokens,
+    bool has_vocoder,
+    tts_type tts_type_val)
     : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
       _thinking_forced_open(thinking_forced_open),
       _reasoning_format(reasoning_format),
-      _media_paths(media_paths), _guide_tokens(guide_tokens) {
+      _media_paths(media_paths), _guide_tokens(guide_tokens),
+      _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
   if (!callback.IsEmpty()) {
     _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
                                           "LlamaCompletionCallback", 0, 1);
@@ -153,8 +156,7 @@ void LlamaCompletionWorker::Execute() {
     // For multimodal input, n_past might already be set
     // Only decode text tokens if we have any input left
     if (n_input > 0) {
-      int ret =
-          llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
+      int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
       if (ret < 0) {
         SetError("Failed to decode token, code: " + std::to_string(ret));
         break;
@@ -171,6 +173,15 @@ void LlamaCompletionWorker::Execute() {
     }
     _next_token_uses_guide_token = (new_token_id == 198);
     common_sampler_accept(sampling.get(), new_token_id, true);
+    // Collect audio tokens for TTS if vocoder is enabled
+    if (_has_vocoder) {
+      if ((_tts_type == OUTETTS_V0_2 || _tts_type == OUTETTS_V0_3) &&
+          (new_token_id >= 151672 && new_token_id <= 155772)) {
+        _result.audio_tokens.push_back(new_token_id);
+      }
+    }
     // prepare the next batch
     embd->emplace_back(new_token_id);
     auto token = common_token_to_piece(ctx, new_token_id);
@@ -291,6 +302,15 @@ void LlamaCompletionWorker::OnOK() {
     result.Set("content", Napi::String::New(env, content.c_str()));
   }
+  // Add audio_tokens if vocoder is enabled and we have audio tokens
+  if (_has_vocoder && !_result.audio_tokens.empty()) {
+    auto audio_tokens = Napi::Array::New(env, _result.audio_tokens.size());
+    for (size_t i = 0; i < _result.audio_tokens.size(); i++) {
+      audio_tokens.Set(i, Napi::Number::New(env, _result.audio_tokens[i]));
+    }
+    result.Set("audio_tokens", audio_tokens);
+  }
   auto ctx = _sess->context();
   const auto timings_token = llama_perf_context(ctx);

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -1,6 +1,7 @@
 #pragma once
 #include "common.hpp"
+#include "tts_utils.h"
 #include <atomic>
 #include <functional>
 #include <napi.h>
@@ -23,7 +24,9 @@ public:
                         bool thinking_forced_open,
                         std::string reasoning_format,
                         const std::vector<std::string> &media_paths = {},
-                        const std::vector<llama_token> &guide_tokens = {});
+                        const std::vector<llama_token> &guide_tokens = {},
+                        bool has_vocoder = false,
+                        tts_type tts_type_val = UNKNOWN);
   ~LlamaCompletionWorker();
@@ -52,6 +55,8 @@ private:
   bool _stop = false;
   Napi::ThreadSafeFunction _tsfn;
   bool _next_token_uses_guide_token = true;
+  bool _has_vocoder;
+  tts_type _tts_type;
   struct {
     size_t tokens_evaluated = 0;
     size_t tokens_predicted = 0;
@@ -62,5 +67,6 @@ private:
     bool stopped_words = false;
     std::string stopping_word;
     bool stopped_limited = false;
+    std::vector<llama_token> audio_tokens;
   } _result;
 };

package/src/LlamaContext.cpp CHANGED Viewed

@@ -917,7 +917,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   auto *worker =
       new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
-                                chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens);
+                                chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
+                                _has_vocoder, _tts_type);
   worker->Queue();
   _wip = worker;
   worker->OnComplete([this]() { _wip = nullptr; });

package/src/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -86,8 +86,7 @@ if (LLAMA_CURL)
     endif()
     target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
-    find_library(CURL_LIBRARY curl REQUIRED)
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
 endif ()
 if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
     ExternalProject_Add(llguidance_ext
         GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.20 (+ fix to build on GCC 15):
-        GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
+        # v1.0.1:
+        GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
         PREFIX ${CMAKE_BINARY_DIR}/llguidance
         SOURCE_DIR ${LLGUIDANCE_SRC}
         BUILD_IN_SOURCE TRUE
         CONFIGURE_COMMAND ""
-        BUILD_COMMAND cargo build --release
+        BUILD_COMMAND cargo build --release --package llguidance
         INSTALL_COMMAND ""
         BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
         UPDATE_COMMAND ""

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.swa_full = true;
         }
     ).set_env("LLAMA_ARG_SWA_FULL"));
+    add_opt(common_arg(
+        {"--kv-unified", "-kvu"},
+        string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
+            "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
+        [](common_params & params) {
+            params.kv_unified = true;
+        }
+    ).set_env("LLAMA_ARG_KV_SPLIT"));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -3423,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    // diffusion parameters
+    add_opt(common_arg(
+        { "--diffusion-steps" }, "N",
+        string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
+        [](common_params & params, int value) { params.diffusion.steps = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-eps" }, "F",
+        string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
+        [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-algorithm" }, "N",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+                      params.diffusion.algorithm),
+        [](common_params & params, int value) { params.diffusion.algorithm = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-alg-temp" }, "F",
+        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-visual" },
+        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+                      params.diffusion.visual_mode ? "true" : "false"),
+        [](common_params & params) { params.diffusion.visual_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     return ctx_arg;
 }

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
     return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
 }
+bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
+    bool has_suffix = string_ends_with(str, suffix);
+    if (has_suffix) {
+        str = str.substr(0, str.size() - suffix.size());
+    }
+    return has_suffix;
+}
 size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
     if (!str.empty() && !stop.empty()) {
         const char text_last_char = str.back();
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
         params.sampling.ignore_eos = false;
     }
-    if (params.sampling.ignore_eos) {
-        for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-            if (llama_vocab_is_eog(vocab, i)) {
-                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-                params.sampling.logit_bias.push_back({i, -INFINITY});
-            }
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
         }
     }
+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
     if (params.sampling.penalty_last_n == -1) {
         LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
         params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1158,6 +1173,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
     cparams.no_perf           = params.no_perf;
     cparams.op_offload        = !params.no_op_offload;
     cparams.swa_full          = params.swa_full;
+    cparams.kv_unified        = params.kv_unified;
     cparams.type_k = params.cache_type_k;
     cparams.type_v = params.cache_type_v;

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -81,6 +81,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
     LLAMA_EXAMPLE_TTS,
+    LLAMA_EXAMPLE_DIFFUSION,
     LLAMA_EXAMPLE_COUNT,
 };
@@ -177,7 +178,8 @@ struct common_params_sampling {
     std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
     std::set<llama_token>               preserved_tokens;
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
+    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
     // print the parameters into a string
     std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
     bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };
+struct common_params_diffusion {
+    int32_t steps       = 64;     // number of diffusion steps
+    float   eps         = 1e-3f;  // epsilon for timesteps
+    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
+    float   alg_temp    = 0.0f;   // algorithm temperature
+    bool    visual_mode = false;  // show progressive diffusion on screen
+};
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
     COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -269,6 +279,7 @@ struct common_params {
     struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
     struct common_params_vocoder     vocoder;
+    struct common_params_diffusion   diffusion;
     struct common_params_model model;
@@ -331,6 +342,7 @@ struct common_params {
     bool no_perf           = false; // disable performance metrics
     bool ctx_shift         = true;  // context shift on inifinite text generation
     bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+    bool kv_unified        = false; // enable unified KV cache
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool use_mmap          = true;  // use mmap for faster loads
@@ -523,6 +535,7 @@ static bool string_starts_with(const std::string & str,
 // While we wait for C++20's std::string::ends_with...
 bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
+bool string_remove_suffix(std::string & str, const std::string_view & suffix);
 size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -181,6 +181,8 @@ option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug ou
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
+option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
+option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
@@ -270,6 +272,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-webgpu.h
     include/gguf.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

package/src/llama.cpp/ggml/include/ggml-webgpu.h ADDED Viewed

@@ -0,0 +1,19 @@
+#pragma once
+#include "ggml.h"
+#include "ggml-backend.h"
+#ifdef  __cplusplus
+extern "C" {
+#endif
+#define GGML_WEBGPU_NAME "WebGPU"
+// Needed for examples in ggml
+GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
+#ifdef  __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -1297,6 +1297,19 @@ extern "C" {
             struct ggml_tensor  * a,
             float                 s);
+    // x = s * a + b
+    GGML_API struct ggml_tensor * ggml_scale_bias(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 s,
+        float                 b);
+    GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        float                 s,
+        float                 b);
     // b -> view(a,offset,nb1,nb2,3), return modified a
     GGML_API struct ggml_tensor * ggml_set(
             struct ggml_context * ctx,

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -370,6 +370,7 @@ ggml_add_backend(MUSA)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
+ggml_add_backend(WebGPU)
 ggml_add_backend(OpenCL)
 foreach (target ggml-base ggml)