npm - @fugood/llama.node - Versions diffs - 0.4.7 → 0.6.0 - Mend

@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/CMakeLists.txt +4 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/lib/binding.ts +66 -6
package/lib/index.js +59 -17
package/lib/index.ts +74 -23
package/package.json +1 -1
package/src/DecodeAudioTokenWorker.cpp +40 -0
package/src/DecodeAudioTokenWorker.h +22 -0
package/src/EmbeddingWorker.cpp +7 -5
package/src/LlamaCompletionWorker.cpp +68 -54
package/src/LlamaCompletionWorker.h +7 -8
package/src/LlamaContext.cpp +551 -235
package/src/LlamaContext.h +26 -4
package/src/LoadSessionWorker.cpp +4 -2
package/src/SaveSessionWorker.cpp +10 -6
package/src/TokenizeWorker.cpp +23 -14
package/src/TokenizeWorker.h +2 -2
package/src/addons.cc +8 -11
package/src/common.hpp +129 -126
package/src/llama.cpp/.github/workflows/build.yml +2 -2
package/src/llama.cpp/.github/workflows/release.yml +152 -129
package/src/llama.cpp/.github/workflows/winget.yml +42 -0
package/src/llama.cpp/common/arg.cpp +14 -13
package/src/llama.cpp/common/common.cpp +4 -75
package/src/llama.cpp/common/common.h +7 -12
package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
package/src/llama.cpp/examples/simple/simple.cpp +1 -1
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
package/src/llama.cpp/ggml/include/ggml.h +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
package/src/llama.cpp/ggml/src/ggml.c +64 -18
package/src/llama.cpp/include/llama.h +24 -124
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
package/src/llama.cpp/src/llama-batch.cpp +3 -1
package/src/llama.cpp/src/llama-context.cpp +60 -110
package/src/llama.cpp/src/llama-graph.cpp +137 -233
package/src/llama.cpp/src/llama-graph.h +49 -7
package/src/llama.cpp/src/llama-hparams.cpp +17 -1
package/src/llama.cpp/src/llama-hparams.h +34 -5
package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
package/src/llama.cpp/src/llama-kv-cache.h +201 -85
package/src/llama.cpp/src/llama-memory.h +3 -2
package/src/llama.cpp/src/llama-model.cpp +273 -94
package/src/llama.cpp/src/llama-model.h +4 -1
package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
package/src/llama.cpp/tools/mtmd/clip.h +6 -4
package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
package/src/llama.cpp/tools/run/run.cpp +2 -2
package/src/llama.cpp/tools/server/server.cpp +158 -47
package/src/llama.cpp/tools/server/utils.hpp +71 -43
package/src/llama.cpp/tools/tts/tts.cpp +4 -2
package/src/tts_utils.cpp +342 -0
package/src/tts_utils.h +62 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -1,7 +1,6 @@
 #include "LlamaCompletionWorker.h"
 #include "LlamaContext.h"
 size_t findStoppingStrings(const std::string &text,
                            const size_t last_token_size,
                            const std::vector<std::string> &stop_words) {
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
 LlamaCompletionWorker::LlamaCompletionWorker(
     const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
     Napi::Function callback, common_params params,
-    std::vector<std::string> stop_words,
-    int32_t chat_format,
-    std::vector<std::string> image_paths)
+    std::vector<std::string> stop_words, int32_t chat_format,
+    const std::vector<std::string> &media_paths,
+    const std::vector<llama_token> &guide_tokens)
     : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
-      _image_paths(image_paths) {
+      _media_paths(media_paths), _guide_tokens(guide_tokens) {
   if (!callback.IsEmpty()) {
     _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
                                           "LlamaCompletionCallback", 0, 1);
@@ -64,34 +63,29 @@ void LlamaCompletionWorker::Execute() {
   LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
                             common_sampler_free};
-  // Process images if any are provided
-  if (!_image_paths.empty()) {
-    const auto* mtmd_ctx = _sess->get_mtmd_ctx();
+  // Process media if any are provided
+  if (!_media_paths.empty()) {
+    const auto *mtmd_ctx = _sess->get_mtmd_ctx();
     if (mtmd_ctx != nullptr) {
-      // Process the images and get the tokens
+      // Process the media and get the tokens
       try {
-        n_cur = process_image_prompt(
-          ctx,
-          mtmd_ctx,
-          _sess,
-          _params,
-          _image_paths
-        );
-      } catch (const std::exception& e) {
+        n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
+      } catch (const std::exception &e) {
         SetError(e.what());
         _sess->get_mutex().unlock();
         return;
       }
       if (n_cur <= 0) {
-        SetError("Failed to process images");
+        SetError("Failed to process media");
         _sess->get_mutex().unlock();
         return;
       }
-      fprintf(stdout, "[DEBUG] Image processing successful, n_cur=%zu, tokens=%zu\n",
-                       n_cur, _sess->tokens_ptr()->size());
+      fprintf(stdout,
+              "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
+              n_cur, _sess->tokens_ptr()->size());
       n_input = _sess->tokens_ptr()->size();
       if (n_cur == n_input) {
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
     }
   } else {
     // Text-only path
-    std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
+    std::vector<llama_token> prompt_tokens =
+        ::common_tokenize(ctx, _params.prompt, add_bos);
     n_input = prompt_tokens.size();
     if (_sess->tokens_ptr()->size() > 0) {
       n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
       if (n_cur == n_input) {
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
         _result.context_full = true;
         break;
       }
       const int n_left = n_cur - n_keep - 1;
       const int n_discard = n_left / 2;
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
       n_cur -= n_discard;
       _result.truncated = true;
     }
     // For multimodal input, n_past might already be set
     // Only decode text tokens if we have any input left
     if (n_input > 0) {
-      int ret = llama_decode(
-          ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
+      int ret =
+          llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
       if (ret < 0) {
         SetError("Failed to decode token, code: " + std::to_string(ret));
         break;
       }
     }
     // sample the next token
-    const llama_token new_token_id =
-        common_sampler_sample(sampling.get(), ctx, -1);
+    llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
+    if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
+        !llama_vocab_is_control(vocab, new_token_id) &&
+        !llama_vocab_is_eog(vocab, new_token_id)) {
+      new_token_id = _guide_tokens[0];
+      _guide_tokens.erase(_guide_tokens.begin());
+    }
+    _next_token_uses_guide_token = (new_token_id == 198);
     common_sampler_accept(sampling.get(), new_token_id, true);
     // prepare the next batch
     embd->emplace_back(new_token_id);
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
 void LlamaCompletionWorker::OnOK() {
   auto env = Napi::AsyncWorker::Env();
   auto result = Napi::Object::New(env);
-  result.Set("tokens_evaluated", Napi::Number::New(env,
-                                                   _result.tokens_evaluated));
+  result.Set("tokens_evaluated",
+             Napi::Number::New(env, _result.tokens_evaluated));
   result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
                                                    _result.tokens_predicted));
-  result.Set("truncated",
-             Napi::Boolean::New(env, _result.truncated));
-  result.Set("context_full",
-             Napi::Boolean::New(env, _result.context_full));
-  result.Set("text",
-             Napi::String::New(env, _result.text.c_str()));
-  result.Set("stopped_eos",
-             Napi::Boolean::New(env, _result.stopped_eos));
-  result.Set("stopped_words",
-             Napi::Boolean::New(env, _result.stopped_words));
+  result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
+  result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
+  result.Set("text", Napi::String::New(env, _result.text.c_str()));
+  result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
+  result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
   result.Set("stopping_word",
              Napi::String::New(env, _result.stopping_word.c_str()));
   result.Set("stopped_limited",
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
   std::string content;
   if (!_stop) {
     try {
-      common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
+      common_chat_msg message = common_chat_parse(
+          _result.text, static_cast<common_chat_format>(_chat_format));
       if (!message.reasoning_content.empty()) {
         reasoning_content = message.reasoning_content;
       }
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
     result.Set("tool_calls", tool_calls);
   }
   if (!reasoning_content.empty()) {
-    result.Set("reasoning_content", Napi::String::New(env, reasoning_content.c_str()));
+    result.Set("reasoning_content",
+               Napi::String::New(env, reasoning_content.c_str()));
   }
   if (!content.empty()) {
     result.Set("content", Napi::String::New(env, content.c_str()));
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
   const auto timings_token = llama_perf_context(ctx);
   auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
-  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
-  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
-  timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
-  timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
-  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
-  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
-  timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
-  timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
+  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                  timings_token.n_p_eval));
+  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                   timings_token.t_p_eval_ms));
+  timingsResult.Set(
+      "prompt_per_token_ms",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        timings_token.t_p_eval_ms / timings_token.n_p_eval));
+  timingsResult.Set("prompt_per_second",
+                    Napi::Number::New(Napi::AsyncWorker::Env(),
+                                      1e3 / timings_token.t_p_eval_ms *
+                                          timings_token.n_p_eval));
+  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                     timings_token.n_eval));
+  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                      timings_token.t_eval_ms));
+  timingsResult.Set(
+      "predicted_per_token_ms",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        timings_token.t_eval_ms / timings_token.n_eval));
+  timingsResult.Set(
+      "predicted_per_second",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        1e3 / timings_token.t_eval_ms * timings_token.n_eval));
   result.Set("timings", timingsResult);
   Napi::Promise::Deferred::Resolve(result);
 }

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -20,19 +20,16 @@ public:
                         Napi::Function callback, common_params params,
                         std::vector<std::string> stop_words,
                         int32_t chat_format,
-                        std::vector<std::string> image_paths = {});
+                        const std::vector<std::string> &media_paths = {},
+                        const std::vector<llama_token> &guide_tokens = {});
   ~LlamaCompletionWorker();
   Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
-  void OnComplete(std::function<void()> cb) {
-    _onComplete = cb;
-  }
+  void OnComplete(std::function<void()> cb) { _onComplete = cb; }
-  void SetStop() {
-    _stop = true;
-  }
+  void SetStop() { _stop = true; }
 protected:
   void Execute() override;
@@ -44,11 +41,13 @@ private:
   common_params _params;
   std::vector<std::string> _stop_words;
   int32_t _chat_format;
-  std::vector<std::string> _image_paths;
+  std::vector<std::string> _media_paths;
+  std::vector<llama_token> _guide_tokens;
   std::function<void()> _onComplete;
   bool _has_callback = false;
   bool _stop = false;
   Napi::ThreadSafeFunction _tsfn;
+  bool _next_token_uses_guide_token = true;
   struct {
     size_t tokens_evaluated = 0;
     size_t tokens_predicted = 0;