npm - @fugood/llama.node - Versions diffs - 0.5.0 → 0.6.0 - Mend

@fugood/llama.node 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/CMakeLists.txt +4 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-cuda/arm64/llama-node.node +0 -0
package/bin/linux-cuda/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/lib/binding.ts +46 -0
package/lib/index.js +18 -0
package/lib/index.ts +24 -0
package/package.json +1 -1
package/src/DecodeAudioTokenWorker.cpp +40 -0
package/src/DecodeAudioTokenWorker.h +22 -0
package/src/EmbeddingWorker.cpp +7 -5
package/src/LlamaCompletionWorker.cpp +64 -50
package/src/LlamaCompletionWorker.h +6 -7
package/src/LlamaContext.cpp +519 -222
package/src/LlamaContext.h +25 -4
package/src/LoadSessionWorker.cpp +4 -2
package/src/SaveSessionWorker.cpp +10 -6
package/src/TokenizeWorker.cpp +10 -5
package/src/addons.cc +8 -11
package/src/common.hpp +92 -93
package/src/tts_utils.cpp +342 -0
package/src/tts_utils.h +62 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0

package/CMakeLists.txt CHANGED Viewed

@@ -102,6 +102,10 @@ file(
     "src/LoadSessionWorker.h"
     "src/SaveSessionWorker.cpp"
     "src/SaveSessionWorker.h"
+    "src/DecodeAudioTokenWorker.cpp"
+    "src/DecodeAudioTokenWorker.h"
+    "src/tts_utils.cpp"
+    "src/tts_utils.h"
 )
 add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})

package/bin/darwin/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/darwin/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-cuda/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-cuda/x64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/arm64/llama-node.node CHANGED Viewed

Binary file

package/bin/linux-vulkan/x64/llama-node.node CHANGED Viewed

Binary file

package/lib/binding.ts CHANGED Viewed

@@ -114,6 +114,11 @@ export type LlamaCompletionOptions = {
    * Supports both file paths and base64 data URLs.
    */
   media_paths?: string | string[]
+  /**
+   * Guide tokens to use for audio completion.
+   * Help prevent hallucinations by forcing the TTS to use the correct words.
+   */
+  guide_tokens?: Int32Array
 }
 export type LlamaCompletionResult = {
@@ -208,6 +213,47 @@ export interface LlamaContext {
    */
   releaseMultimodal(): Promise<void>
+  /**
+   * Load a vocoder model
+   * @param path Path to the vocoder model
+   * @returns Promise resolving to true if loading was successful
+   */
+  initVocoder(path: string): Promise<boolean>
+  /**
+   * Unload the vocoder model
+   * @returns Promise resolving to true if unloading was successful
+   */
+  releaseVocoder(): Promise<void>
+  /**
+   * Check if the vocoder model is enabled
+   * @returns Promise resolving to true if the vocoder model is enabled
+   */
+  isVocoderEnabled(): boolean
+  /**
+   * Get the formatted prompt for audio completion
+   * @param speaker Speaker name or null
+   * @param text Text to complete
+   * @returns Formatted audio completion
+   */
+  getFormattedAudioCompletion(speaker: string|null, text: string): string
+  /**
+   * Get guide tokens for audio completion
+   * @param text Text to complete
+   * @returns Guide tokens
+   */
+  getAudioCompletionGuideTokens(text: string): Int32Array
+  /**
+   * Decode audio tokens to audio data
+   * @param tokens Tokens to decode
+   * @returns Decoded audio tokens
+   */
+  decodeAudioTokens(tokens: Int32Array): Promise<Float32Array>
   // static
   loadModelInfo(path: string, skip: string[]): Promise<Object>
   toggleNativeLog(

package/lib/index.js CHANGED Viewed

@@ -204,6 +204,24 @@ class LlamaContextWrapper {
     getMultimodalSupport() {
         return this.ctx.getMultimodalSupport();
     }
+    initVocoder(path) {
+        return this.ctx.initVocoder(path);
+    }
+    releaseVocoder() {
+        return this.ctx.releaseVocoder();
+    }
+    isVocoderEnabled() {
+        return this.ctx.isVocoderEnabled();
+    }
+    getFormattedAudioCompletion(speaker, text) {
+        return this.ctx.getFormattedAudioCompletion(speaker, text);
+    }
+    getAudioCompletionGuideTokens(text) {
+        return this.ctx.getAudioCompletionGuideTokens(text);
+    }
+    decodeAudioTokens(tokens) {
+        return this.ctx.decodeAudioTokens(tokens);
+    }
 }
 const loadModel = (options) => __awaiter(void 0, void 0, void 0, function* () {
     var _a, _b;

package/lib/index.ts CHANGED Viewed

@@ -269,6 +269,30 @@ class LlamaContextWrapper {
   }> {
     return this.ctx.getMultimodalSupport()
   }
+  initVocoder(path: string): Promise<boolean> {
+    return this.ctx.initVocoder(path)
+  }
+  releaseVocoder(): Promise<void> {
+    return this.ctx.releaseVocoder()
+  }
+  isVocoderEnabled(): boolean {
+    return this.ctx.isVocoderEnabled()
+  }
+  getFormattedAudioCompletion(speaker: string|null, text: string): string {
+    return this.ctx.getFormattedAudioCompletion(speaker, text)
+  }
+  getAudioCompletionGuideTokens(text: string): Int32Array {
+    return this.ctx.getAudioCompletionGuideTokens(text)
+  }
+  decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array> {
+    return this.ctx.decodeAudioTokens(tokens)
+  }
 }
 export const loadModel = async (

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "0.5.0",
+  "version": "0.6.0",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {

package/src/DecodeAudioTokenWorker.cpp ADDED Viewed

@@ -0,0 +1,40 @@
+#include "DecodeAudioTokenWorker.h"
+#include "tts_utils.h"
+#include <vector>
+DecodeAudioTokenWorker::DecodeAudioTokenWorker(
+    const Napi::CallbackInfo &info, llama_model *model, llama_context *ctx,
+    int n_threads, const std::vector<llama_token> &tokens)
+    : AsyncWorker(info.Env()), Deferred(info.Env()), _model(model), _ctx(ctx),
+      _n_threads(n_threads), _tokens(tokens) {}
+void DecodeAudioTokenWorker::Execute() {
+  const int n_codes = _tokens.size();
+  llama_batch batch = llama_batch_init(n_codes, 0, 1);
+  for (size_t i = 0; i < _tokens.size(); ++i) {
+    common_batch_add(batch, _tokens[i], i, {0}, true);
+  }
+  if (batch.n_tokens != n_codes) {
+    SetError("batch.n_tokens != n_codes");
+    return;
+  }
+  if (llama_encode(_ctx, batch) != 0) {
+    SetError("llama_encode() failed");
+    return;
+  }
+  llama_synchronize(_ctx);
+  const int n_embd = llama_model_n_embd(_model);
+  const float *embd = llama_get_embeddings(_ctx);
+  _result = embd_to_audio(embd, n_codes, n_embd, _n_threads);
+}
+void DecodeAudioTokenWorker::OnOK() {
+  auto result =
+      Napi::Float32Array::New(Napi::AsyncWorker::Env(), _result.size());
+  memcpy(result.Data(), _result.data(), _result.size() * sizeof(float));
+  Napi::Promise::Deferred::Resolve(result);
+}
+void DecodeAudioTokenWorker::OnError(const Napi::Error &err) {
+  Napi::Promise::Deferred::Reject(err.Value());
+}

package/src/DecodeAudioTokenWorker.h ADDED Viewed

@@ -0,0 +1,22 @@
+#include "common.hpp"
+#include <vector>
+class DecodeAudioTokenWorker : public Napi::AsyncWorker,
+                               public Napi::Promise::Deferred {
+public:
+  DecodeAudioTokenWorker(const Napi::CallbackInfo &info, llama_model *model,
+                         llama_context *ctx, int n_threads,
+                         const std::vector<llama_token> &tokens);
+protected:
+  void Execute();
+  void OnOK();
+  void OnError(const Napi::Error &err);
+private:
+  llama_model *_model;
+  llama_context *_ctx;
+  int _n_threads;
+  std::vector<llama_token> _tokens;
+  std::vector<float> _result;
+};

package/src/EmbeddingWorker.cpp CHANGED Viewed

@@ -2,8 +2,10 @@
 #include "LlamaContext.h"
 EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
-                                 LlamaSessionPtr &sess, std::string text, common_params &params)
-    : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _params(params) {}
+                                 LlamaSessionPtr &sess, std::string text,
+                                 common_params &params)
+    : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text),
+      _params(params) {}
 void EmbeddingWorker::Execute() {
   llama_kv_self_clear(_sess->context());
@@ -17,8 +19,7 @@ void EmbeddingWorker::Execute() {
   do {
     auto ctx = _sess->context();
     int ret =
-        llama_decode(ctx,
-                     llama_batch_get_one(tokens.data(), tokens.size()));
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()));
     if (ret < 0) {
       SetError("Failed to inference, code: " + std::to_string(ret));
       break;
@@ -37,7 +38,8 @@ void EmbeddingWorker::Execute() {
     }
     _result.embedding.resize(n_embd);
     std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
-        common_embd_normalize(embedding.data(), out.data(), n_embd, _params.embd_normalize);
+    common_embd_normalize(embedding.data(), out.data(), n_embd,
+                          _params.embd_normalize);
     memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
   } while (false);
 }

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -1,7 +1,6 @@
 #include "LlamaCompletionWorker.h"
 #include "LlamaContext.h"
 size_t findStoppingStrings(const std::string &text,
                            const size_t last_token_size,
                            const std::vector<std::string> &stop_words) {
@@ -27,12 +26,12 @@ size_t findStoppingStrings(const std::string &text,
 LlamaCompletionWorker::LlamaCompletionWorker(
     const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
     Napi::Function callback, common_params params,
-    std::vector<std::string> stop_words,
-    int32_t chat_format,
-    std::vector<std::string> media_paths)
+    std::vector<std::string> stop_words, int32_t chat_format,
+    const std::vector<std::string> &media_paths,
+    const std::vector<llama_token> &guide_tokens)
     : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
-      _media_paths(media_paths) {
+      _media_paths(media_paths), _guide_tokens(guide_tokens) {
   if (!callback.IsEmpty()) {
     _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
                                           "LlamaCompletionCallback", 0, 1);
@@ -66,32 +65,27 @@ void LlamaCompletionWorker::Execute() {
   // Process media if any are provided
   if (!_media_paths.empty()) {
-    const auto* mtmd_ctx = _sess->get_mtmd_ctx();
+    const auto *mtmd_ctx = _sess->get_mtmd_ctx();
     if (mtmd_ctx != nullptr) {
       // Process the media and get the tokens
       try {
-        n_cur = processMediaPrompt(
-          ctx,
-          mtmd_ctx,
-          _sess,
-          _params,
-          _media_paths
-        );
-      } catch (const std::exception& e) {
+        n_cur = processMediaPrompt(ctx, mtmd_ctx, _sess, _params, _media_paths);
+      } catch (const std::exception &e) {
         SetError(e.what());
         _sess->get_mutex().unlock();
         return;
       }
       if (n_cur <= 0) {
         SetError("Failed to process media");
         _sess->get_mutex().unlock();
         return;
       }
-      fprintf(stdout, "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
-                       n_cur, _sess->tokens_ptr()->size());
+      fprintf(stdout,
+              "[DEBUG] Media processing successful, n_cur=%zu, tokens=%zu\n",
+              n_cur, _sess->tokens_ptr()->size());
       n_input = _sess->tokens_ptr()->size();
       if (n_cur == n_input) {
@@ -105,9 +99,10 @@ void LlamaCompletionWorker::Execute() {
     }
   } else {
     // Text-only path
-    std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, _params.prompt, add_bos);
+    std::vector<llama_token> prompt_tokens =
+        ::common_tokenize(ctx, _params.prompt, add_bos);
     n_input = prompt_tokens.size();
     if (_sess->tokens_ptr()->size() > 0) {
       n_cur = common_tokens_part(*(_sess->tokens_ptr()), prompt_tokens);
       if (n_cur == n_input) {
@@ -132,7 +127,7 @@ void LlamaCompletionWorker::Execute() {
         _result.context_full = true;
         break;
       }
       const int n_left = n_cur - n_keep - 1;
       const int n_discard = n_left / 2;
@@ -147,21 +142,27 @@ void LlamaCompletionWorker::Execute() {
       n_cur -= n_discard;
       _result.truncated = true;
     }
     // For multimodal input, n_past might already be set
     // Only decode text tokens if we have any input left
     if (n_input > 0) {
-      int ret = llama_decode(
-          ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
+      int ret =
+          llama_decode(ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
       if (ret < 0) {
         SetError("Failed to decode token, code: " + std::to_string(ret));
         break;
       }
     }
     // sample the next token
-    const llama_token new_token_id =
-        common_sampler_sample(sampling.get(), ctx, -1);
+    llama_token new_token_id = common_sampler_sample(sampling.get(), ctx, -1);
+    if (_next_token_uses_guide_token && !_guide_tokens.empty() &&
+        !llama_vocab_is_control(vocab, new_token_id) &&
+        !llama_vocab_is_eog(vocab, new_token_id)) {
+      new_token_id = _guide_tokens[0];
+      _guide_tokens.erase(_guide_tokens.begin());
+    }
+    _next_token_uses_guide_token = (new_token_id == 198);
     common_sampler_accept(sampling.get(), new_token_id, true);
     // prepare the next batch
     embd->emplace_back(new_token_id);
@@ -214,20 +215,15 @@ void LlamaCompletionWorker::Execute() {
 void LlamaCompletionWorker::OnOK() {
   auto env = Napi::AsyncWorker::Env();
   auto result = Napi::Object::New(env);
-  result.Set("tokens_evaluated", Napi::Number::New(env,
-                                                   _result.tokens_evaluated));
+  result.Set("tokens_evaluated",
+             Napi::Number::New(env, _result.tokens_evaluated));
   result.Set("tokens_predicted", Napi::Number::New(Napi::AsyncWorker::Env(),
                                                    _result.tokens_predicted));
-  result.Set("truncated",
-             Napi::Boolean::New(env, _result.truncated));
-  result.Set("context_full",
-             Napi::Boolean::New(env, _result.context_full));
-  result.Set("text",
-             Napi::String::New(env, _result.text.c_str()));
-  result.Set("stopped_eos",
-             Napi::Boolean::New(env, _result.stopped_eos));
-  result.Set("stopped_words",
-             Napi::Boolean::New(env, _result.stopped_words));
+  result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
+  result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
+  result.Set("text", Napi::String::New(env, _result.text.c_str()));
+  result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
+  result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
   result.Set("stopping_word",
              Napi::String::New(env, _result.stopping_word.c_str()));
   result.Set("stopped_limited",
@@ -238,7 +234,8 @@ void LlamaCompletionWorker::OnOK() {
   std::string content;
   if (!_stop) {
     try {
-      common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
+      common_chat_msg message = common_chat_parse(
+          _result.text, static_cast<common_chat_format>(_chat_format));
       if (!message.reasoning_content.empty()) {
         reasoning_content = message.reasoning_content;
       }
@@ -266,7 +263,8 @@ void LlamaCompletionWorker::OnOK() {
     result.Set("tool_calls", tool_calls);
   }
   if (!reasoning_content.empty()) {
-    result.Set("reasoning_content", Napi::String::New(env, reasoning_content.c_str()));
+    result.Set("reasoning_content",
+               Napi::String::New(env, reasoning_content.c_str()));
   }
   if (!content.empty()) {
     result.Set("content", Napi::String::New(env, content.c_str()));
@@ -276,17 +274,33 @@ void LlamaCompletionWorker::OnOK() {
   const auto timings_token = llama_perf_context(ctx);
   auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
-  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
-  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
-  timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
-  timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
-  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
-  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
-  timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
-  timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
+  timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                  timings_token.n_p_eval));
+  timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                   timings_token.t_p_eval_ms));
+  timingsResult.Set(
+      "prompt_per_token_ms",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        timings_token.t_p_eval_ms / timings_token.n_p_eval));
+  timingsResult.Set("prompt_per_second",
+                    Napi::Number::New(Napi::AsyncWorker::Env(),
+                                      1e3 / timings_token.t_p_eval_ms *
+                                          timings_token.n_p_eval));
+  timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                     timings_token.n_eval));
+  timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(),
+                                                      timings_token.t_eval_ms));
+  timingsResult.Set(
+      "predicted_per_token_ms",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        timings_token.t_eval_ms / timings_token.n_eval));
+  timingsResult.Set(
+      "predicted_per_second",
+      Napi::Number::New(Napi::AsyncWorker::Env(),
+                        1e3 / timings_token.t_eval_ms * timings_token.n_eval));
   result.Set("timings", timingsResult);
   Napi::Promise::Deferred::Resolve(result);
 }

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -20,19 +20,16 @@ public:
                         Napi::Function callback, common_params params,
                         std::vector<std::string> stop_words,
                         int32_t chat_format,
-                        std::vector<std::string> media_paths = {});
+                        const std::vector<std::string> &media_paths = {},
+                        const std::vector<llama_token> &guide_tokens = {});
   ~LlamaCompletionWorker();
   Napi::Promise GetPromise() { return Napi::Promise::Deferred::Promise(); }
-  void OnComplete(std::function<void()> cb) {
-    _onComplete = cb;
-  }
+  void OnComplete(std::function<void()> cb) { _onComplete = cb; }
-  void SetStop() {
-    _stop = true;
-  }
+  void SetStop() { _stop = true; }
 protected:
   void Execute() override;
@@ -45,10 +42,12 @@ private:
   std::vector<std::string> _stop_words;
   int32_t _chat_format;
   std::vector<std::string> _media_paths;
+  std::vector<llama_token> _guide_tokens;
   std::function<void()> _onComplete;
   bool _has_callback = false;
   bool _stop = false;
   Napi::ThreadSafeFunction _tsfn;
+  bool _next_token_uses_guide_token = true;
   struct {
     size_t tokens_evaluated = 0;
     size_t tokens_predicted = 0;