npm - @fugood/llama.node - Versions diffs - 1.1.2 → 1.1.4 - Mend

@fugood/llama.node 1.1.2 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CMakeLists.txt +3 -0
package/lib/binding.ts +13 -0
package/lib/index.js +3 -0
package/lib/index.ts +6 -0
package/package.json +14 -14
package/src/LlamaCompletionWorker.cpp +48 -7
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +50 -3
package/src/tts_utils.h +3 -3

package/CMakeLists.txt CHANGED Viewed

@@ -114,6 +114,9 @@ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
 set(LLAMA_CURL OFF CACHE BOOL "Build curl")
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
+add_definitions(-DGGML_MAX_NAME=80)
 add_subdirectory("src/llama.cpp")
 add_subdirectory("src/llama.cpp/tools/mtmd")

package/lib/binding.ts CHANGED Viewed

@@ -55,6 +55,10 @@ export type LlamaModelOptions = {
    * Try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix.
    */
   kv_unified?: boolean
+  /**
+   * Use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
+   */
+  swa_full?: boolean
   use_mlock?: boolean
   use_mmap?: boolean
   vocab_only?: boolean
@@ -102,6 +106,8 @@ export type LlamaCompletionOptions = {
   dry_base?: number
   dry_allowed_length?: number
   dry_penalty_last_n?: number
+  dry_sequence_breakers?: string[]
+  top_n_sigma?: number
   n_predict?: number
   max_length?: number
   max_tokens?: number
@@ -111,6 +117,9 @@ export type LlamaCompletionOptions = {
   grammar_lazy?: boolean
   grammar_triggers?: { type: number; value: string; token?: number }[]
   preserved_tokens?: string[]
+  json_schema?: string
+  logit_bias?: number[][]
+  ignore_eos?: boolean
   /**
    * Path(s) to media file(s) to process before generating text.
    * When provided, the media will be processed and added to the context.
@@ -134,6 +143,7 @@ export type LlamaCompletionResult = {
   tokens_evaluated: number
   truncated: boolean
   context_full: boolean
+  interrupted: boolean
   audio_tokens?: Array<number>
   timings: {
     prompt_n: number
@@ -265,6 +275,9 @@ export interface LlamaContext {
       parallel_tool_calls?: boolean
       tool_choice?: string
       enable_thinking?: boolean
+      add_generation_prompt?: boolean
+      now?: string | number
+      chat_template_kwargs?: Record<string, string>
     },
   ): JinjaFormattedChatResult | string
   completion(

package/lib/index.js CHANGED Viewed

@@ -145,6 +145,9 @@ class LlamaContextWrapper {
             parallel_tool_calls: params === null || params === void 0 ? void 0 : params.parallel_tool_calls,
             tool_choice: params === null || params === void 0 ? void 0 : params.tool_choice,
             enable_thinking: (_a = params === null || params === void 0 ? void 0 : params.enable_thinking) !== null && _a !== void 0 ? _a : true,
+            add_generation_prompt: params === null || params === void 0 ? void 0 : params.add_generation_prompt,
+            now: params === null || params === void 0 ? void 0 : params.now,
+            chat_template_kwargs: params === null || params === void 0 ? void 0 : params.chat_template_kwargs,
         });
         if (!useJinja) {
             return {

package/lib/index.ts CHANGED Viewed

@@ -166,6 +166,9 @@ class LlamaContextWrapper {
       parallel_tool_calls?: boolean
       tool_choice?: string,
       enable_thinking?: boolean,
+      add_generation_prompt?: boolean,
+      now?: string | number,
+      chat_template_kwargs?: Record<string, string>,
     },
   ): FormattedChatResult {
     const {
@@ -186,6 +189,9 @@ class LlamaContextWrapper {
       parallel_tool_calls: params?.parallel_tool_calls,
       tool_choice: params?.tool_choice,
       enable_thinking: params?.enable_thinking ?? true,
+      add_generation_prompt: params?.add_generation_prompt,
+      now: params?.now,
+      chat_template_kwargs: params?.chat_template_kwargs,
     })
     if (!useJinja) {

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.1.2",
+  "version": "1.1.4",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -71,19 +71,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.1.2",
-    "@fugood/node-llama-linux-x64-vulkan": "1.1.2",
-    "@fugood/node-llama-linux-x64-cuda": "1.1.2",
-    "@fugood/node-llama-linux-arm64": "1.1.2",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.1.2",
-    "@fugood/node-llama-linux-arm64-cuda": "1.1.2",
-    "@fugood/node-llama-win32-x64": "1.1.2",
-    "@fugood/node-llama-win32-x64-vulkan": "1.1.2",
-    "@fugood/node-llama-win32-x64-cuda": "1.1.2",
-    "@fugood/node-llama-win32-arm64": "1.1.2",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.1.2",
-    "@fugood/node-llama-darwin-x64": "1.1.2",
-    "@fugood/node-llama-darwin-arm64": "1.1.2"
+    "@fugood/node-llama-linux-x64": "1.1.4",
+    "@fugood/node-llama-linux-x64-vulkan": "1.1.4",
+    "@fugood/node-llama-linux-x64-cuda": "1.1.4",
+    "@fugood/node-llama-linux-arm64": "1.1.4",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.1.4",
+    "@fugood/node-llama-linux-arm64-cuda": "1.1.4",
+    "@fugood/node-llama-win32-x64": "1.1.4",
+    "@fugood/node-llama-win32-x64-vulkan": "1.1.4",
+    "@fugood/node-llama-win32-x64-cuda": "1.1.4",
+    "@fugood/node-llama-win32-arm64": "1.1.4",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.1.4",
+    "@fugood/node-llama-darwin-x64": "1.1.4",
+    "@fugood/node-llama-darwin-arm64": "1.1.4"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -64,6 +64,7 @@ void LlamaCompletionWorker::Execute() {
   size_t n_input = 0;
   const auto model = _sess->model();
   auto vocab = llama_model_get_vocab(model);
+  const bool is_enc_dec = llama_model_has_encoder(model);
   const bool add_bos = llama_vocab_get_add_bos(vocab);
   auto ctx = _sess->context();
@@ -110,7 +111,7 @@ void LlamaCompletionWorker::Execute() {
   } else {
     // Text-only path
     std::vector<llama_token> prompt_tokens =
-        ::common_tokenize(ctx, _params.prompt, add_bos, true);
+        ::common_tokenize(ctx, _params.prompt, add_bos || is_enc_dec, true);
     n_input = prompt_tokens.size();
     if (_sess->tokens_ptr()->size() > 0) {
@@ -126,10 +127,48 @@ void LlamaCompletionWorker::Execute() {
   }
   const int max_len = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
-  _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
   auto embd = _sess->tokens_ptr();
-  for (int i = 0; (i < max_len || _stop) && !_params.vocab_only; i++) {
+  embd->reserve(embd->size() + max_len);
+  if (is_enc_dec) {
+    if (n_input > 0) {
+      // Decode tokens in batches using n_batch as chunk size
+      int n_past_batch = n_cur;
+      int n_remaining = n_input;
+      while (n_remaining > 0) {
+        int n_eval = n_remaining;
+        if (n_eval > _params.n_batch) {
+          n_eval = _params.n_batch;
+        }
+        int ret = llama_encode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
+        if (ret < 0) {
+          SetError("Failed to encode token batch, code: " + std::to_string(ret) +
+                   ", n_eval: " + std::to_string(n_eval) +
+                   ", n_past_batch: " + std::to_string(n_past_batch));
+          _sess->get_mutex().unlock();
+          return;
+        }
+        n_past_batch += n_eval;
+        n_remaining -= n_eval;
+        n_cur += n_eval;
+      }
+    }
+    _result.tokens_evaluated += n_input;
+    llama_token decode_bos = llama_model_decoder_start_token(model);
+    if (decode_bos == LLAMA_TOKEN_NULL) {
+      decode_bos = llama_vocab_bos(vocab);
+    }
+    embd->emplace_back(decode_bos);
+    common_sampler_accept(sampling.get(), decode_bos, false);
+    n_input = 1;
+  }
+  for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
     // check if we need to remove some tokens
     if (embd->size() >= _params.n_ctx) {
       if (!_params.ctx_shift) {
@@ -166,13 +205,14 @@ void LlamaCompletionWorker::Execute() {
         if (n_eval > _params.n_batch) {
           n_eval = _params.n_batch;
         }
         int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
         if (ret < 0) {
           SetError("Failed to decode token batch, code: " + std::to_string(ret) +
                    ", n_eval: " + std::to_string(n_eval) +
                    ", n_past_batch: " + std::to_string(n_past_batch));
-          break;
+          _sess->get_mutex().unlock();
+          return;
         }
         n_past_batch += n_eval;
@@ -256,6 +296,7 @@ void LlamaCompletionWorker::OnOK() {
                                                    _result.tokens_predicted));
   result.Set("truncated", Napi::Boolean::New(env, _result.truncated));
   result.Set("context_full", Napi::Boolean::New(env, _result.context_full));
+  result.Set("interrupted", Napi::Boolean::New(env, _interrupted));
   result.Set("text", Napi::String::New(env, _result.text.c_str()));
   result.Set("stopped_eos", Napi::Boolean::New(env, _result.stopped_eos));
   result.Set("stopped_words", Napi::Boolean::New(env, _result.stopped_words));
@@ -267,7 +308,7 @@ void LlamaCompletionWorker::OnOK() {
   Napi::Array tool_calls = Napi::Array::New(Napi::AsyncWorker::Env());
   std::string reasoning_content = "";
   std::string content;
-  if (!_stop) {
+  if (!_interrupted) {
     try {
       common_chat_syntax chat_syntax;
       chat_syntax.format = static_cast<common_chat_format>(_chat_format);

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -34,7 +34,7 @@ public:
   void OnComplete(std::function<void()> cb) { _onComplete = cb; }
-  void SetStop() { _stop = true; }
+  void SetStop() { _interrupted = true; }
 protected:
   void Execute() override;
@@ -52,7 +52,7 @@ private:
   std::vector<llama_token> _guide_tokens;
   std::function<void()> _onComplete;
   bool _has_callback = false;
-  bool _stop = false;
+  bool _interrupted = false;
   Napi::ThreadSafeFunction _tsfn;
   bool _next_token_uses_guide_token = true;
   bool _has_vocoder;

package/src/LlamaContext.cpp CHANGED Viewed

@@ -248,6 +248,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
       get_option<std::string>(options, "cache_type_v", "f16").c_str());
   params.ctx_shift = get_option<bool>(options, "ctx_shift", true);
   params.kv_unified = get_option<bool>(options, "kv_unified", false);
+  params.swa_full = get_option<bool>(options, "swa_full", false);
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
   params.use_mmap = get_option<bool>(options, "use_mmap", true);
@@ -504,7 +505,10 @@ common_chat_params getFormattedChatWithJinja(
     const std::string &chat_template, const std::string &json_schema,
     const std::string &tools, const bool &parallel_tool_calls,
     const std::string &tool_choice,
-    const bool &enable_thinking
+    const bool &enable_thinking,
+    const bool &add_generation_prompt,
+    const std::string &now_str,
+    const std::map<std::string, std::string> &chat_template_kwargs
 ) {
   common_chat_templates_inputs inputs;
   inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
@@ -520,6 +524,21 @@ common_chat_params getFormattedChatWithJinja(
     inputs.json_schema = json::parse(json_schema);
   }
   inputs.enable_thinking = enable_thinking;
+  inputs.add_generation_prompt = add_generation_prompt;
+  // Handle now parameter - parse timestamp or use current time
+  if (!now_str.empty()) {
+    try {
+      // Try to parse as timestamp (seconds since epoch)
+      auto timestamp = std::stoll(now_str);
+      inputs.now = std::chrono::system_clock::from_time_t(timestamp);
+    } catch (...) {
+      // If parsing fails, use current time
+      inputs.now = std::chrono::system_clock::now();
+    }
+  }
+  inputs.chat_template_kwargs = chat_template_kwargs;
   // If chat_template is provided, create new one and use it (probably slow)
   if (!chat_template.empty()) {
@@ -594,12 +613,26 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
         get_option<bool>(params, "parallel_tool_calls", false);
     auto tool_choice = get_option<std::string>(params, "tool_choice", "");
     auto enable_thinking = get_option<bool>(params, "enable_thinking", false);
+    auto add_generation_prompt = get_option<bool>(params, "add_generation_prompt", true);
+    auto now_str = get_option<std::string>(params, "now", "");
+    std::map<std::string, std::string> chat_template_kwargs;
+    if (params.Has("chat_template_kwargs") && params.Get("chat_template_kwargs").IsObject()) {
+      auto kwargs_obj = params.Get("chat_template_kwargs").As<Napi::Object>();
+      auto props = kwargs_obj.GetPropertyNames();
+      for (uint32_t i = 0; i < props.Length(); i++) {
+        auto key = props.Get(i).ToString().Utf8Value();
+        auto val = kwargs_obj.Get(key).ToString().Utf8Value();
+        chat_template_kwargs[key] = val;
+      }
+    }
     common_chat_params chatParams;
     try {
       chatParams = getFormattedChatWithJinja(
           _sess, _templates, messages, chat_template, json_schema_str, tools_str,
-          parallel_tool_calls, tool_choice, enable_thinking);
+          parallel_tool_calls, tool_choice, enable_thinking,
+          add_generation_prompt, now_str, chat_template_kwargs);
     } catch (const std::exception &e) {
       Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
       return env.Undefined();
@@ -808,13 +841,27 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
       auto tool_choice =
           get_option<std::string>(options, "tool_choice", "none");
       auto enable_thinking = get_option<bool>(options, "enable_thinking", true);
+      auto add_generation_prompt = get_option<bool>(options, "add_generation_prompt", true);
+      auto now_str = get_option<std::string>(options, "now", "");
+      std::map<std::string, std::string> chat_template_kwargs;
+      if (options.Has("chat_template_kwargs") && options.Get("chat_template_kwargs").IsObject()) {
+        auto kwargs_obj = options.Get("chat_template_kwargs").As<Napi::Object>();
+        auto props = kwargs_obj.GetPropertyNames();
+        for (uint32_t i = 0; i < props.Length(); i++) {
+          auto key = props.Get(i).ToString().Utf8Value();
+          auto val = kwargs_obj.Get(key).ToString().Utf8Value();
+          chat_template_kwargs[key] = val;
+        }
+      }
       common_chat_params chatParams;
       try {
         chatParams = getFormattedChatWithJinja(
             _sess, _templates, json_stringify(messages), chat_template,
-            json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking);
+            json_schema_str, tools_str, parallel_tool_calls, tool_choice, enable_thinking,
+            add_generation_prompt, now_str, chat_template_kwargs);
       } catch (const std::exception &e) {
         Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
         return env.Undefined();

package/src/tts_utils.h CHANGED Viewed

@@ -68,7 +68,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
 static const char *OUTETTS_V1_GRAMMAR = R"(
 root       ::= NL? wordAudioBlock+ audioEnd NL eos?
 wordAudioBlock ::= WORD codeBlock NL
-codeBlock ::= TIME CODE{1,144}
+codeBlock ::= TIME CODE*
 eos      ::= "<|im_end|>"
 codeStart ::= "<|code_start|>"
 codeEnd ::= "<|code_end|>"
@@ -85,7 +85,7 @@ static const char *OUTETTS_V2_GRAMMAR = R"(
 root       ::= NL? content+ audioEnd NL eos?
 content ::= wordAudioBlock | emotionBlock
 wordAudioBlock ::= WORD punch* codeBlock space NL
-codeBlock ::= TIME CODE{1,144}
+codeBlock ::= TIME CODE*
 emotionBlock ::= emotionStart TEXT emotionEnd space NL
 TEXT ::= [A-Za-z0-9 .,?!]+
 eos      ::= "<|im_end|>"
@@ -94,7 +94,7 @@ emotionEnd ::= "<|emotion_end|>"
 audioEnd   ::= "<|audio_end|>"
 space      ::= "<|space|>"
 WORD       ::= [A-Za-z]+
-NL         ::= "\n"
+NL         ::= [\n]
 TIME  ::= "<|t_" DECIMAL "|>"
 CODE    ::= "<|" DIGITS "|>"
 DIGITS     ::= [0-9]+