npm - @fugood/llama.node - Versions diffs - 1.1.3 → 1.1.4 - Mend

@fugood/llama.node 1.1.3 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/CMakeLists.txt +3 -0
package/package.json +14 -14
package/src/LlamaCompletionWorker.cpp +45 -5

package/CMakeLists.txt CHANGED Viewed

@@ -114,6 +114,9 @@ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
 set(LLAMA_CURL OFF CACHE BOOL "Build curl")
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
+add_definitions(-DGGML_MAX_NAME=80)
 add_subdirectory("src/llama.cpp")
 add_subdirectory("src/llama.cpp/tools/mtmd")

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.1.3",
+  "version": "1.1.4",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -71,19 +71,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.1.3",
-    "@fugood/node-llama-linux-x64-vulkan": "1.1.3",
-    "@fugood/node-llama-linux-x64-cuda": "1.1.3",
-    "@fugood/node-llama-linux-arm64": "1.1.3",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.1.3",
-    "@fugood/node-llama-linux-arm64-cuda": "1.1.3",
-    "@fugood/node-llama-win32-x64": "1.1.3",
-    "@fugood/node-llama-win32-x64-vulkan": "1.1.3",
-    "@fugood/node-llama-win32-x64-cuda": "1.1.3",
-    "@fugood/node-llama-win32-arm64": "1.1.3",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.1.3",
-    "@fugood/node-llama-darwin-x64": "1.1.3",
-    "@fugood/node-llama-darwin-arm64": "1.1.3"
+    "@fugood/node-llama-linux-x64": "1.1.4",
+    "@fugood/node-llama-linux-x64-vulkan": "1.1.4",
+    "@fugood/node-llama-linux-x64-cuda": "1.1.4",
+    "@fugood/node-llama-linux-arm64": "1.1.4",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.1.4",
+    "@fugood/node-llama-linux-arm64-cuda": "1.1.4",
+    "@fugood/node-llama-win32-x64": "1.1.4",
+    "@fugood/node-llama-win32-x64-vulkan": "1.1.4",
+    "@fugood/node-llama-win32-x64-cuda": "1.1.4",
+    "@fugood/node-llama-win32-arm64": "1.1.4",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.1.4",
+    "@fugood/node-llama-darwin-x64": "1.1.4",
+    "@fugood/node-llama-darwin-arm64": "1.1.4"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -64,6 +64,7 @@ void LlamaCompletionWorker::Execute() {
   size_t n_input = 0;
   const auto model = _sess->model();
   auto vocab = llama_model_get_vocab(model);
+  const bool is_enc_dec = llama_model_has_encoder(model);
   const bool add_bos = llama_vocab_get_add_bos(vocab);
   auto ctx = _sess->context();
@@ -110,7 +111,7 @@ void LlamaCompletionWorker::Execute() {
   } else {
     // Text-only path
     std::vector<llama_token> prompt_tokens =
-        ::common_tokenize(ctx, _params.prompt, add_bos, true);
+        ::common_tokenize(ctx, _params.prompt, add_bos || is_enc_dec, true);
     n_input = prompt_tokens.size();
     if (_sess->tokens_ptr()->size() > 0) {
@@ -126,9 +127,47 @@ void LlamaCompletionWorker::Execute() {
   }
   const int max_len = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
-  _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
   auto embd = _sess->tokens_ptr();
+  embd->reserve(embd->size() + max_len);
+  if (is_enc_dec) {
+    if (n_input > 0) {
+      // Decode tokens in batches using n_batch as chunk size
+      int n_past_batch = n_cur;
+      int n_remaining = n_input;
+      while (n_remaining > 0) {
+        int n_eval = n_remaining;
+        if (n_eval > _params.n_batch) {
+          n_eval = _params.n_batch;
+        }
+        int ret = llama_encode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
+        if (ret < 0) {
+          SetError("Failed to encode token batch, code: " + std::to_string(ret) +
+                   ", n_eval: " + std::to_string(n_eval) +
+                   ", n_past_batch: " + std::to_string(n_past_batch));
+          _sess->get_mutex().unlock();
+          return;
+        }
+        n_past_batch += n_eval;
+        n_remaining -= n_eval;
+        n_cur += n_eval;
+      }
+    }
+    _result.tokens_evaluated += n_input;
+    llama_token decode_bos = llama_model_decoder_start_token(model);
+    if (decode_bos == LLAMA_TOKEN_NULL) {
+      decode_bos = llama_vocab_bos(vocab);
+    }
+    embd->emplace_back(decode_bos);
+    common_sampler_accept(sampling.get(), decode_bos, false);
+    n_input = 1;
+  }
   for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
     // check if we need to remove some tokens
     if (embd->size() >= _params.n_ctx) {
@@ -166,13 +205,14 @@ void LlamaCompletionWorker::Execute() {
         if (n_eval > _params.n_batch) {
           n_eval = _params.n_batch;
         }
         int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
         if (ret < 0) {
           SetError("Failed to decode token batch, code: " + std::to_string(ret) +
                    ", n_eval: " + std::to_string(n_eval) +
                    ", n_past_batch: " + std::to_string(n_past_batch));
-          break;
+          _sess->get_mutex().unlock();
+          return;
         }
         n_past_batch += n_eval;