npm - @fugood/llama.node - Versions diffs - 1.4.3 → 1.4.4 - Mend

@fugood/llama.node 1.4.3 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/lib/binding.ts +8 -0
package/lib/index.js +9 -0
package/lib/index.ts +10 -0
package/package.json +15 -15
package/src/LlamaContext.cpp +24 -0
package/src/LlamaContext.h +3 -0
package/src/llama.cpp/common/arg.cpp +19 -7
package/src/llama.cpp/common/common.cpp +46 -2
package/src/llama.cpp/common/common.h +7 -0
package/src/llama.cpp/common/log.cpp +3 -26
package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
package/src/llama.cpp/ggml/include/ggml-zendnn.h +22 -0
package/src/llama.cpp/ggml/include/ggml.h +22 -0
package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +0 -2
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +333 -0
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +51 -125
package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +38 -11
package/src/llama.cpp/src/llama-model.cpp +4 -0
package/src/llama.cpp/src/llama-quant.cpp +0 -29
package/src/llama.cpp/src/models/deepseek2.cpp +18 -0
package/src/llama.cpp/src/unicode.cpp +2 -2

package/lib/binding.ts CHANGED Viewed

@@ -565,6 +565,14 @@ export interface LlamaContext {
    */
   cancelRequest(requestId: number): void
+  /**
+   * Clear the KV and recurrent caches.
+   * This is faster than recreating the context and useful for preventing
+   * cache contamination between chat sessions.
+   * @param clearData If true, also clears the cache data (default: false)
+   */
+  clearCache(clearData?: boolean): void
   // static
   loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
   toggleNativeLog(

package/lib/index.js CHANGED Viewed

@@ -195,6 +195,15 @@ class LlamaContextWrapper {
     decodeAudioTokens(tokens) {
         return this.ctx.decodeAudioTokens(tokens);
     }
+    /**
+     * Clear the KV and recurrent caches.
+     * This is faster than recreating the context and useful for preventing
+     * cache contamination between chat sessions.
+     * @param clearData If true, also clears the cache data (default: false)
+     */
+    clearCache(clearData) {
+        this.ctx.clearCache(clearData);
+    }
 }
 const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
     var _a, _b;

package/lib/index.ts CHANGED Viewed

@@ -299,6 +299,16 @@ class LlamaContextWrapper {
   decodeAudioTokens(tokens: number[] | Int32Array): Promise<Float32Array> {
     return this.ctx.decodeAudioTokens(tokens)
   }
+  /**
+   * Clear the KV and recurrent caches.
+   * This is faster than recreating the context and useful for preventing
+   * cache contamination between chat sessions.
+   * @param clearData If true, also clears the cache data (default: false)
+   */
+  clearCache(clearData?: boolean): void {
+    this.ctx.clearCache(clearData)
+  }
 }
 export const loadModel = async (

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.4.3",
+  "version": "1.4.4",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,20 +72,20 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-darwin-arm64": "1.4.3",
-    "@fugood/node-llama-darwin-x64": "1.4.3",
-    "@fugood/node-llama-linux-arm64": "1.4.3",
-    "@fugood/node-llama-linux-arm64-cuda": "1.4.3",
-    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.3",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.4.3",
-    "@fugood/node-llama-linux-x64": "1.4.3",
-    "@fugood/node-llama-linux-x64-cuda": "1.4.3",
-    "@fugood/node-llama-linux-x64-vulkan": "1.4.3",
-    "@fugood/node-llama-win32-arm64": "1.4.3",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.4.3",
-    "@fugood/node-llama-win32-x64": "1.4.3",
-    "@fugood/node-llama-win32-x64-cuda": "1.4.3",
-    "@fugood/node-llama-win32-x64-vulkan": "1.4.3"
+    "@fugood/node-llama-darwin-arm64": "1.4.4",
+    "@fugood/node-llama-darwin-x64": "1.4.4",
+    "@fugood/node-llama-linux-arm64": "1.4.4",
+    "@fugood/node-llama-linux-arm64-cuda": "1.4.4",
+    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.4",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.4.4",
+    "@fugood/node-llama-linux-x64": "1.4.4",
+    "@fugood/node-llama-linux-x64-cuda": "1.4.4",
+    "@fugood/node-llama-linux-x64-vulkan": "1.4.4",
+    "@fugood/node-llama-win32-arm64": "1.4.4",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.4.4",
+    "@fugood/node-llama-win32-x64": "1.4.4",
+    "@fugood/node-llama-win32-x64-cuda": "1.4.4",
+    "@fugood/node-llama-win32-x64-vulkan": "1.4.4"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaContext.cpp CHANGED Viewed

@@ -200,6 +200,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::CancelRequest>(
            "cancelRequest",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::ClearCache>(
+           "clearCache",
            static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
@@ -1505,3 +1508,24 @@ Napi::Value LlamaContext::DecodeAudioTokens(const Napi::CallbackInfo &info) {
   worker->Queue();
   return worker->Promise();
 }
+// clearCache(clearData?: boolean): void
+void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  if (!_rn_ctx) {
+    Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
+    return;
+  }
+  if (_rn_ctx->completion != nullptr && _rn_ctx->completion->is_predicting) {
+    Napi::TypeError::New(env, "Cannot clear cache while completion is in progress")
+        .ThrowAsJavaScriptException();
+    return;
+  }
+  bool clear_data = false;
+  if (info.Length() >= 1 && info[0].IsBoolean()) {
+    clear_data = info[0].ToBoolean().Value();
+  }
+  _rn_ctx->clearCache(clear_data);
+}

package/src/LlamaContext.h CHANGED Viewed

@@ -69,6 +69,9 @@ private:
   Napi::Value QueueRerank(const Napi::CallbackInfo &info);
   void CancelRequest(const Napi::CallbackInfo &info);
+  // Cache management
+  void ClearCache(const Napi::CallbackInfo &info);
   std::string _info;
   std::vector<std::string> _used_devices;
   Napi::Object _meta;

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -427,7 +427,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     // model is required (except for server)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
         throw std::invalid_argument("error: --model is required\n");
     }
@@ -708,6 +708,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         params.use_jinja = true;
     }
+    params.use_color = tty_can_use_colors();
     // load dynamic backends
     ggml_backend_load_all();
@@ -790,10 +792,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
-        {"-co", "--color"},
-        string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](common_params & params) {
-            params.use_color = true;
+        {"-co", "--color"}, "[on|off|auto]",
+        "Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
+        "'auto' enables colors when output is to a terminal",
+        [](common_params & params, const std::string & value) {
+            if (is_truthy(value)) {
+                params.use_color = true;
+            } else if (is_falsey(value)) {
+                params.use_color = false;
+            } else if (is_autoy(value)) {
+                params.use_color = tty_can_use_colors();
+            } else {
+                throw std::invalid_argument(
+                    string_format("error: unknown value for --color: '%s'\n", value.c_str()));
+            }
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
@@ -1022,7 +1034,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                                params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
                            } else {
                                throw std::runtime_error(
-                                   string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
+                                   string_format("error: unknown value for --flash-attn: '%s'\n", value.c_str()));
                            }
                        }).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(common_arg(
@@ -2696,7 +2708,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
             } else {
                 throw std::invalid_argument(
-                    string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
             }
         }
     ).set_env("LLAMA_LOG_COLORS"));

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -786,11 +786,29 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
 #include <iostream>
+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+    if (size <= 0) {
+        return std::wstring();
+    }
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+    return wstr;
+}
+#endif
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);
     // if the path already exists, check whether it's a directory
     const DWORD attributes = GetFileAttributesW(wpath.c_str());
@@ -964,6 +982,32 @@ std::vector<common_file_info> fs_list(const std::string & path, bool include_dir
     return files;
 }
+//
+// TTY utils
+//
+bool tty_can_use_colors() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+    return stdout_is_tty || stderr_is_tty;
+}
 //
 // Model utils

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -656,6 +656,13 @@ struct common_file_info {
 };
 std::vector<common_file_info> fs_list(const std::string & path, bool include_directories);
+//
+// TTY utils
+//
+// Auto-detect if colors can be enabled based on terminal and environment
+bool tty_can_use_colors();
 //
 // Model utils
 //

package/src/llama.cpp/common/log.cpp CHANGED Viewed

@@ -1,3 +1,4 @@
+#include "common.h"
 #include "log.h"
 #include <chrono>
@@ -26,30 +27,6 @@ void common_log_set_verbosity_thold(int verbosity) {
     common_log_verbosity_thold = verbosity;
 }
-// Auto-detect if colors should be enabled based on terminal and environment
-static bool common_log_should_use_colors_auto() {
-    // Check NO_COLOR environment variable (https://no-color.org/)
-    if (const char * no_color = std::getenv("NO_COLOR")) {
-        if (no_color[0] != '\0') {
-            return false;
-        }
-    }
-    // Check TERM environment variable
-    if (const char * term = std::getenv("TERM")) {
-        if (std::strcmp(term, "dumb") == 0) {
-            return false;
-        }
-    }
-    // Check if stdout and stderr are connected to a terminal
-    // We check both because log messages can go to either
-    bool stdout_is_tty = isatty(fileno(stdout));
-    bool stderr_is_tty = isatty(fileno(stderr));
-    return stdout_is_tty || stderr_is_tty;
-}
 static int64_t t_us() {
     return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -391,7 +368,7 @@ struct common_log * common_log_main() {
     static std::once_flag    init_flag;
     std::call_once(init_flag, [&]() {
         // Set default to auto-detect colors
-        log.set_colors(common_log_should_use_colors_auto());
+        log.set_colors(tty_can_use_colors());
     });
     return &log;
@@ -422,7 +399,7 @@ void common_log_set_file(struct common_log * log, const char * file) {
 void common_log_set_colors(struct common_log * log, log_colors colors) {
     if (colors == LOG_COLORS_AUTO) {
-        log->set_colors(common_log_should_use_colors_auto());
+        log->set_colors(tty_can_use_colors());
         return;
     }

package/src/llama.cpp/ggml/CMakeLists.txt CHANGED Viewed

@@ -253,6 +253,9 @@ option(GGML_HEXAGON                         "ggml: enable Hexagon backend"
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
+option(GGML_ZENDNN                          "ggml: use ZenDNN"                                OFF)
+option(ZENDNN_ROOT                          "ggml: path to ZenDNN installation"               "")
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
 option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
@@ -314,6 +317,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-sycl.h
     include/ggml-vulkan.h
     include/ggml-webgpu.h
+    include/ggml-zendnn.h
     include/gguf.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")

package/src/llama.cpp/ggml/include/ggml-rpc.h CHANGED Viewed

@@ -1,6 +1,5 @@
 #pragma once
-#include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
@@ -8,7 +7,7 @@ extern "C" {
 #endif
 #define RPC_PROTO_MAJOR_VERSION    3
-#define RPC_PROTO_MINOR_VERSION    5
+#define RPC_PROTO_MINOR_VERSION    6
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

package/src/llama.cpp/ggml/include/ggml-zendnn.h ADDED Viewed

@@ -0,0 +1,22 @@
+#pragma once
+#include "ggml-backend.h"
+#include "ggml.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void);
+GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend);
+// number of threads used for zendnn operations
+GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void);
+#ifdef __cplusplus
+}
+#endif

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -2196,6 +2196,15 @@ extern "C" {
             int                  p2,
             int                  p3);
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   p0,
+            int                   p1,
+            int                   p2,
+            int                   p3);
     GGML_API struct ggml_tensor * ggml_pad_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2209,6 +2218,19 @@ extern "C" {
             int                  rp3
             );
+    // pad each dimension with values on the other side of the torus (looping around)
+    GGML_API struct ggml_tensor * ggml_pad_ext_circular(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   lp0,
+            int                   rp0,
+            int                   lp1,
+            int                   rp1,
+            int                   lp2,
+            int                   rp2,
+            int                   lp3,
+            int                   rp3);
     // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
     GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
             struct ggml_context * ctx,

package/src/llama.cpp/ggml/src/CMakeLists.txt CHANGED Viewed

@@ -440,6 +440,7 @@ ggml_add_backend(WebGPU)
 ggml_add_backend(zDNN)
 ggml_add_backend(OpenCL)
 ggml_add_backend(Hexagon)
+ggml_add_backend(ZenDNN)
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)

package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp CHANGED Viewed

@@ -505,7 +505,6 @@ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
     constexpr int blocklen          = 8;
     assert(n % qk == 0);
-    assert(nr % 4 == 0);
     assert(nc % ncols_interleaved == 0);
     UNUSED(nb);
@@ -645,7 +644,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
     constexpr int blocklen          = 8;
     assert(n % qk == 0);
-    assert(nr % 4 == 0);
     assert(nc % ncols_interleaved == 0);
     UNUSED(nb);