npm - @fugood/llama.node - Versions diffs - 1.4.8 → 1.4.10 - Mend

@fugood/llama.node 1.4.8 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/lib/binding.ts +43 -0
package/lib/parallel.js +26 -0
package/lib/parallel.ts +33 -0
package/package.json +15 -15
package/scripts/llama.cpp.patch +12 -14
package/src/LlamaCompletionWorker.cpp +3 -1
package/src/LlamaCompletionWorker.h +2 -0
package/src/LlamaContext.cpp +16 -1
package/src/LlamaContext.h +3 -0
package/src/llama.cpp/common/CMakeLists.txt +4 -4
package/src/llama.cpp/common/arg.cpp +159 -42
package/src/llama.cpp/common/arg.h +10 -1
package/src/llama.cpp/common/common.cpp +1 -1
package/src/llama.cpp/common/common.h +6 -2
package/src/llama.cpp/common/preset.cpp +197 -5
package/src/llama.cpp/common/preset.h +45 -3
package/src/llama.cpp/common/sampling.cpp +51 -37
package/src/llama.cpp/common/sampling.h +6 -3
package/src/llama.cpp/common/speculative.cpp +1 -1
package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
package/src/llama.cpp/src/llama-arch.cpp +1 -1
package/src/llama.cpp/src/llama-mmap.cpp +123 -28
package/src/llama.cpp/src/llama-mmap.h +5 -1
package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
package/src/llama.cpp/src/llama-model.cpp +7 -5
package/src/llama.cpp/src/llama-sampling.cpp +16 -0
package/src/llama.cpp/src/llama.cpp +22 -32

package/lib/binding.ts CHANGED Viewed

@@ -120,6 +120,8 @@ export type LlamaCompletionOptions = {
   tool_choice?: string
   enable_thinking?: boolean
   thinking_forced_open?: boolean
+  /** Serialized PEG parser for chat output parsing (required for PEG format types) */
+  chat_parser?: string
   prompt?: string
   /**
    * Text to prefill the response with.
@@ -415,6 +417,8 @@ export type JinjaFormattedChatResult = {
   thinking_forced_open: boolean
   preserved_tokens: string[]
   additional_stops: string[]
+  /** Serialized PEG parser for chat output parsing (required for PEG format types) */
+  chat_parser: string
 }
 export type Tool = {
@@ -435,6 +439,24 @@ export type ToolCall = {
   id?: string
 }
+export type ParallelRequestStatus = {
+  request_id: number
+  type: 'completion' | 'embedding' | 'rerank'
+  state: 'queued' | 'processing_prompt' | 'generating' | 'done'
+  prompt_length: number
+  tokens_generated: number
+  prompt_ms: number
+  generation_ms: number
+  tokens_per_second: number
+}
+export type ParallelStatus = {
+  n_parallel: number
+  active_slots: number
+  queued_requests: number
+  requests: ParallelRequestStatus[]
+}
 export interface LlamaContext {
   new (
     options: LlamaModelOptions,
@@ -612,6 +634,27 @@ export interface LlamaContext {
    */
   cancelRequest(requestId: number): void
+  /**
+   * Get current parallel processing status (one-time snapshot)
+   * @returns Current parallel status
+   */
+  getParallelStatus(): ParallelStatus
+  /**
+   * Subscribe to parallel processing status changes
+   * @param callback Called whenever parallel status changes
+   * @returns Subscriber ID that can be used to unsubscribe
+   */
+  subscribeParallelStatus(
+    callback: (status: ParallelStatus) => void,
+  ): { subscriberId: number }
+  /**
+   * Unsubscribe from parallel processing status changes
+   * @param subscriberId Subscriber ID returned from subscribeParallelStatus
+   */
+  unsubscribeParallelStatus(subscriberId: number): void
   /**
    * Clear the KV and recurrent caches.
    * This is faster than recreating the context and useful for preventing

package/lib/parallel.js CHANGED Viewed

@@ -212,5 +212,31 @@ class LlamaParallelAPI {
     isEnabled() {
         return this.enabled;
     }
+    /**
+     * Get current parallel processing status (one-time snapshot)
+     * @returns Current parallel status
+     */
+    getStatus() {
+        if (!this.enabled) {
+            throw new Error('Parallel mode is not enabled. Call enable() first.');
+        }
+        return this.context.getParallelStatus();
+    }
+    /**
+     * Subscribe to parallel processing status changes
+     * @param callback Called whenever parallel status changes
+     * @returns Object with remove() method to unsubscribe
+     */
+    subscribeToStatus(callback) {
+        if (!this.enabled) {
+            throw new Error('Parallel mode is not enabled. Call enable() first.');
+        }
+        const { subscriberId } = this.context.subscribeParallelStatus(callback);
+        return {
+            remove: () => {
+                this.context.unsubscribeParallelStatus(subscriberId);
+            },
+        };
+    }
 }
 exports.LlamaParallelAPI = LlamaParallelAPI;

package/lib/parallel.ts CHANGED Viewed

@@ -4,6 +4,7 @@ import type {
   LlamaCompletionOptions,
   LlamaCompletionToken,
   RerankParams,
+  ParallelStatus,
 } from './binding'
 import { formatMediaChat } from './utils'
@@ -278,4 +279,36 @@ export class LlamaParallelAPI {
   isEnabled(): boolean {
     return this.enabled
   }
+  /**
+   * Get current parallel processing status (one-time snapshot)
+   * @returns Current parallel status
+   */
+  getStatus(): ParallelStatus {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    return this.context.getParallelStatus()
+  }
+  /**
+   * Subscribe to parallel processing status changes
+   * @param callback Called whenever parallel status changes
+   * @returns Object with remove() method to unsubscribe
+   */
+  subscribeToStatus(
+    callback: (status: ParallelStatus) => void,
+  ): { remove: () => void } {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    const { subscriberId } = this.context.subscribeParallelStatus(callback)
+    return {
+      remove: () => {
+        this.context.unsubscribeParallelStatus(subscriberId)
+      },
+    }
+  }
 }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.4.8",
+  "version": "1.4.10",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,20 +72,20 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-darwin-arm64": "1.4.8",
-    "@fugood/node-llama-darwin-x64": "1.4.8",
-    "@fugood/node-llama-linux-arm64": "1.4.8",
-    "@fugood/node-llama-linux-arm64-cuda": "1.4.8",
-    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.8",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.4.8",
-    "@fugood/node-llama-linux-x64": "1.4.8",
-    "@fugood/node-llama-linux-x64-cuda": "1.4.8",
-    "@fugood/node-llama-linux-x64-vulkan": "1.4.8",
-    "@fugood/node-llama-win32-arm64": "1.4.8",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.4.8",
-    "@fugood/node-llama-win32-x64": "1.4.8",
-    "@fugood/node-llama-win32-x64-cuda": "1.4.8",
-    "@fugood/node-llama-win32-x64-vulkan": "1.4.8"
+    "@fugood/node-llama-darwin-arm64": "1.4.10",
+    "@fugood/node-llama-darwin-x64": "1.4.10",
+    "@fugood/node-llama-linux-arm64": "1.4.10",
+    "@fugood/node-llama-linux-arm64-cuda": "1.4.10",
+    "@fugood/node-llama-linux-arm64-snapdragon": "1.4.10",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.4.10",
+    "@fugood/node-llama-linux-x64": "1.4.10",
+    "@fugood/node-llama-linux-x64-cuda": "1.4.10",
+    "@fugood/node-llama-linux-x64-vulkan": "1.4.10",
+    "@fugood/node-llama-win32-arm64": "1.4.10",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.4.10",
+    "@fugood/node-llama-win32-x64": "1.4.10",
+    "@fugood/node-llama-win32-x64-cuda": "1.4.10",
+    "@fugood/node-llama-win32-x64-vulkan": "1.4.10"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/scripts/llama.cpp.patch CHANGED Viewed

@@ -1,25 +1,23 @@
 diff --git a/src/llama.cpp/common/CMakeLists.txt b/src/llama.cpp/common/CMakeLists.txt
-index 0182767c2..f8c4a4f63 100644
+index f7b99159e..fa37fed19 100644
 --- a/src/llama.cpp/common/CMakeLists.txt
 +++ b/src/llama.cpp/common/CMakeLists.txt
-@@ -151,9 +151,16 @@ if (LLAMA_LLGUIDANCE)
+@@ -154,8 +154,14 @@ if (LLAMA_LLGUIDANCE)
      set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
  endif ()
+-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
 +# Add Windows socket libraries unconditionally on Windows
 +if (WIN32)
 +    set(LLAMA_COMMON_WIN_LIBS ws2_32)
 +else()
 +    set(LLAMA_COMMON_WIN_LIBS "")
 +endif()
-+
- target_include_directories(${TARGET} PUBLIC . ../vendor)
- target_compile_features   (${TARGET} PUBLIC cxx_std_17)
--target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
-+target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
++target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
  #
+ # copy the license files
 diff --git a/src/llama.cpp/common/chat-peg-parser.cpp b/src/llama.cpp/common/chat-peg-parser.cpp
 index 1bcba9cd8..b7cd68734 100644
 --- a/src/llama.cpp/common/chat-peg-parser.cpp
@@ -98,7 +96,7 @@ index 6085510a4..263076ce2 100644
  struct common_chat_tool_call {
      std::string name;
 diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
-index 5a8cf5248..8010a990e 100644
+index d4e8c7405..af3dec813 100644
 --- a/src/llama.cpp/common/common.cpp
 +++ b/src/llama.cpp/common/common.cpp
@@ -1343,6 +1343,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
@@ -110,7 +108,7 @@ index 5a8cf5248..8010a990e 100644
      mparams.split_mode      = params.split_mode;
      mparams.tensor_split    = params.tensor_split;
 diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
-index d70744840..dea8c4546 100644
+index 334372073..e912b593a 100644
 --- a/src/llama.cpp/common/common.h
 +++ b/src/llama.cpp/common/common.h
@@ -307,6 +307,7 @@ struct lr_opt {
@@ -122,7 +120,7 @@ index d70744840..dea8c4546 100644
      int32_t n_ctx                 =     0; // context size, 0 == context the model was trained with
      int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
 diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
-index fc31089f3..aa9befe4c 100644
+index 28fb7612e..63f7e1ca1 100644
 --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
 +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
@@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -135,10 +133,10 @@ index fc31089f3..aa9befe4c 100644
              check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
              if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
 diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
-index 514f086f6..792abaa58 100644
+index 6a00abacc..9e12459b6 100644
 --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
 +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
-@@ -3213,11 +3213,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
+@@ -3226,11 +3226,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
      GGML_UNUSED(dev);
  }
@@ -168,7 +166,7 @@ index 514f086f6..792abaa58 100644
      GGML_UNUSED(dev);
  }
-@@ -3398,10 +3413,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
+@@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
          }
      }
@@ -187,7 +185,7 @@ index 514f086f6..792abaa58 100644
      GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
-@@ -3414,6 +3436,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
+@@ -3429,6 +3451,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
          } catch (std::exception const &exc) {
              GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
              devices[i].context = nullptr;

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -37,6 +37,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
     int32_t chat_format,
     bool thinking_forced_open,
     std::string reasoning_format,
+    const std::string &chat_parser,
     const std::vector<std::string> &media_paths,
     const std::vector<llama_token> &guide_tokens,
     bool has_vocoder,
@@ -46,6 +47,7 @@ LlamaCompletionWorker::LlamaCompletionWorker(
       _params(params), _stop_words(stop_words), _chat_format(chat_format),
       _thinking_forced_open(thinking_forced_open),
       _reasoning_format(reasoning_format),
+      _chat_parser(chat_parser),
       _media_paths(media_paths), _guide_tokens(guide_tokens),
       _prefill_text(prefill_text),
       _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
@@ -121,7 +123,7 @@ void LlamaCompletionWorker::Execute() {
     }
     // Begin completion with chat format and reasoning settings
-    completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open);
+    completion->beginCompletion(_chat_format, common_reasoning_format_from_name(_reasoning_format), _thinking_forced_open, _chat_parser);
     // Main completion loop
     int token_count = 0;

package/src/LlamaCompletionWorker.h CHANGED Viewed

@@ -23,6 +23,7 @@ public:
                         int32_t chat_format,
                         bool thinking_forced_open,
                         std::string reasoning_format,
+                        const std::string &chat_parser = "",
                         const std::vector<std::string> &media_paths = {},
                         const std::vector<llama_token> &guide_tokens = {},
                         bool has_vocoder = false,
@@ -50,6 +51,7 @@ private:
   int32_t _chat_format;
   bool _thinking_forced_open;
   std::string _reasoning_format;
+  std::string _chat_parser;
   std::vector<std::string> _media_paths;
   std::vector<llama_token> _guide_tokens;
   std::string _prefill_text;

package/src/LlamaContext.cpp CHANGED Viewed

@@ -201,6 +201,15 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
        InstanceMethod<&LlamaContext::CancelRequest>(
            "cancelRequest",
            static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::GetParallelStatus>(
+           "getParallelStatus",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::SubscribeParallelStatus>(
+           "subscribeParallelStatus",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::UnsubscribeParallelStatus>(
+           "unsubscribeParallelStatus",
+           static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::ClearCache>(
            "clearCache",
            static_cast<napi_property_attributes>(napi_enumerable)),
@@ -250,6 +259,8 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   }
   common_params params;
+  params.fit_params = false;
   params.model.path = get_option<std::string>(options, "model", "");
   if (params.model.path.empty()) {
     Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
@@ -760,6 +771,8 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
           i, Napi::String::New(env, chatParams.additional_stops[i].c_str()));
     }
     result.Set("additional_stops", additional_stops);
+    // chat_parser: string (serialized PEG parser for chat output parsing)
+    result.Set("chat_parser", chatParams.parser);
     return result;
   } else {
@@ -821,6 +834,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   int32_t chat_format = get_option<int32_t>(options, "chat_format", 0);
   bool thinking_forced_open = get_option<bool>(options, "thinking_forced_open", false);
   std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
+  std::string chat_parser = get_option<std::string>(options, "chat_parser", "");
   common_params params = _rn_ctx->params;
   auto grammar_from_params = get_option<std::string>(options, "grammar", "");
@@ -959,6 +973,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
       chat_format = chatParams.format;
       thinking_forced_open = chatParams.thinking_forced_open;
+      chat_parser = chatParams.parser;
       for (const auto &token : chatParams.preserved_tokens) {
         auto ids =
@@ -1074,7 +1089,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
   auto *worker =
       new LlamaCompletionWorker(info, _rn_ctx, callback, params, stop_words,
-                                chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
+                                chat_format, thinking_forced_open, reasoning_format, chat_parser, media_paths, guide_tokens,
                                 _rn_ctx->has_vocoder, _rn_ctx->tts_wrapper ? _rn_ctx->tts_wrapper->type : rnllama::UNKNOWN, prefill_text);
   worker->Queue();
   _wip = worker;

package/src/LlamaContext.h CHANGED Viewed

@@ -68,6 +68,9 @@ private:
   Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
   Napi::Value QueueRerank(const Napi::CallbackInfo &info);
   void CancelRequest(const Napi::CallbackInfo &info);
+  Napi::Value GetParallelStatus(const Napi::CallbackInfo &info);
+  Napi::Value SubscribeParallelStatus(const Napi::CallbackInfo &info);
+  void UnsubscribeParallelStatus(const Napi::CallbackInfo &info);
   // Cache management
   void ClearCache(const Napi::CallbackInfo &info);

package/src/llama.cpp/common/CMakeLists.txt CHANGED Viewed

@@ -85,6 +85,9 @@ add_library(${TARGET} STATIC
     unicode.h
     )
+target_include_directories(${TARGET} PUBLIC . ../vendor)
+target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -158,10 +161,7 @@ else()
     set(LLAMA_COMMON_WIN_LIBS "")
 endif()
-target_include_directories(${TARGET} PUBLIC . ../vendor)
-target_compile_features   (${TARGET} PUBLIC cxx_std_17)
-target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} ${LLAMA_COMMON_WIN_LIBS} PUBLIC llama Threads::Threads)
 #
 # copy the license files