npm - @fugood/llama.node - Versions diffs - 1.1.3 → 1.1.5 - Mend

@fugood/llama.node 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/CMakeLists.txt +3 -0
package/lib/binding.ts +8 -0
package/package.json +14 -14
package/src/LlamaCompletionWorker.cpp +45 -5
package/src/LlamaContext.cpp +3 -0
package/src/llama.cpp/common/arg.cpp +60 -7
package/src/llama.cpp/common/chat.cpp +6 -6
package/src/llama.cpp/common/common.cpp +1 -0
package/src/llama.cpp/common/common.h +14 -5
package/src/llama.cpp/common/speculative.cpp +135 -54
package/src/llama.cpp/common/speculative.h +8 -1
package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
package/src/llama.cpp/include/llama.h +8 -4
package/src/llama.cpp/src/llama-arch.cpp +40 -0
package/src/llama.cpp/src/llama-arch.h +2 -0
package/src/llama.cpp/src/llama-batch.cpp +1 -1
package/src/llama.cpp/src/llama-chat.cpp +20 -1
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +11 -2
package/src/llama.cpp/src/llama-context.h +4 -1
package/src/llama.cpp/src/llama-graph.cpp +57 -139
package/src/llama.cpp/src/llama-graph.h +31 -32
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +2 -2
package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
package/src/llama.cpp/src/llama-model.cpp +400 -21
package/src/llama.cpp/src/llama-quant.cpp +3 -3
package/src/llama.cpp/src/llama-vocab.cpp +7 -1
package/src/llama.cpp/src/llama-vocab.h +1 -0

package/CMakeLists.txt CHANGED Viewed

@@ -114,6 +114,9 @@ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
 set(LLAMA_CURL OFF CACHE BOOL "Build curl")
 set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
+add_definitions(-DGGML_MAX_NAME=80)
 add_subdirectory("src/llama.cpp")
 add_subdirectory("src/llama.cpp/tools/mtmd")

package/lib/binding.ts CHANGED Viewed

@@ -65,6 +65,14 @@ export type LlamaModelOptions = {
   lora?: string
   lora_scaled?: number
   lora_list?: { path: string; scaled: number }[]
+  /**
+   * RoPE base frequency, use 0 to use model default (recommended)
+   */
+  rope_freq_base?: number
+  /**
+   * RoPE frequency scaling factor, use 0 to use model default (recommended)
+   */
+  rope_freq_scale?: number
 }
 export type CompletionResponseFormat = {

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.1.3",
+  "version": "1.1.5",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -71,19 +71,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.1.3",
-    "@fugood/node-llama-linux-x64-vulkan": "1.1.3",
-    "@fugood/node-llama-linux-x64-cuda": "1.1.3",
-    "@fugood/node-llama-linux-arm64": "1.1.3",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.1.3",
-    "@fugood/node-llama-linux-arm64-cuda": "1.1.3",
-    "@fugood/node-llama-win32-x64": "1.1.3",
-    "@fugood/node-llama-win32-x64-vulkan": "1.1.3",
-    "@fugood/node-llama-win32-x64-cuda": "1.1.3",
-    "@fugood/node-llama-win32-arm64": "1.1.3",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.1.3",
-    "@fugood/node-llama-darwin-x64": "1.1.3",
-    "@fugood/node-llama-darwin-arm64": "1.1.3"
+    "@fugood/node-llama-linux-x64": "1.1.5",
+    "@fugood/node-llama-linux-x64-vulkan": "1.1.5",
+    "@fugood/node-llama-linux-x64-cuda": "1.1.5",
+    "@fugood/node-llama-linux-arm64": "1.1.5",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.1.5",
+    "@fugood/node-llama-linux-arm64-cuda": "1.1.5",
+    "@fugood/node-llama-win32-x64": "1.1.5",
+    "@fugood/node-llama-win32-x64-vulkan": "1.1.5",
+    "@fugood/node-llama-win32-x64-cuda": "1.1.5",
+    "@fugood/node-llama-win32-arm64": "1.1.5",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.1.5",
+    "@fugood/node-llama-darwin-x64": "1.1.5",
+    "@fugood/node-llama-darwin-arm64": "1.1.5"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaCompletionWorker.cpp CHANGED Viewed

@@ -64,6 +64,7 @@ void LlamaCompletionWorker::Execute() {
   size_t n_input = 0;
   const auto model = _sess->model();
   auto vocab = llama_model_get_vocab(model);
+  const bool is_enc_dec = llama_model_has_encoder(model);
   const bool add_bos = llama_vocab_get_add_bos(vocab);
   auto ctx = _sess->context();
@@ -110,7 +111,7 @@ void LlamaCompletionWorker::Execute() {
   } else {
     // Text-only path
     std::vector<llama_token> prompt_tokens =
-        ::common_tokenize(ctx, _params.prompt, add_bos, true);
+        ::common_tokenize(ctx, _params.prompt, add_bos || is_enc_dec, true);
     n_input = prompt_tokens.size();
     if (_sess->tokens_ptr()->size() > 0) {
@@ -126,9 +127,47 @@ void LlamaCompletionWorker::Execute() {
   }
   const int max_len = _params.n_predict < 0 ? std::numeric_limits<int>::max() : _params.n_predict;
-  _sess->tokens_ptr()->reserve(_sess->tokens_ptr()->size() + max_len);
   auto embd = _sess->tokens_ptr();
+  embd->reserve(embd->size() + max_len);
+  if (is_enc_dec) {
+    if (n_input > 0) {
+      // Decode tokens in batches using n_batch as chunk size
+      int n_past_batch = n_cur;
+      int n_remaining = n_input;
+      while (n_remaining > 0) {
+        int n_eval = n_remaining;
+        if (n_eval > _params.n_batch) {
+          n_eval = _params.n_batch;
+        }
+        int ret = llama_encode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
+        if (ret < 0) {
+          SetError("Failed to encode token batch, code: " + std::to_string(ret) +
+                   ", n_eval: " + std::to_string(n_eval) +
+                   ", n_past_batch: " + std::to_string(n_past_batch));
+          _sess->get_mutex().unlock();
+          return;
+        }
+        n_past_batch += n_eval;
+        n_remaining -= n_eval;
+        n_cur += n_eval;
+      }
+    }
+    _result.tokens_evaluated += n_input;
+    llama_token decode_bos = llama_model_decoder_start_token(model);
+    if (decode_bos == LLAMA_TOKEN_NULL) {
+      decode_bos = llama_vocab_bos(vocab);
+    }
+    embd->emplace_back(decode_bos);
+    common_sampler_accept(sampling.get(), decode_bos, false);
+    n_input = 1;
+  }
   for (int i = 0; (i < max_len || _interrupted) && !_params.vocab_only; i++) {
     // check if we need to remove some tokens
     if (embd->size() >= _params.n_ctx) {
@@ -166,13 +205,14 @@ void LlamaCompletionWorker::Execute() {
         if (n_eval > _params.n_batch) {
           n_eval = _params.n_batch;
         }
         int ret = llama_decode(ctx, llama_batch_get_one(embd->data() + n_past_batch, n_eval));
         if (ret < 0) {
           SetError("Failed to decode token batch, code: " + std::to_string(ret) +
                    ", n_eval: " + std::to_string(n_eval) +
                    ", n_past_batch: " + std::to_string(n_past_batch));
-          break;
+          _sess->get_mutex().unlock();
+          return;
         }
         n_past_batch += n_eval;

package/src/LlamaContext.cpp CHANGED Viewed

@@ -250,6 +250,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.kv_unified = get_option<bool>(options, "kv_unified", false);
   params.swa_full = get_option<bool>(options, "swa_full", false);
+  params.rope_freq_base = get_option<float>(options, "rope_freq_base", 0.0f);
+  params.rope_freq_scale = get_option<float>(options, "rope_freq_scale", 0.0f);
   params.use_mlock = get_option<bool>(options, "use_mlock", false);
   params.use_mmap = get_option<bool>(options, "use_mmap", true);
   params.numa =

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -977,6 +977,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
             string_process_escapes(seq_breaker);
         }
+        for (auto & pair : params.speculative.replacements) {
+            string_process_escapes(pair.first);
+            string_process_escapes(pair.second);
+        }
     }
     if (!params.kv_overrides.empty()) {
@@ -2091,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.no_kv_offload = true;
         }
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    add_opt(common_arg(
+        {"-nr", "--no-repack"},
+        "disable weight repacking",
+        [](common_params & params) {
+            params.no_extra_bufts = true;
+        }
+    ).set_env("LLAMA_ARG_NO_REPACK"));
     add_opt(common_arg(
         {"-ctk", "--cache-type-k"}, "TYPE",
         string_format(
@@ -2369,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ));
+    add_opt(common_arg(
+        {"--cpu-moe"},
+        "use CPU for Mixture of Experts (MoE) weights",
+        [](common_params & params) {
+            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+        }
+    ).set_env("LLAMA_ARG_CPU_MOE"));
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
@@ -2627,6 +2647,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--output-format"}, "{gguf,dat}",
+        string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
+        [](common_params & params, const std::string & value) {
+            /**/ if (value == "gguf") { params.imat_dat = false; }
+            else if (value == "dat")  { params.imat_dat = true;  }
+            else { throw std::invalid_argument("invalid output format"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--save-frequency"}, "N",
         string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -3249,6 +3278,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.speculative.model.path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
+    add_opt(common_arg(
+        {"--spec-replace"}, "TARGET", "DRAFT",
+        "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
+        [](common_params & params, const std::string & tgt, const std::string & dft) {
+            params.speculative.replacements.push_back({ tgt, dft });
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ctkd", "--cache-type-k-draft"}, "TYPE",
         string_format(
@@ -3438,12 +3474,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    // diffusion parameters
     add_opt(common_arg(
         { "--diffusion-steps" }, "N",
         string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
         [](common_params & params, int value) { params.diffusion.steps = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-visual" },
+        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
+                      params.diffusion.visual_mode ? "true" : "false"),
+        [](common_params & params) { params.diffusion.visual_mode = true; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-eps" }, "F",
         string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
@@ -3451,21 +3493,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-algorithm" }, "N",
-        string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
+        string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
                       params.diffusion.algorithm),
         [](common_params & params, int value) { params.diffusion.algorithm = value; }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
         { "--diffusion-alg-temp" }, "F",
-        string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
+        string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
         [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     add_opt(common_arg(
-        { "--diffusion-visual" },
-        string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
-                      params.diffusion.visual_mode ? "true" : "false"),
-        [](common_params & params) { params.diffusion.visual_mode = true; }
+        { "--diffusion-block-length" }, "N",
+        string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
+        [](common_params & params, int value) { params.diffusion.block_length = value; }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-cfg-scale" }, "F",
+        string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
+        [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
     ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
+    add_opt(common_arg(
+        { "--diffusion-add-gumbel-noise" }, "F",
+        string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
+        [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
+    ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
     return ctx_arg;
 }

package/src/llama.cpp/common/chat.cpp CHANGED Viewed

@@ -1635,7 +1635,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
         "|<function name=\"([^\"]+)\">"  // match 5 (function name again)
     );
-    if (auto res = builder.try_find_regex(open_regex)) {
+    while (auto res = builder.try_find_regex(open_regex)) {
         const auto & block_start = res->groups[1];
         std::string block_end = block_start.empty() ? "" : "```";
@@ -1657,7 +1657,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                     builder.consume_literal(block_end);
                     builder.consume_spaces();
                 }
-                builder.add_content(builder.consume_rest());
             } else {
                 throw common_chat_msg_partial_exception("failed to parse tool call");
             }
@@ -1682,11 +1681,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
                     builder.consume_spaces();
                 }
             }
-            builder.add_content(builder.consume_rest());
         }
-    } else {
-        builder.add_content(builder.consume_rest());
     }
+    builder.add_content(builder.consume_rest());
 }
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1933,6 +1931,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
         }
     }
     auto msg = builder.result();
-    LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    if (!is_partial) {
+        LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
+    }
     return msg;
 }

package/src/llama.cpp/common/common.cpp CHANGED Viewed

@@ -1123,6 +1123,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
     mparams.check_tensors   = params.check_tensors;
+    mparams.use_extra_bufts = !params.no_extra_bufts;
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;

package/src/llama.cpp/common/common.h CHANGED Viewed

@@ -201,6 +201,7 @@ struct common_params_speculative {
     int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     float   p_split      =  0.1f; // speculative decoding split probability
     float   p_min        = 0.75f; // minimum speculative decoding probability (greedy)
+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,11 +221,17 @@ struct common_params_vocoder {
 };
 struct common_params_diffusion {
-    int32_t steps       = 64;     // number of diffusion steps
-    float   eps         = 1e-3f;  // epsilon for timesteps
-    int32_t algorithm   = 0;      // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
-    float   alg_temp    = 0.0f;   // algorithm temperature
-    bool    visual_mode = false;  // show progressive diffusion on screen
+    int32_t steps         = 128;
+    bool    visual_mode   = false;
+    float   eps           = 0;        // epsilon for timesteps
+    int32_t block_length  = 0;        // block length for generation
+    int32_t algorithm     = 4;        // default algorithm: low-confidence
+    float   alg_temp      = 0.0f;     // algorithm temperature
+    float   cfg_scale     = 0;        // classifier-free guidance scale
+    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };
 enum common_reasoning_format {
@@ -353,6 +360,7 @@ struct common_params {
     bool warmup            = true;  // warmup run
     bool check_tensors     = false; // validate tensor data
     bool no_op_offload     = false; // globally disable offload host tensor operations to device
+    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
     bool single_turn       = false; // single turn chat conversation
@@ -432,6 +440,7 @@ struct common_params {
     int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
     int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
     int32_t i_chunk     =  0; // start processing from this chunk
+    bool    imat_dat    = false; // whether the legacy imatrix.dat format should be output
     bool process_output  = false; // collect data for the output tensor
     bool compute_ppl     = true;  // whether to compute perplexity