npm - @fugood/llama.node - Versions diffs - 1.3.3 → 1.3.5 - Mend

@fugood/llama.node 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/CMakeLists.txt +7 -3
package/lib/binding.js +1 -1
package/lib/binding.ts +40 -14
package/lib/index.js +4 -1
package/lib/index.ts +13 -9
package/package.json +14 -14
package/scripts/llama.cpp.patch +10 -10
package/src/LlamaCompletionWorker.cpp +33 -33
package/src/LlamaContext.cpp +53 -16
package/src/LlamaContext.h +2 -0
package/src/llama.cpp/common/CMakeLists.txt +2 -0
package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
package/src/llama.cpp/common/chat-parser.h +10 -0
package/src/llama.cpp/common/chat.cpp +461 -87
package/src/llama.cpp/common/chat.h +6 -0
package/src/llama.cpp/common/common.cpp +8 -1
package/src/llama.cpp/common/common.h +12 -5
package/src/llama.cpp/common/json-partial.cpp +19 -2
package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
package/src/llama.cpp/common/sampling.cpp +60 -6
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
package/src/llama.cpp/src/llama-grammar.cpp +17 -9
package/src/llama.cpp/src/llama-impl.cpp +3 -3
package/src/llama.cpp/src/llama-sampling.cpp +3 -6
package/src/llama.cpp/src/llama-vocab.cpp +1 -0

package/src/llama.cpp/src/llama-grammar.cpp CHANGED Viewed

@@ -6,8 +6,10 @@
 #include <cmath>
 #include <algorithm>
+#include <cstdint>
 #include <stdexcept>
+#define MAX_REPETITION_THRESHOLD 2000
 //
 // helpers
 //
@@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence(
     size_t last_sym_start = rule.size();
     const char * pos = src;
-    auto handle_repetitions = [&](int min_times, int max_times) {
+    // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
+    // (though it's technically the same as -1 now)
+    auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
+        bool no_max = max_times == UINT64_MAX;
         if (last_sym_start == rule.size()) {
             throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
         }
@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
             rule.resize(last_sym_start);
         } else {
             // Repeat the previous elements (min_times - 1) times
-            for (int i = 1; i < min_times; i++) {
+            for (uint64_t i = 1; i < min_times; i++) {
                 rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
             }
         }
         uint32_t last_rec_rule_id = 0;
-        auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+        auto n_opt = no_max ? 1 : max_times - min_times;
         llama_grammar_rule rec_rule(prev_rule);
-        for (int i = 0; i < n_opt; i++) {
+        for (uint64_t i = 0; i < n_opt; i++) {
             rec_rule.resize(prev_rule.size());
             uint32_t rec_rule_id = generate_symbol_id( rule_name);
-            if (i > 0 || max_times < 0) {
-                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+            if (i > 0 || no_max) {
+                rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
             }
             rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
             rec_rule.push_back({LLAMA_GRETYPE_END, 0});
@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
                 throw std::runtime_error(std::string("expecting an int at ") + pos);
             }
             const char * int_end = parse_int(pos);
-            int min_times = std::stoul(std::string(pos, int_end - pos));
+            uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
             pos = parse_space(int_end, is_nested);
-            int max_times = -1;
+            uint64_t max_times = UINT64_MAX; // default: no max limit
             if (*pos == '}') {
                 max_times = min_times;
@@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence(
             } else {
                 throw std::runtime_error(std::string("expecting ',' at ") + pos);
             }
+            bool has_max = max_times != UINT64_MAX;
+            if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
+                throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
+            }
             handle_repetitions(min_times, max_times);
         } else {
             break;

package/src/llama.cpp/src/llama-impl.cpp CHANGED Viewed

@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
 time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
 time_meas::~time_meas() {
-        if (t_start_us >= 0) {
-            t_acc += ggml_time_us() - t_start_us;
-        }
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
     }
+}
 void llama_log_set(ggml_log_callback log_callback, void * user_data) {
     ggml_log_set(log_callback, user_data);

package/src/llama.cpp/src/llama-sampling.cpp CHANGED Viewed

@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
     for (auto * smpl : chain->samplers) {
         llama_sampler_reset(smpl);
     }
-    chain->t_sample_us = 0;
-    chain->n_sample    = 0;
 }
 static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
 void llama_perf_sampler_print(const struct llama_sampler * chain) {
     const auto data = llama_perf_sampler(chain);
-    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+    LLAMA_LOG_INFO("%s:    samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
 }
 void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
     auto * ctx = (struct llama_sampler_chain *) chain->ctx;
-    ctx->t_sample_us = ctx->n_sample = 0;
+    ctx->t_sample_us = 0;
+    ctx->n_sample    = 0;
 }

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -1281,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {
         // Build suffix list in lexicographical order of reversed strings
         std::vector<std::string> suffixes;
+        suffixes.reserve(suffix_to_score.size() + 1);
         for (const auto & pair : suffix_to_score) {
             suffixes.push_back(pair.first);
         }