@fugood/llama.node 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -3
- package/lib/binding.js +1 -1
- package/lib/binding.ts +40 -14
- package/lib/index.js +4 -1
- package/lib/index.ts +13 -9
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +10 -10
- package/src/LlamaCompletionWorker.cpp +33 -33
- package/src/LlamaContext.cpp +53 -16
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/src/llama.cpp/common/chat-parser.h +10 -0
- package/src/llama.cpp/common/chat.cpp +461 -87
- package/src/llama.cpp/common/chat.h +6 -0
- package/src/llama.cpp/common/common.cpp +8 -1
- package/src/llama.cpp/common/common.h +12 -5
- package/src/llama.cpp/common/json-partial.cpp +19 -2
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/src/llama.cpp/common/sampling.cpp +60 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
- package/src/llama.cpp/src/llama-grammar.cpp +17 -9
- package/src/llama.cpp/src/llama-impl.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +3 -6
- package/src/llama.cpp/src/llama-vocab.cpp +1 -0
|
@@ -6,8 +6,10 @@
|
|
|
6
6
|
|
|
7
7
|
#include <cmath>
|
|
8
8
|
#include <algorithm>
|
|
9
|
+
#include <cstdint>
|
|
9
10
|
#include <stdexcept>
|
|
10
11
|
|
|
12
|
+
#define MAX_REPETITION_THRESHOLD 2000
|
|
11
13
|
//
|
|
12
14
|
// helpers
|
|
13
15
|
//
|
|
@@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
345
347
|
size_t last_sym_start = rule.size();
|
|
346
348
|
const char * pos = src;
|
|
347
349
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
+
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
|
|
351
|
+
// (though it's technically the same as -1 now)
|
|
352
|
+
auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
|
|
353
|
+
bool no_max = max_times == UINT64_MAX;
|
|
350
354
|
if (last_sym_start == rule.size()) {
|
|
351
355
|
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
|
352
356
|
}
|
|
@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
373
377
|
rule.resize(last_sym_start);
|
|
374
378
|
} else {
|
|
375
379
|
// Repeat the previous elements (min_times - 1) times
|
|
376
|
-
for (
|
|
380
|
+
for (uint64_t i = 1; i < min_times; i++) {
|
|
377
381
|
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
|
|
378
382
|
}
|
|
379
383
|
}
|
|
380
384
|
|
|
381
385
|
uint32_t last_rec_rule_id = 0;
|
|
382
|
-
auto n_opt =
|
|
386
|
+
auto n_opt = no_max ? 1 : max_times - min_times;
|
|
383
387
|
|
|
384
388
|
llama_grammar_rule rec_rule(prev_rule);
|
|
385
|
-
for (
|
|
389
|
+
for (uint64_t i = 0; i < n_opt; i++) {
|
|
386
390
|
rec_rule.resize(prev_rule.size());
|
|
387
391
|
uint32_t rec_rule_id = generate_symbol_id( rule_name);
|
|
388
|
-
if (i > 0 ||
|
|
389
|
-
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF,
|
|
392
|
+
if (i > 0 || no_max) {
|
|
393
|
+
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
|
|
390
394
|
}
|
|
391
395
|
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
|
392
396
|
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
|
@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
478
482
|
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
|
479
483
|
}
|
|
480
484
|
const char * int_end = parse_int(pos);
|
|
481
|
-
|
|
485
|
+
uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
|
|
482
486
|
pos = parse_space(int_end, is_nested);
|
|
483
487
|
|
|
484
|
-
|
|
488
|
+
uint64_t max_times = UINT64_MAX; // default: no max limit
|
|
485
489
|
|
|
486
490
|
if (*pos == '}') {
|
|
487
491
|
max_times = min_times;
|
|
@@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence(
|
|
|
502
506
|
} else {
|
|
503
507
|
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
|
504
508
|
}
|
|
509
|
+
bool has_max = max_times != UINT64_MAX;
|
|
510
|
+
if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
|
|
511
|
+
throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
|
|
512
|
+
}
|
|
505
513
|
handle_repetitions(min_times, max_times);
|
|
506
514
|
} else {
|
|
507
515
|
break;
|
|
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
|
|
|
20
20
|
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
|
|
21
21
|
|
|
22
22
|
time_meas::~time_meas() {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
}
|
|
23
|
+
if (t_start_us >= 0) {
|
|
24
|
+
t_acc += ggml_time_us() - t_start_us;
|
|
26
25
|
}
|
|
26
|
+
}
|
|
27
27
|
|
|
28
28
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
29
29
|
ggml_log_set(log_callback, user_data);
|
|
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
|
|
472
472
|
for (auto * smpl : chain->samplers) {
|
|
473
473
|
llama_sampler_reset(smpl);
|
|
474
474
|
}
|
|
475
|
-
|
|
476
|
-
chain->t_sample_us = 0;
|
|
477
|
-
chain->n_sample = 0;
|
|
478
475
|
}
|
|
479
476
|
|
|
480
477
|
static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
|
|
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
|
|
|
2670
2667
|
void llama_perf_sampler_print(const struct llama_sampler * chain) {
|
|
2671
2668
|
const auto data = llama_perf_sampler(chain);
|
|
2672
2669
|
|
|
2673
|
-
LLAMA_LOG_INFO("%s:
|
|
2674
|
-
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
|
|
2670
|
+
LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
|
|
2675
2671
|
}
|
|
2676
2672
|
|
|
2677
2673
|
void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
|
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
|
|
|
2681
2677
|
|
|
2682
2678
|
auto * ctx = (struct llama_sampler_chain *) chain->ctx;
|
|
2683
2679
|
|
|
2684
|
-
ctx->t_sample_us =
|
|
2680
|
+
ctx->t_sample_us = 0;
|
|
2681
|
+
ctx->n_sample = 0;
|
|
2685
2682
|
}
|
|
@@ -1281,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {
|
|
|
1281
1281
|
|
|
1282
1282
|
// Build suffix list in lexicographical order of reversed strings
|
|
1283
1283
|
std::vector<std::string> suffixes;
|
|
1284
|
+
suffixes.reserve(suffix_to_score.size() + 1);
|
|
1284
1285
|
for (const auto & pair : suffix_to_score) {
|
|
1285
1286
|
suffixes.push_back(pair.first);
|
|
1286
1287
|
}
|