@fugood/llama.node 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CMakeLists.txt +7 -3
  2. package/lib/binding.js +1 -1
  3. package/lib/binding.ts +40 -14
  4. package/lib/index.js +4 -1
  5. package/lib/index.ts +13 -9
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +10 -10
  8. package/src/LlamaCompletionWorker.cpp +33 -33
  9. package/src/LlamaContext.cpp +53 -16
  10. package/src/LlamaContext.h +2 -0
  11. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  12. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
  13. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
  14. package/src/llama.cpp/common/chat-parser.h +10 -0
  15. package/src/llama.cpp/common/chat.cpp +461 -87
  16. package/src/llama.cpp/common/chat.h +6 -0
  17. package/src/llama.cpp/common/common.cpp +8 -1
  18. package/src/llama.cpp/common/common.h +12 -5
  19. package/src/llama.cpp/common/json-partial.cpp +19 -2
  20. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -0
  21. package/src/llama.cpp/common/json-schema-to-grammar.h +2 -0
  22. package/src/llama.cpp/common/sampling.cpp +60 -6
  23. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -38
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +15 -5
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -3
  27. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +16 -14
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +49 -48
  29. package/src/llama.cpp/src/llama-grammar.cpp +17 -9
  30. package/src/llama.cpp/src/llama-impl.cpp +3 -3
  31. package/src/llama.cpp/src/llama-sampling.cpp +3 -6
  32. package/src/llama.cpp/src/llama-vocab.cpp +1 -0
@@ -6,8 +6,10 @@
6
6
 
7
7
  #include <cmath>
8
8
  #include <algorithm>
9
+ #include <cstdint>
9
10
  #include <stdexcept>
10
11
 
12
+ #define MAX_REPETITION_THRESHOLD 2000
11
13
  //
12
14
  // helpers
13
15
  //
@@ -345,8 +347,10 @@ const char * llama_grammar_parser::parse_sequence(
345
347
  size_t last_sym_start = rule.size();
346
348
  const char * pos = src;
347
349
 
348
- auto handle_repetitions = [&](int min_times, int max_times) {
349
-
350
+ // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
351
+ // (though it's technically the same as -1 now)
352
+ auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
353
+ bool no_max = max_times == UINT64_MAX;
350
354
  if (last_sym_start == rule.size()) {
351
355
  throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
352
356
  }
@@ -373,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
373
377
  rule.resize(last_sym_start);
374
378
  } else {
375
379
  // Repeat the previous elements (min_times - 1) times
376
- for (int i = 1; i < min_times; i++) {
380
+ for (uint64_t i = 1; i < min_times; i++) {
377
381
  rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
378
382
  }
379
383
  }
380
384
 
381
385
  uint32_t last_rec_rule_id = 0;
382
- auto n_opt = max_times < 0 ? 1 : max_times - min_times;
386
+ auto n_opt = no_max ? 1 : max_times - min_times;
383
387
 
384
388
  llama_grammar_rule rec_rule(prev_rule);
385
- for (int i = 0; i < n_opt; i++) {
389
+ for (uint64_t i = 0; i < n_opt; i++) {
386
390
  rec_rule.resize(prev_rule.size());
387
391
  uint32_t rec_rule_id = generate_symbol_id( rule_name);
388
- if (i > 0 || max_times < 0) {
389
- rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
392
+ if (i > 0 || no_max) {
393
+ rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
390
394
  }
391
395
  rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
392
396
  rec_rule.push_back({LLAMA_GRETYPE_END, 0});
@@ -478,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
478
482
  throw std::runtime_error(std::string("expecting an int at ") + pos);
479
483
  }
480
484
  const char * int_end = parse_int(pos);
481
- int min_times = std::stoul(std::string(pos, int_end - pos));
485
+ uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
482
486
  pos = parse_space(int_end, is_nested);
483
487
 
484
- int max_times = -1;
488
+ uint64_t max_times = UINT64_MAX; // default: no max limit
485
489
 
486
490
  if (*pos == '}') {
487
491
  max_times = min_times;
@@ -502,6 +506,10 @@ const char * llama_grammar_parser::parse_sequence(
502
506
  } else {
503
507
  throw std::runtime_error(std::string("expecting ',' at ") + pos);
504
508
  }
509
+ bool has_max = max_times != UINT64_MAX;
510
+ if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
511
+ throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
512
+ }
505
513
  handle_repetitions(min_times, max_times);
506
514
  } else {
507
515
  break;
@@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
20
20
  time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
21
21
 
22
22
  time_meas::~time_meas() {
23
- if (t_start_us >= 0) {
24
- t_acc += ggml_time_us() - t_start_us;
25
- }
23
+ if (t_start_us >= 0) {
24
+ t_acc += ggml_time_us() - t_start_us;
26
25
  }
26
+ }
27
27
 
28
28
  void llama_log_set(ggml_log_callback log_callback, void * user_data) {
29
29
  ggml_log_set(log_callback, user_data);
@@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
472
472
  for (auto * smpl : chain->samplers) {
473
473
  llama_sampler_reset(smpl);
474
474
  }
475
-
476
- chain->t_sample_us = 0;
477
- chain->n_sample = 0;
478
475
  }
479
476
 
480
477
  static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
@@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
2670
2667
  void llama_perf_sampler_print(const struct llama_sampler * chain) {
2671
2668
  const auto data = llama_perf_sampler(chain);
2672
2669
 
2673
- LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
2674
- __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
2670
+ LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
2675
2671
  }
2676
2672
 
2677
2673
  void llama_perf_sampler_reset(struct llama_sampler * chain) {
@@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {
2681
2677
 
2682
2678
  auto * ctx = (struct llama_sampler_chain *) chain->ctx;
2683
2679
 
2684
- ctx->t_sample_us = ctx->n_sample = 0;
2680
+ ctx->t_sample_us = 0;
2681
+ ctx->n_sample = 0;
2685
2682
  }
@@ -1281,6 +1281,7 @@ struct llm_tokenizer_plamo2 : llm_tokenizer {
1281
1281
 
1282
1282
  // Build suffix list in lexicographical order of reversed strings
1283
1283
  std::vector<std::string> suffixes;
1284
+ suffixes.reserve(suffix_to_score.size() + 1);
1284
1285
  for (const auto & pair : suffix_to_score) {
1285
1286
  suffixes.push_back(pair.first);
1286
1287
  }