@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/lib/index.js +26 -20
  19. package/lib/index.ts +32 -28
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +14 -0
  22. package/src/LlamaContext.cpp +13 -4
  23. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  27. package/src/llama.cpp/common/arg.cpp +180 -3
  28. package/src/llama.cpp/common/chat-template.hpp +21 -7
  29. package/src/llama.cpp/common/chat.cpp +220 -101
  30. package/src/llama.cpp/common/chat.hpp +3 -0
  31. package/src/llama.cpp/common/common.h +15 -7
  32. package/src/llama.cpp/common/llguidance.cpp +3 -3
  33. package/src/llama.cpp/common/log.cpp +1 -0
  34. package/src/llama.cpp/common/log.h +2 -1
  35. package/src/llama.cpp/common/minja.hpp +24 -9
  36. package/src/llama.cpp/common/sampling.cpp +52 -46
  37. package/src/llama.cpp/common/speculative.h +1 -1
  38. package/src/llama.cpp/docs/build.md +2 -2
  39. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  40. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  43. package/src/llama.cpp/examples/run/run.cpp +5 -12
  44. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/server/httplib.h +381 -292
  46. package/src/llama.cpp/examples/server/server.cpp +58 -47
  47. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  48. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  49. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  50. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  51. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  52. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  55. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  58. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  59. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  60. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  61. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  62. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  63. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  64. package/src/llama.cpp/include/llama.h +14 -10
  65. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  66. package/src/llama.cpp/src/llama-grammar.h +1 -1
  67. package/src/llama.cpp/src/llama-impl.h +6 -6
  68. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  69. package/src/llama.cpp/src/llama-mmap.h +1 -0
  70. package/src/llama.cpp/src/llama-model.cpp +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  72. package/src/llama.cpp/src/llama.cpp +7 -5
  73. package/src/llama.cpp/src/unicode.cpp +9 -2
  74. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  75. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  76. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  77. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -365,6 +365,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
365
365
  print_options(specific_options);
366
366
  }
367
367
 
368
+ static void common_params_print_completion(common_params_context & ctx_arg) {
369
+ std::vector<common_arg *> common_options;
370
+ std::vector<common_arg *> sparam_options;
371
+ std::vector<common_arg *> specific_options;
372
+
373
+ for (auto & opt : ctx_arg.options) {
374
+ if (opt.is_sparam) {
375
+ sparam_options.push_back(&opt);
376
+ } else if (opt.in_example(ctx_arg.ex)) {
377
+ specific_options.push_back(&opt);
378
+ } else {
379
+ common_options.push_back(&opt);
380
+ }
381
+ }
382
+
383
+ printf("_llama_completions() {\n");
384
+ printf(" local cur prev opts\n");
385
+ printf(" COMPREPLY=()\n");
386
+ printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
387
+ printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
388
+
389
+ printf(" opts=\"");
390
+ auto print_options = [](const std::vector<common_arg *> & options) {
391
+ for (const common_arg * opt : options) {
392
+ for (const char * arg : opt->args) {
393
+ printf("%s ", arg);
394
+ }
395
+ }
396
+ };
397
+
398
+ print_options(common_options);
399
+ print_options(sparam_options);
400
+ print_options(specific_options);
401
+ printf("\"\n\n");
402
+
403
+ printf(" case \"$prev\" in\n");
404
+ printf(" --model)\n");
405
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
406
+ printf(" return 0\n");
407
+ printf(" ;;\n");
408
+ printf(" --grammar-file)\n");
409
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
410
+ printf(" return 0\n");
411
+ printf(" ;;\n");
412
+ printf(" --chat-template-file)\n");
413
+ printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
414
+ printf(" return 0\n");
415
+ printf(" ;;\n");
416
+ printf(" *)\n");
417
+ printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
418
+ printf(" return 0\n");
419
+ printf(" ;;\n");
420
+ printf(" esac\n");
421
+ printf("}\n\n");
422
+
423
+ std::set<std::string> executables = {
424
+ "llama-batched",
425
+ "llama-batched-bench",
426
+ "llama-bench",
427
+ "llama-cli",
428
+ "llama-convert-llama2c-to-ggml",
429
+ "llama-cvector-generator",
430
+ "llama-embedding",
431
+ "llama-eval-callback",
432
+ "llama-export-lora",
433
+ "llama-gbnf-validator",
434
+ "llama-gen-docs",
435
+ "llama-gguf",
436
+ "llama-gguf-hash",
437
+ "llama-gguf-split",
438
+ "llama-gritlm",
439
+ "llama-imatrix",
440
+ "llama-infill",
441
+ "llama-llava-cli",
442
+ "llama-llava-clip-quantize-cli",
443
+ "llama-lookahead",
444
+ "llama-lookup",
445
+ "llama-lookup-create",
446
+ "llama-lookup-merge",
447
+ "llama-lookup-stats",
448
+ "llama-minicpmv-cli",
449
+ "llama-parallel",
450
+ "llama-passkey",
451
+ "llama-perplexity",
452
+ "llama-q8dot",
453
+ "llama-quantize",
454
+ "llama-quantize-stats",
455
+ "llama-qwen2vl-cli",
456
+ "llama-retrieval",
457
+ "llama-run",
458
+ "llama-save-load-state",
459
+ "llama-server",
460
+ "llama-simple",
461
+ "llama-simple-chat",
462
+ "llama-speculative",
463
+ "llama-speculative-simple",
464
+ "llama-tokenize",
465
+ "llama-tts",
466
+ "llama-vdot"
467
+ };
468
+
469
+ for (const auto& exe : executables) {
470
+ printf("complete -F _llama_completions %s\n", exe.c_str());
471
+ }
472
+ }
473
+
368
474
  static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
369
475
  std::vector<ggml_backend_dev_t> devices;
370
476
  auto dev_names = string_split<std::string>(value, ',');
@@ -426,6 +532,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
426
532
  }
427
533
  exit(0);
428
534
  }
535
+ if (ctx_arg.params.completion) {
536
+ common_params_print_completion(ctx_arg);
537
+ exit(0);
538
+ }
429
539
  } catch (const std::invalid_argument & ex) {
430
540
  fprintf(stderr, "%s\n", ex.what());
431
541
  ctx_arg.params = params_org;
@@ -494,6 +604,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
494
604
  exit(0);
495
605
  }
496
606
  ));
607
+ add_opt(common_arg(
608
+ {"--completion-bash"},
609
+ "print source-able bash completion script for llama.cpp",
610
+ [](common_params & params) {
611
+ params.completion = true;
612
+ }
613
+ ));
497
614
  add_opt(common_arg(
498
615
  {"--verbose-prompt"},
499
616
  string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -674,7 +791,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674
791
  ));
675
792
  add_opt(common_arg(
676
793
  {"--no-context-shift"},
677
- string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
794
+ string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
678
795
  [](common_params & params) {
679
796
  params.ctx_shift = false;
680
797
  }
@@ -946,6 +1063,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
946
1063
  params.sampling.min_p = std::stof(value);
947
1064
  }
948
1065
  ).set_sparam());
1066
+ add_opt(common_arg(
1067
+ {"--top-nsigma"}, "N",
1068
+ string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1069
+ [](common_params & params, const std::string & value) {
1070
+ params.sampling.top_n_sigma = std::stof(value);
1071
+ }
1072
+ ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
949
1073
  add_opt(common_arg(
950
1074
  {"--xtc-probability"}, "N",
951
1075
  string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1445,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
1569
  "- isolate: only spawn threads on CPUs on the node that execution started on\n"
1446
1570
  "- numactl: use the CPU map provided by numactl\n"
1447
1571
  "if run without this previously, it is recommended to drop the system page cache before using this\n"
1448
- "see https://github.com/ggerganov/llama.cpp/issues/1437",
1572
+ "see https://github.com/ggml-org/llama.cpp/issues/1437",
1449
1573
  [](common_params & params, const std::string & value) {
1450
1574
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1451
1575
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1975,6 +2099,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1975
2099
  params.use_jinja = true;
1976
2100
  }
1977
2101
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2102
+ add_opt(common_arg(
2103
+ {"--reasoning-format"}, "FORMAT",
2104
+ "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2105
+ "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2106
+ "only supported for non-streamed responses",
2107
+ [](common_params & params, const std::string & value) {
2108
+ /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2109
+ else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2110
+ else { std::invalid_argument("invalid value"); }
2111
+ }
2112
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
1978
2113
  add_opt(common_arg(
1979
2114
  {"--chat-template"}, "JINJA_TEMPLATE",
1980
2115
  string_format(
@@ -2112,7 +2247,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2112
2247
  ).set_env("LLAMA_LOG_VERBOSITY"));
2113
2248
  add_opt(common_arg(
2114
2249
  {"--log-prefix"},
2115
- "Enable prefx in log messages",
2250
+ "Enable prefix in log messages",
2116
2251
  [](common_params &) {
2117
2252
  common_log_set_prefix(common_log_main(), true);
2118
2253
  }
@@ -2324,5 +2459,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2324
2459
  }
2325
2460
  ).set_examples({LLAMA_EXAMPLE_TTS}));
2326
2461
 
2462
+ add_opt(common_arg(
2463
+ {"--embd-bge-small-en-default"},
2464
+ string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2465
+ [](common_params & params) {
2466
+ params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2467
+ params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2468
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2469
+ params.embd_normalize = 2;
2470
+ params.n_ctx = 512;
2471
+ params.verbose_prompt = true;
2472
+ params.embedding = true;
2473
+ }
2474
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2475
+
2476
+ add_opt(common_arg(
2477
+ {"--embd-e5-small-en-default"},
2478
+ string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2479
+ [](common_params & params) {
2480
+ params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2481
+ params.hf_file = "e5-small-v2-q8_0.gguf";
2482
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2483
+ params.embd_normalize = 2;
2484
+ params.n_ctx = 512;
2485
+ params.verbose_prompt = true;
2486
+ params.embedding = true;
2487
+ }
2488
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2489
+
2490
+ add_opt(common_arg(
2491
+ {"--embd-gte-small-default"},
2492
+ string_format("use default gte-small model (note: can download weights from the internet)"),
2493
+ [](common_params & params) {
2494
+ params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2495
+ params.hf_file = "gte-small-q8_0.gguf";
2496
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2497
+ params.embd_normalize = 2;
2498
+ params.n_ctx = 512;
2499
+ params.verbose_prompt = true;
2500
+ params.embedding = true;
2501
+ }
2502
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2503
+
2327
2504
  return ctx_arg;
2328
2505
  }
@@ -249,16 +249,30 @@ class chat_template {
249
249
  inputs.add_generation_prompt = false;
250
250
  full = apply(inputs);
251
251
  }
252
-
253
- if (full.find(prefix) != 0) {
254
- if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255
- prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252
+ auto eos_pos_last = full.rfind(eos_token_);
253
+ if (eos_pos_last == prefix.size() - eos_token_.size() ||
254
+ (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255
+ full = full.substr(0, eos_pos_last);
256
+ }
257
+ size_t common_prefix_length = 0;
258
+ for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259
+ if (prefix[i] != full[i]) {
260
+ break;
256
261
  }
262
+ if (prefix[i] == '<') {
263
+ // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264
+ // but it removes thinking tags for past messages.
265
+ // The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266
+ continue;
267
+ }
268
+ common_prefix_length = i + 1;
257
269
  }
258
- if (full.find(prefix) != 0) {
270
+ auto example = full.substr(common_prefix_length);
271
+ if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259
272
  fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273
+ } else {
274
+ tool_call_example_ = example;
260
275
  }
261
- tool_call_example_ = full.substr(prefix.size());
262
276
  }
263
277
  } catch (const std::exception & e) {
264
278
  fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363
377
  if (polyfill_tools) {
364
378
  adjusted_messages = add_system(inputs.messages,
365
379
  "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366
- (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380
+ (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367
381
  } else {
368
382
  adjusted_messages = inputs.messages;
369
383
  }