@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "log.h"
4
4
  #include "sampling.h"
5
+ #include "chat.h"
5
6
 
6
7
  #include <algorithm>
7
8
  #include <climits>
@@ -365,6 +366,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
365
366
  print_options(specific_options);
366
367
  }
367
368
 
369
+ static void common_params_print_completion(common_params_context & ctx_arg) {
370
+ std::vector<common_arg *> common_options;
371
+ std::vector<common_arg *> sparam_options;
372
+ std::vector<common_arg *> specific_options;
373
+
374
+ for (auto & opt : ctx_arg.options) {
375
+ if (opt.is_sparam) {
376
+ sparam_options.push_back(&opt);
377
+ } else if (opt.in_example(ctx_arg.ex)) {
378
+ specific_options.push_back(&opt);
379
+ } else {
380
+ common_options.push_back(&opt);
381
+ }
382
+ }
383
+
384
+ printf("_llama_completions() {\n");
385
+ printf(" local cur prev opts\n");
386
+ printf(" COMPREPLY=()\n");
387
+ printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
388
+ printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
389
+
390
+ printf(" opts=\"");
391
+ auto print_options = [](const std::vector<common_arg *> & options) {
392
+ for (const common_arg * opt : options) {
393
+ for (const char * arg : opt->args) {
394
+ printf("%s ", arg);
395
+ }
396
+ }
397
+ };
398
+
399
+ print_options(common_options);
400
+ print_options(sparam_options);
401
+ print_options(specific_options);
402
+ printf("\"\n\n");
403
+
404
+ printf(" case \"$prev\" in\n");
405
+ printf(" --model)\n");
406
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
407
+ printf(" return 0\n");
408
+ printf(" ;;\n");
409
+ printf(" --grammar-file)\n");
410
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
411
+ printf(" return 0\n");
412
+ printf(" ;;\n");
413
+ printf(" --chat-template-file)\n");
414
+ printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
415
+ printf(" return 0\n");
416
+ printf(" ;;\n");
417
+ printf(" *)\n");
418
+ printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
419
+ printf(" return 0\n");
420
+ printf(" ;;\n");
421
+ printf(" esac\n");
422
+ printf("}\n\n");
423
+
424
+ std::set<std::string> executables = {
425
+ "llama-batched",
426
+ "llama-batched-bench",
427
+ "llama-bench",
428
+ "llama-cli",
429
+ "llama-convert-llama2c-to-ggml",
430
+ "llama-cvector-generator",
431
+ "llama-embedding",
432
+ "llama-eval-callback",
433
+ "llama-export-lora",
434
+ "llama-gbnf-validator",
435
+ "llama-gen-docs",
436
+ "llama-gguf",
437
+ "llama-gguf-hash",
438
+ "llama-gguf-split",
439
+ "llama-gritlm",
440
+ "llama-imatrix",
441
+ "llama-infill",
442
+ "llama-llava-cli",
443
+ "llama-llava-clip-quantize-cli",
444
+ "llama-lookahead",
445
+ "llama-lookup",
446
+ "llama-lookup-create",
447
+ "llama-lookup-merge",
448
+ "llama-lookup-stats",
449
+ "llama-minicpmv-cli",
450
+ "llama-parallel",
451
+ "llama-passkey",
452
+ "llama-perplexity",
453
+ "llama-q8dot",
454
+ "llama-quantize",
455
+ "llama-quantize-stats",
456
+ "llama-qwen2vl-cli",
457
+ "llama-retrieval",
458
+ "llama-run",
459
+ "llama-save-load-state",
460
+ "llama-server",
461
+ "llama-simple",
462
+ "llama-simple-chat",
463
+ "llama-speculative",
464
+ "llama-speculative-simple",
465
+ "llama-tokenize",
466
+ "llama-tts",
467
+ "llama-vdot"
468
+ };
469
+
470
+ for (const auto& exe : executables) {
471
+ printf("complete -F _llama_completions %s\n", exe.c_str());
472
+ }
473
+ }
474
+
368
475
  static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
369
476
  std::vector<ggml_backend_dev_t> devices;
370
477
  auto dev_names = string_split<std::string>(value, ',');
@@ -426,6 +533,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
426
533
  }
427
534
  exit(0);
428
535
  }
536
+ if (ctx_arg.params.completion) {
537
+ common_params_print_completion(ctx_arg);
538
+ exit(0);
539
+ }
429
540
  } catch (const std::invalid_argument & ex) {
430
541
  fprintf(stderr, "%s\n", ex.what());
431
542
  ctx_arg.params = params_org;
@@ -494,6 +605,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
494
605
  exit(0);
495
606
  }
496
607
  ));
608
+ add_opt(common_arg(
609
+ {"--completion-bash"},
610
+ "print source-able bash completion script for llama.cpp",
611
+ [](common_params & params) {
612
+ params.completion = true;
613
+ }
614
+ ));
497
615
  add_opt(common_arg(
498
616
  {"--verbose-prompt"},
499
617
  string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -674,7 +792,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674
792
  ));
675
793
  add_opt(common_arg(
676
794
  {"--no-context-shift"},
677
- string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
795
+ string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
678
796
  [](common_params & params) {
679
797
  params.ctx_shift = false;
680
798
  }
@@ -695,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
695
813
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
696
814
  add_opt(common_arg(
697
815
  {"-p", "--prompt"}, "PROMPT",
698
- ex == LLAMA_EXAMPLE_MAIN
699
- ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
700
- : "prompt to start generation with",
816
+ "prompt to start generation with; for system message, use -sys",
701
817
  [](common_params & params, const std::string & value) {
702
818
  params.prompt = value;
703
819
  }
704
820
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
821
+ add_opt(common_arg(
822
+ {"-sys", "--system-prompt"}, "PROMPT",
823
+ "system prompt to use with model (if applicable, depending on chat template)",
824
+ [](common_params & params, const std::string & value) {
825
+ params.system_prompt = value;
826
+ }
827
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
705
828
  add_opt(common_arg(
706
829
  {"--no-perf"},
707
830
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -826,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
826
949
  params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
827
950
  }
828
951
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
952
+ add_opt(common_arg(
953
+ {"-st", "--single-turn"},
954
+ "run conversation for a single turn only, then exit when done\n"
955
+ "will not be interactive if first turn is predefined with --prompt\n"
956
+ "(default: false)",
957
+ [](common_params & params) {
958
+ params.single_turn = true;
959
+ }
960
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
829
961
  add_opt(common_arg(
830
962
  {"-i", "--interactive"},
831
963
  string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
@@ -946,6 +1078,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
946
1078
  params.sampling.min_p = std::stof(value);
947
1079
  }
948
1080
  ).set_sparam());
1081
+ add_opt(common_arg(
1082
+ {"--top-nsigma"}, "N",
1083
+ string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1084
+ [](common_params & params, const std::string & value) {
1085
+ params.sampling.top_n_sigma = std::stof(value);
1086
+ }
1087
+ ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
949
1088
  add_opt(common_arg(
950
1089
  {"--xtc-probability"}, "N",
951
1090
  string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1445,7 +1584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
1584
  "- isolate: only spawn threads on CPUs on the node that execution started on\n"
1446
1585
  "- numactl: use the CPU map provided by numactl\n"
1447
1586
  "if run without this previously, it is recommended to drop the system page cache before using this\n"
1448
- "see https://github.com/ggerganov/llama.cpp/issues/1437",
1587
+ "see https://github.com/ggml-org/llama.cpp/issues/1437",
1449
1588
  [](common_params & params, const std::string & value) {
1450
1589
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1451
1590
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1728,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1728
1867
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1729
1868
  add_opt(common_arg(
1730
1869
  {"-o", "--output", "--output-file"}, "FNAME",
1731
- string_format("output file (default: '%s')",
1732
- ex == LLAMA_EXAMPLE_EXPORT_LORA
1733
- ? params.lora_outfile.c_str()
1734
- : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1735
- ? params.cvector_outfile.c_str()
1736
- : params.out_file.c_str()),
1870
+ string_format("output file (default: '%s')", params.out_file.c_str()),
1737
1871
  [](common_params & params, const std::string & value) {
1738
1872
  params.out_file = value;
1739
- params.cvector_outfile = value;
1740
- params.lora_outfile = value;
1741
1873
  }
1742
1874
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1743
1875
  add_opt(common_arg(
@@ -1975,6 +2107,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1975
2107
  params.use_jinja = true;
1976
2108
  }
1977
2109
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2110
+ add_opt(common_arg(
2111
+ {"--reasoning-format"}, "FORMAT",
2112
+ "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2113
+ "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2114
+ "only supported for non-streamed responses",
2115
+ [](common_params & params, const std::string & value) {
2116
+ /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2117
+ else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2118
+ else { std::invalid_argument("invalid value"); }
2119
+ }
2120
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
1978
2121
  add_opt(common_arg(
1979
2122
  {"--chat-template"}, "JINJA_TEMPLATE",
1980
2123
  string_format(
@@ -2112,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2112
2255
  ).set_env("LLAMA_LOG_VERBOSITY"));
2113
2256
  add_opt(common_arg(
2114
2257
  {"--log-prefix"},
2115
- "Enable prefx in log messages",
2258
+ "Enable prefix in log messages",
2116
2259
  [](common_params &) {
2117
2260
  common_log_set_prefix(common_log_main(), true);
2118
2261
  }
@@ -2311,6 +2454,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2311
2454
  params.vocoder.use_guide_tokens = true;
2312
2455
  }
2313
2456
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2457
+ add_opt(common_arg(
2458
+ {"--tts-speaker-file"}, "FNAME",
2459
+ "speaker file path for audio generation",
2460
+ [](common_params & params, const std::string & value) {
2461
+ params.vocoder.speaker_file = value;
2462
+ }
2463
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
2314
2464
 
2315
2465
  // model-specific
2316
2466
  add_opt(common_arg(
@@ -2324,5 +2474,133 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2324
2474
  }
2325
2475
  ).set_examples({LLAMA_EXAMPLE_TTS}));
2326
2476
 
2477
+ add_opt(common_arg(
2478
+ {"--embd-bge-small-en-default"},
2479
+ string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2480
+ [](common_params & params) {
2481
+ params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2482
+ params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2483
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2484
+ params.embd_normalize = 2;
2485
+ params.n_ctx = 512;
2486
+ params.verbose_prompt = true;
2487
+ params.embedding = true;
2488
+ }
2489
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2490
+
2491
+ add_opt(common_arg(
2492
+ {"--embd-e5-small-en-default"},
2493
+ string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2494
+ [](common_params & params) {
2495
+ params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2496
+ params.hf_file = "e5-small-v2-q8_0.gguf";
2497
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2498
+ params.embd_normalize = 2;
2499
+ params.n_ctx = 512;
2500
+ params.verbose_prompt = true;
2501
+ params.embedding = true;
2502
+ }
2503
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2504
+
2505
+ add_opt(common_arg(
2506
+ {"--embd-gte-small-default"},
2507
+ string_format("use default gte-small model (note: can download weights from the internet)"),
2508
+ [](common_params & params) {
2509
+ params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2510
+ params.hf_file = "gte-small-q8_0.gguf";
2511
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2512
+ params.embd_normalize = 2;
2513
+ params.n_ctx = 512;
2514
+ params.verbose_prompt = true;
2515
+ params.embedding = true;
2516
+ }
2517
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2518
+
2519
+ add_opt(common_arg(
2520
+ {"--fim-qwen-1.5b-default"},
2521
+ string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2522
+ [](common_params & params) {
2523
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2524
+ params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2525
+ params.port = 8012;
2526
+ params.n_gpu_layers = 99;
2527
+ params.flash_attn = true;
2528
+ params.n_ubatch = 1024;
2529
+ params.n_batch = 1024;
2530
+ params.n_ctx = 0;
2531
+ params.n_cache_reuse = 256;
2532
+ }
2533
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2534
+
2535
+ add_opt(common_arg(
2536
+ {"--fim-qwen-3b-default"},
2537
+ string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2538
+ [](common_params & params) {
2539
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2540
+ params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2541
+ params.port = 8012;
2542
+ params.n_gpu_layers = 99;
2543
+ params.flash_attn = true;
2544
+ params.n_ubatch = 1024;
2545
+ params.n_batch = 1024;
2546
+ params.n_ctx = 0;
2547
+ params.n_cache_reuse = 256;
2548
+ }
2549
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2550
+
2551
+ add_opt(common_arg(
2552
+ {"--fim-qwen-7b-default"},
2553
+ string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2554
+ [](common_params & params) {
2555
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2556
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2557
+ params.port = 8012;
2558
+ params.n_gpu_layers = 99;
2559
+ params.flash_attn = true;
2560
+ params.n_ubatch = 1024;
2561
+ params.n_batch = 1024;
2562
+ params.n_ctx = 0;
2563
+ params.n_cache_reuse = 256;
2564
+ }
2565
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2566
+
2567
+ add_opt(common_arg(
2568
+ {"--fim-qwen-7b-spec"},
2569
+ string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2570
+ [](common_params & params) {
2571
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2572
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2573
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2574
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2575
+ params.speculative.n_gpu_layers = 99;
2576
+ params.port = 8012;
2577
+ params.n_gpu_layers = 99;
2578
+ params.flash_attn = true;
2579
+ params.n_ubatch = 1024;
2580
+ params.n_batch = 1024;
2581
+ params.n_ctx = 0;
2582
+ params.n_cache_reuse = 256;
2583
+ }
2584
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2585
+
2586
+ add_opt(common_arg(
2587
+ {"--fim-qwen-14b-spec"},
2588
+ string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2589
+ [](common_params & params) {
2590
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2591
+ params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2592
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2593
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2594
+ params.speculative.n_gpu_layers = 99;
2595
+ params.port = 8012;
2596
+ params.n_gpu_layers = 99;
2597
+ params.flash_attn = true;
2598
+ params.n_ubatch = 1024;
2599
+ params.n_batch = 1024;
2600
+ params.n_ctx = 0;
2601
+ params.n_cache_reuse = 256;
2602
+ }
2603
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2604
+
2327
2605
  return ctx_arg;
2328
2606
  }