@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "log.h"
4
4
  #include "sampling.h"
5
+ #include "chat.h"
5
6
 
6
7
  #include <algorithm>
7
8
  #include <climits>
@@ -763,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
763
764
  ).set_env("LLAMA_ARG_CTX_SIZE"));
764
765
  add_opt(common_arg(
765
766
  {"-n", "--predict", "--n-predict"}, "N",
766
- string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
767
+ string_format(
768
+ ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
769
+ ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
770
+ : "number of tokens to predict (default: %d, -1 = infinity)",
771
+ params.n_predict),
767
772
  [](common_params & params, int value) {
768
773
  params.n_predict = value;
769
774
  }
@@ -812,13 +817,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
812
817
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
813
818
  add_opt(common_arg(
814
819
  {"-p", "--prompt"}, "PROMPT",
815
- ex == LLAMA_EXAMPLE_MAIN
816
- ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
817
- : "prompt to start generation with",
820
+ "prompt to start generation with; for system message, use -sys",
818
821
  [](common_params & params, const std::string & value) {
819
822
  params.prompt = value;
820
823
  }
821
824
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
825
+ add_opt(common_arg(
826
+ {"-sys", "--system-prompt"}, "PROMPT",
827
+ "system prompt to use with model (if applicable, depending on chat template)",
828
+ [](common_params & params, const std::string & value) {
829
+ params.system_prompt = value;
830
+ }
831
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
822
832
  add_opt(common_arg(
823
833
  {"--no-perf"},
824
834
  string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -843,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
843
853
  }
844
854
  }
845
855
  ).set_excludes({LLAMA_EXAMPLE_SERVER}));
856
+ add_opt(common_arg(
857
+ {"-sysf", "--system-prompt-file"}, "FNAME",
858
+ "a file containing the system prompt (default: none)",
859
+ [](common_params & params, const std::string & value) {
860
+ std::ifstream file(value);
861
+ if (!file) {
862
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
863
+ }
864
+ std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
865
+ if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
866
+ params.system_prompt.pop_back();
867
+ }
868
+ }
869
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
846
870
  add_opt(common_arg(
847
871
  {"--in-file"}, "FNAME",
848
872
  "an input file (repeat to specify multiple files)",
@@ -943,6 +967,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
943
967
  params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
944
968
  }
945
969
  ).set_examples({LLAMA_EXAMPLE_MAIN}));
970
+ add_opt(common_arg(
971
+ {"-st", "--single-turn"},
972
+ "run conversation for a single turn only, then exit when done\n"
973
+ "will not be interactive if first turn is predefined with --prompt\n"
974
+ "(default: false)",
975
+ [](common_params & params) {
976
+ params.single_turn = true;
977
+ }
978
+ ).set_examples({LLAMA_EXAMPLE_MAIN}));
946
979
  add_opt(common_arg(
947
980
  {"-i", "--interactive"},
948
981
  string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
@@ -1852,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1852
1885
  ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
1853
1886
  add_opt(common_arg(
1854
1887
  {"-o", "--output", "--output-file"}, "FNAME",
1855
- string_format("output file (default: '%s')",
1856
- ex == LLAMA_EXAMPLE_EXPORT_LORA
1857
- ? params.lora_outfile.c_str()
1858
- : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
1859
- ? params.cvector_outfile.c_str()
1860
- : params.out_file.c_str()),
1888
+ string_format("output file (default: '%s')", params.out_file.c_str()),
1861
1889
  [](common_params & params, const std::string & value) {
1862
1890
  params.out_file = value;
1863
- params.cvector_outfile = value;
1864
- params.lora_outfile = value;
1865
1891
  }
1866
- ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
1892
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
1867
1893
  add_opt(common_arg(
1868
1894
  {"-ofreq", "--output-frequency"}, "N",
1869
1895
  string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
@@ -2446,6 +2472,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2446
2472
  params.vocoder.use_guide_tokens = true;
2447
2473
  }
2448
2474
  ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
2475
+ add_opt(common_arg(
2476
+ {"--tts-speaker-file"}, "FNAME",
2477
+ "speaker file path for audio generation",
2478
+ [](common_params & params, const std::string & value) {
2479
+ params.vocoder.speaker_file = value;
2480
+ }
2481
+ ).set_examples({LLAMA_EXAMPLE_TTS}));
2449
2482
 
2450
2483
  // model-specific
2451
2484
  add_opt(common_arg(
@@ -2501,5 +2534,91 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2501
2534
  }
2502
2535
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2503
2536
 
2537
+ add_opt(common_arg(
2538
+ {"--fim-qwen-1.5b-default"},
2539
+ string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2540
+ [](common_params & params) {
2541
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2542
+ params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2543
+ params.port = 8012;
2544
+ params.n_gpu_layers = 99;
2545
+ params.flash_attn = true;
2546
+ params.n_ubatch = 1024;
2547
+ params.n_batch = 1024;
2548
+ params.n_ctx = 0;
2549
+ params.n_cache_reuse = 256;
2550
+ }
2551
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2552
+
2553
+ add_opt(common_arg(
2554
+ {"--fim-qwen-3b-default"},
2555
+ string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2556
+ [](common_params & params) {
2557
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2558
+ params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2559
+ params.port = 8012;
2560
+ params.n_gpu_layers = 99;
2561
+ params.flash_attn = true;
2562
+ params.n_ubatch = 1024;
2563
+ params.n_batch = 1024;
2564
+ params.n_ctx = 0;
2565
+ params.n_cache_reuse = 256;
2566
+ }
2567
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2568
+
2569
+ add_opt(common_arg(
2570
+ {"--fim-qwen-7b-default"},
2571
+ string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2572
+ [](common_params & params) {
2573
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2574
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2575
+ params.port = 8012;
2576
+ params.n_gpu_layers = 99;
2577
+ params.flash_attn = true;
2578
+ params.n_ubatch = 1024;
2579
+ params.n_batch = 1024;
2580
+ params.n_ctx = 0;
2581
+ params.n_cache_reuse = 256;
2582
+ }
2583
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2584
+
2585
+ add_opt(common_arg(
2586
+ {"--fim-qwen-7b-spec"},
2587
+ string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2588
+ [](common_params & params) {
2589
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2590
+ params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2591
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2592
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2593
+ params.speculative.n_gpu_layers = 99;
2594
+ params.port = 8012;
2595
+ params.n_gpu_layers = 99;
2596
+ params.flash_attn = true;
2597
+ params.n_ubatch = 1024;
2598
+ params.n_batch = 1024;
2599
+ params.n_ctx = 0;
2600
+ params.n_cache_reuse = 256;
2601
+ }
2602
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2603
+
2604
+ add_opt(common_arg(
2605
+ {"--fim-qwen-14b-spec"},
2606
+ string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
2607
+ [](common_params & params) {
2608
+ params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
2609
+ params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
2610
+ params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
2611
+ params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
2612
+ params.speculative.n_gpu_layers = 99;
2613
+ params.port = 8012;
2614
+ params.n_gpu_layers = 99;
2615
+ params.flash_attn = true;
2616
+ params.n_ubatch = 1024;
2617
+ params.n_batch = 1024;
2618
+ params.n_ctx = 0;
2619
+ params.n_cache_reuse = 256;
2620
+ }
2621
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2622
+
2504
2623
  return ctx_arg;
2505
2624
  }