@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "sampling.h"
|
|
5
|
+
#include "chat.h"
|
|
5
6
|
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <climits>
|
|
@@ -763,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
763
764
|
).set_env("LLAMA_ARG_CTX_SIZE"));
|
|
764
765
|
add_opt(common_arg(
|
|
765
766
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
766
|
-
string_format(
|
|
767
|
+
string_format(
|
|
768
|
+
ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL
|
|
769
|
+
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
770
|
+
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
771
|
+
params.n_predict),
|
|
767
772
|
[](common_params & params, int value) {
|
|
768
773
|
params.n_predict = value;
|
|
769
774
|
}
|
|
@@ -812,13 +817,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
812
817
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
813
818
|
add_opt(common_arg(
|
|
814
819
|
{"-p", "--prompt"}, "PROMPT",
|
|
815
|
-
|
|
816
|
-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
|
817
|
-
: "prompt to start generation with",
|
|
820
|
+
"prompt to start generation with; for system message, use -sys",
|
|
818
821
|
[](common_params & params, const std::string & value) {
|
|
819
822
|
params.prompt = value;
|
|
820
823
|
}
|
|
821
824
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
825
|
+
add_opt(common_arg(
|
|
826
|
+
{"-sys", "--system-prompt"}, "PROMPT",
|
|
827
|
+
"system prompt to use with model (if applicable, depending on chat template)",
|
|
828
|
+
[](common_params & params, const std::string & value) {
|
|
829
|
+
params.system_prompt = value;
|
|
830
|
+
}
|
|
831
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
822
832
|
add_opt(common_arg(
|
|
823
833
|
{"--no-perf"},
|
|
824
834
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -843,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
843
853
|
}
|
|
844
854
|
}
|
|
845
855
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
856
|
+
add_opt(common_arg(
|
|
857
|
+
{"-sysf", "--system-prompt-file"}, "FNAME",
|
|
858
|
+
"a file containing the system prompt (default: none)",
|
|
859
|
+
[](common_params & params, const std::string & value) {
|
|
860
|
+
std::ifstream file(value);
|
|
861
|
+
if (!file) {
|
|
862
|
+
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
|
|
863
|
+
}
|
|
864
|
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.system_prompt));
|
|
865
|
+
if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') {
|
|
866
|
+
params.system_prompt.pop_back();
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
846
870
|
add_opt(common_arg(
|
|
847
871
|
{"--in-file"}, "FNAME",
|
|
848
872
|
"an input file (repeat to specify multiple files)",
|
|
@@ -943,6 +967,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
943
967
|
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
944
968
|
}
|
|
945
969
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
970
|
+
add_opt(common_arg(
|
|
971
|
+
{"-st", "--single-turn"},
|
|
972
|
+
"run conversation for a single turn only, then exit when done\n"
|
|
973
|
+
"will not be interactive if first turn is predefined with --prompt\n"
|
|
974
|
+
"(default: false)",
|
|
975
|
+
[](common_params & params) {
|
|
976
|
+
params.single_turn = true;
|
|
977
|
+
}
|
|
978
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
946
979
|
add_opt(common_arg(
|
|
947
980
|
{"-i", "--interactive"},
|
|
948
981
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
@@ -1852,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1852
1885
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1853
1886
|
add_opt(common_arg(
|
|
1854
1887
|
{"-o", "--output", "--output-file"}, "FNAME",
|
|
1855
|
-
string_format("output file (default: '%s')",
|
|
1856
|
-
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1857
|
-
? params.lora_outfile.c_str()
|
|
1858
|
-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
1859
|
-
? params.cvector_outfile.c_str()
|
|
1860
|
-
: params.out_file.c_str()),
|
|
1888
|
+
string_format("output file (default: '%s')", params.out_file.c_str()),
|
|
1861
1889
|
[](common_params & params, const std::string & value) {
|
|
1862
1890
|
params.out_file = value;
|
|
1863
|
-
params.cvector_outfile = value;
|
|
1864
|
-
params.lora_outfile = value;
|
|
1865
1891
|
}
|
|
1866
|
-
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1892
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS}));
|
|
1867
1893
|
add_opt(common_arg(
|
|
1868
1894
|
{"-ofreq", "--output-frequency"}, "N",
|
|
1869
1895
|
string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
|
|
@@ -2446,6 +2472,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2446
2472
|
params.vocoder.use_guide_tokens = true;
|
|
2447
2473
|
}
|
|
2448
2474
|
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2475
|
+
add_opt(common_arg(
|
|
2476
|
+
{"--tts-speaker-file"}, "FNAME",
|
|
2477
|
+
"speaker file path for audio generation",
|
|
2478
|
+
[](common_params & params, const std::string & value) {
|
|
2479
|
+
params.vocoder.speaker_file = value;
|
|
2480
|
+
}
|
|
2481
|
+
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2449
2482
|
|
|
2450
2483
|
// model-specific
|
|
2451
2484
|
add_opt(common_arg(
|
|
@@ -2501,5 +2534,91 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2501
2534
|
}
|
|
2502
2535
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2503
2536
|
|
|
2537
|
+
add_opt(common_arg(
|
|
2538
|
+
{"--fim-qwen-1.5b-default"},
|
|
2539
|
+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
|
2540
|
+
[](common_params & params) {
|
|
2541
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
2542
|
+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
2543
|
+
params.port = 8012;
|
|
2544
|
+
params.n_gpu_layers = 99;
|
|
2545
|
+
params.flash_attn = true;
|
|
2546
|
+
params.n_ubatch = 1024;
|
|
2547
|
+
params.n_batch = 1024;
|
|
2548
|
+
params.n_ctx = 0;
|
|
2549
|
+
params.n_cache_reuse = 256;
|
|
2550
|
+
}
|
|
2551
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2552
|
+
|
|
2553
|
+
add_opt(common_arg(
|
|
2554
|
+
{"--fim-qwen-3b-default"},
|
|
2555
|
+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
|
2556
|
+
[](common_params & params) {
|
|
2557
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
2558
|
+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
2559
|
+
params.port = 8012;
|
|
2560
|
+
params.n_gpu_layers = 99;
|
|
2561
|
+
params.flash_attn = true;
|
|
2562
|
+
params.n_ubatch = 1024;
|
|
2563
|
+
params.n_batch = 1024;
|
|
2564
|
+
params.n_ctx = 0;
|
|
2565
|
+
params.n_cache_reuse = 256;
|
|
2566
|
+
}
|
|
2567
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2568
|
+
|
|
2569
|
+
add_opt(common_arg(
|
|
2570
|
+
{"--fim-qwen-7b-default"},
|
|
2571
|
+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
|
2572
|
+
[](common_params & params) {
|
|
2573
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2574
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2575
|
+
params.port = 8012;
|
|
2576
|
+
params.n_gpu_layers = 99;
|
|
2577
|
+
params.flash_attn = true;
|
|
2578
|
+
params.n_ubatch = 1024;
|
|
2579
|
+
params.n_batch = 1024;
|
|
2580
|
+
params.n_ctx = 0;
|
|
2581
|
+
params.n_cache_reuse = 256;
|
|
2582
|
+
}
|
|
2583
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2584
|
+
|
|
2585
|
+
add_opt(common_arg(
|
|
2586
|
+
{"--fim-qwen-7b-spec"},
|
|
2587
|
+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2588
|
+
[](common_params & params) {
|
|
2589
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2590
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2591
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2592
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2593
|
+
params.speculative.n_gpu_layers = 99;
|
|
2594
|
+
params.port = 8012;
|
|
2595
|
+
params.n_gpu_layers = 99;
|
|
2596
|
+
params.flash_attn = true;
|
|
2597
|
+
params.n_ubatch = 1024;
|
|
2598
|
+
params.n_batch = 1024;
|
|
2599
|
+
params.n_ctx = 0;
|
|
2600
|
+
params.n_cache_reuse = 256;
|
|
2601
|
+
}
|
|
2602
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2603
|
+
|
|
2604
|
+
add_opt(common_arg(
|
|
2605
|
+
{"--fim-qwen-14b-spec"},
|
|
2606
|
+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2607
|
+
[](common_params & params) {
|
|
2608
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
2609
|
+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
2610
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2611
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2612
|
+
params.speculative.n_gpu_layers = 99;
|
|
2613
|
+
params.port = 8012;
|
|
2614
|
+
params.n_gpu_layers = 99;
|
|
2615
|
+
params.flash_attn = true;
|
|
2616
|
+
params.n_ubatch = 1024;
|
|
2617
|
+
params.n_batch = 1024;
|
|
2618
|
+
params.n_ctx = 0;
|
|
2619
|
+
params.n_cache_reuse = 256;
|
|
2620
|
+
}
|
|
2621
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2622
|
+
|
|
2504
2623
|
return ctx_arg;
|
|
2505
2624
|
}
|