@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "sampling.h"
|
|
5
|
+
#include "chat.h"
|
|
5
6
|
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <climits>
|
|
@@ -365,6 +366,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
|
365
366
|
print_options(specific_options);
|
|
366
367
|
}
|
|
367
368
|
|
|
369
|
+
static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
370
|
+
std::vector<common_arg *> common_options;
|
|
371
|
+
std::vector<common_arg *> sparam_options;
|
|
372
|
+
std::vector<common_arg *> specific_options;
|
|
373
|
+
|
|
374
|
+
for (auto & opt : ctx_arg.options) {
|
|
375
|
+
if (opt.is_sparam) {
|
|
376
|
+
sparam_options.push_back(&opt);
|
|
377
|
+
} else if (opt.in_example(ctx_arg.ex)) {
|
|
378
|
+
specific_options.push_back(&opt);
|
|
379
|
+
} else {
|
|
380
|
+
common_options.push_back(&opt);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
printf("_llama_completions() {\n");
|
|
385
|
+
printf(" local cur prev opts\n");
|
|
386
|
+
printf(" COMPREPLY=()\n");
|
|
387
|
+
printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
|
|
388
|
+
printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
|
|
389
|
+
|
|
390
|
+
printf(" opts=\"");
|
|
391
|
+
auto print_options = [](const std::vector<common_arg *> & options) {
|
|
392
|
+
for (const common_arg * opt : options) {
|
|
393
|
+
for (const char * arg : opt->args) {
|
|
394
|
+
printf("%s ", arg);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
};
|
|
398
|
+
|
|
399
|
+
print_options(common_options);
|
|
400
|
+
print_options(sparam_options);
|
|
401
|
+
print_options(specific_options);
|
|
402
|
+
printf("\"\n\n");
|
|
403
|
+
|
|
404
|
+
printf(" case \"$prev\" in\n");
|
|
405
|
+
printf(" --model)\n");
|
|
406
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
407
|
+
printf(" return 0\n");
|
|
408
|
+
printf(" ;;\n");
|
|
409
|
+
printf(" --grammar-file)\n");
|
|
410
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
411
|
+
printf(" return 0\n");
|
|
412
|
+
printf(" ;;\n");
|
|
413
|
+
printf(" --chat-template-file)\n");
|
|
414
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
415
|
+
printf(" return 0\n");
|
|
416
|
+
printf(" ;;\n");
|
|
417
|
+
printf(" *)\n");
|
|
418
|
+
printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
|
|
419
|
+
printf(" return 0\n");
|
|
420
|
+
printf(" ;;\n");
|
|
421
|
+
printf(" esac\n");
|
|
422
|
+
printf("}\n\n");
|
|
423
|
+
|
|
424
|
+
std::set<std::string> executables = {
|
|
425
|
+
"llama-batched",
|
|
426
|
+
"llama-batched-bench",
|
|
427
|
+
"llama-bench",
|
|
428
|
+
"llama-cli",
|
|
429
|
+
"llama-convert-llama2c-to-ggml",
|
|
430
|
+
"llama-cvector-generator",
|
|
431
|
+
"llama-embedding",
|
|
432
|
+
"llama-eval-callback",
|
|
433
|
+
"llama-export-lora",
|
|
434
|
+
"llama-gbnf-validator",
|
|
435
|
+
"llama-gen-docs",
|
|
436
|
+
"llama-gguf",
|
|
437
|
+
"llama-gguf-hash",
|
|
438
|
+
"llama-gguf-split",
|
|
439
|
+
"llama-gritlm",
|
|
440
|
+
"llama-imatrix",
|
|
441
|
+
"llama-infill",
|
|
442
|
+
"llama-llava-cli",
|
|
443
|
+
"llama-llava-clip-quantize-cli",
|
|
444
|
+
"llama-lookahead",
|
|
445
|
+
"llama-lookup",
|
|
446
|
+
"llama-lookup-create",
|
|
447
|
+
"llama-lookup-merge",
|
|
448
|
+
"llama-lookup-stats",
|
|
449
|
+
"llama-minicpmv-cli",
|
|
450
|
+
"llama-parallel",
|
|
451
|
+
"llama-passkey",
|
|
452
|
+
"llama-perplexity",
|
|
453
|
+
"llama-q8dot",
|
|
454
|
+
"llama-quantize",
|
|
455
|
+
"llama-quantize-stats",
|
|
456
|
+
"llama-qwen2vl-cli",
|
|
457
|
+
"llama-retrieval",
|
|
458
|
+
"llama-run",
|
|
459
|
+
"llama-save-load-state",
|
|
460
|
+
"llama-server",
|
|
461
|
+
"llama-simple",
|
|
462
|
+
"llama-simple-chat",
|
|
463
|
+
"llama-speculative",
|
|
464
|
+
"llama-speculative-simple",
|
|
465
|
+
"llama-tokenize",
|
|
466
|
+
"llama-tts",
|
|
467
|
+
"llama-vdot"
|
|
468
|
+
};
|
|
469
|
+
|
|
470
|
+
for (const auto& exe : executables) {
|
|
471
|
+
printf("complete -F _llama_completions %s\n", exe.c_str());
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
|
|
368
475
|
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
|
369
476
|
std::vector<ggml_backend_dev_t> devices;
|
|
370
477
|
auto dev_names = string_split<std::string>(value, ',');
|
|
@@ -426,6 +533,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
426
533
|
}
|
|
427
534
|
exit(0);
|
|
428
535
|
}
|
|
536
|
+
if (ctx_arg.params.completion) {
|
|
537
|
+
common_params_print_completion(ctx_arg);
|
|
538
|
+
exit(0);
|
|
539
|
+
}
|
|
429
540
|
} catch (const std::invalid_argument & ex) {
|
|
430
541
|
fprintf(stderr, "%s\n", ex.what());
|
|
431
542
|
ctx_arg.params = params_org;
|
|
@@ -494,6 +605,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
494
605
|
exit(0);
|
|
495
606
|
}
|
|
496
607
|
));
|
|
608
|
+
add_opt(common_arg(
|
|
609
|
+
{"--completion-bash"},
|
|
610
|
+
"print source-able bash completion script for llama.cpp",
|
|
611
|
+
[](common_params & params) {
|
|
612
|
+
params.completion = true;
|
|
613
|
+
}
|
|
614
|
+
));
|
|
497
615
|
add_opt(common_arg(
|
|
498
616
|
{"--verbose-prompt"},
|
|
499
617
|
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
|
@@ -674,7 +792,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
674
792
|
));
|
|
675
793
|
add_opt(common_arg(
|
|
676
794
|
{"--no-context-shift"},
|
|
677
|
-
string_format("disables context shift on
|
|
795
|
+
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
678
796
|
[](common_params & params) {
|
|
679
797
|
params.ctx_shift = false;
|
|
680
798
|
}
|
|
@@ -695,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
695
813
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
696
814
|
add_opt(common_arg(
|
|
697
815
|
{"-p", "--prompt"}, "PROMPT",
|
|
698
|
-
|
|
699
|
-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
|
700
|
-
: "prompt to start generation with",
|
|
816
|
+
"prompt to start generation with; for system message, use -sys",
|
|
701
817
|
[](common_params & params, const std::string & value) {
|
|
702
818
|
params.prompt = value;
|
|
703
819
|
}
|
|
704
820
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
821
|
+
add_opt(common_arg(
|
|
822
|
+
{"-sys", "--system-prompt"}, "PROMPT",
|
|
823
|
+
"system prompt to use with model (if applicable, depending on chat template)",
|
|
824
|
+
[](common_params & params, const std::string & value) {
|
|
825
|
+
params.system_prompt = value;
|
|
826
|
+
}
|
|
827
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
705
828
|
add_opt(common_arg(
|
|
706
829
|
{"--no-perf"},
|
|
707
830
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -826,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
826
949
|
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
827
950
|
}
|
|
828
951
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
952
|
+
add_opt(common_arg(
|
|
953
|
+
{"-st", "--single-turn"},
|
|
954
|
+
"run conversation for a single turn only, then exit when done\n"
|
|
955
|
+
"will not be interactive if first turn is predefined with --prompt\n"
|
|
956
|
+
"(default: false)",
|
|
957
|
+
[](common_params & params) {
|
|
958
|
+
params.single_turn = true;
|
|
959
|
+
}
|
|
960
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
829
961
|
add_opt(common_arg(
|
|
830
962
|
{"-i", "--interactive"},
|
|
831
963
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
@@ -946,6 +1078,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
946
1078
|
params.sampling.min_p = std::stof(value);
|
|
947
1079
|
}
|
|
948
1080
|
).set_sparam());
|
|
1081
|
+
add_opt(common_arg(
|
|
1082
|
+
{"--top-nsigma"}, "N",
|
|
1083
|
+
string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
|
1084
|
+
[](common_params & params, const std::string & value) {
|
|
1085
|
+
params.sampling.top_n_sigma = std::stof(value);
|
|
1086
|
+
}
|
|
1087
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
|
|
949
1088
|
add_opt(common_arg(
|
|
950
1089
|
{"--xtc-probability"}, "N",
|
|
951
1090
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -1445,7 +1584,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1445
1584
|
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1446
1585
|
"- numactl: use the CPU map provided by numactl\n"
|
|
1447
1586
|
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1448
|
-
"see https://github.com/
|
|
1587
|
+
"see https://github.com/ggml-org/llama.cpp/issues/1437",
|
|
1449
1588
|
[](common_params & params, const std::string & value) {
|
|
1450
1589
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1451
1590
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
@@ -1728,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1728
1867
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1729
1868
|
add_opt(common_arg(
|
|
1730
1869
|
{"-o", "--output", "--output-file"}, "FNAME",
|
|
1731
|
-
string_format("output file (default: '%s')",
|
|
1732
|
-
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1733
|
-
? params.lora_outfile.c_str()
|
|
1734
|
-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
1735
|
-
? params.cvector_outfile.c_str()
|
|
1736
|
-
: params.out_file.c_str()),
|
|
1870
|
+
string_format("output file (default: '%s')", params.out_file.c_str()),
|
|
1737
1871
|
[](common_params & params, const std::string & value) {
|
|
1738
1872
|
params.out_file = value;
|
|
1739
|
-
params.cvector_outfile = value;
|
|
1740
|
-
params.lora_outfile = value;
|
|
1741
1873
|
}
|
|
1742
1874
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1743
1875
|
add_opt(common_arg(
|
|
@@ -1975,6 +2107,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1975
2107
|
params.use_jinja = true;
|
|
1976
2108
|
}
|
|
1977
2109
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2110
|
+
add_opt(common_arg(
|
|
2111
|
+
{"--reasoning-format"}, "FORMAT",
|
|
2112
|
+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
|
2113
|
+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
|
2114
|
+
"only supported for non-streamed responses",
|
|
2115
|
+
[](common_params & params, const std::string & value) {
|
|
2116
|
+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2117
|
+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2118
|
+
else { std::invalid_argument("invalid value"); }
|
|
2119
|
+
}
|
|
2120
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
1978
2121
|
add_opt(common_arg(
|
|
1979
2122
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1980
2123
|
string_format(
|
|
@@ -2112,7 +2255,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2112
2255
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
2113
2256
|
add_opt(common_arg(
|
|
2114
2257
|
{"--log-prefix"},
|
|
2115
|
-
"Enable
|
|
2258
|
+
"Enable prefix in log messages",
|
|
2116
2259
|
[](common_params &) {
|
|
2117
2260
|
common_log_set_prefix(common_log_main(), true);
|
|
2118
2261
|
}
|
|
@@ -2311,6 +2454,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2311
2454
|
params.vocoder.use_guide_tokens = true;
|
|
2312
2455
|
}
|
|
2313
2456
|
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2457
|
+
add_opt(common_arg(
|
|
2458
|
+
{"--tts-speaker-file"}, "FNAME",
|
|
2459
|
+
"speaker file path for audio generation",
|
|
2460
|
+
[](common_params & params, const std::string & value) {
|
|
2461
|
+
params.vocoder.speaker_file = value;
|
|
2462
|
+
}
|
|
2463
|
+
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2314
2464
|
|
|
2315
2465
|
// model-specific
|
|
2316
2466
|
add_opt(common_arg(
|
|
@@ -2324,5 +2474,133 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2324
2474
|
}
|
|
2325
2475
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2326
2476
|
|
|
2477
|
+
add_opt(common_arg(
|
|
2478
|
+
{"--embd-bge-small-en-default"},
|
|
2479
|
+
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
|
2480
|
+
[](common_params & params) {
|
|
2481
|
+
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
2482
|
+
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
2483
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2484
|
+
params.embd_normalize = 2;
|
|
2485
|
+
params.n_ctx = 512;
|
|
2486
|
+
params.verbose_prompt = true;
|
|
2487
|
+
params.embedding = true;
|
|
2488
|
+
}
|
|
2489
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2490
|
+
|
|
2491
|
+
add_opt(common_arg(
|
|
2492
|
+
{"--embd-e5-small-en-default"},
|
|
2493
|
+
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
2494
|
+
[](common_params & params) {
|
|
2495
|
+
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
2496
|
+
params.hf_file = "e5-small-v2-q8_0.gguf";
|
|
2497
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2498
|
+
params.embd_normalize = 2;
|
|
2499
|
+
params.n_ctx = 512;
|
|
2500
|
+
params.verbose_prompt = true;
|
|
2501
|
+
params.embedding = true;
|
|
2502
|
+
}
|
|
2503
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2504
|
+
|
|
2505
|
+
add_opt(common_arg(
|
|
2506
|
+
{"--embd-gte-small-default"},
|
|
2507
|
+
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2508
|
+
[](common_params & params) {
|
|
2509
|
+
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
2510
|
+
params.hf_file = "gte-small-q8_0.gguf";
|
|
2511
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2512
|
+
params.embd_normalize = 2;
|
|
2513
|
+
params.n_ctx = 512;
|
|
2514
|
+
params.verbose_prompt = true;
|
|
2515
|
+
params.embedding = true;
|
|
2516
|
+
}
|
|
2517
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2518
|
+
|
|
2519
|
+
add_opt(common_arg(
|
|
2520
|
+
{"--fim-qwen-1.5b-default"},
|
|
2521
|
+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
|
2522
|
+
[](common_params & params) {
|
|
2523
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
2524
|
+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
2525
|
+
params.port = 8012;
|
|
2526
|
+
params.n_gpu_layers = 99;
|
|
2527
|
+
params.flash_attn = true;
|
|
2528
|
+
params.n_ubatch = 1024;
|
|
2529
|
+
params.n_batch = 1024;
|
|
2530
|
+
params.n_ctx = 0;
|
|
2531
|
+
params.n_cache_reuse = 256;
|
|
2532
|
+
}
|
|
2533
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2534
|
+
|
|
2535
|
+
add_opt(common_arg(
|
|
2536
|
+
{"--fim-qwen-3b-default"},
|
|
2537
|
+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
|
2538
|
+
[](common_params & params) {
|
|
2539
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
2540
|
+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
2541
|
+
params.port = 8012;
|
|
2542
|
+
params.n_gpu_layers = 99;
|
|
2543
|
+
params.flash_attn = true;
|
|
2544
|
+
params.n_ubatch = 1024;
|
|
2545
|
+
params.n_batch = 1024;
|
|
2546
|
+
params.n_ctx = 0;
|
|
2547
|
+
params.n_cache_reuse = 256;
|
|
2548
|
+
}
|
|
2549
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2550
|
+
|
|
2551
|
+
add_opt(common_arg(
|
|
2552
|
+
{"--fim-qwen-7b-default"},
|
|
2553
|
+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
|
2554
|
+
[](common_params & params) {
|
|
2555
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2556
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2557
|
+
params.port = 8012;
|
|
2558
|
+
params.n_gpu_layers = 99;
|
|
2559
|
+
params.flash_attn = true;
|
|
2560
|
+
params.n_ubatch = 1024;
|
|
2561
|
+
params.n_batch = 1024;
|
|
2562
|
+
params.n_ctx = 0;
|
|
2563
|
+
params.n_cache_reuse = 256;
|
|
2564
|
+
}
|
|
2565
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2566
|
+
|
|
2567
|
+
add_opt(common_arg(
|
|
2568
|
+
{"--fim-qwen-7b-spec"},
|
|
2569
|
+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2570
|
+
[](common_params & params) {
|
|
2571
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2572
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2573
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2574
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2575
|
+
params.speculative.n_gpu_layers = 99;
|
|
2576
|
+
params.port = 8012;
|
|
2577
|
+
params.n_gpu_layers = 99;
|
|
2578
|
+
params.flash_attn = true;
|
|
2579
|
+
params.n_ubatch = 1024;
|
|
2580
|
+
params.n_batch = 1024;
|
|
2581
|
+
params.n_ctx = 0;
|
|
2582
|
+
params.n_cache_reuse = 256;
|
|
2583
|
+
}
|
|
2584
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2585
|
+
|
|
2586
|
+
add_opt(common_arg(
|
|
2587
|
+
{"--fim-qwen-14b-spec"},
|
|
2588
|
+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2589
|
+
[](common_params & params) {
|
|
2590
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
2591
|
+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
2592
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2593
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2594
|
+
params.speculative.n_gpu_layers = 99;
|
|
2595
|
+
params.port = 8012;
|
|
2596
|
+
params.n_gpu_layers = 99;
|
|
2597
|
+
params.flash_attn = true;
|
|
2598
|
+
params.n_ubatch = 1024;
|
|
2599
|
+
params.n_batch = 1024;
|
|
2600
|
+
params.n_ctx = 0;
|
|
2601
|
+
params.n_cache_reuse = 256;
|
|
2602
|
+
}
|
|
2603
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2604
|
+
|
|
2327
2605
|
return ctx_arg;
|
|
2328
2606
|
}
|