@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "log.h"
|
|
4
4
|
#include "sampling.h"
|
|
5
|
+
#include "chat.h"
|
|
5
6
|
|
|
6
7
|
#include <algorithm>
|
|
7
8
|
#include <climits>
|
|
@@ -812,13 +813,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
812
813
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
813
814
|
add_opt(common_arg(
|
|
814
815
|
{"-p", "--prompt"}, "PROMPT",
|
|
815
|
-
|
|
816
|
-
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
|
|
817
|
-
: "prompt to start generation with",
|
|
816
|
+
"prompt to start generation with; for system message, use -sys",
|
|
818
817
|
[](common_params & params, const std::string & value) {
|
|
819
818
|
params.prompt = value;
|
|
820
819
|
}
|
|
821
820
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
|
821
|
+
add_opt(common_arg(
|
|
822
|
+
{"-sys", "--system-prompt"}, "PROMPT",
|
|
823
|
+
"system prompt to use with model (if applicable, depending on chat template)",
|
|
824
|
+
[](common_params & params, const std::string & value) {
|
|
825
|
+
params.system_prompt = value;
|
|
826
|
+
}
|
|
827
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
822
828
|
add_opt(common_arg(
|
|
823
829
|
{"--no-perf"},
|
|
824
830
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
@@ -943,6 +949,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
943
949
|
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
944
950
|
}
|
|
945
951
|
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
952
|
+
add_opt(common_arg(
|
|
953
|
+
{"-st", "--single-turn"},
|
|
954
|
+
"run conversation for a single turn only, then exit when done\n"
|
|
955
|
+
"will not be interactive if first turn is predefined with --prompt\n"
|
|
956
|
+
"(default: false)",
|
|
957
|
+
[](common_params & params) {
|
|
958
|
+
params.single_turn = true;
|
|
959
|
+
}
|
|
960
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
946
961
|
add_opt(common_arg(
|
|
947
962
|
{"-i", "--interactive"},
|
|
948
963
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
@@ -1852,16 +1867,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1852
1867
|
).set_examples({LLAMA_EXAMPLE_PASSKEY}));
|
|
1853
1868
|
add_opt(common_arg(
|
|
1854
1869
|
{"-o", "--output", "--output-file"}, "FNAME",
|
|
1855
|
-
string_format("output file (default: '%s')",
|
|
1856
|
-
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
|
1857
|
-
? params.lora_outfile.c_str()
|
|
1858
|
-
: ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
|
|
1859
|
-
? params.cvector_outfile.c_str()
|
|
1860
|
-
: params.out_file.c_str()),
|
|
1870
|
+
string_format("output file (default: '%s')", params.out_file.c_str()),
|
|
1861
1871
|
[](common_params & params, const std::string & value) {
|
|
1862
1872
|
params.out_file = value;
|
|
1863
|
-
params.cvector_outfile = value;
|
|
1864
|
-
params.lora_outfile = value;
|
|
1865
1873
|
}
|
|
1866
1874
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
1867
1875
|
add_opt(common_arg(
|
|
@@ -2446,6 +2454,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2446
2454
|
params.vocoder.use_guide_tokens = true;
|
|
2447
2455
|
}
|
|
2448
2456
|
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
|
2457
|
+
add_opt(common_arg(
|
|
2458
|
+
{"--tts-speaker-file"}, "FNAME",
|
|
2459
|
+
"speaker file path for audio generation",
|
|
2460
|
+
[](common_params & params, const std::string & value) {
|
|
2461
|
+
params.vocoder.speaker_file = value;
|
|
2462
|
+
}
|
|
2463
|
+
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2449
2464
|
|
|
2450
2465
|
// model-specific
|
|
2451
2466
|
add_opt(common_arg(
|
|
@@ -2501,5 +2516,91 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2501
2516
|
}
|
|
2502
2517
|
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2503
2518
|
|
|
2519
|
+
add_opt(common_arg(
|
|
2520
|
+
{"--fim-qwen-1.5b-default"},
|
|
2521
|
+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
|
|
2522
|
+
[](common_params & params) {
|
|
2523
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
2524
|
+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
2525
|
+
params.port = 8012;
|
|
2526
|
+
params.n_gpu_layers = 99;
|
|
2527
|
+
params.flash_attn = true;
|
|
2528
|
+
params.n_ubatch = 1024;
|
|
2529
|
+
params.n_batch = 1024;
|
|
2530
|
+
params.n_ctx = 0;
|
|
2531
|
+
params.n_cache_reuse = 256;
|
|
2532
|
+
}
|
|
2533
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2534
|
+
|
|
2535
|
+
add_opt(common_arg(
|
|
2536
|
+
{"--fim-qwen-3b-default"},
|
|
2537
|
+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
|
|
2538
|
+
[](common_params & params) {
|
|
2539
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
2540
|
+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
2541
|
+
params.port = 8012;
|
|
2542
|
+
params.n_gpu_layers = 99;
|
|
2543
|
+
params.flash_attn = true;
|
|
2544
|
+
params.n_ubatch = 1024;
|
|
2545
|
+
params.n_batch = 1024;
|
|
2546
|
+
params.n_ctx = 0;
|
|
2547
|
+
params.n_cache_reuse = 256;
|
|
2548
|
+
}
|
|
2549
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2550
|
+
|
|
2551
|
+
add_opt(common_arg(
|
|
2552
|
+
{"--fim-qwen-7b-default"},
|
|
2553
|
+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
|
|
2554
|
+
[](common_params & params) {
|
|
2555
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2556
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2557
|
+
params.port = 8012;
|
|
2558
|
+
params.n_gpu_layers = 99;
|
|
2559
|
+
params.flash_attn = true;
|
|
2560
|
+
params.n_ubatch = 1024;
|
|
2561
|
+
params.n_batch = 1024;
|
|
2562
|
+
params.n_ctx = 0;
|
|
2563
|
+
params.n_cache_reuse = 256;
|
|
2564
|
+
}
|
|
2565
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2566
|
+
|
|
2567
|
+
add_opt(common_arg(
|
|
2568
|
+
{"--fim-qwen-7b-spec"},
|
|
2569
|
+
string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2570
|
+
[](common_params & params) {
|
|
2571
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
2572
|
+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
2573
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2574
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2575
|
+
params.speculative.n_gpu_layers = 99;
|
|
2576
|
+
params.port = 8012;
|
|
2577
|
+
params.n_gpu_layers = 99;
|
|
2578
|
+
params.flash_attn = true;
|
|
2579
|
+
params.n_ubatch = 1024;
|
|
2580
|
+
params.n_batch = 1024;
|
|
2581
|
+
params.n_ctx = 0;
|
|
2582
|
+
params.n_cache_reuse = 256;
|
|
2583
|
+
}
|
|
2584
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2585
|
+
|
|
2586
|
+
add_opt(common_arg(
|
|
2587
|
+
{"--fim-qwen-14b-spec"},
|
|
2588
|
+
string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
|
|
2589
|
+
[](common_params & params) {
|
|
2590
|
+
params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
|
|
2591
|
+
params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
2592
|
+
params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
2593
|
+
params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
2594
|
+
params.speculative.n_gpu_layers = 99;
|
|
2595
|
+
params.port = 8012;
|
|
2596
|
+
params.n_gpu_layers = 99;
|
|
2597
|
+
params.flash_attn = true;
|
|
2598
|
+
params.n_ubatch = 1024;
|
|
2599
|
+
params.n_batch = 1024;
|
|
2600
|
+
params.n_ctx = 0;
|
|
2601
|
+
params.n_cache_reuse = 256;
|
|
2602
|
+
}
|
|
2603
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
2604
|
+
|
|
2504
2605
|
return ctx_arg;
|
|
2505
2606
|
}
|