@fugood/llama.node 0.3.12 → 0.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +13 -4
- package/src/llama.cpp/.github/workflows/build.yml +35 -3
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/common/CMakeLists.txt +20 -3
- package/src/llama.cpp/common/arg.cpp +180 -3
- package/src/llama.cpp/common/chat-template.hpp +21 -7
- package/src/llama.cpp/common/chat.cpp +220 -101
- package/src/llama.cpp/common/chat.hpp +3 -0
- package/src/llama.cpp/common/common.h +15 -7
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/minja.hpp +24 -9
- package/src/llama.cpp/common/sampling.cpp +52 -46
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/run/run.cpp +5 -12
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +58 -47
- package/src/llama.cpp/examples/server/utils.hpp +7 -5
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
- package/src/llama.cpp/ggml/src/ggml.c +1 -1
- package/src/llama.cpp/include/llama.h +14 -10
- package/src/llama.cpp/src/llama-grammar.cpp +1 -1
- package/src/llama.cpp/src/llama-grammar.h +1 -1
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +131 -57
- package/src/llama.cpp/src/llama.cpp +7 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
- package/src/llama.cpp/tests/test-chat.cpp +237 -69
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/bin/win32/arm64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
package/bin/win32/x64/node.lib
CHANGED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
package/package.json
CHANGED
|
@@ -165,9 +165,17 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
165
165
|
Napi::String::New(env, _result.text.c_str()));
|
|
166
166
|
|
|
167
167
|
Napi::Array tool_calls = Napi::Array::New(Napi::AsyncWorker::Env());
|
|
168
|
+
std::string * reasoning_content = nullptr;
|
|
169
|
+
std::string * content = nullptr;
|
|
168
170
|
if (!_stop) {
|
|
169
171
|
try {
|
|
170
172
|
common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
|
|
173
|
+
if (!message.reasoning_content.empty()) {
|
|
174
|
+
reasoning_content = &message.reasoning_content;
|
|
175
|
+
}
|
|
176
|
+
if (!message.content.empty()) {
|
|
177
|
+
content = &message.content;
|
|
178
|
+
}
|
|
171
179
|
for (size_t i = 0; i < message.tool_calls.size(); i++) {
|
|
172
180
|
const auto &tc = message.tool_calls[i];
|
|
173
181
|
Napi::Object tool_call = Napi::Object::New(env);
|
|
@@ -188,6 +196,12 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
188
196
|
if (tool_calls.Length() > 0) {
|
|
189
197
|
result.Set("tool_calls", tool_calls);
|
|
190
198
|
}
|
|
199
|
+
if (reasoning_content) {
|
|
200
|
+
result.Set("reasoning_content", Napi::String::New(env, reasoning_content->c_str()));
|
|
201
|
+
}
|
|
202
|
+
if (content) {
|
|
203
|
+
result.Set("content", Napi::String::New(env, content->c_str()));
|
|
204
|
+
}
|
|
191
205
|
|
|
192
206
|
auto ctx = _sess->context();
|
|
193
207
|
const auto timings_token = llama_perf_context(ctx);
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -185,6 +185,13 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
185
185
|
|
|
186
186
|
params.chat_template = get_option<std::string>(options, "chat_template", "");
|
|
187
187
|
|
|
188
|
+
std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
|
|
189
|
+
if (reasoning_format == "deepseek") {
|
|
190
|
+
params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
191
|
+
} else {
|
|
192
|
+
params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
193
|
+
}
|
|
194
|
+
|
|
188
195
|
params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
|
|
189
196
|
params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
|
|
190
197
|
params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
|
|
@@ -377,7 +384,7 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
|
|
|
377
384
|
}
|
|
378
385
|
|
|
379
386
|
common_chat_params getFormattedChatWithJinja(
|
|
380
|
-
const
|
|
387
|
+
const std::shared_ptr<LlamaSession> &sess,
|
|
381
388
|
const common_chat_templates &templates,
|
|
382
389
|
const std::string &messages,
|
|
383
390
|
const std::string &chat_template,
|
|
@@ -399,11 +406,12 @@ common_chat_params getFormattedChatWithJinja(
|
|
|
399
406
|
if (!json_schema.empty()) {
|
|
400
407
|
inputs.json_schema = json::parse(json_schema);
|
|
401
408
|
}
|
|
409
|
+
inputs.extract_reasoning = sess->params().reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
|
402
410
|
inputs.stream = true;
|
|
403
411
|
|
|
404
412
|
// If chat_template is provided, create new one and use it (probably slow)
|
|
405
413
|
if (!chat_template.empty()) {
|
|
406
|
-
auto tmp = common_chat_templates_from_model(model, chat_template);
|
|
414
|
+
auto tmp = common_chat_templates_from_model(sess->model(), chat_template);
|
|
407
415
|
const common_chat_template* template_ptr = useTools && tmp.template_tool_use ? tmp.template_tool_use.get() : tmp.template_default.get();
|
|
408
416
|
if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
|
|
409
417
|
inputs.parallel_tool_calls = false;
|
|
@@ -493,7 +501,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
|
|
|
493
501
|
auto parallel_tool_calls = get_option<bool>(params, "parallel_tool_calls", false);
|
|
494
502
|
auto tool_choice = get_option<std::string>(params, "tool_choice", "");
|
|
495
503
|
|
|
496
|
-
auto chatParams = getFormattedChatWithJinja(_sess
|
|
504
|
+
auto chatParams = getFormattedChatWithJinja(_sess, _templates, messages, chat_template, json_schema_str, tools_str, parallel_tool_calls, tool_choice);
|
|
497
505
|
|
|
498
506
|
Napi::Object result = Napi::Object::New(env);
|
|
499
507
|
result.Set("prompt", chatParams.prompt.get<std::string>());
|
|
@@ -598,7 +606,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
598
606
|
auto tool_choice = get_option<std::string>(options, "tool_choice", "none");
|
|
599
607
|
|
|
600
608
|
auto chatParams = getFormattedChatWithJinja(
|
|
601
|
-
_sess
|
|
609
|
+
_sess,
|
|
602
610
|
_templates,
|
|
603
611
|
json_stringify(messages),
|
|
604
612
|
chat_template,
|
|
@@ -685,6 +693,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
685
693
|
params.sampling.dry_base = get_option<float>(options, "dry_base", 2);
|
|
686
694
|
params.sampling.dry_allowed_length = get_option<float>(options, "dry_allowed_length", -1);
|
|
687
695
|
params.sampling.dry_penalty_last_n = get_option<float>(options, "dry_penalty_last_n", 0);
|
|
696
|
+
params.sampling.top_n_sigma = get_option<float>(options, "top_n_sigma", -1.0f);
|
|
688
697
|
params.sampling.ignore_eos = get_option<bool>(options, "ignore_eos", false);
|
|
689
698
|
params.n_keep = get_option<int32_t>(options, "n_keep", 0);
|
|
690
699
|
params.sampling.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
|
|
@@ -129,7 +129,7 @@ jobs:
|
|
|
129
129
|
run: |
|
|
130
130
|
sysctl -a
|
|
131
131
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
|
132
|
-
# https://github.com/
|
|
132
|
+
# https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
|
133
133
|
cmake -B build \
|
|
134
134
|
-DCMAKE_BUILD_RPATH="@loader_path" \
|
|
135
135
|
-DLLAMA_FATAL_WARNINGS=ON \
|
|
@@ -374,6 +374,8 @@ jobs:
|
|
|
374
374
|
- name: Clone
|
|
375
375
|
id: checkout
|
|
376
376
|
uses: actions/checkout@v4
|
|
377
|
+
with:
|
|
378
|
+
fetch-depth: 0
|
|
377
379
|
|
|
378
380
|
- name: ccache
|
|
379
381
|
uses: hendrikmuhs/ccache-action@v1.2.16
|
|
@@ -401,7 +403,35 @@ jobs:
|
|
|
401
403
|
run: |
|
|
402
404
|
cd build
|
|
403
405
|
# This is using llvmpipe and runs slower than other backends
|
|
404
|
-
ctest -L main --verbose --timeout
|
|
406
|
+
ctest -L main --verbose --timeout 2700
|
|
407
|
+
|
|
408
|
+
- name: Determine tag name
|
|
409
|
+
id: tag
|
|
410
|
+
shell: bash
|
|
411
|
+
run: |
|
|
412
|
+
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
|
413
|
+
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
|
414
|
+
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
|
415
|
+
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
|
416
|
+
else
|
|
417
|
+
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
|
418
|
+
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
|
419
|
+
fi
|
|
420
|
+
|
|
421
|
+
- name: Pack artifacts
|
|
422
|
+
id: pack_artifacts
|
|
423
|
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
424
|
+
run: |
|
|
425
|
+
cp LICENSE ./build/bin/
|
|
426
|
+
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
|
|
427
|
+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
|
|
428
|
+
|
|
429
|
+
- name: Upload artifacts
|
|
430
|
+
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
|
431
|
+
uses: actions/upload-artifact@v4
|
|
432
|
+
with:
|
|
433
|
+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
|
|
434
|
+
name: llama-bin-ubuntu-vulkan-x64.zip
|
|
405
435
|
|
|
406
436
|
ubuntu-22-cmake-hip:
|
|
407
437
|
runs-on: ubuntu-22.04
|
|
@@ -443,7 +473,7 @@ jobs:
|
|
|
443
473
|
|
|
444
474
|
ubuntu-22-cmake-musa:
|
|
445
475
|
runs-on: ubuntu-22.04
|
|
446
|
-
container: mthreads/musa:rc3.1.
|
|
476
|
+
container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
|
|
447
477
|
|
|
448
478
|
steps:
|
|
449
479
|
- name: Clone
|
|
@@ -1345,8 +1375,10 @@ jobs:
|
|
|
1345
1375
|
|
|
1346
1376
|
needs:
|
|
1347
1377
|
- ubuntu-cpu-cmake
|
|
1378
|
+
- ubuntu-22-cmake-vulkan
|
|
1348
1379
|
- windows-latest-cmake
|
|
1349
1380
|
- windows-2019-cmake-cuda
|
|
1381
|
+
- windows-latest-cmake-sycl
|
|
1350
1382
|
- windows-latest-cmake-hip-release
|
|
1351
1383
|
- macOS-latest-cmake-arm64
|
|
1352
1384
|
- macOS-latest-cmake-x64
|
|
@@ -96,6 +96,22 @@ if (LLAMA_LLGUIDANCE)
|
|
|
96
96
|
include(ExternalProject)
|
|
97
97
|
set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
|
|
98
98
|
set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
|
|
99
|
+
|
|
100
|
+
# Set the correct library file extension based on platform
|
|
101
|
+
if (WIN32)
|
|
102
|
+
set(LLGUIDANCE_LIB_NAME "llguidance.lib")
|
|
103
|
+
# Add Windows-specific libraries
|
|
104
|
+
set(LLGUIDANCE_PLATFORM_LIBS
|
|
105
|
+
ws2_32 # Windows Sockets API
|
|
106
|
+
userenv # For GetUserProfileDirectoryW
|
|
107
|
+
ntdll # For NT functions
|
|
108
|
+
bcrypt # For BCryptGenRandom
|
|
109
|
+
)
|
|
110
|
+
else()
|
|
111
|
+
set(LLGUIDANCE_LIB_NAME "libllguidance.a")
|
|
112
|
+
set(LLGUIDANCE_PLATFORM_LIBS "")
|
|
113
|
+
endif()
|
|
114
|
+
|
|
99
115
|
ExternalProject_Add(llguidance_ext
|
|
100
116
|
GIT_REPOSITORY https://github.com/guidance-ai/llguidance
|
|
101
117
|
# v0.6.12:
|
|
@@ -106,17 +122,18 @@ if (LLAMA_LLGUIDANCE)
|
|
|
106
122
|
CONFIGURE_COMMAND ""
|
|
107
123
|
BUILD_COMMAND cargo build --release
|
|
108
124
|
INSTALL_COMMAND ""
|
|
109
|
-
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}
|
|
125
|
+
BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
|
|
110
126
|
UPDATE_COMMAND ""
|
|
111
127
|
)
|
|
112
128
|
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
|
|
113
129
|
|
|
114
130
|
add_library(llguidance STATIC IMPORTED)
|
|
115
|
-
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}
|
|
131
|
+
set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
|
|
116
132
|
add_dependencies(llguidance llguidance_ext)
|
|
117
133
|
|
|
118
134
|
target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
|
|
119
|
-
|
|
135
|
+
# Add platform libraries to the main target
|
|
136
|
+
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
|
|
120
137
|
endif ()
|
|
121
138
|
|
|
122
139
|
target_include_directories(${TARGET} PUBLIC .)
|
|
@@ -365,6 +365,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
|
|
|
365
365
|
print_options(specific_options);
|
|
366
366
|
}
|
|
367
367
|
|
|
368
|
+
static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
369
|
+
std::vector<common_arg *> common_options;
|
|
370
|
+
std::vector<common_arg *> sparam_options;
|
|
371
|
+
std::vector<common_arg *> specific_options;
|
|
372
|
+
|
|
373
|
+
for (auto & opt : ctx_arg.options) {
|
|
374
|
+
if (opt.is_sparam) {
|
|
375
|
+
sparam_options.push_back(&opt);
|
|
376
|
+
} else if (opt.in_example(ctx_arg.ex)) {
|
|
377
|
+
specific_options.push_back(&opt);
|
|
378
|
+
} else {
|
|
379
|
+
common_options.push_back(&opt);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
printf("_llama_completions() {\n");
|
|
384
|
+
printf(" local cur prev opts\n");
|
|
385
|
+
printf(" COMPREPLY=()\n");
|
|
386
|
+
printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
|
|
387
|
+
printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
|
|
388
|
+
|
|
389
|
+
printf(" opts=\"");
|
|
390
|
+
auto print_options = [](const std::vector<common_arg *> & options) {
|
|
391
|
+
for (const common_arg * opt : options) {
|
|
392
|
+
for (const char * arg : opt->args) {
|
|
393
|
+
printf("%s ", arg);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
};
|
|
397
|
+
|
|
398
|
+
print_options(common_options);
|
|
399
|
+
print_options(sparam_options);
|
|
400
|
+
print_options(specific_options);
|
|
401
|
+
printf("\"\n\n");
|
|
402
|
+
|
|
403
|
+
printf(" case \"$prev\" in\n");
|
|
404
|
+
printf(" --model)\n");
|
|
405
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
406
|
+
printf(" return 0\n");
|
|
407
|
+
printf(" ;;\n");
|
|
408
|
+
printf(" --grammar-file)\n");
|
|
409
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
410
|
+
printf(" return 0\n");
|
|
411
|
+
printf(" ;;\n");
|
|
412
|
+
printf(" --chat-template-file)\n");
|
|
413
|
+
printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
414
|
+
printf(" return 0\n");
|
|
415
|
+
printf(" ;;\n");
|
|
416
|
+
printf(" *)\n");
|
|
417
|
+
printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
|
|
418
|
+
printf(" return 0\n");
|
|
419
|
+
printf(" ;;\n");
|
|
420
|
+
printf(" esac\n");
|
|
421
|
+
printf("}\n\n");
|
|
422
|
+
|
|
423
|
+
std::set<std::string> executables = {
|
|
424
|
+
"llama-batched",
|
|
425
|
+
"llama-batched-bench",
|
|
426
|
+
"llama-bench",
|
|
427
|
+
"llama-cli",
|
|
428
|
+
"llama-convert-llama2c-to-ggml",
|
|
429
|
+
"llama-cvector-generator",
|
|
430
|
+
"llama-embedding",
|
|
431
|
+
"llama-eval-callback",
|
|
432
|
+
"llama-export-lora",
|
|
433
|
+
"llama-gbnf-validator",
|
|
434
|
+
"llama-gen-docs",
|
|
435
|
+
"llama-gguf",
|
|
436
|
+
"llama-gguf-hash",
|
|
437
|
+
"llama-gguf-split",
|
|
438
|
+
"llama-gritlm",
|
|
439
|
+
"llama-imatrix",
|
|
440
|
+
"llama-infill",
|
|
441
|
+
"llama-llava-cli",
|
|
442
|
+
"llama-llava-clip-quantize-cli",
|
|
443
|
+
"llama-lookahead",
|
|
444
|
+
"llama-lookup",
|
|
445
|
+
"llama-lookup-create",
|
|
446
|
+
"llama-lookup-merge",
|
|
447
|
+
"llama-lookup-stats",
|
|
448
|
+
"llama-minicpmv-cli",
|
|
449
|
+
"llama-parallel",
|
|
450
|
+
"llama-passkey",
|
|
451
|
+
"llama-perplexity",
|
|
452
|
+
"llama-q8dot",
|
|
453
|
+
"llama-quantize",
|
|
454
|
+
"llama-quantize-stats",
|
|
455
|
+
"llama-qwen2vl-cli",
|
|
456
|
+
"llama-retrieval",
|
|
457
|
+
"llama-run",
|
|
458
|
+
"llama-save-load-state",
|
|
459
|
+
"llama-server",
|
|
460
|
+
"llama-simple",
|
|
461
|
+
"llama-simple-chat",
|
|
462
|
+
"llama-speculative",
|
|
463
|
+
"llama-speculative-simple",
|
|
464
|
+
"llama-tokenize",
|
|
465
|
+
"llama-tts",
|
|
466
|
+
"llama-vdot"
|
|
467
|
+
};
|
|
468
|
+
|
|
469
|
+
for (const auto& exe : executables) {
|
|
470
|
+
printf("complete -F _llama_completions %s\n", exe.c_str());
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
|
|
368
474
|
static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
|
|
369
475
|
std::vector<ggml_backend_dev_t> devices;
|
|
370
476
|
auto dev_names = string_split<std::string>(value, ',');
|
|
@@ -426,6 +532,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
|
|
|
426
532
|
}
|
|
427
533
|
exit(0);
|
|
428
534
|
}
|
|
535
|
+
if (ctx_arg.params.completion) {
|
|
536
|
+
common_params_print_completion(ctx_arg);
|
|
537
|
+
exit(0);
|
|
538
|
+
}
|
|
429
539
|
} catch (const std::invalid_argument & ex) {
|
|
430
540
|
fprintf(stderr, "%s\n", ex.what());
|
|
431
541
|
ctx_arg.params = params_org;
|
|
@@ -494,6 +604,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
494
604
|
exit(0);
|
|
495
605
|
}
|
|
496
606
|
));
|
|
607
|
+
add_opt(common_arg(
|
|
608
|
+
{"--completion-bash"},
|
|
609
|
+
"print source-able bash completion script for llama.cpp",
|
|
610
|
+
[](common_params & params) {
|
|
611
|
+
params.completion = true;
|
|
612
|
+
}
|
|
613
|
+
));
|
|
497
614
|
add_opt(common_arg(
|
|
498
615
|
{"--verbose-prompt"},
|
|
499
616
|
string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
|
|
@@ -674,7 +791,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
674
791
|
));
|
|
675
792
|
add_opt(common_arg(
|
|
676
793
|
{"--no-context-shift"},
|
|
677
|
-
string_format("disables context shift on
|
|
794
|
+
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
678
795
|
[](common_params & params) {
|
|
679
796
|
params.ctx_shift = false;
|
|
680
797
|
}
|
|
@@ -946,6 +1063,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
946
1063
|
params.sampling.min_p = std::stof(value);
|
|
947
1064
|
}
|
|
948
1065
|
).set_sparam());
|
|
1066
|
+
add_opt(common_arg(
|
|
1067
|
+
{"--top-nsigma"}, "N",
|
|
1068
|
+
string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
|
|
1069
|
+
[](common_params & params, const std::string & value) {
|
|
1070
|
+
params.sampling.top_n_sigma = std::stof(value);
|
|
1071
|
+
}
|
|
1072
|
+
).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
|
|
949
1073
|
add_opt(common_arg(
|
|
950
1074
|
{"--xtc-probability"}, "N",
|
|
951
1075
|
string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
|
|
@@ -1445,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1445
1569
|
"- isolate: only spawn threads on CPUs on the node that execution started on\n"
|
|
1446
1570
|
"- numactl: use the CPU map provided by numactl\n"
|
|
1447
1571
|
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
|
1448
|
-
"see https://github.com/
|
|
1572
|
+
"see https://github.com/ggml-org/llama.cpp/issues/1437",
|
|
1449
1573
|
[](common_params & params, const std::string & value) {
|
|
1450
1574
|
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
1451
1575
|
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
@@ -1975,6 +2099,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1975
2099
|
params.use_jinja = true;
|
|
1976
2100
|
}
|
|
1977
2101
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
|
2102
|
+
add_opt(common_arg(
|
|
2103
|
+
{"--reasoning-format"}, "FORMAT",
|
|
2104
|
+
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
|
2105
|
+
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
|
2106
|
+
"only supported for non-streamed responses",
|
|
2107
|
+
[](common_params & params, const std::string & value) {
|
|
2108
|
+
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
|
2109
|
+
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
|
2110
|
+
else { std::invalid_argument("invalid value"); }
|
|
2111
|
+
}
|
|
2112
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
1978
2113
|
add_opt(common_arg(
|
|
1979
2114
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
1980
2115
|
string_format(
|
|
@@ -2112,7 +2247,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2112
2247
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
2113
2248
|
add_opt(common_arg(
|
|
2114
2249
|
{"--log-prefix"},
|
|
2115
|
-
"Enable
|
|
2250
|
+
"Enable prefix in log messages",
|
|
2116
2251
|
[](common_params &) {
|
|
2117
2252
|
common_log_set_prefix(common_log_main(), true);
|
|
2118
2253
|
}
|
|
@@ -2324,5 +2459,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2324
2459
|
}
|
|
2325
2460
|
).set_examples({LLAMA_EXAMPLE_TTS}));
|
|
2326
2461
|
|
|
2462
|
+
add_opt(common_arg(
|
|
2463
|
+
{"--embd-bge-small-en-default"},
|
|
2464
|
+
string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
|
|
2465
|
+
[](common_params & params) {
|
|
2466
|
+
params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
|
|
2467
|
+
params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
|
|
2468
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2469
|
+
params.embd_normalize = 2;
|
|
2470
|
+
params.n_ctx = 512;
|
|
2471
|
+
params.verbose_prompt = true;
|
|
2472
|
+
params.embedding = true;
|
|
2473
|
+
}
|
|
2474
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2475
|
+
|
|
2476
|
+
add_opt(common_arg(
|
|
2477
|
+
{"--embd-e5-small-en-default"},
|
|
2478
|
+
string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
|
|
2479
|
+
[](common_params & params) {
|
|
2480
|
+
params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
|
|
2481
|
+
params.hf_file = "e5-small-v2-q8_0.gguf";
|
|
2482
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2483
|
+
params.embd_normalize = 2;
|
|
2484
|
+
params.n_ctx = 512;
|
|
2485
|
+
params.verbose_prompt = true;
|
|
2486
|
+
params.embedding = true;
|
|
2487
|
+
}
|
|
2488
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2489
|
+
|
|
2490
|
+
add_opt(common_arg(
|
|
2491
|
+
{"--embd-gte-small-default"},
|
|
2492
|
+
string_format("use default gte-small model (note: can download weights from the internet)"),
|
|
2493
|
+
[](common_params & params) {
|
|
2494
|
+
params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
|
|
2495
|
+
params.hf_file = "gte-small-q8_0.gguf";
|
|
2496
|
+
params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
2497
|
+
params.embd_normalize = 2;
|
|
2498
|
+
params.n_ctx = 512;
|
|
2499
|
+
params.verbose_prompt = true;
|
|
2500
|
+
params.embedding = true;
|
|
2501
|
+
}
|
|
2502
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
|
|
2503
|
+
|
|
2327
2504
|
return ctx_arg;
|
|
2328
2505
|
}
|
|
@@ -249,16 +249,30 @@ class chat_template {
|
|
|
249
249
|
inputs.add_generation_prompt = false;
|
|
250
250
|
full = apply(inputs);
|
|
251
251
|
}
|
|
252
|
-
|
|
253
|
-
if (
|
|
254
|
-
|
|
255
|
-
|
|
252
|
+
auto eos_pos_last = full.rfind(eos_token_);
|
|
253
|
+
if (eos_pos_last == prefix.size() - eos_token_.size() ||
|
|
254
|
+
(full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
|
|
255
|
+
full = full.substr(0, eos_pos_last);
|
|
256
|
+
}
|
|
257
|
+
size_t common_prefix_length = 0;
|
|
258
|
+
for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
|
|
259
|
+
if (prefix[i] != full[i]) {
|
|
260
|
+
break;
|
|
256
261
|
}
|
|
262
|
+
if (prefix[i] == '<') {
|
|
263
|
+
// DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
|
|
264
|
+
// but it removes thinking tags for past messages.
|
|
265
|
+
// The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
|
|
266
|
+
continue;
|
|
267
|
+
}
|
|
268
|
+
common_prefix_length = i + 1;
|
|
257
269
|
}
|
|
258
|
-
|
|
270
|
+
auto example = full.substr(common_prefix_length);
|
|
271
|
+
if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
|
|
259
272
|
fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
|
|
273
|
+
} else {
|
|
274
|
+
tool_call_example_ = example;
|
|
260
275
|
}
|
|
261
|
-
tool_call_example_ = full.substr(prefix.size());
|
|
262
276
|
}
|
|
263
277
|
} catch (const std::exception & e) {
|
|
264
278
|
fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
|
|
@@ -363,7 +377,7 @@ class chat_template {
|
|
|
363
377
|
if (polyfill_tools) {
|
|
364
378
|
adjusted_messages = add_system(inputs.messages,
|
|
365
379
|
"You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
|
|
366
|
-
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
|
|
380
|
+
(!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
|
|
367
381
|
} else {
|
|
368
382
|
adjusted_messages = inputs.messages;
|
|
369
383
|
}
|