@fugood/llama.node 0.3.12 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +13 -4
  21. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  22. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  23. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  24. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  25. package/src/llama.cpp/common/arg.cpp +180 -3
  26. package/src/llama.cpp/common/chat-template.hpp +21 -7
  27. package/src/llama.cpp/common/chat.cpp +220 -101
  28. package/src/llama.cpp/common/chat.hpp +3 -0
  29. package/src/llama.cpp/common/common.h +15 -7
  30. package/src/llama.cpp/common/llguidance.cpp +3 -3
  31. package/src/llama.cpp/common/log.cpp +1 -0
  32. package/src/llama.cpp/common/log.h +2 -1
  33. package/src/llama.cpp/common/minja.hpp +24 -9
  34. package/src/llama.cpp/common/sampling.cpp +52 -46
  35. package/src/llama.cpp/common/speculative.h +1 -1
  36. package/src/llama.cpp/docs/build.md +2 -2
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  39. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  40. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  41. package/src/llama.cpp/examples/run/run.cpp +5 -12
  42. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  43. package/src/llama.cpp/examples/server/httplib.h +381 -292
  44. package/src/llama.cpp/examples/server/server.cpp +58 -47
  45. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  46. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  47. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  48. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  49. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  50. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  51. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  52. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  56. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  57. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  58. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  59. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  60. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  61. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  62. package/src/llama.cpp/include/llama.h +14 -10
  63. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  64. package/src/llama.cpp/src/llama-grammar.h +1 -1
  65. package/src/llama.cpp/src/llama-impl.h +6 -6
  66. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  67. package/src/llama.cpp/src/llama-mmap.h +1 -0
  68. package/src/llama.cpp/src/llama-model.cpp +1 -1
  69. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  70. package/src/llama.cpp/src/llama.cpp +7 -5
  71. package/src/llama.cpp/src/unicode.cpp +9 -2
  72. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  73. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  74. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  75. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -8,6 +8,7 @@ export type ChatMessage = {
8
8
  export type LlamaModelOptions = {
9
9
  model: string
10
10
  chat_template?: string
11
+ reasoning_format?: string
11
12
  embedding?: boolean
12
13
  embd_normalize?: number
13
14
  pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.12",
4
+ "version": "0.3.13",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -165,9 +165,17 @@ void LlamaCompletionWorker::OnOK() {
165
165
  Napi::String::New(env, _result.text.c_str()));
166
166
 
167
167
  Napi::Array tool_calls = Napi::Array::New(Napi::AsyncWorker::Env());
168
+ std::string * reasoning_content = nullptr;
169
+ std::string * content = nullptr;
168
170
  if (!_stop) {
169
171
  try {
170
172
  common_chat_msg message = common_chat_parse(_result.text, static_cast<common_chat_format>(_chat_format));
173
+ if (!message.reasoning_content.empty()) {
174
+ reasoning_content = &message.reasoning_content;
175
+ }
176
+ if (!message.content.empty()) {
177
+ content = &message.content;
178
+ }
171
179
  for (size_t i = 0; i < message.tool_calls.size(); i++) {
172
180
  const auto &tc = message.tool_calls[i];
173
181
  Napi::Object tool_call = Napi::Object::New(env);
@@ -188,6 +196,12 @@ void LlamaCompletionWorker::OnOK() {
188
196
  if (tool_calls.Length() > 0) {
189
197
  result.Set("tool_calls", tool_calls);
190
198
  }
199
+ if (reasoning_content) {
200
+ result.Set("reasoning_content", Napi::String::New(env, reasoning_content->c_str()));
201
+ }
202
+ if (content) {
203
+ result.Set("content", Napi::String::New(env, content->c_str()));
204
+ }
191
205
 
192
206
  auto ctx = _sess->context();
193
207
  const auto timings_token = llama_perf_context(ctx);
@@ -185,6 +185,13 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
185
185
 
186
186
  params.chat_template = get_option<std::string>(options, "chat_template", "");
187
187
 
188
+ std::string reasoning_format = get_option<std::string>(options, "reasoning_format", "none");
189
+ if (reasoning_format == "deepseek") {
190
+ params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
191
+ } else {
192
+ params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
193
+ }
194
+
188
195
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
189
196
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
190
197
  params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
@@ -377,7 +384,7 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
377
384
  }
378
385
 
379
386
  common_chat_params getFormattedChatWithJinja(
380
- const struct llama_model * model,
387
+ const std::shared_ptr<LlamaSession> &sess,
381
388
  const common_chat_templates &templates,
382
389
  const std::string &messages,
383
390
  const std::string &chat_template,
@@ -399,11 +406,12 @@ common_chat_params getFormattedChatWithJinja(
399
406
  if (!json_schema.empty()) {
400
407
  inputs.json_schema = json::parse(json_schema);
401
408
  }
409
+ inputs.extract_reasoning = sess->params().reasoning_format != COMMON_REASONING_FORMAT_NONE;
402
410
  inputs.stream = true;
403
411
 
404
412
  // If chat_template is provided, create new one and use it (probably slow)
405
413
  if (!chat_template.empty()) {
406
- auto tmp = common_chat_templates_from_model(model, chat_template);
414
+ auto tmp = common_chat_templates_from_model(sess->model(), chat_template);
407
415
  const common_chat_template* template_ptr = useTools && tmp.template_tool_use ? tmp.template_tool_use.get() : tmp.template_default.get();
408
416
  if (inputs.parallel_tool_calls && !template_ptr->original_caps().supports_parallel_tool_calls) {
409
417
  inputs.parallel_tool_calls = false;
@@ -493,7 +501,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
493
501
  auto parallel_tool_calls = get_option<bool>(params, "parallel_tool_calls", false);
494
502
  auto tool_choice = get_option<std::string>(params, "tool_choice", "");
495
503
 
496
- auto chatParams = getFormattedChatWithJinja(_sess->model(), _templates, messages, chat_template, json_schema_str, tools_str, parallel_tool_calls, tool_choice);
504
+ auto chatParams = getFormattedChatWithJinja(_sess, _templates, messages, chat_template, json_schema_str, tools_str, parallel_tool_calls, tool_choice);
497
505
 
498
506
  Napi::Object result = Napi::Object::New(env);
499
507
  result.Set("prompt", chatParams.prompt.get<std::string>());
@@ -598,7 +606,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
598
606
  auto tool_choice = get_option<std::string>(options, "tool_choice", "none");
599
607
 
600
608
  auto chatParams = getFormattedChatWithJinja(
601
- _sess->model(),
609
+ _sess,
602
610
  _templates,
603
611
  json_stringify(messages),
604
612
  chat_template,
@@ -685,6 +693,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
685
693
  params.sampling.dry_base = get_option<float>(options, "dry_base", 2);
686
694
  params.sampling.dry_allowed_length = get_option<float>(options, "dry_allowed_length", -1);
687
695
  params.sampling.dry_penalty_last_n = get_option<float>(options, "dry_penalty_last_n", 0);
696
+ params.sampling.top_n_sigma = get_option<float>(options, "top_n_sigma", -1.0f);
688
697
  params.sampling.ignore_eos = get_option<bool>(options, "ignore_eos", false);
689
698
  params.n_keep = get_option<int32_t>(options, "n_keep", 0);
690
699
  params.sampling.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
@@ -129,7 +129,7 @@ jobs:
129
129
  run: |
130
130
  sysctl -a
131
131
  # Metal is disabled due to intermittent failures with Github runners not having a GPU:
132
- # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
132
+ # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
133
133
  cmake -B build \
134
134
  -DCMAKE_BUILD_RPATH="@loader_path" \
135
135
  -DLLAMA_FATAL_WARNINGS=ON \
@@ -374,6 +374,8 @@ jobs:
374
374
  - name: Clone
375
375
  id: checkout
376
376
  uses: actions/checkout@v4
377
+ with:
378
+ fetch-depth: 0
377
379
 
378
380
  - name: ccache
379
381
  uses: hendrikmuhs/ccache-action@v1.2.16
@@ -401,7 +403,35 @@ jobs:
401
403
  run: |
402
404
  cd build
403
405
  # This is using llvmpipe and runs slower than other backends
404
- ctest -L main --verbose --timeout 1800
406
+ ctest -L main --verbose --timeout 2700
407
+
408
+ - name: Determine tag name
409
+ id: tag
410
+ shell: bash
411
+ run: |
412
+ BUILD_NUMBER="$(git rev-list --count HEAD)"
413
+ SHORT_HASH="$(git rev-parse --short=7 HEAD)"
414
+ if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
415
+ echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
416
+ else
417
+ SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
418
+ echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
419
+ fi
420
+
421
+ - name: Pack artifacts
422
+ id: pack_artifacts
423
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
424
+ run: |
425
+ cp LICENSE ./build/bin/
426
+ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
427
+ zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
428
+
429
+ - name: Upload artifacts
430
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
431
+ uses: actions/upload-artifact@v4
432
+ with:
433
+ path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
434
+ name: llama-bin-ubuntu-vulkan-x64.zip
405
435
 
406
436
  ubuntu-22-cmake-hip:
407
437
  runs-on: ubuntu-22.04
@@ -443,7 +473,7 @@ jobs:
443
473
 
444
474
  ubuntu-22-cmake-musa:
445
475
  runs-on: ubuntu-22.04
446
- container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
476
+ container: mthreads/musa:rc3.1.1-devel-ubuntu22.04
447
477
 
448
478
  steps:
449
479
  - name: Clone
@@ -1345,8 +1375,10 @@ jobs:
1345
1375
 
1346
1376
  needs:
1347
1377
  - ubuntu-cpu-cmake
1378
+ - ubuntu-22-cmake-vulkan
1348
1379
  - windows-latest-cmake
1349
1380
  - windows-2019-cmake-cuda
1381
+ - windows-latest-cmake-sycl
1350
1382
  - windows-latest-cmake-hip-release
1351
1383
  - macOS-latest-cmake-arm64
1352
1384
  - macOS-latest-cmake-x64
@@ -51,6 +51,8 @@ jobs:
51
51
 
52
52
  - name: Set up QEMU
53
53
  uses: docker/setup-qemu-action@v3
54
+ with:
55
+ image: tonistiigi/binfmt:qemu-v7.0.0-28
54
56
 
55
57
  - name: Set up Docker Buildx
56
58
  uses: docker/setup-buildx-action@v3
@@ -11,7 +11,7 @@ jobs:
11
11
  steps:
12
12
  - uses: actions/checkout@v4
13
13
  with:
14
- repository: "ggerganov/llama.cpp"
14
+ repository: "ggml-org/llama.cpp"
15
15
  - uses: actions/labeler@v5
16
16
  with:
17
17
  configuration-path: '.github/labeler.yml'
@@ -96,6 +96,22 @@ if (LLAMA_LLGUIDANCE)
96
96
  include(ExternalProject)
97
97
  set(LLGUIDANCE_SRC ${CMAKE_BINARY_DIR}/llguidance/source)
98
98
  set(LLGUIDANCE_PATH ${LLGUIDANCE_SRC}/target/release)
99
+
100
+ # Set the correct library file extension based on platform
101
+ if (WIN32)
102
+ set(LLGUIDANCE_LIB_NAME "llguidance.lib")
103
+ # Add Windows-specific libraries
104
+ set(LLGUIDANCE_PLATFORM_LIBS
105
+ ws2_32 # Windows Sockets API
106
+ userenv # For GetUserProfileDirectoryW
107
+ ntdll # For NT functions
108
+ bcrypt # For BCryptGenRandom
109
+ )
110
+ else()
111
+ set(LLGUIDANCE_LIB_NAME "libllguidance.a")
112
+ set(LLGUIDANCE_PLATFORM_LIBS "")
113
+ endif()
114
+
99
115
  ExternalProject_Add(llguidance_ext
100
116
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
101
117
  # v0.6.12:
@@ -106,17 +122,18 @@ if (LLAMA_LLGUIDANCE)
106
122
  CONFIGURE_COMMAND ""
107
123
  BUILD_COMMAND cargo build --release
108
124
  INSTALL_COMMAND ""
109
- BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/libllguidance.a ${LLGUIDANCE_PATH}/llguidance.h
125
+ BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
110
126
  UPDATE_COMMAND ""
111
127
  )
112
128
  target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_LLGUIDANCE)
113
129
 
114
130
  add_library(llguidance STATIC IMPORTED)
115
- set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/libllguidance.a)
131
+ set_target_properties(llguidance PROPERTIES IMPORTED_LOCATION ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME})
116
132
  add_dependencies(llguidance llguidance_ext)
117
133
 
118
134
  target_include_directories(${TARGET} PRIVATE ${LLGUIDANCE_PATH})
119
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance)
135
+ # Add platform libraries to the main target
136
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} llguidance ${LLGUIDANCE_PLATFORM_LIBS})
120
137
  endif ()
121
138
 
122
139
  target_include_directories(${TARGET} PUBLIC .)
@@ -365,6 +365,112 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
365
365
  print_options(specific_options);
366
366
  }
367
367
 
368
+ static void common_params_print_completion(common_params_context & ctx_arg) {
369
+ std::vector<common_arg *> common_options;
370
+ std::vector<common_arg *> sparam_options;
371
+ std::vector<common_arg *> specific_options;
372
+
373
+ for (auto & opt : ctx_arg.options) {
374
+ if (opt.is_sparam) {
375
+ sparam_options.push_back(&opt);
376
+ } else if (opt.in_example(ctx_arg.ex)) {
377
+ specific_options.push_back(&opt);
378
+ } else {
379
+ common_options.push_back(&opt);
380
+ }
381
+ }
382
+
383
+ printf("_llama_completions() {\n");
384
+ printf(" local cur prev opts\n");
385
+ printf(" COMPREPLY=()\n");
386
+ printf(" cur=\"${COMP_WORDS[COMP_CWORD]}\"\n");
387
+ printf(" prev=\"${COMP_WORDS[COMP_CWORD-1]}\"\n\n");
388
+
389
+ printf(" opts=\"");
390
+ auto print_options = [](const std::vector<common_arg *> & options) {
391
+ for (const common_arg * opt : options) {
392
+ for (const char * arg : opt->args) {
393
+ printf("%s ", arg);
394
+ }
395
+ }
396
+ };
397
+
398
+ print_options(common_options);
399
+ print_options(sparam_options);
400
+ print_options(specific_options);
401
+ printf("\"\n\n");
402
+
403
+ printf(" case \"$prev\" in\n");
404
+ printf(" --model)\n");
405
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
406
+ printf(" return 0\n");
407
+ printf(" ;;\n");
408
+ printf(" --grammar-file)\n");
409
+ printf(" COMPREPLY=( $(compgen -f -X '!*.gbnf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
410
+ printf(" return 0\n");
411
+ printf(" ;;\n");
412
+ printf(" --chat-template-file)\n");
413
+ printf(" COMPREPLY=( $(compgen -f -X '!*.jinja' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
414
+ printf(" return 0\n");
415
+ printf(" ;;\n");
416
+ printf(" *)\n");
417
+ printf(" COMPREPLY=( $(compgen -W \"${opts}\" -- \"$cur\") )\n");
418
+ printf(" return 0\n");
419
+ printf(" ;;\n");
420
+ printf(" esac\n");
421
+ printf("}\n\n");
422
+
423
+ std::set<std::string> executables = {
424
+ "llama-batched",
425
+ "llama-batched-bench",
426
+ "llama-bench",
427
+ "llama-cli",
428
+ "llama-convert-llama2c-to-ggml",
429
+ "llama-cvector-generator",
430
+ "llama-embedding",
431
+ "llama-eval-callback",
432
+ "llama-export-lora",
433
+ "llama-gbnf-validator",
434
+ "llama-gen-docs",
435
+ "llama-gguf",
436
+ "llama-gguf-hash",
437
+ "llama-gguf-split",
438
+ "llama-gritlm",
439
+ "llama-imatrix",
440
+ "llama-infill",
441
+ "llama-llava-cli",
442
+ "llama-llava-clip-quantize-cli",
443
+ "llama-lookahead",
444
+ "llama-lookup",
445
+ "llama-lookup-create",
446
+ "llama-lookup-merge",
447
+ "llama-lookup-stats",
448
+ "llama-minicpmv-cli",
449
+ "llama-parallel",
450
+ "llama-passkey",
451
+ "llama-perplexity",
452
+ "llama-q8dot",
453
+ "llama-quantize",
454
+ "llama-quantize-stats",
455
+ "llama-qwen2vl-cli",
456
+ "llama-retrieval",
457
+ "llama-run",
458
+ "llama-save-load-state",
459
+ "llama-server",
460
+ "llama-simple",
461
+ "llama-simple-chat",
462
+ "llama-speculative",
463
+ "llama-speculative-simple",
464
+ "llama-tokenize",
465
+ "llama-tts",
466
+ "llama-vdot"
467
+ };
468
+
469
+ for (const auto& exe : executables) {
470
+ printf("complete -F _llama_completions %s\n", exe.c_str());
471
+ }
472
+ }
473
+
368
474
  static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
369
475
  std::vector<ggml_backend_dev_t> devices;
370
476
  auto dev_names = string_split<std::string>(value, ',');
@@ -426,6 +532,10 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
426
532
  }
427
533
  exit(0);
428
534
  }
535
+ if (ctx_arg.params.completion) {
536
+ common_params_print_completion(ctx_arg);
537
+ exit(0);
538
+ }
429
539
  } catch (const std::invalid_argument & ex) {
430
540
  fprintf(stderr, "%s\n", ex.what());
431
541
  ctx_arg.params = params_org;
@@ -494,6 +604,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
494
604
  exit(0);
495
605
  }
496
606
  ));
607
+ add_opt(common_arg(
608
+ {"--completion-bash"},
609
+ "print source-able bash completion script for llama.cpp",
610
+ [](common_params & params) {
611
+ params.completion = true;
612
+ }
613
+ ));
497
614
  add_opt(common_arg(
498
615
  {"--verbose-prompt"},
499
616
  string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -674,7 +791,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
674
791
  ));
675
792
  add_opt(common_arg(
676
793
  {"--no-context-shift"},
677
- string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
794
+ string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
678
795
  [](common_params & params) {
679
796
  params.ctx_shift = false;
680
797
  }
@@ -946,6 +1063,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
946
1063
  params.sampling.min_p = std::stof(value);
947
1064
  }
948
1065
  ).set_sparam());
1066
+ add_opt(common_arg(
1067
+ {"--top-nsigma"}, "N",
1068
+ string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma),
1069
+ [](common_params & params, const std::string & value) {
1070
+ params.sampling.top_n_sigma = std::stof(value);
1071
+ }
1072
+ ).set_examples({LLAMA_EXAMPLE_MAIN}).set_sparam());
949
1073
  add_opt(common_arg(
950
1074
  {"--xtc-probability"}, "N",
951
1075
  string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
@@ -1445,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1445
1569
  "- isolate: only spawn threads on CPUs on the node that execution started on\n"
1446
1570
  "- numactl: use the CPU map provided by numactl\n"
1447
1571
  "if run without this previously, it is recommended to drop the system page cache before using this\n"
1448
- "see https://github.com/ggerganov/llama.cpp/issues/1437",
1572
+ "see https://github.com/ggml-org/llama.cpp/issues/1437",
1449
1573
  [](common_params & params, const std::string & value) {
1450
1574
  /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
1451
1575
  else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -1975,6 +2099,17 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1975
2099
  params.use_jinja = true;
1976
2100
  }
1977
2101
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
2102
+ add_opt(common_arg(
2103
+ {"--reasoning-format"}, "FORMAT",
2104
+ "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
2105
+ "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
2106
+ "only supported for non-streamed responses",
2107
+ [](common_params & params, const std::string & value) {
2108
+ /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2109
+ else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2110
+ else { std::invalid_argument("invalid value"); }
2111
+ }
2112
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
1978
2113
  add_opt(common_arg(
1979
2114
  {"--chat-template"}, "JINJA_TEMPLATE",
1980
2115
  string_format(
@@ -2112,7 +2247,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2112
2247
  ).set_env("LLAMA_LOG_VERBOSITY"));
2113
2248
  add_opt(common_arg(
2114
2249
  {"--log-prefix"},
2115
- "Enable prefx in log messages",
2250
+ "Enable prefix in log messages",
2116
2251
  [](common_params &) {
2117
2252
  common_log_set_prefix(common_log_main(), true);
2118
2253
  }
@@ -2324,5 +2459,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2324
2459
  }
2325
2460
  ).set_examples({LLAMA_EXAMPLE_TTS}));
2326
2461
 
2462
+ add_opt(common_arg(
2463
+ {"--embd-bge-small-en-default"},
2464
+ string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
2465
+ [](common_params & params) {
2466
+ params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
2467
+ params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
2468
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2469
+ params.embd_normalize = 2;
2470
+ params.n_ctx = 512;
2471
+ params.verbose_prompt = true;
2472
+ params.embedding = true;
2473
+ }
2474
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2475
+
2476
+ add_opt(common_arg(
2477
+ {"--embd-e5-small-en-default"},
2478
+ string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
2479
+ [](common_params & params) {
2480
+ params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
2481
+ params.hf_file = "e5-small-v2-q8_0.gguf";
2482
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2483
+ params.embd_normalize = 2;
2484
+ params.n_ctx = 512;
2485
+ params.verbose_prompt = true;
2486
+ params.embedding = true;
2487
+ }
2488
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2489
+
2490
+ add_opt(common_arg(
2491
+ {"--embd-gte-small-default"},
2492
+ string_format("use default gte-small model (note: can download weights from the internet)"),
2493
+ [](common_params & params) {
2494
+ params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
2495
+ params.hf_file = "gte-small-q8_0.gguf";
2496
+ params.pooling_type = LLAMA_POOLING_TYPE_NONE;
2497
+ params.embd_normalize = 2;
2498
+ params.n_ctx = 512;
2499
+ params.verbose_prompt = true;
2500
+ params.embedding = true;
2501
+ }
2502
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
2503
+
2327
2504
  return ctx_arg;
2328
2505
  }
@@ -249,16 +249,30 @@ class chat_template {
249
249
  inputs.add_generation_prompt = false;
250
250
  full = apply(inputs);
251
251
  }
252
-
253
- if (full.find(prefix) != 0) {
254
- if (prefix.rfind(eos_token_) == prefix.size() - eos_token_.size()) {
255
- prefix = prefix.substr(0, prefix.size() - eos_token_.size());
252
+ auto eos_pos_last = full.rfind(eos_token_);
253
+ if (eos_pos_last == prefix.size() - eos_token_.size() ||
254
+ (full[full.size() - 1] == '\n' && (eos_pos_last == full.size() - eos_token_.size() - 1))) {
255
+ full = full.substr(0, eos_pos_last);
256
+ }
257
+ size_t common_prefix_length = 0;
258
+ for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
259
+ if (prefix[i] != full[i]) {
260
+ break;
256
261
  }
262
+ if (prefix[i] == '<') {
263
+ // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
264
+ // but it removes thinking tags for past messages.
265
+ // The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
266
+ continue;
267
+ }
268
+ common_prefix_length = i + 1;
257
269
  }
258
- if (full.find(prefix) != 0) {
270
+ auto example = full.substr(common_prefix_length);
271
+ if (example.find("tool_name") == std::string::npos && example.find("some_value") == std::string::npos) {
259
272
  fprintf(stderr, "Failed to infer a tool call example (possible template bug)\n");
273
+ } else {
274
+ tool_call_example_ = example;
260
275
  }
261
- tool_call_example_ = full.substr(prefix.size());
262
276
  }
263
277
  } catch (const std::exception & e) {
264
278
  fprintf(stderr, "Failed to generate tool call example: %s\n", e.what());
@@ -363,7 +377,7 @@ class chat_template {
363
377
  if (polyfill_tools) {
364
378
  adjusted_messages = add_system(inputs.messages,
365
379
  "You can call any of the following tools to satisfy the user's requests: " + minja::Value(inputs.tools).dump(2, /* to_json= */ true) +
366
- (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_));
380
+ (!polyfill_tool_call_example || tool_call_example_.empty() ? "" : "\n\nExample tool call syntax:\n\n" + tool_call_example_ + "\n\n"));
367
381
  } else {
368
382
  adjusted_messages = inputs.messages;
369
383
  }