@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
package/lib/binding.ts CHANGED
@@ -65,6 +65,14 @@ export type LlamaModelOptions = {
65
65
  lora?: string
66
66
  lora_scaled?: number
67
67
  lora_list?: { path: string; scaled: number }[]
68
+ /**
69
+ * RoPE base frequency, use 0 to use model default (recommended)
70
+ */
71
+ rope_freq_base?: number
72
+ /**
73
+ * RoPE frequency scaling factor, use 0 to use model default (recommended)
74
+ */
75
+ rope_freq_scale?: number
68
76
  }
69
77
 
70
78
  export type CompletionResponseFormat = {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.4",
4
+ "version": "1.1.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.4",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.4",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.4",
77
- "@fugood/node-llama-linux-arm64": "1.1.4",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.4",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.4",
80
- "@fugood/node-llama-win32-x64": "1.1.4",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.4",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.4",
83
- "@fugood/node-llama-win32-arm64": "1.1.4",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.4",
85
- "@fugood/node-llama-darwin-x64": "1.1.4",
86
- "@fugood/node-llama-darwin-arm64": "1.1.4"
74
+ "@fugood/node-llama-linux-x64": "1.1.6",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.6",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.6",
77
+ "@fugood/node-llama-linux-arm64": "1.1.6",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.6",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.6",
80
+ "@fugood/node-llama-win32-x64": "1.1.6",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.6",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.6",
83
+ "@fugood/node-llama-win32-arm64": "1.1.6",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.6",
85
+ "@fugood/node-llama-darwin-x64": "1.1.6",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.6"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 114dbfcc..6771bd43 100644
2
+ index 60805ab3..71b4236a 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -12,13 +12,15 @@ index 114dbfcc..6771bd43 100644
12
12
  #include <cstdio>
13
13
  #include <exception>
14
14
  #include <iostream>
15
- @@ -123,14 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
15
+ @@ -123,16 +120,6 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
16
16
  return diffs;
17
17
  }
18
18
 
19
19
  -typedef minja::chat_template common_chat_template;
20
20
  -
21
21
  -struct common_chat_templates {
22
+ - bool add_bos;
23
+ - bool add_eos;
22
24
  - bool has_explicit_template; // Model had builtin template or template overridde was specified.
23
25
  - std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
24
26
  - std::unique_ptr<common_chat_template> template_tool_use;
@@ -27,21 +29,23 @@ index 114dbfcc..6771bd43 100644
27
29
  struct templates_params {
28
30
  json messages;
29
31
  json tools;
30
- diff --git a/common/chat.h b/common/chat.h
31
- index ca807c14..56649863 100644
32
+ diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
33
+ index b014f9f0..3a868797 100644
32
34
  --- a/src/llama.cpp/common/chat.h
33
35
  +++ b/src/llama.cpp/common/chat.h
34
- @@ -9,7 +9,16 @@
36
+ @@ -9,7 +9,18 @@
35
37
  #include <vector>
36
38
  #include <map>
37
39
 
38
40
  -struct common_chat_templates;
39
- +#include <minja/chat-template.hpp>
40
- +#include <minja/minja.hpp>
41
+ +#include "minja/chat-template.hpp"
42
+ +#include "minja/minja.hpp"
41
43
  +
42
44
  +typedef minja::chat_template common_chat_template;
43
45
  +
44
46
  +struct common_chat_templates {
47
+ + bool add_bos;
48
+ + bool add_eos;
45
49
  + bool has_explicit_template; // Model had builtin template or template overridde was specified.
46
50
  + std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
47
51
  + std::unique_ptr<common_chat_template> template_tool_use;
@@ -50,10 +54,10 @@ index ca807c14..56649863 100644
50
54
  struct common_chat_tool_call {
51
55
  std::string name;
52
56
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
53
- index e4e71ad1..091ddda4 100644
57
+ index c6962d1d..ba5a4786 100644
54
58
  --- a/src/llama.cpp/common/common.cpp
55
59
  +++ b/src/llama.cpp/common/common.cpp
56
- @@ -1101,6 +1101,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
60
+ @@ -1116,6 +1116,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
57
61
  mparams.n_gpu_layers = params.n_gpu_layers;
58
62
  }
59
63
 
@@ -62,10 +66,10 @@ index e4e71ad1..091ddda4 100644
62
66
  mparams.split_mode = params.split_mode;
63
67
  mparams.tensor_split = params.tensor_split;
64
68
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
65
- index 8922090e..3c2d1a6a 100644
69
+ index 6c1c7ee2..c3eb0552 100644
66
70
  --- a/src/llama.cpp/common/common.h
67
71
  +++ b/src/llama.cpp/common/common.h
68
- @@ -224,6 +224,7 @@ enum common_reasoning_format {
72
+ @@ -242,6 +242,7 @@ enum common_reasoning_format {
69
73
  };
70
74
 
71
75
  struct common_params {
@@ -74,10 +78,10 @@ index 8922090e..3c2d1a6a 100644
74
78
  int32_t n_ctx = 4096; // context size
75
79
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
76
80
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
77
- index 671fad4d..93fc3cd7 100644
81
+ index f188d163..0c33acad 100644
78
82
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
79
83
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
80
- @@ -104,7 +104,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
84
+ @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
81
85
  )
82
86
 
83
87
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
@@ -320,6 +320,8 @@ void LlamaCompletionWorker::OnOK() {
320
320
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
321
321
  } else if (_reasoning_format == "deepseek-legacy") {
322
322
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
323
+ } else if (_reasoning_format == "auto") {
324
+ chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
323
325
  } else {
324
326
  chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_NONE;
325
327
  }
@@ -250,6 +250,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
250
250
  params.kv_unified = get_option<bool>(options, "kv_unified", false);
251
251
  params.swa_full = get_option<bool>(options, "swa_full", false);
252
252
 
253
+ params.rope_freq_base = get_option<float>(options, "rope_freq_base", 0.0f);
254
+ params.rope_freq_scale = get_option<float>(options, "rope_freq_scale", 0.0f);
255
+
253
256
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
254
257
  params.use_mmap = get_option<bool>(options, "use_mmap", true);
255
258
  params.numa =
@@ -24,6 +24,7 @@
24
24
  #include <cstdarg>
25
25
  #include <filesystem>
26
26
  #include <fstream>
27
+ #include <list>
27
28
  #include <regex>
28
29
  #include <set>
29
30
  #include <string>
@@ -977,6 +978,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
977
978
  for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
978
979
  string_process_escapes(seq_breaker);
979
980
  }
981
+ for (auto & pair : params.speculative.replacements) {
982
+ string_process_escapes(pair.first);
983
+ string_process_escapes(pair.second);
984
+ }
980
985
  }
981
986
 
982
987
  if (!params.kv_overrides.empty()) {
@@ -2091,6 +2096,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2091
2096
  params.no_kv_offload = true;
2092
2097
  }
2093
2098
  ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
2099
+ add_opt(common_arg(
2100
+ {"-nr", "--no-repack"},
2101
+ "disable weight repacking",
2102
+ [](common_params & params) {
2103
+ params.no_extra_bufts = true;
2104
+ }
2105
+ ).set_env("LLAMA_ARG_NO_REPACK"));
2094
2106
  add_opt(common_arg(
2095
2107
  {"-ctk", "--cache-type-k"}, "TYPE",
2096
2108
  string_format(
@@ -2364,11 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2364
2376
  }
2365
2377
  throw std::invalid_argument("unknown buffer type");
2366
2378
  }
2367
- // FIXME: this leaks memory
2368
- params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2379
+ // keep strings alive and avoid leaking memory by storing them in a static vector
2380
+ static std::list<std::string> buft_overrides;
2381
+ buft_overrides.push_back(tensor_name);
2382
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
2369
2383
  }
2370
2384
  }
2371
2385
  ));
2386
+ add_opt(common_arg(
2387
+ {"--cpu-moe", "-cmoe"},
2388
+ "keep all Mixture of Experts (MoE) weights in the CPU",
2389
+ [](common_params & params) {
2390
+ params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
2391
+ }
2392
+ ).set_env("LLAMA_ARG_CPU_MOE"));
2393
+ add_opt(common_arg(
2394
+ {"--n-cpu-moe", "-ncmoe"}, "N",
2395
+ "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2396
+ [](common_params & params, int value) {
2397
+ if (value < 0) {
2398
+ throw std::invalid_argument("invalid value");
2399
+ }
2400
+ for (int i = 0; i < value; ++i) {
2401
+ // keep strings alive and avoid leaking memory by storing them in a static vector
2402
+ static std::list<std::string> buft_overrides;
2403
+ buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2404
+ params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2405
+ }
2406
+ }
2407
+ ).set_env("LLAMA_ARG_N_CPU_MOE"));
2372
2408
  add_opt(common_arg(
2373
2409
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2374
2410
  "number of layers to store in VRAM",
@@ -2627,6 +2663,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2627
2663
  params.n_out_freq = value;
2628
2664
  }
2629
2665
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2666
+ add_opt(common_arg(
2667
+ {"--output-format"}, "{gguf,dat}",
2668
+ string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
2669
+ [](common_params & params, const std::string & value) {
2670
+ /**/ if (value == "gguf") { params.imat_dat = -1; }
2671
+ else if (value == "dat") { params.imat_dat = 1; }
2672
+ else { throw std::invalid_argument("invalid output format"); }
2673
+ }
2674
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2630
2675
  add_opt(common_arg(
2631
2676
  {"--save-frequency"}, "N",
2632
2677
  string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
@@ -2902,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2902
2947
  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
2903
2948
  "- none: leaves thoughts unparsed in `message.content`\n"
2904
2949
  "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2905
- "(default: deepseek)",
2950
+ "(default: auto)",
2906
2951
  [](common_params & params, const std::string & value) {
2907
2952
  /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2908
2953
  else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
2909
2954
  else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955
+ else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
2910
2956
  else { throw std::invalid_argument("invalid value"); }
2911
2957
  }
2912
2958
  ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
@@ -3249,6 +3295,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3249
3295
  params.speculative.model.path = value;
3250
3296
  }
3251
3297
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
3298
+ add_opt(common_arg(
3299
+ {"--spec-replace"}, "TARGET", "DRAFT",
3300
+ "translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
3301
+ [](common_params & params, const std::string & tgt, const std::string & dft) {
3302
+ params.speculative.replacements.push_back({ tgt, dft });
3303
+ }
3304
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
3252
3305
  add_opt(common_arg(
3253
3306
  {"-ctkd", "--cache-type-k-draft"}, "TYPE",
3254
3307
  string_format(
@@ -3438,12 +3491,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3438
3491
  }
3439
3492
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3440
3493
 
3441
- // diffusion parameters
3442
3494
  add_opt(common_arg(
3443
3495
  { "--diffusion-steps" }, "N",
3444
3496
  string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3445
3497
  [](common_params & params, int value) { params.diffusion.steps = value; }
3446
3498
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3499
+ add_opt(common_arg(
3500
+ { "--diffusion-visual" },
3501
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3502
+ params.diffusion.visual_mode ? "true" : "false"),
3503
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3504
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3505
+
3447
3506
  add_opt(common_arg(
3448
3507
  { "--diffusion-eps" }, "F",
3449
3508
  string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
@@ -3451,21 +3510,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3451
3510
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3452
3511
  add_opt(common_arg(
3453
3512
  { "--diffusion-algorithm" }, "N",
3454
- string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3513
+ string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)",
3455
3514
  params.diffusion.algorithm),
3456
3515
  [](common_params & params, int value) { params.diffusion.algorithm = value; }
3457
3516
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3458
3517
  add_opt(common_arg(
3459
3518
  { "--diffusion-alg-temp" }, "F",
3460
- string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3519
+ string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3461
3520
  [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3462
3521
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3522
+
3523
+ add_opt(common_arg(
3524
+ { "--diffusion-block-length" }, "N",
3525
+ string_format("llada block length for generation (default: %d)", params.diffusion.block_length),
3526
+ [](common_params & params, int value) { params.diffusion.block_length = value; }
3527
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3463
3528
  add_opt(common_arg(
3464
- { "--diffusion-visual" },
3465
- string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466
- params.diffusion.visual_mode ? "true" : "false"),
3467
- [](common_params & params) { params.diffusion.visual_mode = true; }
3529
+ { "--diffusion-cfg-scale" }, "F",
3530
+ string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale),
3531
+ [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); }
3532
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3533
+ add_opt(common_arg(
3534
+ { "--diffusion-add-gumbel-noise" }, "F",
3535
+ string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
3536
+ [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
3468
3537
  ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3469
3538
 
3539
+
3470
3540
  return ctx_arg;
3471
3541
  }
@@ -132,6 +132,8 @@ struct templates_params {
132
132
  bool enable_thinking = true;
133
133
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
134
134
  json extra_context;
135
+ bool add_bos;
136
+ bool add_eos;
135
137
  };
136
138
 
137
139
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -434,6 +436,8 @@ std::string common_chat_format_single(
434
436
 
435
437
  common_chat_templates_inputs inputs;
436
438
  inputs.use_jinja = use_jinja;
439
+ inputs.add_bos = tmpls->add_bos;
440
+ inputs.add_eos = tmpls->add_eos;
437
441
 
438
442
  std::string fmt_past_msg;
439
443
  if (!past_msg.empty()) {
@@ -458,6 +462,8 @@ std::string common_chat_format_single(
458
462
  std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
459
463
  common_chat_templates_inputs inputs;
460
464
  inputs.use_jinja = use_jinja;
465
+ inputs.add_bos = tmpls->add_bos;
466
+ inputs.add_eos = tmpls->add_eos;
461
467
  auto add_simple_msg = [&](auto role, auto content) {
462
468
  common_chat_msg msg;
463
469
  msg.role = role;
@@ -535,6 +541,8 @@ common_chat_templates_ptr common_chat_templates_init(
535
541
  }
536
542
  std::string token_bos = bos_token_override;
537
543
  std::string token_eos = eos_token_override;
544
+ bool add_bos = false;
545
+ bool add_eos = false;
538
546
  if (model) {
539
547
  const auto * vocab = llama_model_get_vocab(model);
540
548
  const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -549,9 +557,13 @@ common_chat_templates_ptr common_chat_templates_init(
549
557
  };
550
558
  token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
551
559
  token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
560
+ add_bos = llama_vocab_get_add_bos(vocab);
561
+ add_eos = llama_vocab_get_add_eos(vocab);
552
562
  }
553
563
  common_chat_templates_ptr tmpls(new common_chat_templates());
554
564
  tmpls->has_explicit_template = has_explicit_template;
565
+ tmpls->add_bos = add_bos;
566
+ tmpls->add_eos = add_eos;
555
567
  try {
556
568
  tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
557
569
  } catch (const std::exception & e) {
@@ -581,6 +593,7 @@ const char * common_chat_format_name(common_chat_format format) {
581
593
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
582
594
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
583
595
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
596
+ case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
584
597
  default:
585
598
  throw std::runtime_error("Unknown chat format");
586
599
  }
@@ -589,6 +602,7 @@ const char * common_chat_format_name(common_chat_format format) {
589
602
  const char * common_reasoning_format_name(common_reasoning_format format) {
590
603
  switch (format) {
591
604
  case COMMON_REASONING_FORMAT_NONE: return "none";
605
+ case COMMON_REASONING_FORMAT_AUTO: return "auto";
592
606
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
593
607
  case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
594
608
  default:
@@ -737,10 +751,10 @@ static std::string apply(
737
751
  // instead of using `chat_template_options.use_bos_token = false`, since these tokens
738
752
  // may be needed inside the template / between messages too.
739
753
  auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
740
- if (string_starts_with(result, tmpl.bos_token())) {
754
+ if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
741
755
  result = result.substr(tmpl.bos_token().size());
742
756
  }
743
- if (string_ends_with(result, tmpl.eos_token())) {
757
+ if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
744
758
  result = result.substr(0, result.size() - tmpl.eos_token().size());
745
759
  }
746
760
  return result;
@@ -1278,6 +1292,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1278
1292
  tool_calls_end);
1279
1293
  }
1280
1294
 
1295
+ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1296
+ common_chat_params data;
1297
+ auto prompt = apply(tmpl, inputs);
1298
+
1299
+ data.prompt = prompt;
1300
+ data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1301
+
1302
+ // TODO: support tool calls in GPT-OSS?
1303
+
1304
+ return data;
1305
+ }
1306
+ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1307
+ // TODO @ngxson : this won't work with --special enabled, we should fix that
1308
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1309
+ if (!builder.syntax().parse_tool_calls) {
1310
+ builder.add_content(builder.consume_rest());
1311
+ return;
1312
+ }
1313
+ }
1314
+
1281
1315
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1282
1316
  LOG_DBG("%s\n", __func__);
1283
1317
  common_chat_params data;
@@ -1635,7 +1669,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1635
1669
  "|<function name=\"([^\"]+)\">" // match 5 (function name again)
1636
1670
  );
1637
1671
 
1638
- if (auto res = builder.try_find_regex(open_regex)) {
1672
+ while (auto res = builder.try_find_regex(open_regex)) {
1639
1673
  const auto & block_start = res->groups[1];
1640
1674
  std::string block_end = block_start.empty() ? "" : "```";
1641
1675
 
@@ -1657,7 +1691,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1657
1691
  builder.consume_literal(block_end);
1658
1692
  builder.consume_spaces();
1659
1693
  }
1660
- builder.add_content(builder.consume_rest());
1661
1694
  } else {
1662
1695
  throw common_chat_msg_partial_exception("failed to parse tool call");
1663
1696
  }
@@ -1682,11 +1715,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1682
1715
  builder.consume_spaces();
1683
1716
  }
1684
1717
  }
1685
- builder.add_content(builder.consume_rest());
1686
1718
  }
1687
- } else {
1688
- builder.add_content(builder.consume_rest());
1689
1719
  }
1720
+
1721
+ builder.add_content(builder.consume_rest());
1690
1722
  }
1691
1723
 
1692
1724
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
@@ -1722,6 +1754,8 @@ static common_chat_params common_chat_templates_apply_jinja(
1722
1754
  params.enable_thinking = inputs.enable_thinking;
1723
1755
  params.grammar = inputs.grammar;
1724
1756
  params.now = inputs.now;
1757
+ params.add_bos = inputs.add_bos;
1758
+ params.add_eos = inputs.add_eos;
1725
1759
 
1726
1760
  params.extra_context = json::object();
1727
1761
  for (auto el : inputs.chat_template_kwargs) {
@@ -1763,6 +1797,11 @@ static common_chat_params common_chat_templates_apply_jinja(
1763
1797
  return common_chat_params_init_hermes_2_pro(tmpl, params);
1764
1798
  }
1765
1799
 
1800
+ // GPT-OSS
1801
+ if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
1802
+ return common_chat_params_init_gpt_oss(tmpl, params);
1803
+ }
1804
+
1766
1805
  // Use generic handler when mixing tools + JSON schema.
1767
1806
  // TODO: support that mix in handlers below.
1768
1807
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1914,6 +1953,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1914
1953
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
1915
1954
  common_chat_parse_command_r7b(builder);
1916
1955
  break;
1956
+ case COMMON_CHAT_FORMAT_GPT_OSS:
1957
+ common_chat_parse_gpt_oss(builder);
1958
+ break;
1917
1959
  default:
1918
1960
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1919
1961
  }
@@ -1933,6 +1975,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
1933
1975
  }
1934
1976
  }
1935
1977
  auto msg = builder.result();
1936
- LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1978
+ if (!is_partial) {
1979
+ LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat<json>({msg}).at(0).dump().c_str());
1980
+ }
1937
1981
  return msg;
1938
1982
  }
@@ -9,12 +9,14 @@
9
9
  #include <vector>
10
10
  #include <map>
11
11
 
12
- #include <minja/chat-template.hpp>
13
- #include <minja/minja.hpp>
12
+ #include "minja/chat-template.hpp"
13
+ #include "minja/minja.hpp"
14
14
 
15
15
  typedef minja::chat_template common_chat_template;
16
16
 
17
17
  struct common_chat_templates {
18
+ bool add_bos;
19
+ bool add_eos;
18
20
  bool has_explicit_template; // Model had builtin template or template overridde was specified.
19
21
  std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
20
22
  std::unique_ptr<common_chat_template> template_tool_use;
@@ -118,6 +120,7 @@ enum common_chat_format {
118
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
119
121
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
120
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
+ COMMON_CHAT_FORMAT_GPT_OSS,
121
124
 
122
125
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
123
126
  };
@@ -136,6 +139,8 @@ struct common_chat_templates_inputs {
136
139
  bool enable_thinking = true;
137
140
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
138
141
  std::map<std::string, std::string> chat_template_kwargs;
142
+ bool add_bos = false;
143
+ bool add_eos = false;
139
144
  };
140
145
 
141
146
  struct common_chat_params {
@@ -1123,6 +1123,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1123
1123
  mparams.use_mmap = params.use_mmap;
1124
1124
  mparams.use_mlock = params.use_mlock;
1125
1125
  mparams.check_tensors = params.check_tensors;
1126
+ mparams.use_extra_bufts = !params.no_extra_bufts;
1126
1127
 
1127
1128
  if (params.kv_overrides.empty()) {
1128
1129
  mparams.kv_overrides = NULL;
@@ -201,6 +201,7 @@ struct common_params_speculative {
201
201
  int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
202
202
  float p_split = 0.1f; // speculative decoding split probability
203
203
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204
+ std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
204
205
 
205
206
  ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
206
207
  ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -220,15 +221,22 @@ struct common_params_vocoder {
220
221
  };
221
222
 
222
223
  struct common_params_diffusion {
223
- int32_t steps = 64; // number of diffusion steps
224
- float eps = 1e-3f; // epsilon for timesteps
225
- int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226
- float alg_temp = 0.0f; // algorithm temperature
227
- bool visual_mode = false; // show progressive diffusion on screen
224
+ int32_t steps = 128;
225
+ bool visual_mode = false;
226
+
227
+ float eps = 0; // epsilon for timesteps
228
+ int32_t block_length = 0; // block length for generation
229
+
230
+ int32_t algorithm = 4; // default algorithm: low-confidence
231
+ float alg_temp = 0.0f; // algorithm temperature
232
+
233
+ float cfg_scale = 0; // classifier-free guidance scale
234
+ bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
228
235
  };
229
236
 
230
237
  enum common_reasoning_format {
231
238
  COMMON_REASONING_FORMAT_NONE,
239
+ COMMON_REASONING_FORMAT_AUTO,
232
240
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
233
241
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
234
242
  };
@@ -353,6 +361,7 @@ struct common_params {
353
361
  bool warmup = true; // warmup run
354
362
  bool check_tensors = false; // validate tensor data
355
363
  bool no_op_offload = false; // globally disable offload host tensor operations to device
364
+ bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
356
365
 
357
366
  bool single_turn = false; // single turn chat conversation
358
367
 
@@ -387,7 +396,7 @@ struct common_params {
387
396
  std::string chat_template = ""; // NOLINT
388
397
  bool use_jinja = false; // NOLINT
389
398
  bool enable_chat_template = true;
390
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
399
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
391
400
  int reasoning_budget = -1;
392
401
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
393
402
 
@@ -432,6 +441,7 @@ struct common_params {
432
441
  int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
433
442
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
434
443
  int32_t i_chunk = 0; // start processing from this chunk
444
+ int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
435
445
 
436
446
  bool process_output = false; // collect data for the output tensor
437
447
  bool compute_ppl = true; // whether to compute perplexity