@fugood/llama.node 1.4.11 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +31 -31
  3. package/src/llama.cpp/common/arg.cpp +128 -59
  4. package/src/llama.cpp/common/arg.h +1 -0
  5. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  6. package/src/llama.cpp/common/chat.cpp +36 -7
  7. package/src/llama.cpp/common/chat.h +1 -0
  8. package/src/llama.cpp/common/common.cpp +42 -23
  9. package/src/llama.cpp/common/common.h +11 -1
  10. package/src/llama.cpp/common/llguidance.cpp +10 -6
  11. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  12. package/src/llama.cpp/common/sampling.cpp +58 -14
  13. package/src/llama.cpp/common/sampling.h +3 -1
  14. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  15. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  16. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  17. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  18. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  20. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  21. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  23. package/src/llama.cpp/include/llama.h +100 -12
  24. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  26. package/src/llama.cpp/src/llama-adapter.h +7 -1
  27. package/src/llama.cpp/src/llama-arch.cpp +78 -0
  28. package/src/llama.cpp/src/llama-arch.h +8 -0
  29. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  30. package/src/llama.cpp/src/llama-chat.h +1 -0
  31. package/src/llama.cpp/src/llama-context.cpp +637 -49
  32. package/src/llama.cpp/src/llama-context.h +43 -1
  33. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  34. package/src/llama.cpp/src/llama-grammar.h +2 -0
  35. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  36. package/src/llama.cpp/src/llama-graph.h +71 -6
  37. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  38. package/src/llama.cpp/src/llama-hparams.h +12 -5
  39. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  40. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  41. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  42. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  43. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  44. package/src/llama.cpp/src/llama-model.cpp +337 -26
  45. package/src/llama.cpp/src/llama-model.h +13 -2
  46. package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
  47. package/src/llama.cpp/src/llama-sampling.h +19 -7
  48. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  49. package/src/llama.cpp/src/llama-vocab.h +2 -0
  50. package/src/llama.cpp/src/llama.cpp +87 -64
  51. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  52. package/src/llama.cpp/src/models/bert.cpp +4 -2
  53. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  54. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  55. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  56. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  57. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  58. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  59. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  60. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  61. package/src/llama.cpp/src/models/llama.cpp +19 -6
  62. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  63. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  64. package/src/llama.cpp/src/models/models.h +18 -0
  65. package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
  66. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  67. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  68. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
  69. package/src/llama.cpp/src/unicode.cpp +23 -14
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.11",
4
+ "version": "1.4.13",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.11",
76
- "@fugood/node-llama-darwin-x64": "1.4.11",
77
- "@fugood/node-llama-linux-arm64": "1.4.11",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.11",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.11",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.11",
81
- "@fugood/node-llama-linux-x64": "1.4.11",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.11",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.11",
84
- "@fugood/node-llama-win32-arm64": "1.4.11",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.11",
86
- "@fugood/node-llama-win32-x64": "1.4.11",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.11",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.11"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.13",
76
+ "@fugood/node-llama-darwin-x64": "1.4.13",
77
+ "@fugood/node-llama-linux-arm64": "1.4.13",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.13",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.13",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.13",
81
+ "@fugood/node-llama-linux-x64": "1.4.13",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.13",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.13",
84
+ "@fugood/node-llama-win32-arm64": "1.4.13",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.13",
86
+ "@fugood/node-llama-win32-x64": "1.4.13",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.13",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.13"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
32
32
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
33
33
  int count = 0;
34
34
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
35
- index 0a426f447..ab02be247 100644
35
+ index 22e527bab..c3d0affca 100644
36
36
  --- a/src/llama.cpp/common/chat.cpp
37
37
  +++ b/src/llama.cpp/common/chat.cpp
38
38
  @@ -7,9 +7,6 @@
@@ -62,7 +62,7 @@ index 0a426f447..ab02be247 100644
62
62
  struct templates_params {
63
63
  json messages;
64
64
  json tools;
65
- @@ -751,7 +738,7 @@ static std::string apply(
65
+ @@ -752,7 +739,7 @@ static std::string apply(
66
66
  tmpl_inputs.extra_context.merge_patch(*additional_context);
67
67
  }
68
68
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -72,7 +72,7 @@ index 0a426f447..ab02be247 100644
72
72
  minja::chat_template_options tmpl_opts;
73
73
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
74
74
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
75
- index 6085510a4..263076ce2 100644
75
+ index 8bd4a325f..333b3301f 100644
76
76
  --- a/src/llama.cpp/common/chat.h
77
77
  +++ b/src/llama.cpp/common/chat.h
78
78
  @@ -10,7 +10,18 @@
@@ -96,22 +96,22 @@ index 6085510a4..263076ce2 100644
96
96
  struct common_chat_tool_call {
97
97
  std::string name;
98
98
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
99
- index d4e8c7405..af3dec813 100644
99
+ index 41b2b6833..fe9ba05aa 100644
100
100
  --- a/src/llama.cpp/common/common.cpp
101
101
  +++ b/src/llama.cpp/common/common.cpp
102
- @@ -1343,6 +1343,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
- mparams.n_gpu_layers = params.n_gpu_layers;
102
+ @@ -1361,6 +1361,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
+ mparams.devices = params.devices.data();
104
104
  }
105
105
 
106
106
  + mparams.vocab_only = params.vocab_only;
107
+ mparams.n_gpu_layers = params.n_gpu_layers;
107
108
  mparams.main_gpu = params.main_gpu;
108
109
  mparams.split_mode = params.split_mode;
109
- mparams.tensor_split = params.tensor_split;
110
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index 334372073..e912b593a 100644
111
+ index d6fd0d37a..477209ce5 100644
112
112
  --- a/src/llama.cpp/common/common.h
113
113
  +++ b/src/llama.cpp/common/common.h
114
- @@ -307,6 +307,7 @@ struct lr_opt {
114
+ @@ -310,6 +310,7 @@ struct lr_opt {
115
115
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
116
116
 
117
117
  struct common_params {
@@ -120,7 +120,7 @@ index 334372073..e912b593a 100644
120
120
  int32_t n_ctx = 0; // context size, 0 == context the model was trained with
121
121
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
122
122
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
123
- index 28fb7612e..63f7e1ca1 100644
123
+ index 7622d0bf4..d2edcfddb 100644
124
124
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
125
125
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
126
126
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -133,14 +133,13 @@ index 28fb7612e..63f7e1ca1 100644
133
133
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
134
134
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
135
135
  diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
136
- index 6a00abacc..9e12459b6 100644
136
+ index 365a24b49..83bf4ee62 100644
137
137
  --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
138
138
  +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
139
- @@ -3226,11 +3226,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
139
+ @@ -2798,9 +2798,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
140
140
  GGML_UNUSED(dev);
141
141
  }
142
142
 
143
- +
144
143
  +// ~2GB per session for now
145
144
  +#define GGML_HEXAGON_SESSION_MEMORY_DEFAULT (2ULL * 1024 * 1024 * 1024)
146
145
  +// Max to 3.5GB
@@ -149,7 +148,6 @@ index 6a00abacc..9e12459b6 100644
149
148
  static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
150
149
  - // ~2GB per session for now
151
150
  - *free = 2ULL * 1024 * 1024 * 1024;
152
- - *total = *free;
153
151
  + const char * str_mem = getenv("GGML_HEXAGON_SESSION_MEMORY");
154
152
  + if (str_mem) {
155
153
  + *free = std::stoull(str_mem);
@@ -161,32 +159,34 @@ index 6a00abacc..9e12459b6 100644
161
159
  + } else {
162
160
  + *free = GGML_HEXAGON_SESSION_MEMORY_DEFAULT;
163
161
  + }
162
+ +
163
+ *total = *free;
164
164
 
165
- + *total = *free;
166
165
  GGML_UNUSED(dev);
167
- }
168
-
169
- @@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
166
+ @@ -3010,10 +3025,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
170
167
  }
171
168
  }
172
169
 
173
- +#if defined(__ANDROID__)
174
- if(opt_arch < 75) {
175
- opt_ndev = 1;
170
+ - if (opt_arch < 75) {
171
+ - opt_ndev = 1;
176
172
  - GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
177
- + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
178
- + }
179
- +#else
180
- + if(opt_arch < 73) {
181
- + opt_ndev = 1;
182
- + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
183
- }
184
- +#endif
173
+ - }
174
+ + #if defined(__ANDROID__)
175
+ + if(opt_arch < 75) {
176
+ + opt_ndev = 1;
177
+ + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
178
+ + }
179
+ + #else
180
+ + if(opt_arch < 73) {
181
+ + opt_ndev = 1;
182
+ + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
183
+ + }
184
+ + #endif
185
185
 
186
186
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
187
187
 
188
- @@ -3429,6 +3451,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
- } catch (std::exception const &exc) {
188
+ @@ -3026,6 +3048,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
+ } catch (const std::exception & exc) {
190
190
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
191
191
  devices[i].context = nullptr;
192
192
  + opt_ndev = i;
@@ -679,7 +679,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
679
679
  "llama-quantize",
680
680
  "llama-qwen2vl-cli",
681
681
  "llama-retrieval",
682
- "llama-run",
683
682
  "llama-save-load-state",
684
683
  "llama-server",
685
684
  "llama-simple",
@@ -854,6 +853,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
854
853
  return value == "auto" || value == "-1";
855
854
  }
856
855
 
856
+ // Simple CSV parser that handles quoted fields and escaped quotes
857
+ // example:
858
+ // input: value1,"value, with, commas","value with ""escaped"" quotes",value4
859
+ // output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
860
+ static std::vector<std::string> parse_csv_row(const std::string& input) {
861
+ std::vector<std::string> fields;
862
+ std::string field;
863
+ bool in_quotes = false;
864
+
865
+ for (size_t i = 0; i < input.length(); ++i) {
866
+ char ch = input[i];
867
+
868
+ if (ch == '"') {
869
+ if (!in_quotes) {
870
+ // start of quoted field (only valid if at beginning of field)
871
+ if (!field.empty()) {
872
+ // quote appeared in middle of unquoted field, treat as literal
873
+ field += '"';
874
+ } else {
875
+ in_quotes = true; // start
876
+ }
877
+ } else {
878
+ if (i + 1 < input.length() && input[i + 1] == '"') {
879
+ // escaped quote: ""
880
+ field += '"';
881
+ ++i; // skip the next quote
882
+ } else {
883
+ in_quotes = false; // end
884
+ }
885
+ }
886
+ } else if (ch == ',') {
887
+ if (in_quotes) {
888
+ field += ',';
889
+ } else {
890
+ fields.push_back(std::move(field));
891
+ field.clear();
892
+ }
893
+ } else {
894
+ field += ch;
895
+ }
896
+ }
897
+
898
+ // Add the last field
899
+ fields.push_back(std::move(field));
900
+
901
+ return fields;
902
+ }
903
+
857
904
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
858
905
  // per-example default params
859
906
  // we define here to make sure it's included in llama-gen-docs
@@ -1250,7 +1297,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1250
1297
  {"--in-file"}, "FNAME",
1251
1298
  "an input file (use comma-separated values to specify multiple files)",
1252
1299
  [](common_params & params, const std::string & value) {
1253
- for (const auto & item : string_split<std::string>(value, ',')) {
1300
+ for (const auto & item : parse_csv_row(value)) {
1254
1301
  std::ifstream file(item);
1255
1302
  if (!file) {
1256
1303
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -1397,7 +1444,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1397
1444
  [](common_params & params, bool value) {
1398
1445
  params.warmup = value;
1399
1446
  }
1400
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1447
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
1401
1448
  add_opt(common_arg(
1402
1449
  {"--spm-infill"},
1403
1450
  string_format(
@@ -1695,6 +1742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1695
1742
  params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1696
1743
  }
1697
1744
  ).set_sparam());
1745
+ add_opt(common_arg(
1746
+ {"-bs", "--backend-sampling"},
1747
+ "enable backend sampling (experimental) (default: disabled)",
1748
+ [](common_params & params) {
1749
+ params.sampling.backend_sampling = true;
1750
+ }
1751
+ ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
1698
1752
  add_opt(common_arg(
1699
1753
  {"--pooling"}, "{none,mean,cls,last,rank}",
1700
1754
  "pooling type for embeddings, use model default if unspecified",
@@ -1706,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1706
1760
  else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1707
1761
  else { throw std::invalid_argument("invalid value"); }
1708
1762
  }
1709
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1763
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
1710
1764
  add_opt(common_arg(
1711
1765
  {"--attention"}, "{causal,non-causal}",
1712
1766
  "attention type for embeddings, use model default if unspecified",
@@ -1995,7 +2049,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1995
2049
  {"--image", "--audio"}, "FILE",
1996
2050
  "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1997
2051
  [](common_params & params, const std::string & value) {
1998
- for (const auto & item : string_split<std::string>(value, ',')) {
2052
+ for (const auto & item : parse_csv_row(value)) {
1999
2053
  params.image.emplace_back(item);
2000
2054
  }
2001
2055
  }
@@ -2017,7 +2071,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2017
2071
  if (llama_supports_rpc()) {
2018
2072
  add_opt(common_arg(
2019
2073
  {"--rpc"}, "SERVERS",
2020
- "comma separated list of RPC servers",
2074
+ "comma separated list of RPC servers (host:port)",
2021
2075
  [](common_params & params, const std::string & value) {
2022
2076
  add_rpc_devices(value);
2023
2077
  GGML_UNUSED(params);
@@ -2087,7 +2141,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2087
2141
  "override tensor buffer type", [](common_params & params, const std::string & value) {
2088
2142
  parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2089
2143
  }
2090
- ));
2144
+ ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
2091
2145
  add_opt(common_arg(
2092
2146
  {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2093
2147
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
@@ -2137,11 +2191,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2137
2191
  }
2138
2192
  }
2139
2193
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2194
+ GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
2140
2195
  add_opt(common_arg(
2141
2196
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2142
- string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
2143
- [](common_params & params, int value) {
2144
- params.n_gpu_layers = value;
2197
+ string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
2198
+ [](common_params & params, const std::string & value) {
2199
+ if (value == "auto") {
2200
+ params.n_gpu_layers = -1;
2201
+ } else if (value == "all") {
2202
+ params.n_gpu_layers = -2;
2203
+ } else {
2204
+ params.n_gpu_layers = std::stoi(value);
2205
+ }
2145
2206
  if (!llama_supports_gpu_offload()) {
2146
2207
  fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
2147
2208
  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -2245,37 +2306,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2245
2306
  ));
2246
2307
  add_opt(common_arg(
2247
2308
  {"--override-kv"}, "KEY=TYPE:VALUE,...",
2248
- "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2309
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
2249
2310
  "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2250
2311
  [](common_params & params, const std::string & value) {
2251
- std::vector<std::string> kv_overrides;
2252
-
2253
- std::string current;
2254
- bool escaping = false;
2255
-
2256
- for (const char c : value) {
2257
- if (escaping) {
2258
- current.push_back(c);
2259
- escaping = false;
2260
- } else if (c == '\\') {
2261
- escaping = true;
2262
- } else if (c == ',') {
2263
- kv_overrides.push_back(current);
2264
- current.clear();
2265
- } else {
2266
- current.push_back(c);
2267
- }
2268
- }
2269
-
2270
- if (escaping) {
2271
- current.push_back('\\');
2272
- }
2273
-
2274
- kv_overrides.push_back(current);
2275
-
2276
- for (const auto & kv_override : kv_overrides) {
2277
- if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2278
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2312
+ for (const auto & item : parse_csv_row(value)) {
2313
+ if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
2314
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
2279
2315
  }
2280
2316
  }
2281
2317
  }
@@ -2292,7 +2328,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2292
2328
  {"--lora"}, "FNAME",
2293
2329
  "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2294
2330
  [](common_params & params, const std::string & value) {
2295
- for (const auto & item : string_split<std::string>(value, ',')) {
2331
+ for (const auto & item : parse_csv_row(value)) {
2296
2332
  params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2297
2333
  }
2298
2334
  }
@@ -2303,7 +2339,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2303
2339
  "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2304
2340
  "note: use comma-separated values",
2305
2341
  [](common_params & params, const std::string & value) {
2306
- for (const auto & item : string_split<std::string>(value, ',')) {
2342
+ for (const auto & item : parse_csv_row(value)) {
2307
2343
  auto parts = string_split<std::string>(item, ':');
2308
2344
  if (parts.size() != 2) {
2309
2345
  throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
@@ -2317,7 +2353,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2317
2353
  {"--control-vector"}, "FNAME",
2318
2354
  "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2319
2355
  [](common_params & params, const std::string & value) {
2320
- for (const auto & item : string_split<std::string>(value, ',')) {
2356
+ for (const auto & item : parse_csv_row(value)) {
2321
2357
  params.control_vectors.push_back({ 1.0f, item, });
2322
2358
  }
2323
2359
  }
@@ -2327,7 +2363,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2327
2363
  "add a control vector with user defined scaling SCALE\n"
2328
2364
  "note: use comma-separated values (format: FNAME:SCALE,...)",
2329
2365
  [](common_params & params, const std::string & value) {
2330
- for (const auto & item : string_split<std::string>(value, ',')) {
2366
+ for (const auto & item : parse_csv_row(value)) {
2331
2367
  auto parts = string_split<std::string>(item, ':');
2332
2368
  if (parts.size() != 2) {
2333
2369
  throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
@@ -2425,7 +2461,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2425
2461
  {"--context-file"}, "FNAME",
2426
2462
  "file to load context from (use comma-separated values to specify multiple files)",
2427
2463
  [](common_params & params, const std::string & value) {
2428
- for (const auto & item : string_split<std::string>(value, ',')) {
2464
+ for (const auto & item : parse_csv_row(value)) {
2429
2465
  std::ifstream file(item, std::ios::binary);
2430
2466
  if (!file) {
2431
2467
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -2572,7 +2608,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2572
2608
  [](common_params & params, int value) {
2573
2609
  params.embd_normalize = value;
2574
2610
  }
2575
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2611
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
2576
2612
  add_opt(common_arg(
2577
2613
  {"--embd-output-format"}, "FORMAT",
2578
2614
  "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2650,7 +2686,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2650
2686
  [](common_params & params) {
2651
2687
  params.embedding = true;
2652
2688
  }
2653
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2689
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
2654
2690
  add_opt(common_arg(
2655
2691
  {"--rerank", "--reranking"},
2656
2692
  string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -2661,9 +2697,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2661
2697
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2662
2698
  add_opt(common_arg(
2663
2699
  {"--api-key"}, "KEY",
2664
- "API key to use for authentication (default: none)",
2700
+ "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
2665
2701
  [](common_params & params, const std::string & value) {
2666
- params.api_keys.push_back(value);
2702
+ for (const auto & key : parse_csv_row(value)) {
2703
+ if (!key.empty()) {
2704
+ params.api_keys.push_back(key);
2705
+ }
2706
+ }
2667
2707
  }
2668
2708
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2669
2709
  add_opt(common_arg(
@@ -2677,7 +2717,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2677
2717
  std::string key;
2678
2718
  while (std::getline(key_file, key)) {
2679
2719
  if (!key.empty()) {
2680
- params.api_keys.push_back(key);
2720
+ params.api_keys.push_back(key);
2681
2721
  }
2682
2722
  }
2683
2723
  key_file.close();
@@ -2699,7 +2739,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2699
2739
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2700
2740
  add_opt(common_arg(
2701
2741
  {"--chat-template-kwargs"}, "STRING",
2702
- string_format("sets additional params for the json template parser"),
2742
+ "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
2703
2743
  [](common_params & params, const std::string & value) {
2704
2744
  auto parsed = json::parse(value);
2705
2745
  for (const auto & item : parsed.items()) {
@@ -3175,11 +3215,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3175
3215
  params.speculative.devices = parse_device_list(value);
3176
3216
  }
3177
3217
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3218
+ GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
3178
3219
  add_opt(common_arg(
3179
3220
  {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
3180
- "number of layers to store in VRAM for the draft model",
3181
- [](common_params & params, int value) {
3182
- params.speculative.n_gpu_layers = value;
3221
+ string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
3222
+ params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
3223
+ [](common_params & params, const std::string & value) {
3224
+ if (value == "auto") {
3225
+ params.speculative.n_gpu_layers = -1;
3226
+ } else if (value == "all") {
3227
+ params.speculative.n_gpu_layers = -2;
3228
+ } else {
3229
+ params.speculative.n_gpu_layers = std::stoi(value);
3230
+ }
3183
3231
  if (!llama_supports_gpu_offload()) {
3184
3232
  fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
3185
3233
  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3329,6 +3377,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3329
3377
  }
3330
3378
  }
3331
3379
  ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3380
+ add_opt(common_arg(
3381
+ {"--save-logits"},
3382
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
3383
+ [](common_params & params) {
3384
+ params.save_logits = true;
3385
+ }
3386
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3387
+ add_opt(common_arg(
3388
+ {"--logits-output-dir"}, "PATH",
3389
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
3390
+ [](common_params & params, const std::string & value) {
3391
+ params.logits_output_dir = value;
3392
+ }
3393
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3394
+ add_opt(common_arg(
3395
+ {"--tensor-filter"}, "REGEX",
3396
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
3397
+ [](common_params & params, const std::string & value) {
3398
+ params.tensor_filter.push_back(value);
3399
+ }
3400
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3332
3401
 
3333
3402
  // presets
3334
3403
  add_opt(common_arg(
@@ -3518,15 +3587,15 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
3518
3587
  [](common_params &, const std::string &) { /* unused */ }
3519
3588
  ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3520
3589
 
3590
+ args.push_back(common_arg(
3591
+ {"stop-timeout"}, "SECONDS",
3592
+ "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
3593
+ [](common_params &, int) { /* unused */ }
3594
+ ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
3595
+
3521
3596
  // args.push_back(common_arg(
3522
3597
  // {"pin"},
3523
3598
  // "in server router mode, do not unload this model if models_max is exceeded",
3524
3599
  // [](common_params &) { /* unused */ }
3525
3600
  // ).set_preset_only());
3526
-
3527
- // args.push_back(common_arg(
3528
- // {"unload-idle-seconds"}, "SECONDS",
3529
- // "in server router mode, unload models idle for more than this many seconds",
3530
- // [](common_params &, int) { /* unused */ }
3531
- // ).set_preset_only());
3532
3601
  }
@@ -10,6 +10,7 @@
10
10
 
11
11
  // pseudo-env variable to identify preset-only arguments
12
12
  #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13
+ #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
13
14
 
14
15
  //
15
16
  // CLI argument parsing
@@ -1395,6 +1395,14 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
1395
1395
  builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
1396
1396
  }
1397
1397
 
1398
+ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1399
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
1400
+
1401
+ // TODO: Tool calling
1402
+
1403
+ builder.add_content(builder.consume_rest());
1404
+ }
1405
+
1398
1406
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1399
1407
  builder.try_parse_reasoning("<think>", "</think>");
1400
1408
  builder.add_content(builder.consume_rest());
@@ -1479,6 +1487,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1479
1487
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
1480
1488
  common_chat_parse_xiaomi_mimo(builder);
1481
1489
  break;
1490
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1491
+ common_chat_parse_solar_open(builder);
1492
+ break;
1482
1493
  default:
1483
1494
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1484
1495
  }