@fugood/llama.node 1.4.8 → 1.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/lib/binding.ts +43 -0
  2. package/lib/parallel.js +26 -0
  3. package/lib/parallel.ts +33 -0
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +12 -14
  6. package/src/LlamaCompletionWorker.cpp +3 -1
  7. package/src/LlamaCompletionWorker.h +2 -0
  8. package/src/LlamaContext.cpp +16 -1
  9. package/src/LlamaContext.h +3 -0
  10. package/src/llama.cpp/common/CMakeLists.txt +4 -4
  11. package/src/llama.cpp/common/arg.cpp +159 -42
  12. package/src/llama.cpp/common/arg.h +10 -1
  13. package/src/llama.cpp/common/common.cpp +1 -1
  14. package/src/llama.cpp/common/common.h +6 -2
  15. package/src/llama.cpp/common/preset.cpp +197 -5
  16. package/src/llama.cpp/common/preset.h +45 -3
  17. package/src/llama.cpp/common/sampling.cpp +51 -37
  18. package/src/llama.cpp/common/sampling.h +6 -3
  19. package/src/llama.cpp/common/speculative.cpp +1 -1
  20. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +283 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +51 -6
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +286 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -1
  30. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  31. package/src/llama.cpp/src/llama-mmap.h +5 -1
  32. package/src/llama.cpp/src/llama-model-loader.cpp +56 -13
  33. package/src/llama.cpp/src/llama-model.cpp +7 -5
  34. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  35. package/src/llama.cpp/src/llama.cpp +22 -32
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
96
96
  return *this;
97
97
  }
98
98
 
99
+ common_arg & common_arg::set_preset_only() {
100
+ is_preset_only = true;
101
+ return *this;
102
+ }
103
+
99
104
  bool common_arg::in_example(enum llama_example ex) {
100
105
  return examples.find(ex) != examples.end();
101
106
  }
@@ -420,6 +425,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
420
425
  }
421
426
  };
422
427
 
428
+ std::set<std::string> seen_args;
429
+
423
430
  for (int i = 1; i < argc; i++) {
424
431
  const std::string arg_prefix = "--";
425
432
 
@@ -430,6 +437,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
430
437
  if (arg_to_options.find(arg) == arg_to_options.end()) {
431
438
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
432
439
  }
440
+ if (!seen_args.insert(arg).second) {
441
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
442
+ }
433
443
  auto & tmp = arg_to_options[arg];
434
444
  auto opt = *tmp.first;
435
445
  bool is_positive = tmp.second;
@@ -750,6 +760,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
750
760
  }
751
761
  };
752
762
 
763
+ std::set<std::string> seen_args;
764
+
753
765
  for (int i = 1; i < argc; i++) {
754
766
  const std::string arg_prefix = "--";
755
767
 
@@ -760,8 +772,16 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
760
772
  if (arg_to_options.find(arg) == arg_to_options.end()) {
761
773
  throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
762
774
  }
775
+ if (!seen_args.insert(arg).second) {
776
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
777
+ }
763
778
  auto opt = *arg_to_options[arg];
764
779
  std::string val;
780
+ if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
781
+ // bool arg (need to reverse the meaning for negative args)
782
+ bool is_neg = std::find(opt.args_neg.begin(), opt.args_neg.end(), arg) != opt.args_neg.end();
783
+ val = is_neg ? "0" : "1";
784
+ }
765
785
  if (opt.value_hint != nullptr) {
766
786
  // arg with single value
767
787
  check_arg(i);
@@ -863,7 +883,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
863
883
  sampler_type_chars += common_sampler_type_to_chr(sampler);
864
884
  sampler_type_names += common_sampler_type_to_str(sampler) + ";";
865
885
  }
866
- sampler_type_names.pop_back();
886
+ if (!sampler_type_names.empty()) {
887
+ sampler_type_names.pop_back(); // remove last semicolon
888
+ }
867
889
 
868
890
 
869
891
  /**
@@ -1127,7 +1149,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1127
1149
  }
1128
1150
  ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1129
1151
  add_opt(common_arg(
1130
- {"--cache-ram", "-cram"}, "N",
1152
+ {"-cram", "--cache-ram"}, "N",
1131
1153
  string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
1132
1154
  "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
1133
1155
  [](common_params & params, int value) {
@@ -1135,7 +1157,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1135
1157
  }
1136
1158
  ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
1137
1159
  add_opt(common_arg(
1138
- {"--kv-unified", "-kvu"},
1160
+ {"-kvu", "--kv-unified"},
1139
1161
  "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
1140
1162
  [](common_params & params) {
1141
1163
  params.kv_unified = true;
@@ -1184,7 +1206,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1184
1206
  [](common_params & params, const std::string & value) {
1185
1207
  params.system_prompt = value;
1186
1208
  }
1187
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1209
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_MTMD}));
1188
1210
  add_opt(common_arg(
1189
1211
  {"--perf"},
1190
1212
  {"--no-perf"},
@@ -1226,13 +1248,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1226
1248
  ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
1227
1249
  add_opt(common_arg(
1228
1250
  {"--in-file"}, "FNAME",
1229
- "an input file (repeat to specify multiple files)",
1251
+ "an input file (use comma-separated values to specify multiple files)",
1230
1252
  [](common_params & params, const std::string & value) {
1231
- std::ifstream file(value);
1232
- if (!file) {
1233
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
1253
+ for (const auto & item : string_split<std::string>(value, ',')) {
1254
+ std::ifstream file(item);
1255
+ if (!file) {
1256
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
1257
+ }
1258
+ params.in_files.push_back(item);
1234
1259
  }
1235
- params.in_files.push_back(value);
1236
1260
  }
1237
1261
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
1238
1262
  add_opt(common_arg(
@@ -1401,7 +1425,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1401
1425
  }
1402
1426
  ).set_sparam());
1403
1427
  add_opt(common_arg(
1404
- {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
1428
+ {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
1405
1429
  string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
1406
1430
  [](common_params & params, const std::string & value) {
1407
1431
  params.sampling.samplers = common_sampler_types_from_chars(value);
@@ -1969,9 +1993,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1969
1993
  ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
1970
1994
  add_opt(common_arg(
1971
1995
  {"--image", "--audio"}, "FILE",
1972
- "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
1996
+ "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1973
1997
  [](common_params & params, const std::string & value) {
1974
- params.image.emplace_back(value);
1998
+ for (const auto & item : string_split<std::string>(value, ',')) {
1999
+ params.image.emplace_back(item);
2000
+ }
1975
2001
  }
1976
2002
  ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
1977
2003
  add_opt(common_arg(
@@ -2057,26 +2083,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2057
2083
  }
2058
2084
  ));
2059
2085
  add_opt(common_arg(
2060
- {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
2086
+ {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
2061
2087
  "override tensor buffer type", [](common_params & params, const std::string & value) {
2062
2088
  parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2063
2089
  }
2064
2090
  ));
2065
2091
  add_opt(common_arg(
2066
- {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
2092
+ {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2067
2093
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
2068
2094
  parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
2069
2095
  }
2070
2096
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
2071
2097
  add_opt(common_arg(
2072
- {"--cpu-moe", "-cmoe"},
2098
+ {"-cmoe", "--cpu-moe"},
2073
2099
  "keep all Mixture of Experts (MoE) weights in the CPU",
2074
2100
  [](common_params & params) {
2075
2101
  params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2076
2102
  }
2077
2103
  ).set_env("LLAMA_ARG_CPU_MOE"));
2078
2104
  add_opt(common_arg(
2079
- {"--n-cpu-moe", "-ncmoe"}, "N",
2105
+ {"-ncmoe", "--n-cpu-moe"}, "N",
2080
2106
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2081
2107
  [](common_params & params, int value) {
2082
2108
  if (value < 0) {
@@ -2091,14 +2117,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2091
2117
  }
2092
2118
  ).set_env("LLAMA_ARG_N_CPU_MOE"));
2093
2119
  add_opt(common_arg(
2094
- {"--cpu-moe-draft", "-cmoed"},
2120
+ {"-cmoed", "--cpu-moe-draft"},
2095
2121
  "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
2096
2122
  [](common_params & params) {
2097
2123
  params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
2098
2124
  }
2099
2125
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
2100
2126
  add_opt(common_arg(
2101
- {"--n-cpu-moe-draft", "-ncmoed"}, "N",
2127
+ {"-ncmoed", "--n-cpu-moe-draft"}, "N",
2102
2128
  "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
2103
2129
  [](common_params & params, int value) {
2104
2130
  if (value < 0) {
@@ -2218,12 +2244,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2218
2244
  }
2219
2245
  ));
2220
2246
  add_opt(common_arg(
2221
- {"--override-kv"}, "KEY=TYPE:VALUE",
2222
- "advanced option to override model metadata by key. may be specified multiple times.\n"
2223
- "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
2247
+ {"--override-kv"}, "KEY=TYPE:VALUE,...",
2248
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2249
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2224
2250
  [](common_params & params, const std::string & value) {
2225
- if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
2226
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
2251
+ std::vector<std::string> kv_overrides;
2252
+
2253
+ std::string current;
2254
+ bool escaping = false;
2255
+
2256
+ for (const char c : value) {
2257
+ if (escaping) {
2258
+ current.push_back(c);
2259
+ escaping = false;
2260
+ } else if (c == '\\') {
2261
+ escaping = true;
2262
+ } else if (c == ',') {
2263
+ kv_overrides.push_back(current);
2264
+ current.clear();
2265
+ } else {
2266
+ current.push_back(c);
2267
+ }
2268
+ }
2269
+
2270
+ if (escaping) {
2271
+ current.push_back('\\');
2272
+ }
2273
+
2274
+ kv_overrides.push_back(current);
2275
+
2276
+ for (const auto & kv_override : kv_overrides) {
2277
+ if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2278
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2279
+ }
2227
2280
  }
2228
2281
  }
2229
2282
  ));
@@ -2237,33 +2290,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2237
2290
  ));
2238
2291
  add_opt(common_arg(
2239
2292
  {"--lora"}, "FNAME",
2240
- "path to LoRA adapter (can be repeated to use multiple adapters)",
2293
+ "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2241
2294
  [](common_params & params, const std::string & value) {
2242
- params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2295
+ for (const auto & item : string_split<std::string>(value, ',')) {
2296
+ params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2297
+ }
2243
2298
  }
2244
2299
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2245
2300
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2246
2301
  add_opt(common_arg(
2247
- {"--lora-scaled"}, "FNAME", "SCALE",
2248
- "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2249
- [](common_params & params, const std::string & fname, const std::string & scale) {
2250
- params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2302
+ {"--lora-scaled"}, "FNAME:SCALE,...",
2303
+ "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2304
+ "note: use comma-separated values",
2305
+ [](common_params & params, const std::string & value) {
2306
+ for (const auto & item : string_split<std::string>(value, ',')) {
2307
+ auto parts = string_split<std::string>(item, ':');
2308
+ if (parts.size() != 2) {
2309
+ throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
2310
+ }
2311
+ params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr });
2312
+ }
2251
2313
  }
2252
2314
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2253
2315
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
2254
2316
  add_opt(common_arg(
2255
2317
  {"--control-vector"}, "FNAME",
2256
- "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
2318
+ "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2257
2319
  [](common_params & params, const std::string & value) {
2258
- params.control_vectors.push_back({ 1.0f, value, });
2320
+ for (const auto & item : string_split<std::string>(value, ',')) {
2321
+ params.control_vectors.push_back({ 1.0f, item, });
2322
+ }
2259
2323
  }
2260
2324
  ));
2261
2325
  add_opt(common_arg(
2262
- {"--control-vector-scaled"}, "FNAME", "SCALE",
2326
+ {"--control-vector-scaled"}, "FNAME:SCALE,...",
2263
2327
  "add a control vector with user defined scaling SCALE\n"
2264
- "note: this argument can be repeated to add multiple scaled control vectors",
2265
- [](common_params & params, const std::string & fname, const std::string & scale) {
2266
- params.control_vectors.push_back({ std::stof(scale), fname });
2328
+ "note: use comma-separated values (format: FNAME:SCALE,...)",
2329
+ [](common_params & params, const std::string & value) {
2330
+ for (const auto & item : string_split<std::string>(value, ',')) {
2331
+ auto parts = string_split<std::string>(item, ':');
2332
+ if (parts.size() != 2) {
2333
+ throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
2334
+ }
2335
+ params.control_vectors.push_back({ std::stof(parts[1]), parts[0] });
2336
+ }
2267
2337
  }
2268
2338
  ));
2269
2339
  add_opt(common_arg(
@@ -2353,13 +2423,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2353
2423
  ).set_env("HF_TOKEN"));
2354
2424
  add_opt(common_arg(
2355
2425
  {"--context-file"}, "FNAME",
2356
- "file to load context from (repeat to specify multiple files)",
2426
+ "file to load context from (use comma-separated values to specify multiple files)",
2357
2427
  [](common_params & params, const std::string & value) {
2358
- std::ifstream file(value, std::ios::binary);
2359
- if (!file) {
2360
- throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
2428
+ for (const auto & item : string_split<std::string>(value, ',')) {
2429
+ std::ifstream file(item, std::ios::binary);
2430
+ if (!file) {
2431
+ throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
2432
+ }
2433
+ params.context_files.push_back(item);
2361
2434
  }
2362
- params.context_files.push_back(value);
2363
2435
  }
2364
2436
  ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
2365
2437
  add_opt(common_arg(
@@ -2550,6 +2622,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2550
2622
  params.api_prefix = value;
2551
2623
  }
2552
2624
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2625
+ add_opt(common_arg(
2626
+ {"--webui-config"}, "JSON",
2627
+ "JSON that provides default WebUI settings (overrides WebUI defaults)",
2628
+ [](common_params & params, const std::string & value) {
2629
+ params.webui_config_json = value;
2630
+ }
2631
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
2632
+ add_opt(common_arg(
2633
+ {"--webui-config-file"}, "PATH",
2634
+ "JSON file that provides default WebUI settings (overrides WebUI defaults)",
2635
+ [](common_params & params, const std::string & value) {
2636
+ params.webui_config_json = read_file(value);
2637
+ }
2638
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
2553
2639
  add_opt(common_arg(
2554
2640
  {"--webui"},
2555
2641
  {"--no-webui"},
@@ -2566,7 +2652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2566
2652
  }
2567
2653
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2568
2654
  add_opt(common_arg(
2569
- {"--reranking", "--rerank"},
2655
+ {"--rerank", "--reranking"},
2570
2656
  string_format("enable reranking endpoint on server (default: %s)", "disabled"),
2571
2657
  [](common_params & params) {
2572
2658
  params.embedding = true;
@@ -2801,6 +2887,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2801
2887
  params.lora_init_without_apply = true;
2802
2888
  }
2803
2889
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
2890
+ add_opt(common_arg(
2891
+ {"--sleep-idle-seconds"}, "SECONDS",
2892
+ string_format("number of seconds of idleness after which the server will sleep (default: %d; -1 = disabled)", params.sleep_idle_seconds),
2893
+ [](common_params & params, int value) {
2894
+ if (value == 0 || value < -1) {
2895
+ throw std::invalid_argument("invalid value: cannot be 0 or less than -1");
2896
+ }
2897
+ params.sleep_idle_seconds = value;
2898
+ }
2899
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
2804
2900
  add_opt(common_arg(
2805
2901
  {"--simple-io"},
2806
2902
  "use basic IO for better compatibility in subprocesses and limited consoles",
@@ -3037,7 +3133,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3037
3133
  }
3038
3134
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
3039
3135
  add_opt(common_arg(
3040
- {"--draft-max", "--draft", "--draft-n"}, "N",
3136
+ {"--draft", "--draft-n", "--draft-max"}, "N",
3041
3137
  string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
3042
3138
  [](common_params & params, int value) {
3043
3139
  params.speculative.n_max = value;
@@ -3413,3 +3509,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3413
3509
 
3414
3510
  return ctx_arg;
3415
3511
  }
3512
+
3513
+ void common_params_add_preset_options(std::vector<common_arg> & args) {
3514
+ // arguments below won't be treated as CLI args, only preset options
3515
+ args.push_back(common_arg(
3516
+ {"load-on-startup"}, "NAME",
3517
+ "in server router mode, autoload this model on startup",
3518
+ [](common_params &, const std::string &) { /* unused */ }
3519
+ ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3520
+
3521
+ // args.push_back(common_arg(
3522
+ // {"pin"},
3523
+ // "in server router mode, do not unload this model if models_max is exceeded",
3524
+ // [](common_params &) { /* unused */ }
3525
+ // ).set_preset_only());
3526
+
3527
+ // args.push_back(common_arg(
3528
+ // {"unload-idle-seconds"}, "SECONDS",
3529
+ // "in server router mode, unload models idle for more than this many seconds",
3530
+ // [](common_params &, int) { /* unused */ }
3531
+ // ).set_preset_only());
3532
+ }
@@ -8,6 +8,9 @@
8
8
  #include <vector>
9
9
  #include <cstring>
10
10
 
11
+ // pseudo-env variable to identify preset-only arguments
12
+ #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13
+
11
14
  //
12
15
  // CLI argument parsing
13
16
  //
@@ -22,6 +25,7 @@ struct common_arg {
22
25
  const char * env = nullptr;
23
26
  std::string help;
24
27
  bool is_sparam = false; // is current arg a sampling param?
28
+ bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
25
29
  void (*handler_void) (common_params & params) = nullptr;
26
30
  void (*handler_string) (common_params & params, const std::string &) = nullptr;
27
31
  void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +74,7 @@ struct common_arg {
70
74
  common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
71
75
  common_arg & set_env(const char * env);
72
76
  common_arg & set_sparam();
77
+ common_arg & set_preset_only();
73
78
  bool in_example(enum llama_example ex);
74
79
  bool is_exclude(enum llama_example ex);
75
80
  bool get_value_from_env(std::string & output) const;
@@ -114,9 +119,13 @@ struct common_params_context {
114
119
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
115
120
 
116
121
  // parse input arguments from CLI into a map
117
- // TODO: support repeated args in the future
118
122
  bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
119
123
 
124
+ // populate preset-only arguments
125
+ // these arguments are not treated as command line arguments
126
+ // see: https://github.com/ggml-org/llama.cpp/issues/18163
127
+ void common_params_add_preset_options(std::vector<common_arg> & args);
128
+
120
129
  // initialize argument parser context - used by test-arg-parser and preset
121
130
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
122
131
 
@@ -1092,7 +1092,7 @@ common_init_result::common_init_result(common_params & params) :
1092
1092
  auto cparams = common_context_params_to_llama(params);
1093
1093
 
1094
1094
  if (params.fit_params) {
1095
- LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__);
1095
+ LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1096
1096
  llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1097
1097
  params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1098
1098
  params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
@@ -476,7 +476,8 @@ struct common_params {
476
476
  bool enable_chat_template = true;
477
477
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
478
478
  int reasoning_budget = -1;
479
- bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
479
+ bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
480
+ int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time
480
481
 
481
482
  std::vector<std::string> api_keys;
482
483
 
@@ -485,8 +486,11 @@ struct common_params {
485
486
 
486
487
  std::map<std::string, std::string> default_template_kwargs;
487
488
 
489
+ // webui configs
490
+ bool webui = true;
491
+ std::string webui_config_json;
492
+
488
493
  // "advanced" endpoints are disabled by default for better security
489
- bool webui = true;
490
494
  bool endpoint_slots = true;
491
495
  bool endpoint_props = false; // only control POST requests, not GET
492
496
  bool endpoint_metrics = false;