@fugood/llama.node 1.4.12 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +11 -1
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +2 -0
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -9
  7. package/src/LlamaContext.cpp +5 -2
  8. package/src/llama.cpp/common/arg.cpp +249 -101
  9. package/src/llama.cpp/common/arg.h +0 -8
  10. package/src/llama.cpp/common/chat.cpp +4 -4
  11. package/src/llama.cpp/common/common.cpp +21 -1
  12. package/src/llama.cpp/common/common.h +20 -7
  13. package/src/llama.cpp/common/download.cpp +104 -55
  14. package/src/llama.cpp/common/download.h +26 -5
  15. package/src/llama.cpp/common/llguidance.cpp +10 -6
  16. package/src/llama.cpp/common/preset.cpp +76 -1
  17. package/src/llama.cpp/common/preset.h +10 -1
  18. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  19. package/src/llama.cpp/common/sampling.cpp +58 -14
  20. package/src/llama.cpp/common/sampling.h +3 -1
  21. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  22. package/src/llama.cpp/include/llama.h +92 -10
  23. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  24. package/src/llama.cpp/src/llama-arch.h +1 -0
  25. package/src/llama.cpp/src/llama-context.cpp +615 -28
  26. package/src/llama.cpp/src/llama-context.h +43 -1
  27. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  28. package/src/llama.cpp/src/llama-grammar.h +2 -0
  29. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  30. package/src/llama.cpp/src/llama-graph.h +71 -6
  31. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  32. package/src/llama.cpp/src/llama-hparams.h +8 -2
  33. package/src/llama.cpp/src/llama-mmap.cpp +70 -37
  34. package/src/llama.cpp/src/llama-mmap.h +5 -4
  35. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  36. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  37. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  38. package/src/llama.cpp/src/llama-model.cpp +66 -16
  39. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  40. package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
  41. package/src/llama.cpp/src/llama-sampling.h +16 -7
  42. package/src/llama.cpp/src/llama.cpp +101 -57
  43. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  44. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  45. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  46. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  47. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  48. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  49. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -6,6 +6,7 @@
6
6
  #include "log.h"
7
7
  #include "sampling.h"
8
8
  #include "download.h"
9
+ #include "preset.h"
9
10
 
10
11
  // fix problem with std::min and std::max
11
12
  #if defined(_WIN32)
@@ -268,6 +269,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
268
269
  }
269
270
  }
270
271
 
272
+ static std::string clean_file_name(const std::string & fname) {
273
+ std::string clean_fname = fname;
274
+ string_replace_all(clean_fname, "\\", "_");
275
+ string_replace_all(clean_fname, "/", "_");
276
+ return clean_fname;
277
+ }
278
+
279
+ static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
280
+ GGML_ASSERT(!params.model.hf_repo.empty());
281
+
282
+ const bool offline = params.offline;
283
+ std::string model_endpoint = get_model_endpoint();
284
+ auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
285
+
286
+ // prepare local path for caching
287
+ auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
288
+ auto preset_path = fs_get_cache_file(preset_fname);
289
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
290
+ const bool has_preset = status >= 200 && status < 400;
291
+
292
+ // remote preset is optional, so we don't error out if not found
293
+ if (has_preset) {
294
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
295
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
296
+ common_preset global; // unused for now
297
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
298
+ if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
299
+ common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
300
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
301
+ preset.apply_to_params(params);
302
+ } else {
303
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
304
+ }
305
+ } else {
306
+ LOG_INF("%s", "no remote preset found, skipping\n");
307
+ }
308
+
309
+ return has_preset;
310
+ }
311
+
271
312
  struct handle_model_result {
272
313
  bool found_mmproj = false;
273
314
  common_params_model mmproj;
@@ -309,9 +350,7 @@ static handle_model_result common_params_handle_model(
309
350
  // make sure model path is present (for caching purposes)
310
351
  if (model.path.empty()) {
311
352
  // this is to avoid different repo having same file name, or same file name in different subdirs
312
- std::string filename = model.hf_repo + "_" + model.hf_file;
313
- // to make sure we don't have any slashes in the filename
314
- string_replace_all(filename, "/", "_");
353
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
315
354
  model.path = fs_get_cache_file(filename);
316
355
  }
317
356
 
@@ -425,61 +464,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
425
464
  }
426
465
  };
427
466
 
428
- std::set<std::string> seen_args;
467
+ auto parse_cli_args = [&]() {
468
+ std::set<std::string> seen_args;
429
469
 
430
- for (int i = 1; i < argc; i++) {
431
- const std::string arg_prefix = "--";
470
+ for (int i = 1; i < argc; i++) {
471
+ const std::string arg_prefix = "--";
432
472
 
433
- std::string arg = argv[i];
434
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
435
- std::replace(arg.begin(), arg.end(), '_', '-');
436
- }
437
- if (arg_to_options.find(arg) == arg_to_options.end()) {
438
- throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
439
- }
440
- if (!seen_args.insert(arg).second) {
441
- LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
442
- }
443
- auto & tmp = arg_to_options[arg];
444
- auto opt = *tmp.first;
445
- bool is_positive = tmp.second;
446
- if (opt.has_value_from_env()) {
447
- fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
448
- }
449
- try {
450
- if (opt.handler_void) {
451
- opt.handler_void(params);
452
- continue;
473
+ std::string arg = argv[i];
474
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
475
+ std::replace(arg.begin(), arg.end(), '_', '-');
453
476
  }
454
- if (opt.handler_bool) {
455
- opt.handler_bool(params, is_positive);
456
- continue;
477
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
478
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
457
479
  }
458
-
459
- // arg with single value
460
- check_arg(i);
461
- std::string val = argv[++i];
462
- if (opt.handler_int) {
463
- opt.handler_int(params, std::stoi(val));
464
- continue;
480
+ if (!seen_args.insert(arg).second) {
481
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
465
482
  }
466
- if (opt.handler_string) {
467
- opt.handler_string(params, val);
468
- continue;
483
+ auto & tmp = arg_to_options[arg];
484
+ auto opt = *tmp.first;
485
+ bool is_positive = tmp.second;
486
+ if (opt.has_value_from_env()) {
487
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
469
488
  }
489
+ try {
490
+ if (opt.handler_void) {
491
+ opt.handler_void(params);
492
+ continue;
493
+ }
494
+ if (opt.handler_bool) {
495
+ opt.handler_bool(params, is_positive);
496
+ continue;
497
+ }
470
498
 
471
- // arg with 2 values
472
- check_arg(i);
473
- std::string val2 = argv[++i];
474
- if (opt.handler_str_str) {
475
- opt.handler_str_str(params, val, val2);
476
- continue;
499
+ // arg with single value
500
+ check_arg(i);
501
+ std::string val = argv[++i];
502
+ if (opt.handler_int) {
503
+ opt.handler_int(params, std::stoi(val));
504
+ continue;
505
+ }
506
+ if (opt.handler_string) {
507
+ opt.handler_string(params, val);
508
+ continue;
509
+ }
510
+
511
+ // arg with 2 values
512
+ check_arg(i);
513
+ std::string val2 = argv[++i];
514
+ if (opt.handler_str_str) {
515
+ opt.handler_str_str(params, val, val2);
516
+ continue;
517
+ }
518
+ } catch (std::exception & e) {
519
+ throw std::invalid_argument(string_format(
520
+ "error while handling argument \"%s\": %s\n\n"
521
+ "usage:\n%s\n\nto show complete usage, run with -h",
522
+ arg.c_str(), e.what(), opt.to_string().c_str()));
477
523
  }
478
- } catch (std::exception & e) {
479
- throw std::invalid_argument(string_format(
480
- "error while handling argument \"%s\": %s\n\n"
481
- "usage:\n%s\n\nto show complete usage, run with -h",
482
- arg.c_str(), e.what(), opt.to_string().c_str()));
524
+ }
525
+ };
526
+
527
+ // parse the first time to get -hf option (used for remote preset)
528
+ parse_cli_args();
529
+
530
+ // maybe handle remote preset
531
+ if (!params.model.hf_repo.empty()) {
532
+ std::string cli_hf_repo = params.model.hf_repo;
533
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
534
+
535
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
536
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
537
+ std::string preset_hf_repo = params.model.hf_repo;
538
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
539
+
540
+ if (has_preset) {
541
+ // re-parse CLI args to override preset values
542
+ parse_cli_args();
543
+ }
544
+
545
+ // preserve hf_repo from preset if needed
546
+ if (preset_has_hf_repo) {
547
+ params.model.hf_repo = preset_hf_repo;
483
548
  }
484
549
  }
485
550
 
@@ -679,7 +744,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
679
744
  "llama-quantize",
680
745
  "llama-qwen2vl-cli",
681
746
  "llama-retrieval",
682
- "llama-run",
683
747
  "llama-save-load-state",
684
748
  "llama-server",
685
749
  "llama-simple",
@@ -854,6 +918,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
854
918
  return value == "auto" || value == "-1";
855
919
  }
856
920
 
921
+ // Simple CSV parser that handles quoted fields and escaped quotes
922
+ // example:
923
+ // input: value1,"value, with, commas","value with ""escaped"" quotes",value4
924
+ // output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
925
+ static std::vector<std::string> parse_csv_row(const std::string& input) {
926
+ std::vector<std::string> fields;
927
+ std::string field;
928
+ bool in_quotes = false;
929
+
930
+ for (size_t i = 0; i < input.length(); ++i) {
931
+ char ch = input[i];
932
+
933
+ if (ch == '"') {
934
+ if (!in_quotes) {
935
+ // start of quoted field (only valid if at beginning of field)
936
+ if (!field.empty()) {
937
+ // quote appeared in middle of unquoted field, treat as literal
938
+ field += '"';
939
+ } else {
940
+ in_quotes = true; // start
941
+ }
942
+ } else {
943
+ if (i + 1 < input.length() && input[i + 1] == '"') {
944
+ // escaped quote: ""
945
+ field += '"';
946
+ ++i; // skip the next quote
947
+ } else {
948
+ in_quotes = false; // end
949
+ }
950
+ }
951
+ } else if (ch == ',') {
952
+ if (in_quotes) {
953
+ field += ',';
954
+ } else {
955
+ fields.push_back(std::move(field));
956
+ field.clear();
957
+ }
958
+ } else {
959
+ field += ch;
960
+ }
961
+ }
962
+
963
+ // Add the last field
964
+ fields.push_back(std::move(field));
965
+
966
+ return fields;
967
+ }
968
+
857
969
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
858
970
  // per-example default params
859
971
  // we define here to make sure it's included in llama-gen-docs
@@ -1250,7 +1362,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1250
1362
  {"--in-file"}, "FNAME",
1251
1363
  "an input file (use comma-separated values to specify multiple files)",
1252
1364
  [](common_params & params, const std::string & value) {
1253
- for (const auto & item : string_split<std::string>(value, ',')) {
1365
+ for (const auto & item : parse_csv_row(value)) {
1254
1366
  std::ifstream file(item);
1255
1367
  if (!file) {
1256
1368
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -1397,7 +1509,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1397
1509
  [](common_params & params, bool value) {
1398
1510
  params.warmup = value;
1399
1511
  }
1400
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
1512
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
1401
1513
  add_opt(common_arg(
1402
1514
  {"--spm-infill"},
1403
1515
  string_format(
@@ -1695,6 +1807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1695
1807
  params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
1696
1808
  }
1697
1809
  ).set_sparam());
1810
+ add_opt(common_arg(
1811
+ {"-bs", "--backend-sampling"},
1812
+ "enable backend sampling (experimental) (default: disabled)",
1813
+ [](common_params & params) {
1814
+ params.sampling.backend_sampling = true;
1815
+ }
1816
+ ).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
1698
1817
  add_opt(common_arg(
1699
1818
  {"--pooling"}, "{none,mean,cls,last,rank}",
1700
1819
  "pooling type for embeddings, use model default if unspecified",
@@ -1706,7 +1825,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1706
1825
  else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
1707
1826
  else { throw std::invalid_argument("invalid value"); }
1708
1827
  }
1709
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
1828
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
1710
1829
  add_opt(common_arg(
1711
1830
  {"--attention"}, "{causal,non-causal}",
1712
1831
  "attention type for embeddings, use model default if unspecified",
@@ -1995,7 +2114,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1995
2114
  {"--image", "--audio"}, "FILE",
1996
2115
  "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
1997
2116
  [](common_params & params, const std::string & value) {
1998
- for (const auto & item : string_split<std::string>(value, ',')) {
2117
+ for (const auto & item : parse_csv_row(value)) {
1999
2118
  params.image.emplace_back(item);
2000
2119
  }
2001
2120
  }
@@ -2034,11 +2153,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2034
2153
  add_opt(common_arg(
2035
2154
  {"--mmap"},
2036
2155
  {"--no-mmap"},
2037
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2156
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2038
2157
  [](common_params & params, bool value) {
2039
2158
  params.use_mmap = value;
2159
+ if (value) {
2160
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
2161
+ }
2040
2162
  }
2041
2163
  ).set_env("LLAMA_ARG_MMAP"));
2164
+ add_opt(common_arg(
2165
+ {"-dio", "--direct-io"},
2166
+ {"-ndio", "--no-direct-io"},
2167
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
2168
+ [](common_params & params, bool value) {
2169
+ params.use_direct_io = value;
2170
+ }
2171
+ ).set_env("LLAMA_ARG_DIO"));
2042
2172
  add_opt(common_arg(
2043
2173
  {"--numa"}, "TYPE",
2044
2174
  "attempt optimizations that help on some NUMA systems\n"
@@ -2190,7 +2320,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2190
2320
  std::vector<std::string> split_arg{ it, {} };
2191
2321
  if (split_arg.size() >= llama_max_devices()) {
2192
2322
  throw std::invalid_argument(
2193
- string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
2323
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2194
2324
  );
2195
2325
  }
2196
2326
  for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2230,10 +2360,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2230
2360
  }
2231
2361
  ).set_env("LLAMA_ARG_FIT"));
2232
2362
  add_opt(common_arg(
2233
- { "-fitt", "--fit-target" }, "MiB",
2234
- string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2235
- [](common_params & params, int value) {
2236
- params.fit_params_target = value * size_t(1024*1024);
2363
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
2364
+ string_format("target margin per device for --fit, comma-separated list of values, "
2365
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
2366
+ [](common_params & params, const std::string & value) {
2367
+ std::string arg_next = value;
2368
+
2369
+ // split string by , and /
2370
+ const std::regex regex{ R"([,/]+)" };
2371
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2372
+ std::vector<std::string> split_arg{ it, {} };
2373
+ if (split_arg.size() >= llama_max_devices()) {
2374
+ throw std::invalid_argument(
2375
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2376
+ );
2377
+ }
2378
+ if (split_arg.size() == 1) {
2379
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
2380
+ return;
2381
+ }
2382
+ for (size_t i = 0; i < split_arg.size(); i++) {
2383
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
2384
+ }
2237
2385
  }
2238
2386
  ).set_env("LLAMA_ARG_FIT_TARGET"));
2239
2387
  add_opt(common_arg(
@@ -2252,37 +2400,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2252
2400
  ));
2253
2401
  add_opt(common_arg(
2254
2402
  {"--override-kv"}, "KEY=TYPE:VALUE,...",
2255
- "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n"
2403
+ "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
2256
2404
  "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
2257
2405
  [](common_params & params, const std::string & value) {
2258
- std::vector<std::string> kv_overrides;
2259
-
2260
- std::string current;
2261
- bool escaping = false;
2262
-
2263
- for (const char c : value) {
2264
- if (escaping) {
2265
- current.push_back(c);
2266
- escaping = false;
2267
- } else if (c == '\\') {
2268
- escaping = true;
2269
- } else if (c == ',') {
2270
- kv_overrides.push_back(current);
2271
- current.clear();
2272
- } else {
2273
- current.push_back(c);
2274
- }
2275
- }
2276
-
2277
- if (escaping) {
2278
- current.push_back('\\');
2279
- }
2280
-
2281
- kv_overrides.push_back(current);
2282
-
2283
- for (const auto & kv_override : kv_overrides) {
2284
- if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
2285
- throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
2406
+ for (const auto & item : parse_csv_row(value)) {
2407
+ if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
2408
+ throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
2286
2409
  }
2287
2410
  }
2288
2411
  }
@@ -2299,7 +2422,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2299
2422
  {"--lora"}, "FNAME",
2300
2423
  "path to LoRA adapter (use comma-separated values to load multiple adapters)",
2301
2424
  [](common_params & params, const std::string & value) {
2302
- for (const auto & item : string_split<std::string>(value, ',')) {
2425
+ for (const auto & item : parse_csv_row(value)) {
2303
2426
  params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
2304
2427
  }
2305
2428
  }
@@ -2310,7 +2433,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2310
2433
  "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
2311
2434
  "note: use comma-separated values",
2312
2435
  [](common_params & params, const std::string & value) {
2313
- for (const auto & item : string_split<std::string>(value, ',')) {
2436
+ for (const auto & item : parse_csv_row(value)) {
2314
2437
  auto parts = string_split<std::string>(item, ':');
2315
2438
  if (parts.size() != 2) {
2316
2439
  throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
@@ -2324,7 +2447,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2324
2447
  {"--control-vector"}, "FNAME",
2325
2448
  "add a control vector\nnote: use comma-separated values to add multiple control vectors",
2326
2449
  [](common_params & params, const std::string & value) {
2327
- for (const auto & item : string_split<std::string>(value, ',')) {
2450
+ for (const auto & item : parse_csv_row(value)) {
2328
2451
  params.control_vectors.push_back({ 1.0f, item, });
2329
2452
  }
2330
2453
  }
@@ -2334,7 +2457,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2334
2457
  "add a control vector with user defined scaling SCALE\n"
2335
2458
  "note: use comma-separated values (format: FNAME:SCALE,...)",
2336
2459
  [](common_params & params, const std::string & value) {
2337
- for (const auto & item : string_split<std::string>(value, ',')) {
2460
+ for (const auto & item : parse_csv_row(value)) {
2338
2461
  auto parts = string_split<std::string>(item, ':');
2339
2462
  if (parts.size() != 2) {
2340
2463
  throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
@@ -2432,7 +2555,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2432
2555
  {"--context-file"}, "FNAME",
2433
2556
  "file to load context from (use comma-separated values to specify multiple files)",
2434
2557
  [](common_params & params, const std::string & value) {
2435
- for (const auto & item : string_split<std::string>(value, ',')) {
2558
+ for (const auto & item : parse_csv_row(value)) {
2436
2559
  std::ifstream file(item, std::ios::binary);
2437
2560
  if (!file) {
2438
2561
  throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
@@ -2579,7 +2702,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2579
2702
  [](common_params & params, int value) {
2580
2703
  params.embd_normalize = value;
2581
2704
  }
2582
- ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2705
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
2583
2706
  add_opt(common_arg(
2584
2707
  {"--embd-output-format"}, "FORMAT",
2585
2708
  "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2657,7 +2780,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2657
2780
  [](common_params & params) {
2658
2781
  params.embedding = true;
2659
2782
  }
2660
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
2783
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
2661
2784
  add_opt(common_arg(
2662
2785
  {"--rerank", "--reranking"},
2663
2786
  string_format("enable reranking endpoint on server (default: %s)", "disabled"),
@@ -2668,9 +2791,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2668
2791
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
2669
2792
  add_opt(common_arg(
2670
2793
  {"--api-key"}, "KEY",
2671
- "API key to use for authentication (default: none)",
2794
+ "API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
2672
2795
  [](common_params & params, const std::string & value) {
2673
- params.api_keys.push_back(value);
2796
+ for (const auto & key : parse_csv_row(value)) {
2797
+ if (!key.empty()) {
2798
+ params.api_keys.push_back(key);
2799
+ }
2800
+ }
2674
2801
  }
2675
2802
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
2676
2803
  add_opt(common_arg(
@@ -2684,7 +2811,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2684
2811
  std::string key;
2685
2812
  while (std::getline(key_file, key)) {
2686
2813
  if (!key.empty()) {
2687
- params.api_keys.push_back(key);
2814
+ params.api_keys.push_back(key);
2688
2815
  }
2689
2816
  }
2690
2817
  key_file.close();
@@ -2706,7 +2833,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2706
2833
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
2707
2834
  add_opt(common_arg(
2708
2835
  {"--chat-template-kwargs"}, "STRING",
2709
- string_format("sets additional params for the json template parser"),
2836
+ "sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
2710
2837
  [](common_params & params, const std::string & value) {
2711
2838
  auto parsed = json::parse(value);
2712
2839
  for (const auto & item : parsed.items()) {
@@ -3344,6 +3471,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3344
3471
  }
3345
3472
  }
3346
3473
  ).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
3474
+ add_opt(common_arg(
3475
+ {"--save-logits"},
3476
+ string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
3477
+ [](common_params & params) {
3478
+ params.save_logits = true;
3479
+ }
3480
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3481
+ add_opt(common_arg(
3482
+ {"--logits-output-dir"}, "PATH",
3483
+ string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
3484
+ [](common_params & params, const std::string & value) {
3485
+ params.logits_output_dir = value;
3486
+ }
3487
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3488
+ add_opt(common_arg(
3489
+ {"--tensor-filter"}, "REGEX",
3490
+ "filter tensor names for debug output (regex pattern, can be specified multiple times)",
3491
+ [](common_params & params, const std::string & value) {
3492
+ params.tensor_filter.push_back(value);
3493
+ }
3494
+ ).set_examples({LLAMA_EXAMPLE_DEBUG}));
3347
3495
 
3348
3496
  // presets
3349
3497
  add_opt(common_arg(
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
129
129
 
130
130
  // initialize argument parser context - used by test-arg-parser and preset
131
131
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
132
-
133
- struct common_remote_params {
134
- std::vector<std::string> headers;
135
- long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
136
- long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
137
- };
138
- // get remote file content, returns <http_code, raw_response_body>
139
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
@@ -2052,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
2052
2052
  // Trigger on tool calls that appear in the commentary channel
2053
2053
  data.grammar_triggers.push_back({
2054
2054
  COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2055
- "<\\|channel\\|>(commentary|analysis) to"
2055
+ "<\\|channel\\|>(?:commentary|analysis) to"
2056
2056
  });
2057
2057
 
2058
2058
  // Trigger tool calls that appear in the role section, either at the
@@ -2385,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
2385
2385
  (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
2386
2386
  // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
2387
2387
  data.grammar_triggers.push_back({
2388
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
2388
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
2389
2389
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
2390
2390
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
2391
- std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
2391
+ std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
2392
2392
  "\\s*("
2393
2393
  "(?:<tool_call>"
2394
2394
  "|<function"
2395
2395
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
2396
2396
  "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
2397
2397
  ")"
2398
- ")[\\s\\S]*"
2398
+ ")"
2399
2399
  ),
2400
2400
  });
2401
2401
  data.preserved_tokens = {
@@ -1086,6 +1086,7 @@ struct common_init_result::impl {
1086
1086
  std::vector<llama_adapter_lora_ptr> lora;
1087
1087
 
1088
1088
  std::vector<common_sampler_ptr> samplers;
1089
+ std::vector<llama_sampler_seq_config> samplers_seq_config;
1089
1090
  };
1090
1091
 
1091
1092
  common_init_result::common_init_result(common_params & params) :
@@ -1096,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
1096
1097
  if (params.fit_params) {
1097
1098
  LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1098
1099
  llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1099
- params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1100
+ params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
1100
1101
  params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1101
1102
  }
1102
1103
 
@@ -1162,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
1162
1163
  // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
1163
1164
  //}
1164
1165
 
1166
+ // init the backend samplers as part of the context creation
1165
1167
  pimpl->samplers.resize(cparams.n_seq_max);
1168
+ pimpl->samplers_seq_config.resize(cparams.n_seq_max);
1166
1169
 
1167
1170
  for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
1168
1171
  pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
1172
+ pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
1173
+ }
1174
+
1175
+ // TODO: temporarily gated behind a flag
1176
+ if (params.sampling.backend_sampling) {
1177
+ cparams.samplers = pimpl->samplers_seq_config.data();
1178
+ cparams.n_samplers = pimpl->samplers_seq_config.size();
1169
1179
  }
1170
1180
 
1171
1181
  llama_context * lctx = llama_init_from_model(model, cparams);
@@ -1189,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
1189
1199
  return pimpl->samplers[seq_id].get();
1190
1200
  }
1191
1201
 
1202
+ void common_init_result::reset_samplers() {
1203
+ for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
1204
+ llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
1205
+ }
1206
+ }
1207
+
1192
1208
  std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
1193
1209
  return pimpl->lora;
1194
1210
  }
@@ -1304,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
1304
1320
  llama_synchronize(lctx);
1305
1321
  llama_perf_context_reset(lctx);
1306
1322
  llama_set_warmup(lctx, false);
1323
+
1324
+ // reset samplers to reset RNG state after warmup to the seeded state
1325
+ res->reset_samplers();
1307
1326
  }
1308
1327
 
1309
1328
  return res;
@@ -1348,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1348
1367
  mparams.split_mode = params.split_mode;
1349
1368
  mparams.tensor_split = params.tensor_split;
1350
1369
  mparams.use_mmap = params.use_mmap;
1370
+ mparams.use_direct_io = params.use_direct_io;
1351
1371
  mparams.use_mlock = params.use_mlock;
1352
1372
  mparams.check_tensors = params.check_tensors;
1353
1373
  mparams.use_extra_bufts = !params.no_extra_bufts;