@fugood/llama.node 1.4.12 → 1.4.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +11 -1
- package/lib/index.js +2 -1
- package/lib/index.ts +2 -0
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaContext.cpp +5 -2
- package/src/llama.cpp/common/arg.cpp +249 -101
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +21 -1
- package/src/llama.cpp/common/common.h +20 -7
- package/src/llama.cpp/common/download.cpp +104 -55
- package/src/llama.cpp/common/download.h +26 -5
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/preset.cpp +76 -1
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +92 -10
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-mmap.cpp +70 -37
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +66 -16
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1233 -171
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +101 -57
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
#include "log.h"
|
|
7
7
|
#include "sampling.h"
|
|
8
8
|
#include "download.h"
|
|
9
|
+
#include "preset.h"
|
|
9
10
|
|
|
10
11
|
// fix problem with std::min and std::max
|
|
11
12
|
#if defined(_WIN32)
|
|
@@ -268,6 +269,46 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
|
|
|
268
269
|
}
|
|
269
270
|
}
|
|
270
271
|
|
|
272
|
+
static std::string clean_file_name(const std::string & fname) {
|
|
273
|
+
std::string clean_fname = fname;
|
|
274
|
+
string_replace_all(clean_fname, "\\", "_");
|
|
275
|
+
string_replace_all(clean_fname, "/", "_");
|
|
276
|
+
return clean_fname;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
280
|
+
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
281
|
+
|
|
282
|
+
const bool offline = params.offline;
|
|
283
|
+
std::string model_endpoint = get_model_endpoint();
|
|
284
|
+
auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
|
|
285
|
+
|
|
286
|
+
// prepare local path for caching
|
|
287
|
+
auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
|
|
288
|
+
auto preset_path = fs_get_cache_file(preset_fname);
|
|
289
|
+
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
|
290
|
+
const bool has_preset = status >= 200 && status < 400;
|
|
291
|
+
|
|
292
|
+
// remote preset is optional, so we don't error out if not found
|
|
293
|
+
if (has_preset) {
|
|
294
|
+
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
|
295
|
+
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
296
|
+
common_preset global; // unused for now
|
|
297
|
+
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
298
|
+
if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
|
|
299
|
+
common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
|
|
300
|
+
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
301
|
+
preset.apply_to_params(params);
|
|
302
|
+
} else {
|
|
303
|
+
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
|
|
304
|
+
}
|
|
305
|
+
} else {
|
|
306
|
+
LOG_INF("%s", "no remote preset found, skipping\n");
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
return has_preset;
|
|
310
|
+
}
|
|
311
|
+
|
|
271
312
|
struct handle_model_result {
|
|
272
313
|
bool found_mmproj = false;
|
|
273
314
|
common_params_model mmproj;
|
|
@@ -309,9 +350,7 @@ static handle_model_result common_params_handle_model(
|
|
|
309
350
|
// make sure model path is present (for caching purposes)
|
|
310
351
|
if (model.path.empty()) {
|
|
311
352
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
312
|
-
std::string filename = model.hf_repo + "_" + model.hf_file;
|
|
313
|
-
// to make sure we don't have any slashes in the filename
|
|
314
|
-
string_replace_all(filename, "/", "_");
|
|
353
|
+
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
|
|
315
354
|
model.path = fs_get_cache_file(filename);
|
|
316
355
|
}
|
|
317
356
|
|
|
@@ -425,61 +464,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
425
464
|
}
|
|
426
465
|
};
|
|
427
466
|
|
|
428
|
-
|
|
467
|
+
auto parse_cli_args = [&]() {
|
|
468
|
+
std::set<std::string> seen_args;
|
|
429
469
|
|
|
430
|
-
|
|
431
|
-
|
|
470
|
+
for (int i = 1; i < argc; i++) {
|
|
471
|
+
const std::string arg_prefix = "--";
|
|
432
472
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
}
|
|
437
|
-
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
438
|
-
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
439
|
-
}
|
|
440
|
-
if (!seen_args.insert(arg).second) {
|
|
441
|
-
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
442
|
-
}
|
|
443
|
-
auto & tmp = arg_to_options[arg];
|
|
444
|
-
auto opt = *tmp.first;
|
|
445
|
-
bool is_positive = tmp.second;
|
|
446
|
-
if (opt.has_value_from_env()) {
|
|
447
|
-
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
448
|
-
}
|
|
449
|
-
try {
|
|
450
|
-
if (opt.handler_void) {
|
|
451
|
-
opt.handler_void(params);
|
|
452
|
-
continue;
|
|
473
|
+
std::string arg = argv[i];
|
|
474
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
475
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
453
476
|
}
|
|
454
|
-
if (
|
|
455
|
-
|
|
456
|
-
continue;
|
|
477
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
478
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
457
479
|
}
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
check_arg(i);
|
|
461
|
-
std::string val = argv[++i];
|
|
462
|
-
if (opt.handler_int) {
|
|
463
|
-
opt.handler_int(params, std::stoi(val));
|
|
464
|
-
continue;
|
|
480
|
+
if (!seen_args.insert(arg).second) {
|
|
481
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
465
482
|
}
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
483
|
+
auto & tmp = arg_to_options[arg];
|
|
484
|
+
auto opt = *tmp.first;
|
|
485
|
+
bool is_positive = tmp.second;
|
|
486
|
+
if (opt.has_value_from_env()) {
|
|
487
|
+
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
469
488
|
}
|
|
489
|
+
try {
|
|
490
|
+
if (opt.handler_void) {
|
|
491
|
+
opt.handler_void(params);
|
|
492
|
+
continue;
|
|
493
|
+
}
|
|
494
|
+
if (opt.handler_bool) {
|
|
495
|
+
opt.handler_bool(params, is_positive);
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
470
498
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
499
|
+
// arg with single value
|
|
500
|
+
check_arg(i);
|
|
501
|
+
std::string val = argv[++i];
|
|
502
|
+
if (opt.handler_int) {
|
|
503
|
+
opt.handler_int(params, std::stoi(val));
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
if (opt.handler_string) {
|
|
507
|
+
opt.handler_string(params, val);
|
|
508
|
+
continue;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// arg with 2 values
|
|
512
|
+
check_arg(i);
|
|
513
|
+
std::string val2 = argv[++i];
|
|
514
|
+
if (opt.handler_str_str) {
|
|
515
|
+
opt.handler_str_str(params, val, val2);
|
|
516
|
+
continue;
|
|
517
|
+
}
|
|
518
|
+
} catch (std::exception & e) {
|
|
519
|
+
throw std::invalid_argument(string_format(
|
|
520
|
+
"error while handling argument \"%s\": %s\n\n"
|
|
521
|
+
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
522
|
+
arg.c_str(), e.what(), opt.to_string().c_str()));
|
|
477
523
|
}
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
524
|
+
}
|
|
525
|
+
};
|
|
526
|
+
|
|
527
|
+
// parse the first time to get -hf option (used for remote preset)
|
|
528
|
+
parse_cli_args();
|
|
529
|
+
|
|
530
|
+
// maybe handle remote preset
|
|
531
|
+
if (!params.model.hf_repo.empty()) {
|
|
532
|
+
std::string cli_hf_repo = params.model.hf_repo;
|
|
533
|
+
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
|
534
|
+
|
|
535
|
+
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
|
536
|
+
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
|
537
|
+
std::string preset_hf_repo = params.model.hf_repo;
|
|
538
|
+
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
|
539
|
+
|
|
540
|
+
if (has_preset) {
|
|
541
|
+
// re-parse CLI args to override preset values
|
|
542
|
+
parse_cli_args();
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// preserve hf_repo from preset if needed
|
|
546
|
+
if (preset_has_hf_repo) {
|
|
547
|
+
params.model.hf_repo = preset_hf_repo;
|
|
483
548
|
}
|
|
484
549
|
}
|
|
485
550
|
|
|
@@ -679,7 +744,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
679
744
|
"llama-quantize",
|
|
680
745
|
"llama-qwen2vl-cli",
|
|
681
746
|
"llama-retrieval",
|
|
682
|
-
"llama-run",
|
|
683
747
|
"llama-save-load-state",
|
|
684
748
|
"llama-server",
|
|
685
749
|
"llama-simple",
|
|
@@ -854,6 +918,54 @@ bool common_arg_utils::is_autoy(const std::string & value) {
|
|
|
854
918
|
return value == "auto" || value == "-1";
|
|
855
919
|
}
|
|
856
920
|
|
|
921
|
+
// Simple CSV parser that handles quoted fields and escaped quotes
|
|
922
|
+
// example:
|
|
923
|
+
// input: value1,"value, with, commas","value with ""escaped"" quotes",value4
|
|
924
|
+
// output: [value1] [value, with, commas] [value with "escaped" quotes] [value4]
|
|
925
|
+
static std::vector<std::string> parse_csv_row(const std::string& input) {
|
|
926
|
+
std::vector<std::string> fields;
|
|
927
|
+
std::string field;
|
|
928
|
+
bool in_quotes = false;
|
|
929
|
+
|
|
930
|
+
for (size_t i = 0; i < input.length(); ++i) {
|
|
931
|
+
char ch = input[i];
|
|
932
|
+
|
|
933
|
+
if (ch == '"') {
|
|
934
|
+
if (!in_quotes) {
|
|
935
|
+
// start of quoted field (only valid if at beginning of field)
|
|
936
|
+
if (!field.empty()) {
|
|
937
|
+
// quote appeared in middle of unquoted field, treat as literal
|
|
938
|
+
field += '"';
|
|
939
|
+
} else {
|
|
940
|
+
in_quotes = true; // start
|
|
941
|
+
}
|
|
942
|
+
} else {
|
|
943
|
+
if (i + 1 < input.length() && input[i + 1] == '"') {
|
|
944
|
+
// escaped quote: ""
|
|
945
|
+
field += '"';
|
|
946
|
+
++i; // skip the next quote
|
|
947
|
+
} else {
|
|
948
|
+
in_quotes = false; // end
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
} else if (ch == ',') {
|
|
952
|
+
if (in_quotes) {
|
|
953
|
+
field += ',';
|
|
954
|
+
} else {
|
|
955
|
+
fields.push_back(std::move(field));
|
|
956
|
+
field.clear();
|
|
957
|
+
}
|
|
958
|
+
} else {
|
|
959
|
+
field += ch;
|
|
960
|
+
}
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// Add the last field
|
|
964
|
+
fields.push_back(std::move(field));
|
|
965
|
+
|
|
966
|
+
return fields;
|
|
967
|
+
}
|
|
968
|
+
|
|
857
969
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
|
|
858
970
|
// per-example default params
|
|
859
971
|
// we define here to make sure it's included in llama-gen-docs
|
|
@@ -1250,7 +1362,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1250
1362
|
{"--in-file"}, "FNAME",
|
|
1251
1363
|
"an input file (use comma-separated values to specify multiple files)",
|
|
1252
1364
|
[](common_params & params, const std::string & value) {
|
|
1253
|
-
for (const auto & item :
|
|
1365
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1254
1366
|
std::ifstream file(item);
|
|
1255
1367
|
if (!file) {
|
|
1256
1368
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -1397,7 +1509,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1397
1509
|
[](common_params & params, bool value) {
|
|
1398
1510
|
params.warmup = value;
|
|
1399
1511
|
}
|
|
1400
|
-
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1512
|
+
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_DEBUG}));
|
|
1401
1513
|
add_opt(common_arg(
|
|
1402
1514
|
{"--spm-infill"},
|
|
1403
1515
|
string_format(
|
|
@@ -1695,6 +1807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1695
1807
|
params.sampling.grammar = json_schema_to_grammar(json::parse(schema));
|
|
1696
1808
|
}
|
|
1697
1809
|
).set_sparam());
|
|
1810
|
+
add_opt(common_arg(
|
|
1811
|
+
{"-bs", "--backend-sampling"},
|
|
1812
|
+
"enable backend sampling (experimental) (default: disabled)",
|
|
1813
|
+
[](common_params & params) {
|
|
1814
|
+
params.sampling.backend_sampling = true;
|
|
1815
|
+
}
|
|
1816
|
+
).set_sparam().set_env("LLAMA_ARG_BACKEND_SAMPLING"));
|
|
1698
1817
|
add_opt(common_arg(
|
|
1699
1818
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
|
1700
1819
|
"pooling type for embeddings, use model default if unspecified",
|
|
@@ -1706,7 +1825,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1706
1825
|
else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
|
|
1707
1826
|
else { throw std::invalid_argument("invalid value"); }
|
|
1708
1827
|
}
|
|
1709
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
|
|
1828
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_POOLING"));
|
|
1710
1829
|
add_opt(common_arg(
|
|
1711
1830
|
{"--attention"}, "{causal,non-causal}",
|
|
1712
1831
|
"attention type for embeddings, use model default if unspecified",
|
|
@@ -1995,7 +2114,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1995
2114
|
{"--image", "--audio"}, "FILE",
|
|
1996
2115
|
"path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n",
|
|
1997
2116
|
[](common_params & params, const std::string & value) {
|
|
1998
|
-
for (const auto & item :
|
|
2117
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
1999
2118
|
params.image.emplace_back(item);
|
|
2000
2119
|
}
|
|
2001
2120
|
}
|
|
@@ -2034,11 +2153,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2034
2153
|
add_opt(common_arg(
|
|
2035
2154
|
{"--mmap"},
|
|
2036
2155
|
{"--no-mmap"},
|
|
2037
|
-
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2156
|
+
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2038
2157
|
[](common_params & params, bool value) {
|
|
2039
2158
|
params.use_mmap = value;
|
|
2159
|
+
if (value) {
|
|
2160
|
+
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
|
2161
|
+
}
|
|
2040
2162
|
}
|
|
2041
2163
|
).set_env("LLAMA_ARG_MMAP"));
|
|
2164
|
+
add_opt(common_arg(
|
|
2165
|
+
{"-dio", "--direct-io"},
|
|
2166
|
+
{"-ndio", "--no-direct-io"},
|
|
2167
|
+
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
|
2168
|
+
[](common_params & params, bool value) {
|
|
2169
|
+
params.use_direct_io = value;
|
|
2170
|
+
}
|
|
2171
|
+
).set_env("LLAMA_ARG_DIO"));
|
|
2042
2172
|
add_opt(common_arg(
|
|
2043
2173
|
{"--numa"}, "TYPE",
|
|
2044
2174
|
"attempt optimizations that help on some NUMA systems\n"
|
|
@@ -2190,7 +2320,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2190
2320
|
std::vector<std::string> split_arg{ it, {} };
|
|
2191
2321
|
if (split_arg.size() >= llama_max_devices()) {
|
|
2192
2322
|
throw std::invalid_argument(
|
|
2193
|
-
string_format("got %
|
|
2323
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2194
2324
|
);
|
|
2195
2325
|
}
|
|
2196
2326
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
@@ -2230,10 +2360,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2230
2360
|
}
|
|
2231
2361
|
).set_env("LLAMA_ARG_FIT"));
|
|
2232
2362
|
add_opt(common_arg(
|
|
2233
|
-
{ "-fitt", "--fit-target" }, "
|
|
2234
|
-
string_format("target margin per device for --fit
|
|
2235
|
-
|
|
2236
|
-
|
|
2363
|
+
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
|
2364
|
+
string_format("target margin per device for --fit, comma-separated list of values, "
|
|
2365
|
+
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
|
2366
|
+
[](common_params & params, const std::string & value) {
|
|
2367
|
+
std::string arg_next = value;
|
|
2368
|
+
|
|
2369
|
+
// split string by , and /
|
|
2370
|
+
const std::regex regex{ R"([,/]+)" };
|
|
2371
|
+
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
|
2372
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
2373
|
+
if (split_arg.size() >= llama_max_devices()) {
|
|
2374
|
+
throw std::invalid_argument(
|
|
2375
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2376
|
+
);
|
|
2377
|
+
}
|
|
2378
|
+
if (split_arg.size() == 1) {
|
|
2379
|
+
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
|
2380
|
+
return;
|
|
2381
|
+
}
|
|
2382
|
+
for (size_t i = 0; i < split_arg.size(); i++) {
|
|
2383
|
+
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
|
2384
|
+
}
|
|
2237
2385
|
}
|
|
2238
2386
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
|
2239
2387
|
add_opt(common_arg(
|
|
@@ -2252,37 +2400,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2252
2400
|
));
|
|
2253
2401
|
add_opt(common_arg(
|
|
2254
2402
|
{"--override-kv"}, "KEY=TYPE:VALUE,...",
|
|
2255
|
-
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated
|
|
2403
|
+
"advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated values.\n"
|
|
2256
2404
|
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false",
|
|
2257
2405
|
[](common_params & params, const std::string & value) {
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
|
|
2261
|
-
bool escaping = false;
|
|
2262
|
-
|
|
2263
|
-
for (const char c : value) {
|
|
2264
|
-
if (escaping) {
|
|
2265
|
-
current.push_back(c);
|
|
2266
|
-
escaping = false;
|
|
2267
|
-
} else if (c == '\\') {
|
|
2268
|
-
escaping = true;
|
|
2269
|
-
} else if (c == ',') {
|
|
2270
|
-
kv_overrides.push_back(current);
|
|
2271
|
-
current.clear();
|
|
2272
|
-
} else {
|
|
2273
|
-
current.push_back(c);
|
|
2274
|
-
}
|
|
2275
|
-
}
|
|
2276
|
-
|
|
2277
|
-
if (escaping) {
|
|
2278
|
-
current.push_back('\\');
|
|
2279
|
-
}
|
|
2280
|
-
|
|
2281
|
-
kv_overrides.push_back(current);
|
|
2282
|
-
|
|
2283
|
-
for (const auto & kv_override : kv_overrides) {
|
|
2284
|
-
if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) {
|
|
2285
|
-
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str()));
|
|
2406
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2407
|
+
if (!string_parse_kv_override(item.c_str(), params.kv_overrides)) {
|
|
2408
|
+
throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", item.c_str()));
|
|
2286
2409
|
}
|
|
2287
2410
|
}
|
|
2288
2411
|
}
|
|
@@ -2299,7 +2422,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2299
2422
|
{"--lora"}, "FNAME",
|
|
2300
2423
|
"path to LoRA adapter (use comma-separated values to load multiple adapters)",
|
|
2301
2424
|
[](common_params & params, const std::string & value) {
|
|
2302
|
-
for (const auto & item :
|
|
2425
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2303
2426
|
params.lora_adapters.push_back({ item, 1.0, "", "", nullptr });
|
|
2304
2427
|
}
|
|
2305
2428
|
}
|
|
@@ -2310,7 +2433,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2310
2433
|
"path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n"
|
|
2311
2434
|
"note: use comma-separated values",
|
|
2312
2435
|
[](common_params & params, const std::string & value) {
|
|
2313
|
-
for (const auto & item :
|
|
2436
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2314
2437
|
auto parts = string_split<std::string>(item, ':');
|
|
2315
2438
|
if (parts.size() != 2) {
|
|
2316
2439
|
throw std::invalid_argument("lora-scaled format: FNAME:SCALE");
|
|
@@ -2324,7 +2447,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2324
2447
|
{"--control-vector"}, "FNAME",
|
|
2325
2448
|
"add a control vector\nnote: use comma-separated values to add multiple control vectors",
|
|
2326
2449
|
[](common_params & params, const std::string & value) {
|
|
2327
|
-
for (const auto & item :
|
|
2450
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2328
2451
|
params.control_vectors.push_back({ 1.0f, item, });
|
|
2329
2452
|
}
|
|
2330
2453
|
}
|
|
@@ -2334,7 +2457,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2334
2457
|
"add a control vector with user defined scaling SCALE\n"
|
|
2335
2458
|
"note: use comma-separated values (format: FNAME:SCALE,...)",
|
|
2336
2459
|
[](common_params & params, const std::string & value) {
|
|
2337
|
-
for (const auto & item :
|
|
2460
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2338
2461
|
auto parts = string_split<std::string>(item, ':');
|
|
2339
2462
|
if (parts.size() != 2) {
|
|
2340
2463
|
throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE");
|
|
@@ -2432,7 +2555,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2432
2555
|
{"--context-file"}, "FNAME",
|
|
2433
2556
|
"file to load context from (use comma-separated values to specify multiple files)",
|
|
2434
2557
|
[](common_params & params, const std::string & value) {
|
|
2435
|
-
for (const auto & item :
|
|
2558
|
+
for (const auto & item : parse_csv_row(value)) {
|
|
2436
2559
|
std::ifstream file(item, std::ios::binary);
|
|
2437
2560
|
if (!file) {
|
|
2438
2561
|
throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str()));
|
|
@@ -2579,7 +2702,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2579
2702
|
[](common_params & params, int value) {
|
|
2580
2703
|
params.embd_normalize = value;
|
|
2581
2704
|
}
|
|
2582
|
-
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
|
|
2705
|
+
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
|
|
2583
2706
|
add_opt(common_arg(
|
|
2584
2707
|
{"--embd-output-format"}, "FORMAT",
|
|
2585
2708
|
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
|
|
@@ -2657,7 +2780,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2657
2780
|
[](common_params & params) {
|
|
2658
2781
|
params.embedding = true;
|
|
2659
2782
|
}
|
|
2660
|
-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2783
|
+
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
|
2661
2784
|
add_opt(common_arg(
|
|
2662
2785
|
{"--rerank", "--reranking"},
|
|
2663
2786
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
|
@@ -2668,9 +2791,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2668
2791
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
|
|
2669
2792
|
add_opt(common_arg(
|
|
2670
2793
|
{"--api-key"}, "KEY",
|
|
2671
|
-
"API key to use for authentication (default: none)",
|
|
2794
|
+
"API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)",
|
|
2672
2795
|
[](common_params & params, const std::string & value) {
|
|
2673
|
-
|
|
2796
|
+
for (const auto & key : parse_csv_row(value)) {
|
|
2797
|
+
if (!key.empty()) {
|
|
2798
|
+
params.api_keys.push_back(key);
|
|
2799
|
+
}
|
|
2800
|
+
}
|
|
2674
2801
|
}
|
|
2675
2802
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
|
|
2676
2803
|
add_opt(common_arg(
|
|
@@ -2684,7 +2811,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2684
2811
|
std::string key;
|
|
2685
2812
|
while (std::getline(key_file, key)) {
|
|
2686
2813
|
if (!key.empty()) {
|
|
2687
|
-
|
|
2814
|
+
params.api_keys.push_back(key);
|
|
2688
2815
|
}
|
|
2689
2816
|
}
|
|
2690
2817
|
key_file.close();
|
|
@@ -2706,7 +2833,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2706
2833
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
|
|
2707
2834
|
add_opt(common_arg(
|
|
2708
2835
|
{"--chat-template-kwargs"}, "STRING",
|
|
2709
|
-
|
|
2836
|
+
"sets additional params for the json template parser, must be a valid json object string, e.g. '{\"key1\":\"value1\",\"key2\":\"value2\"}'",
|
|
2710
2837
|
[](common_params & params, const std::string & value) {
|
|
2711
2838
|
auto parsed = json::parse(value);
|
|
2712
2839
|
for (const auto & item : parsed.items()) {
|
|
@@ -3344,6 +3471,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3344
3471
|
}
|
|
3345
3472
|
}
|
|
3346
3473
|
).set_examples({ LLAMA_EXAMPLE_FINETUNE }));
|
|
3474
|
+
add_opt(common_arg(
|
|
3475
|
+
{"--save-logits"},
|
|
3476
|
+
string_format("save final logits to files for verification (default: %s)", params.save_logits ? "true" : "false"),
|
|
3477
|
+
[](common_params & params) {
|
|
3478
|
+
params.save_logits = true;
|
|
3479
|
+
}
|
|
3480
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3481
|
+
add_opt(common_arg(
|
|
3482
|
+
{"--logits-output-dir"}, "PATH",
|
|
3483
|
+
string_format("directory for saving logits output files (default: %s)", params.logits_output_dir.c_str()),
|
|
3484
|
+
[](common_params & params, const std::string & value) {
|
|
3485
|
+
params.logits_output_dir = value;
|
|
3486
|
+
}
|
|
3487
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3488
|
+
add_opt(common_arg(
|
|
3489
|
+
{"--tensor-filter"}, "REGEX",
|
|
3490
|
+
"filter tensor names for debug output (regex pattern, can be specified multiple times)",
|
|
3491
|
+
[](common_params & params, const std::string & value) {
|
|
3492
|
+
params.tensor_filter.push_back(value);
|
|
3493
|
+
}
|
|
3494
|
+
).set_examples({LLAMA_EXAMPLE_DEBUG}));
|
|
3347
3495
|
|
|
3348
3496
|
// presets
|
|
3349
3497
|
add_opt(common_arg(
|
|
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
|
|
129
129
|
|
|
130
130
|
// initialize argument parser context - used by test-arg-parser and preset
|
|
131
131
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
132
|
-
|
|
133
|
-
struct common_remote_params {
|
|
134
|
-
std::vector<std::string> headers;
|
|
135
|
-
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
136
|
-
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
137
|
-
};
|
|
138
|
-
// get remote file content, returns <http_code, raw_response_body>
|
|
139
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
|
@@ -2052,7 +2052,7 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
2052
2052
|
// Trigger on tool calls that appear in the commentary channel
|
|
2053
2053
|
data.grammar_triggers.push_back({
|
|
2054
2054
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2055
|
-
"<\\|channel\\|>(commentary|analysis) to"
|
|
2055
|
+
"<\\|channel\\|>(?:commentary|analysis) to"
|
|
2056
2056
|
});
|
|
2057
2057
|
|
|
2058
2058
|
// Trigger tool calls that appear in the role section, either at the
|
|
@@ -2385,17 +2385,17 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
2385
2385
|
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
|
|
2386
2386
|
// Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
|
|
2387
2387
|
data.grammar_triggers.push_back({
|
|
2388
|
-
|
|
2388
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
2389
2389
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
2390
2390
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
2391
|
-
std::string(data.thinking_forced_open ? "
|
|
2391
|
+
std::string(data.thinking_forced_open ? "(</think>\\s*)" : "") + (
|
|
2392
2392
|
"\\s*("
|
|
2393
2393
|
"(?:<tool_call>"
|
|
2394
2394
|
"|<function"
|
|
2395
2395
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
2396
2396
|
"\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
|
|
2397
2397
|
")"
|
|
2398
|
-
")
|
|
2398
|
+
")"
|
|
2399
2399
|
),
|
|
2400
2400
|
});
|
|
2401
2401
|
data.preserved_tokens = {
|
|
@@ -1086,6 +1086,7 @@ struct common_init_result::impl {
|
|
|
1086
1086
|
std::vector<llama_adapter_lora_ptr> lora;
|
|
1087
1087
|
|
|
1088
1088
|
std::vector<common_sampler_ptr> samplers;
|
|
1089
|
+
std::vector<llama_sampler_seq_config> samplers_seq_config;
|
|
1089
1090
|
};
|
|
1090
1091
|
|
|
1091
1092
|
common_init_result::common_init_result(common_params & params) :
|
|
@@ -1096,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1096
1097
|
if (params.fit_params) {
|
|
1097
1098
|
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
1098
1099
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1099
|
-
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1100
|
+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
|
1100
1101
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
1101
1102
|
}
|
|
1102
1103
|
|
|
@@ -1162,10 +1163,19 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1162
1163
|
// params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
|
1163
1164
|
//}
|
|
1164
1165
|
|
|
1166
|
+
// init the backend samplers as part of the context creation
|
|
1165
1167
|
pimpl->samplers.resize(cparams.n_seq_max);
|
|
1168
|
+
pimpl->samplers_seq_config.resize(cparams.n_seq_max);
|
|
1166
1169
|
|
|
1167
1170
|
for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
|
|
1168
1171
|
pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
|
|
1172
|
+
pimpl->samplers_seq_config[i] = { i, common_sampler_get(pimpl->samplers[i].get()) };
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
// TODO: temporarily gated behind a flag
|
|
1176
|
+
if (params.sampling.backend_sampling) {
|
|
1177
|
+
cparams.samplers = pimpl->samplers_seq_config.data();
|
|
1178
|
+
cparams.n_samplers = pimpl->samplers_seq_config.size();
|
|
1169
1179
|
}
|
|
1170
1180
|
|
|
1171
1181
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
@@ -1189,6 +1199,12 @@ common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
|
|
|
1189
1199
|
return pimpl->samplers[seq_id].get();
|
|
1190
1200
|
}
|
|
1191
1201
|
|
|
1202
|
+
void common_init_result::reset_samplers() {
|
|
1203
|
+
for (int i = 0; i < (int) pimpl->samplers.size(); ++i) {
|
|
1204
|
+
llama_sampler_reset(common_sampler_get(pimpl->samplers[i].get()));
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1192
1208
|
std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
|
|
1193
1209
|
return pimpl->lora;
|
|
1194
1210
|
}
|
|
@@ -1304,6 +1320,9 @@ common_init_result_ptr common_init_from_params(common_params & params) {
|
|
|
1304
1320
|
llama_synchronize(lctx);
|
|
1305
1321
|
llama_perf_context_reset(lctx);
|
|
1306
1322
|
llama_set_warmup(lctx, false);
|
|
1323
|
+
|
|
1324
|
+
// reset samplers to reset RNG state after warmup to the seeded state
|
|
1325
|
+
res->reset_samplers();
|
|
1307
1326
|
}
|
|
1308
1327
|
|
|
1309
1328
|
return res;
|
|
@@ -1348,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1348
1367
|
mparams.split_mode = params.split_mode;
|
|
1349
1368
|
mparams.tensor_split = params.tensor_split;
|
|
1350
1369
|
mparams.use_mmap = params.use_mmap;
|
|
1370
|
+
mparams.use_direct_io = params.use_direct_io;
|
|
1351
1371
|
mparams.use_mlock = params.use_mlock;
|
|
1352
1372
|
mparams.check_tensors = params.check_tensors;
|
|
1353
1373
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|