@fugood/llama.node 1.4.13 → 1.4.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +23 -2
- package/lib/index.js +2 -1
- package/lib/index.ts +8 -1
- package/lib/parallel.ts +2 -2
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -12
- package/src/LlamaContext.cpp +16 -4
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +3 -34
- package/src/llama.cpp/common/arg.cpp +183 -60
- package/src/llama.cpp/common/arg.h +0 -8
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +67 -0
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +2 -1
- package/src/llama.cpp/common/common.h +12 -7
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +88 -369
- package/src/llama.cpp/common/download.h +32 -5
- package/src/llama.cpp/common/preset.cpp +87 -2
- package/src/llama.cpp/common/preset.h +10 -1
- package/src/llama.cpp/ggml/include/ggml.h +5 -0
- package/src/llama.cpp/include/llama.h +5 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-mmap.cpp +78 -42
- package/src/llama.cpp/src/llama-mmap.h +5 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +225 -101
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/llama.cpp +63 -27
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -2,10 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
+
#include "download.h"
|
|
5
6
|
#include "json-schema-to-grammar.h"
|
|
6
7
|
#include "log.h"
|
|
7
8
|
#include "sampling.h"
|
|
8
|
-
#include "
|
|
9
|
+
#include "preset.h"
|
|
9
10
|
|
|
10
11
|
// fix problem with std::min and std::max
|
|
11
12
|
#if defined(_WIN32)
|
|
@@ -47,6 +48,8 @@
|
|
|
47
48
|
|
|
48
49
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
49
50
|
|
|
51
|
+
extern const char * LICENSES[];
|
|
52
|
+
|
|
50
53
|
using json = nlohmann::ordered_json;
|
|
51
54
|
using namespace common_arg_utils;
|
|
52
55
|
|
|
@@ -268,6 +271,55 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
|
|
|
268
271
|
}
|
|
269
272
|
}
|
|
270
273
|
|
|
274
|
+
static std::string clean_file_name(const std::string & fname) {
|
|
275
|
+
std::string clean_fname = fname;
|
|
276
|
+
string_replace_all(clean_fname, "\\", "_");
|
|
277
|
+
string_replace_all(clean_fname, "/", "_");
|
|
278
|
+
return clean_fname;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
282
|
+
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
283
|
+
|
|
284
|
+
// the returned hf_repo is without tag
|
|
285
|
+
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
|
|
286
|
+
|
|
287
|
+
// "latest" tag (default if not specified) is translated to "default" preset
|
|
288
|
+
if (hf_tag == "latest") {
|
|
289
|
+
hf_tag = "default";
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const bool offline = params.offline;
|
|
293
|
+
std::string model_endpoint = get_model_endpoint();
|
|
294
|
+
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
|
295
|
+
|
|
296
|
+
// prepare local path for caching
|
|
297
|
+
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
|
|
298
|
+
auto preset_path = fs_get_cache_file(preset_fname);
|
|
299
|
+
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
|
300
|
+
const bool has_preset = status >= 200 && status < 400;
|
|
301
|
+
|
|
302
|
+
// remote preset is optional, so we don't error out if not found
|
|
303
|
+
if (has_preset) {
|
|
304
|
+
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
|
305
|
+
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
306
|
+
common_preset global;
|
|
307
|
+
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
308
|
+
remote_presets = ctx.cascade(global, remote_presets);
|
|
309
|
+
if (remote_presets.find(hf_tag) != remote_presets.end()) {
|
|
310
|
+
common_preset preset = remote_presets.at(hf_tag);
|
|
311
|
+
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
312
|
+
preset.apply_to_params(params);
|
|
313
|
+
} else {
|
|
314
|
+
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
|
315
|
+
}
|
|
316
|
+
} else {
|
|
317
|
+
LOG_INF("%s", "no remote preset found, skipping\n");
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return has_preset;
|
|
321
|
+
}
|
|
322
|
+
|
|
271
323
|
struct handle_model_result {
|
|
272
324
|
bool found_mmproj = false;
|
|
273
325
|
common_params_model mmproj;
|
|
@@ -289,7 +341,7 @@ static handle_model_result common_params_handle_model(
|
|
|
289
341
|
if (model.path.empty()) {
|
|
290
342
|
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
|
291
343
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
292
|
-
exit(1); //
|
|
344
|
+
exit(1); // error message already printed
|
|
293
345
|
}
|
|
294
346
|
model.name = model.hf_repo; // repo name with tag
|
|
295
347
|
model.hf_repo = auto_detected.repo; // repo name without tag
|
|
@@ -309,9 +361,7 @@ static handle_model_result common_params_handle_model(
|
|
|
309
361
|
// make sure model path is present (for caching purposes)
|
|
310
362
|
if (model.path.empty()) {
|
|
311
363
|
// this is to avoid different repo having same file name, or same file name in different subdirs
|
|
312
|
-
std::string filename = model.hf_repo + "_" + model.hf_file;
|
|
313
|
-
// to make sure we don't have any slashes in the filename
|
|
314
|
-
string_replace_all(filename, "/", "_");
|
|
364
|
+
std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
|
|
315
365
|
model.path = fs_get_cache_file(filename);
|
|
316
366
|
}
|
|
317
367
|
|
|
@@ -425,61 +475,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
425
475
|
}
|
|
426
476
|
};
|
|
427
477
|
|
|
428
|
-
|
|
478
|
+
auto parse_cli_args = [&]() {
|
|
479
|
+
std::set<std::string> seen_args;
|
|
429
480
|
|
|
430
|
-
|
|
431
|
-
|
|
481
|
+
for (int i = 1; i < argc; i++) {
|
|
482
|
+
const std::string arg_prefix = "--";
|
|
432
483
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
}
|
|
437
|
-
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
438
|
-
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
439
|
-
}
|
|
440
|
-
if (!seen_args.insert(arg).second) {
|
|
441
|
-
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
442
|
-
}
|
|
443
|
-
auto & tmp = arg_to_options[arg];
|
|
444
|
-
auto opt = *tmp.first;
|
|
445
|
-
bool is_positive = tmp.second;
|
|
446
|
-
if (opt.has_value_from_env()) {
|
|
447
|
-
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
448
|
-
}
|
|
449
|
-
try {
|
|
450
|
-
if (opt.handler_void) {
|
|
451
|
-
opt.handler_void(params);
|
|
452
|
-
continue;
|
|
484
|
+
std::string arg = argv[i];
|
|
485
|
+
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
486
|
+
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
453
487
|
}
|
|
454
|
-
if (
|
|
455
|
-
|
|
456
|
-
continue;
|
|
488
|
+
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
|
489
|
+
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
|
457
490
|
}
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
check_arg(i);
|
|
461
|
-
std::string val = argv[++i];
|
|
462
|
-
if (opt.handler_int) {
|
|
463
|
-
opt.handler_int(params, std::stoi(val));
|
|
464
|
-
continue;
|
|
491
|
+
if (!seen_args.insert(arg).second) {
|
|
492
|
+
LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
|
|
465
493
|
}
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
494
|
+
auto & tmp = arg_to_options[arg];
|
|
495
|
+
auto opt = *tmp.first;
|
|
496
|
+
bool is_positive = tmp.second;
|
|
497
|
+
if (opt.has_value_from_env()) {
|
|
498
|
+
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
|
469
499
|
}
|
|
500
|
+
try {
|
|
501
|
+
if (opt.handler_void) {
|
|
502
|
+
opt.handler_void(params);
|
|
503
|
+
continue;
|
|
504
|
+
}
|
|
505
|
+
if (opt.handler_bool) {
|
|
506
|
+
opt.handler_bool(params, is_positive);
|
|
507
|
+
continue;
|
|
508
|
+
}
|
|
470
509
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
510
|
+
// arg with single value
|
|
511
|
+
check_arg(i);
|
|
512
|
+
std::string val = argv[++i];
|
|
513
|
+
if (opt.handler_int) {
|
|
514
|
+
opt.handler_int(params, std::stoi(val));
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
if (opt.handler_string) {
|
|
518
|
+
opt.handler_string(params, val);
|
|
519
|
+
continue;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// arg with 2 values
|
|
523
|
+
check_arg(i);
|
|
524
|
+
std::string val2 = argv[++i];
|
|
525
|
+
if (opt.handler_str_str) {
|
|
526
|
+
opt.handler_str_str(params, val, val2);
|
|
527
|
+
continue;
|
|
528
|
+
}
|
|
529
|
+
} catch (std::exception & e) {
|
|
530
|
+
throw std::invalid_argument(string_format(
|
|
531
|
+
"error while handling argument \"%s\": %s\n\n"
|
|
532
|
+
"usage:\n%s\n\nto show complete usage, run with -h",
|
|
533
|
+
arg.c_str(), e.what(), opt.to_string().c_str()));
|
|
477
534
|
}
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
535
|
+
}
|
|
536
|
+
};
|
|
537
|
+
|
|
538
|
+
// parse the first time to get -hf option (used for remote preset)
|
|
539
|
+
parse_cli_args();
|
|
540
|
+
|
|
541
|
+
// maybe handle remote preset
|
|
542
|
+
if (!params.model.hf_repo.empty()) {
|
|
543
|
+
std::string cli_hf_repo = params.model.hf_repo;
|
|
544
|
+
bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
|
|
545
|
+
|
|
546
|
+
// special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
|
|
547
|
+
// this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
|
|
548
|
+
std::string preset_hf_repo = params.model.hf_repo;
|
|
549
|
+
bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
|
|
550
|
+
|
|
551
|
+
if (has_preset) {
|
|
552
|
+
// re-parse CLI args to override preset values
|
|
553
|
+
parse_cli_args();
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
// preserve hf_repo from preset if needed
|
|
557
|
+
if (preset_has_hf_repo) {
|
|
558
|
+
params.model.hf_repo = preset_hf_repo;
|
|
483
559
|
}
|
|
484
560
|
}
|
|
485
561
|
|
|
@@ -965,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
965
1041
|
exit(0);
|
|
966
1042
|
}
|
|
967
1043
|
));
|
|
1044
|
+
add_opt(common_arg(
|
|
1045
|
+
{"--license"},
|
|
1046
|
+
"show source code license and dependencies",
|
|
1047
|
+
[](common_params &) {
|
|
1048
|
+
for (int i = 0; LICENSES[i]; ++i) {
|
|
1049
|
+
printf("%s\n", LICENSES[i]);
|
|
1050
|
+
}
|
|
1051
|
+
exit(0);
|
|
1052
|
+
}
|
|
1053
|
+
));
|
|
968
1054
|
add_opt(common_arg(
|
|
969
1055
|
{"-cl", "--cache-list"},
|
|
970
1056
|
"show list of models in cache",
|
|
@@ -1209,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1209
1295
|
[](common_params & params) {
|
|
1210
1296
|
params.kv_unified = true;
|
|
1211
1297
|
}
|
|
1212
|
-
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1298
|
+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
|
|
1213
1299
|
add_opt(common_arg(
|
|
1214
1300
|
{"--context-shift"},
|
|
1215
1301
|
{"--no-context-shift"},
|
|
@@ -2088,11 +2174,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2088
2174
|
add_opt(common_arg(
|
|
2089
2175
|
{"--mmap"},
|
|
2090
2176
|
{"--no-mmap"},
|
|
2091
|
-
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2177
|
+
string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
|
2092
2178
|
[](common_params & params, bool value) {
|
|
2093
2179
|
params.use_mmap = value;
|
|
2180
|
+
if (value) {
|
|
2181
|
+
params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
|
|
2182
|
+
}
|
|
2094
2183
|
}
|
|
2095
2184
|
).set_env("LLAMA_ARG_MMAP"));
|
|
2185
|
+
add_opt(common_arg(
|
|
2186
|
+
{"-dio", "--direct-io"},
|
|
2187
|
+
{"-ndio", "--no-direct-io"},
|
|
2188
|
+
string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
|
2189
|
+
[](common_params & params, bool value) {
|
|
2190
|
+
params.use_direct_io = value;
|
|
2191
|
+
}
|
|
2192
|
+
).set_env("LLAMA_ARG_DIO"));
|
|
2096
2193
|
add_opt(common_arg(
|
|
2097
2194
|
{"--numa"}, "TYPE",
|
|
2098
2195
|
"attempt optimizations that help on some NUMA systems\n"
|
|
@@ -2244,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2244
2341
|
std::vector<std::string> split_arg{ it, {} };
|
|
2245
2342
|
if (split_arg.size() >= llama_max_devices()) {
|
|
2246
2343
|
throw std::invalid_argument(
|
|
2247
|
-
string_format("got %
|
|
2344
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2248
2345
|
);
|
|
2249
2346
|
}
|
|
2250
2347
|
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
@@ -2284,10 +2381,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2284
2381
|
}
|
|
2285
2382
|
).set_env("LLAMA_ARG_FIT"));
|
|
2286
2383
|
add_opt(common_arg(
|
|
2287
|
-
{ "-fitt", "--fit-target" }, "
|
|
2288
|
-
string_format("target margin per device for --fit
|
|
2289
|
-
|
|
2290
|
-
|
|
2384
|
+
{ "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
|
|
2385
|
+
string_format("target margin per device for --fit, comma-separated list of values, "
|
|
2386
|
+
"single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
|
|
2387
|
+
[](common_params & params, const std::string & value) {
|
|
2388
|
+
std::string arg_next = value;
|
|
2389
|
+
|
|
2390
|
+
// split string by , and /
|
|
2391
|
+
const std::regex regex{ R"([,/]+)" };
|
|
2392
|
+
std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
|
|
2393
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
2394
|
+
if (split_arg.size() >= llama_max_devices()) {
|
|
2395
|
+
throw std::invalid_argument(
|
|
2396
|
+
string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
|
|
2397
|
+
);
|
|
2398
|
+
}
|
|
2399
|
+
if (split_arg.size() == 1) {
|
|
2400
|
+
std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
|
|
2401
|
+
return;
|
|
2402
|
+
}
|
|
2403
|
+
for (size_t i = 0; i < split_arg.size(); i++) {
|
|
2404
|
+
params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
|
|
2405
|
+
}
|
|
2291
2406
|
}
|
|
2292
2407
|
).set_env("LLAMA_ARG_FIT_TARGET"));
|
|
2293
2408
|
add_opt(common_arg(
|
|
@@ -2762,10 +2877,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2762
2877
|
params.n_threads_http = value;
|
|
2763
2878
|
}
|
|
2764
2879
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
2880
|
+
add_opt(common_arg(
|
|
2881
|
+
{"--cache-prompt"},
|
|
2882
|
+
{"--no-cache-prompt"},
|
|
2883
|
+
string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
|
|
2884
|
+
[](common_params & params, bool value) {
|
|
2885
|
+
params.cache_prompt = value;
|
|
2886
|
+
}
|
|
2887
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
|
|
2765
2888
|
add_opt(common_arg(
|
|
2766
2889
|
{"--cache-reuse"}, "N",
|
|
2767
2890
|
string_format(
|
|
2768
|
-
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
|
2891
|
+
"min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
|
|
2769
2892
|
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
|
2770
2893
|
),
|
|
2771
2894
|
[](common_params & params, int value) {
|
|
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
|
|
|
129
129
|
|
|
130
130
|
// initialize argument parser context - used by test-arg-parser and preset
|
|
131
131
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
|
132
|
-
|
|
133
|
-
struct common_remote_params {
|
|
134
|
-
std::vector<std::string> headers;
|
|
135
|
-
long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
|
|
136
|
-
long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
|
|
137
|
-
};
|
|
138
|
-
// get remote file content, returns <http_code, raw_response_body>
|
|
139
|
-
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
|
|
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
|
|
|
1403
1403
|
builder.add_content(builder.consume_rest());
|
|
1404
1404
|
}
|
|
1405
1405
|
|
|
1406
|
+
static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
|
|
1407
|
+
// 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
|
|
1408
|
+
// 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
|
|
1409
|
+
static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
|
|
1410
|
+
|
|
1411
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1412
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1413
|
+
builder.add_content(builder.consume_rest());
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1418
|
+
|
|
1419
|
+
// Find all <tool_call></tool_call> blocks
|
|
1420
|
+
while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
|
|
1421
|
+
builder.move_to(first->groups[0].end);
|
|
1422
|
+
builder.consume_spaces();
|
|
1423
|
+
|
|
1424
|
+
builder.try_consume_literal("```json");
|
|
1425
|
+
builder.try_consume_literal("```");
|
|
1426
|
+
builder.consume_spaces();
|
|
1427
|
+
|
|
1428
|
+
// Consume JSON object
|
|
1429
|
+
auto data = builder.consume_json();
|
|
1430
|
+
|
|
1431
|
+
builder.consume_spaces();
|
|
1432
|
+
builder.try_consume_literal("```");
|
|
1433
|
+
builder.consume_spaces();
|
|
1434
|
+
|
|
1435
|
+
if (!builder.try_consume_literal("</tool_call>")) {
|
|
1436
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1437
|
+
}
|
|
1438
|
+
builder.consume_spaces();
|
|
1439
|
+
|
|
1440
|
+
// Extract name and arguments
|
|
1441
|
+
std::string name;
|
|
1442
|
+
std::string id;
|
|
1443
|
+
nlohmann::ordered_json arguments;
|
|
1444
|
+
|
|
1445
|
+
const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
|
|
1446
|
+
if (!obj.contains("name") || !obj.contains("arguments")) {
|
|
1447
|
+
return false;
|
|
1448
|
+
}
|
|
1449
|
+
name = obj.at("name").get<std::string>();
|
|
1450
|
+
arguments = obj.at("arguments");
|
|
1451
|
+
if (obj.contains("id") && obj.at("id").is_string()) {
|
|
1452
|
+
id = obj.at("id").get<std::string>();
|
|
1453
|
+
}
|
|
1454
|
+
return true;
|
|
1455
|
+
};
|
|
1456
|
+
|
|
1457
|
+
if (!extract_args(data.json)) {
|
|
1458
|
+
if (data.json.contains("function") && data.json.at("function").is_object()) {
|
|
1459
|
+
auto fn = data.json.at("function");
|
|
1460
|
+
extract_args(fn);
|
|
1461
|
+
if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
|
|
1462
|
+
id = data.json.at("id").get<std::string>();
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
// If name is empty, treat the JSON object as content
|
|
1468
|
+
if (name.empty()) {
|
|
1469
|
+
LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
|
|
1470
|
+
builder.add_content(data.json.dump());
|
|
1471
|
+
continue;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
std::string args_str = arguments.dump();
|
|
1475
|
+
if (!builder.add_tool_call(name, id, args_str)) {
|
|
1476
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
builder.add_content(builder.consume_rest());
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
|
|
1484
|
+
LOG_DBG("%s: parsing exaone_moe\n", __func__);
|
|
1485
|
+
// EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1486
|
+
// First try to parse using the standard reasoning parsing method
|
|
1487
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1488
|
+
|
|
1489
|
+
auto start_pos = builder.pos();
|
|
1490
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1491
|
+
builder.move_to(start_pos);
|
|
1492
|
+
|
|
1493
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1494
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1495
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1496
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1497
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1498
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1499
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1500
|
+
} else {
|
|
1501
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1502
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1504
|
+
return;
|
|
1505
|
+
}
|
|
1506
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1507
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1508
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1509
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1510
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1511
|
+
} else {
|
|
1512
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1513
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1406
1518
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
1407
1519
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1408
1520
|
builder.add_content(builder.consume_rest());
|
|
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1490
1602
|
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
|
|
1491
1603
|
common_chat_parse_solar_open(builder);
|
|
1492
1604
|
break;
|
|
1605
|
+
case COMMON_CHAT_FORMAT_EXAONE_MOE:
|
|
1606
|
+
common_chat_parse_exaone_moe(builder);
|
|
1607
|
+
break;
|
|
1493
1608
|
default:
|
|
1494
1609
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1495
1610
|
}
|
|
@@ -657,6 +657,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
657
657
|
case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
|
|
658
658
|
case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
|
|
659
659
|
case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
|
|
660
|
+
case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
|
|
660
661
|
case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
|
|
661
662
|
case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
|
|
662
663
|
case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
|
|
@@ -2526,6 +2527,65 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
|
|
|
2526
2527
|
return data;
|
|
2527
2528
|
}
|
|
2528
2529
|
|
|
2530
|
+
static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2531
|
+
common_chat_params data;
|
|
2532
|
+
|
|
2533
|
+
data.prompt = apply(tmpl, inputs);
|
|
2534
|
+
data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
|
|
2535
|
+
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
2536
|
+
if (!inputs.enable_thinking) {
|
|
2537
|
+
data.prompt += "</think>\n\n";
|
|
2538
|
+
} else {
|
|
2539
|
+
data.thinking_forced_open = true;
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
2544
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
|
2545
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
2546
|
+
std::vector<std::string> tool_rules;
|
|
2547
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
2548
|
+
const auto & function = tool.at("function");
|
|
2549
|
+
std::string name = function.at("name");
|
|
2550
|
+
auto parameters = function.at("parameters");
|
|
2551
|
+
builder.resolve_refs(parameters);
|
|
2552
|
+
// Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
|
|
2553
|
+
tool_rules.push_back(builder.add_rule(
|
|
2554
|
+
name + "-call",
|
|
2555
|
+
"\"<tool_call>\" space " +
|
|
2556
|
+
builder.add_schema(name + "-obj", json{
|
|
2557
|
+
{"type", "object"},
|
|
2558
|
+
{"properties", {
|
|
2559
|
+
{"name", json{{"const", name}}},
|
|
2560
|
+
{"arguments", parameters},
|
|
2561
|
+
}},
|
|
2562
|
+
{"required", json::array({"name", "arguments"})},
|
|
2563
|
+
}) +
|
|
2564
|
+
" space \"</tool_call>\" space"));
|
|
2565
|
+
});
|
|
2566
|
+
|
|
2567
|
+
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
|
2568
|
+
builder.add_rule("root",
|
|
2569
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
2570
|
+
(inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
|
|
2571
|
+
|
|
2572
|
+
data.grammar_triggers.push_back({
|
|
2573
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
2574
|
+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
|
|
2575
|
+
"(<tool_call>)[\\s\\S]*"
|
|
2576
|
+
});
|
|
2577
|
+
data.preserved_tokens = {
|
|
2578
|
+
"<think>",
|
|
2579
|
+
"</think>",
|
|
2580
|
+
"<tool_call>",
|
|
2581
|
+
"</tool_call>",
|
|
2582
|
+
};
|
|
2583
|
+
});
|
|
2584
|
+
}
|
|
2585
|
+
|
|
2586
|
+
return data;
|
|
2587
|
+
}
|
|
2588
|
+
|
|
2529
2589
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2530
2590
|
common_chat_params data;
|
|
2531
2591
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2696,6 +2756,13 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2696
2756
|
return common_chat_params_init_xiaomi_mimo(tmpl, params);
|
|
2697
2757
|
}
|
|
2698
2758
|
|
|
2759
|
+
// EXAONE MoE format detection
|
|
2760
|
+
if (src.find("<tool_call>") != std::string::npos &&
|
|
2761
|
+
src.find("<tool_result>") != std::string::npos &&
|
|
2762
|
+
src.find("<|tool_declare|>") != std::string::npos) {
|
|
2763
|
+
return common_chat_params_init_exaone_moe(tmpl, params);
|
|
2764
|
+
}
|
|
2765
|
+
|
|
2699
2766
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
2700
2767
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
|
2701
2768
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
@@ -136,6 +136,7 @@ enum common_chat_format {
|
|
|
136
136
|
COMMON_CHAT_FORMAT_APRIEL_1_5,
|
|
137
137
|
COMMON_CHAT_FORMAT_XIAOMI_MIMO,
|
|
138
138
|
COMMON_CHAT_FORMAT_SOLAR_OPEN,
|
|
139
|
+
COMMON_CHAT_FORMAT_EXAONE_MOE,
|
|
139
140
|
|
|
140
141
|
// These are intended to be parsed by the PEG parser
|
|
141
142
|
COMMON_CHAT_FORMAT_PEG_SIMPLE,
|
|
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
|
|
|
1097
1097
|
if (params.fit_params) {
|
|
1098
1098
|
LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
|
|
1099
1099
|
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
|
|
1100
|
-
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
|
|
1100
|
+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
|
|
1101
1101
|
params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
|
|
1102
1102
|
}
|
|
1103
1103
|
|
|
@@ -1367,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1367
1367
|
mparams.split_mode = params.split_mode;
|
|
1368
1368
|
mparams.tensor_split = params.tensor_split;
|
|
1369
1369
|
mparams.use_mmap = params.use_mmap;
|
|
1370
|
+
mparams.use_direct_io = params.use_direct_io;
|
|
1370
1371
|
mparams.use_mlock = params.use_mlock;
|
|
1371
1372
|
mparams.check_tensors = params.check_tensors;
|
|
1372
1373
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
|
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
|
|
|
80
80
|
//
|
|
81
81
|
|
|
82
82
|
enum llama_example {
|
|
83
|
+
LLAMA_EXAMPLE_BATCHED,
|
|
83
84
|
LLAMA_EXAMPLE_DEBUG,
|
|
84
85
|
LLAMA_EXAMPLE_COMMON,
|
|
85
86
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
@@ -333,12 +334,14 @@ struct common_params {
|
|
|
333
334
|
// offload params
|
|
334
335
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
335
336
|
|
|
336
|
-
int32_t n_gpu_layers = -1;
|
|
337
|
-
int32_t main_gpu = 0;
|
|
338
|
-
float tensor_split[128] = {0};
|
|
339
|
-
bool fit_params = true;
|
|
340
|
-
|
|
341
|
-
|
|
337
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
|
|
338
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
339
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
340
|
+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
341
|
+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
|
342
|
+
|
|
343
|
+
// margin per device in bytes for fitting parameters to free memory:
|
|
344
|
+
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
|
342
345
|
|
|
343
346
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
344
347
|
|
|
@@ -429,7 +432,8 @@ struct common_params {
|
|
|
429
432
|
bool kv_unified = false; // enable unified KV cache
|
|
430
433
|
|
|
431
434
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
432
|
-
bool use_mmap = true; //
|
|
435
|
+
bool use_mmap = true; // enable mmap to use filesystem cache
|
|
436
|
+
bool use_direct_io = true; // read from disk without buffering for faster model loading
|
|
433
437
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
434
438
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
435
439
|
bool display_prompt = true; // print prompt before generation
|
|
@@ -473,6 +477,7 @@ struct common_params {
|
|
|
473
477
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
474
478
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
475
479
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
480
|
+
bool cache_prompt = true; // whether to enable prompt caching
|
|
476
481
|
int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
|
|
477
482
|
int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
|
|
478
483
|
|