@fugood/llama.node 1.4.13 → 1.4.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/lib/binding.ts +23 -2
  2. package/lib/index.js +2 -1
  3. package/lib/index.ts +8 -1
  4. package/lib/parallel.ts +2 -2
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +9 -12
  7. package/src/LlamaContext.cpp +16 -4
  8. package/src/llama.cpp/CMakeLists.txt +24 -8
  9. package/src/llama.cpp/common/CMakeLists.txt +3 -34
  10. package/src/llama.cpp/common/arg.cpp +183 -60
  11. package/src/llama.cpp/common/arg.h +0 -8
  12. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  13. package/src/llama.cpp/common/chat.cpp +67 -0
  14. package/src/llama.cpp/common/chat.h +1 -0
  15. package/src/llama.cpp/common/common.cpp +2 -1
  16. package/src/llama.cpp/common/common.h +12 -7
  17. package/src/llama.cpp/common/debug.cpp +165 -0
  18. package/src/llama.cpp/common/debug.h +43 -0
  19. package/src/llama.cpp/common/download.cpp +88 -369
  20. package/src/llama.cpp/common/download.h +32 -5
  21. package/src/llama.cpp/common/preset.cpp +87 -2
  22. package/src/llama.cpp/common/preset.h +10 -1
  23. package/src/llama.cpp/ggml/include/ggml.h +5 -0
  24. package/src/llama.cpp/include/llama.h +5 -2
  25. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  27. package/src/llama.cpp/src/llama-arch.h +1 -0
  28. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  29. package/src/llama.cpp/src/llama-chat.h +1 -0
  30. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  31. package/src/llama.cpp/src/llama-mmap.cpp +78 -42
  32. package/src/llama.cpp/src/llama-mmap.h +5 -4
  33. package/src/llama.cpp/src/llama-model-loader.cpp +17 -5
  34. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  35. package/src/llama.cpp/src/llama-model.cpp +225 -101
  36. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  37. package/src/llama.cpp/src/llama-sampling.cpp +1 -1
  38. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  39. package/src/llama.cpp/src/llama-vocab.h +1 -0
  40. package/src/llama.cpp/src/llama.cpp +63 -27
  41. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  42. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  43. package/src/llama.cpp/src/models/models.h +13 -2
  44. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -2,10 +2,11 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "common.h"
5
+ #include "download.h"
5
6
  #include "json-schema-to-grammar.h"
6
7
  #include "log.h"
7
8
  #include "sampling.h"
8
- #include "download.h"
9
+ #include "preset.h"
9
10
 
10
11
  // fix problem with std::min and std::max
11
12
  #if defined(_WIN32)
@@ -47,6 +48,8 @@
47
48
 
48
49
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
49
50
 
51
+ extern const char * LICENSES[];
52
+
50
53
  using json = nlohmann::ordered_json;
51
54
  using namespace common_arg_utils;
52
55
 
@@ -268,6 +271,55 @@ static void parse_tensor_buffer_overrides(const std::string & value, std::vector
268
271
  }
269
272
  }
270
273
 
274
+ static std::string clean_file_name(const std::string & fname) {
275
+ std::string clean_fname = fname;
276
+ string_replace_all(clean_fname, "\\", "_");
277
+ string_replace_all(clean_fname, "/", "_");
278
+ return clean_fname;
279
+ }
280
+
281
+ static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
282
+ GGML_ASSERT(!params.model.hf_repo.empty());
283
+
284
+ // the returned hf_repo is without tag
285
+ auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
286
+
287
+ // "latest" tag (default if not specified) is translated to "default" preset
288
+ if (hf_tag == "latest") {
289
+ hf_tag = "default";
290
+ }
291
+
292
+ const bool offline = params.offline;
293
+ std::string model_endpoint = get_model_endpoint();
294
+ auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
295
+
296
+ // prepare local path for caching
297
+ auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
298
+ auto preset_path = fs_get_cache_file(preset_fname);
299
+ const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
300
+ const bool has_preset = status >= 200 && status < 400;
301
+
302
+ // remote preset is optional, so we don't error out if not found
303
+ if (has_preset) {
304
+ LOG_INF("applying remote preset from %s\n", preset_url.c_str());
305
+ common_preset_context ctx(ex, /* only_remote_allowed */ true);
306
+ common_preset global;
307
+ auto remote_presets = ctx.load_from_ini(preset_path, global);
308
+ remote_presets = ctx.cascade(global, remote_presets);
309
+ if (remote_presets.find(hf_tag) != remote_presets.end()) {
310
+ common_preset preset = remote_presets.at(hf_tag);
311
+ LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
312
+ preset.apply_to_params(params);
313
+ } else {
314
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
315
+ }
316
+ } else {
317
+ LOG_INF("%s", "no remote preset found, skipping\n");
318
+ }
319
+
320
+ return has_preset;
321
+ }
322
+
271
323
  struct handle_model_result {
272
324
  bool found_mmproj = false;
273
325
  common_params_model mmproj;
@@ -289,7 +341,7 @@ static handle_model_result common_params_handle_model(
289
341
  if (model.path.empty()) {
290
342
  auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
291
343
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
292
- exit(1); // built without CURL, error message already printed
344
+ exit(1); // error message already printed
293
345
  }
294
346
  model.name = model.hf_repo; // repo name with tag
295
347
  model.hf_repo = auto_detected.repo; // repo name without tag
@@ -309,9 +361,7 @@ static handle_model_result common_params_handle_model(
309
361
  // make sure model path is present (for caching purposes)
310
362
  if (model.path.empty()) {
311
363
  // this is to avoid different repo having same file name, or same file name in different subdirs
312
- std::string filename = model.hf_repo + "_" + model.hf_file;
313
- // to make sure we don't have any slashes in the filename
314
- string_replace_all(filename, "/", "_");
364
+ std::string filename = clean_file_name(model.hf_repo + "_" + model.hf_file);
315
365
  model.path = fs_get_cache_file(filename);
316
366
  }
317
367
 
@@ -425,61 +475,87 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
425
475
  }
426
476
  };
427
477
 
428
- std::set<std::string> seen_args;
478
+ auto parse_cli_args = [&]() {
479
+ std::set<std::string> seen_args;
429
480
 
430
- for (int i = 1; i < argc; i++) {
431
- const std::string arg_prefix = "--";
481
+ for (int i = 1; i < argc; i++) {
482
+ const std::string arg_prefix = "--";
432
483
 
433
- std::string arg = argv[i];
434
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
435
- std::replace(arg.begin(), arg.end(), '_', '-');
436
- }
437
- if (arg_to_options.find(arg) == arg_to_options.end()) {
438
- throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
439
- }
440
- if (!seen_args.insert(arg).second) {
441
- LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
442
- }
443
- auto & tmp = arg_to_options[arg];
444
- auto opt = *tmp.first;
445
- bool is_positive = tmp.second;
446
- if (opt.has_value_from_env()) {
447
- fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
448
- }
449
- try {
450
- if (opt.handler_void) {
451
- opt.handler_void(params);
452
- continue;
484
+ std::string arg = argv[i];
485
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
486
+ std::replace(arg.begin(), arg.end(), '_', '-');
453
487
  }
454
- if (opt.handler_bool) {
455
- opt.handler_bool(params, is_positive);
456
- continue;
488
+ if (arg_to_options.find(arg) == arg_to_options.end()) {
489
+ throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
457
490
  }
458
-
459
- // arg with single value
460
- check_arg(i);
461
- std::string val = argv[++i];
462
- if (opt.handler_int) {
463
- opt.handler_int(params, std::stoi(val));
464
- continue;
491
+ if (!seen_args.insert(arg).second) {
492
+ LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
465
493
  }
466
- if (opt.handler_string) {
467
- opt.handler_string(params, val);
468
- continue;
494
+ auto & tmp = arg_to_options[arg];
495
+ auto opt = *tmp.first;
496
+ bool is_positive = tmp.second;
497
+ if (opt.has_value_from_env()) {
498
+ fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
469
499
  }
500
+ try {
501
+ if (opt.handler_void) {
502
+ opt.handler_void(params);
503
+ continue;
504
+ }
505
+ if (opt.handler_bool) {
506
+ opt.handler_bool(params, is_positive);
507
+ continue;
508
+ }
470
509
 
471
- // arg with 2 values
472
- check_arg(i);
473
- std::string val2 = argv[++i];
474
- if (opt.handler_str_str) {
475
- opt.handler_str_str(params, val, val2);
476
- continue;
510
+ // arg with single value
511
+ check_arg(i);
512
+ std::string val = argv[++i];
513
+ if (opt.handler_int) {
514
+ opt.handler_int(params, std::stoi(val));
515
+ continue;
516
+ }
517
+ if (opt.handler_string) {
518
+ opt.handler_string(params, val);
519
+ continue;
520
+ }
521
+
522
+ // arg with 2 values
523
+ check_arg(i);
524
+ std::string val2 = argv[++i];
525
+ if (opt.handler_str_str) {
526
+ opt.handler_str_str(params, val, val2);
527
+ continue;
528
+ }
529
+ } catch (std::exception & e) {
530
+ throw std::invalid_argument(string_format(
531
+ "error while handling argument \"%s\": %s\n\n"
532
+ "usage:\n%s\n\nto show complete usage, run with -h",
533
+ arg.c_str(), e.what(), opt.to_string().c_str()));
477
534
  }
478
- } catch (std::exception & e) {
479
- throw std::invalid_argument(string_format(
480
- "error while handling argument \"%s\": %s\n\n"
481
- "usage:\n%s\n\nto show complete usage, run with -h",
482
- arg.c_str(), e.what(), opt.to_string().c_str()));
535
+ }
536
+ };
537
+
538
+ // parse the first time to get -hf option (used for remote preset)
539
+ parse_cli_args();
540
+
541
+ // maybe handle remote preset
542
+ if (!params.model.hf_repo.empty()) {
543
+ std::string cli_hf_repo = params.model.hf_repo;
544
+ bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex);
545
+
546
+ // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value)
547
+ // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs)
548
+ std::string preset_hf_repo = params.model.hf_repo;
549
+ bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo;
550
+
551
+ if (has_preset) {
552
+ // re-parse CLI args to override preset values
553
+ parse_cli_args();
554
+ }
555
+
556
+ // preserve hf_repo from preset if needed
557
+ if (preset_has_hf_repo) {
558
+ params.model.hf_repo = preset_hf_repo;
483
559
  }
484
560
  }
485
561
 
@@ -965,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
965
1041
  exit(0);
966
1042
  }
967
1043
  ));
1044
+ add_opt(common_arg(
1045
+ {"--license"},
1046
+ "show source code license and dependencies",
1047
+ [](common_params &) {
1048
+ for (int i = 0; LICENSES[i]; ++i) {
1049
+ printf("%s\n", LICENSES[i]);
1050
+ }
1051
+ exit(0);
1052
+ }
1053
+ ));
968
1054
  add_opt(common_arg(
969
1055
  {"-cl", "--cache-list"},
970
1056
  "show list of models in cache",
@@ -1209,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1209
1295
  [](common_params & params) {
1210
1296
  params.kv_unified = true;
1211
1297
  }
1212
- ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
1298
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
1213
1299
  add_opt(common_arg(
1214
1300
  {"--context-shift"},
1215
1301
  {"--no-context-shift"},
@@ -2088,11 +2174,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2088
2174
  add_opt(common_arg(
2089
2175
  {"--mmap"},
2090
2176
  {"--no-mmap"},
2091
- string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2177
+ string_format("whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
2092
2178
  [](common_params & params, bool value) {
2093
2179
  params.use_mmap = value;
2180
+ if (value) {
2181
+ params.use_direct_io = false; // disable direct io when mmap is explicitly enabled
2182
+ }
2094
2183
  }
2095
2184
  ).set_env("LLAMA_ARG_MMAP"));
2185
+ add_opt(common_arg(
2186
+ {"-dio", "--direct-io"},
2187
+ {"-ndio", "--no-direct-io"},
2188
+ string_format("use DirectIO if available. Takes precedence over --mmap (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
2189
+ [](common_params & params, bool value) {
2190
+ params.use_direct_io = value;
2191
+ }
2192
+ ).set_env("LLAMA_ARG_DIO"));
2096
2193
  add_opt(common_arg(
2097
2194
  {"--numa"}, "TYPE",
2098
2195
  "attempt optimizations that help on some NUMA systems\n"
@@ -2244,7 +2341,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2244
2341
  std::vector<std::string> split_arg{ it, {} };
2245
2342
  if (split_arg.size() >= llama_max_devices()) {
2246
2343
  throw std::invalid_argument(
2247
- string_format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
2344
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2248
2345
  );
2249
2346
  }
2250
2347
  for (size_t i = 0; i < llama_max_devices(); ++i) {
@@ -2284,10 +2381,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2284
2381
  }
2285
2382
  ).set_env("LLAMA_ARG_FIT"));
2286
2383
  add_opt(common_arg(
2287
- { "-fitt", "--fit-target" }, "MiB",
2288
- string_format("target margin per device for --fit option, default: %zu", params.fit_params_target/(1024*1024)),
2289
- [](common_params & params, int value) {
2290
- params.fit_params_target = value * size_t(1024*1024);
2384
+ { "-fitt", "--fit-target" }, "MiB0,MiB1,MiB2,...",
2385
+ string_format("target margin per device for --fit, comma-separated list of values, "
2386
+ "single value is broadcast across all devices, default: %zu", params.fit_params_target[0]/(1024*1024)),
2387
+ [](common_params & params, const std::string & value) {
2388
+ std::string arg_next = value;
2389
+
2390
+ // split string by , and /
2391
+ const std::regex regex{ R"([,/]+)" };
2392
+ std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
2393
+ std::vector<std::string> split_arg{ it, {} };
2394
+ if (split_arg.size() >= llama_max_devices()) {
2395
+ throw std::invalid_argument(
2396
+ string_format("got %zu input configs, but system only has %zu devices", split_arg.size(), llama_max_devices())
2397
+ );
2398
+ }
2399
+ if (split_arg.size() == 1) {
2400
+ std::fill(params.fit_params_target.begin(), params.fit_params_target.end(), std::stoul(split_arg[0]) * 1024*1024);
2401
+ return;
2402
+ }
2403
+ for (size_t i = 0; i < split_arg.size(); i++) {
2404
+ params.fit_params_target[i] = std::stoul(split_arg[i]) * 1024*1024;
2405
+ }
2291
2406
  }
2292
2407
  ).set_env("LLAMA_ARG_FIT_TARGET"));
2293
2408
  add_opt(common_arg(
@@ -2762,10 +2877,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2762
2877
  params.n_threads_http = value;
2763
2878
  }
2764
2879
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2880
+ add_opt(common_arg(
2881
+ {"--cache-prompt"},
2882
+ {"--no-cache-prompt"},
2883
+ string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
2884
+ [](common_params & params, bool value) {
2885
+ params.cache_prompt = value;
2886
+ }
2887
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
2765
2888
  add_opt(common_arg(
2766
2889
  {"--cache-reuse"}, "N",
2767
2890
  string_format(
2768
- "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2891
+ "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
2769
2892
  "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2770
2893
  ),
2771
2894
  [](common_params & params, int value) {
@@ -129,11 +129,3 @@ void common_params_add_preset_options(std::vector<common_arg> & args);
129
129
 
130
130
  // initialize argument parser context - used by test-arg-parser and preset
131
131
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
132
-
133
- struct common_remote_params {
134
- std::vector<std::string> headers;
135
- long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
136
- long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
137
- };
138
- // get remote file content, returns <http_code, raw_response_body>
139
- std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1403
1403
  builder.add_content(builder.consume_rest());
1404
1404
  }
1405
1405
 
1406
+ static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
1407
+ // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
1408
+ // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
1409
+ static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
1410
+
1411
+ if (!builder.syntax().parse_tool_calls) {
1412
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1413
+ builder.add_content(builder.consume_rest());
1414
+ return;
1415
+ }
1416
+
1417
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1418
+
1419
+ // Find all <tool_call></tool_call> blocks
1420
+ while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
1421
+ builder.move_to(first->groups[0].end);
1422
+ builder.consume_spaces();
1423
+
1424
+ builder.try_consume_literal("```json");
1425
+ builder.try_consume_literal("```");
1426
+ builder.consume_spaces();
1427
+
1428
+ // Consume JSON object
1429
+ auto data = builder.consume_json();
1430
+
1431
+ builder.consume_spaces();
1432
+ builder.try_consume_literal("```");
1433
+ builder.consume_spaces();
1434
+
1435
+ if (!builder.try_consume_literal("</tool_call>")) {
1436
+ throw common_chat_msg_partial_exception("incomplete tool call");
1437
+ }
1438
+ builder.consume_spaces();
1439
+
1440
+ // Extract name and arguments
1441
+ std::string name;
1442
+ std::string id;
1443
+ nlohmann::ordered_json arguments;
1444
+
1445
+ const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
1446
+ if (!obj.contains("name") || !obj.contains("arguments")) {
1447
+ return false;
1448
+ }
1449
+ name = obj.at("name").get<std::string>();
1450
+ arguments = obj.at("arguments");
1451
+ if (obj.contains("id") && obj.at("id").is_string()) {
1452
+ id = obj.at("id").get<std::string>();
1453
+ }
1454
+ return true;
1455
+ };
1456
+
1457
+ if (!extract_args(data.json)) {
1458
+ if (data.json.contains("function") && data.json.at("function").is_object()) {
1459
+ auto fn = data.json.at("function");
1460
+ extract_args(fn);
1461
+ if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
1462
+ id = data.json.at("id").get<std::string>();
1463
+ }
1464
+ }
1465
+ }
1466
+
1467
+ // If name is empty, treat the JSON object as content
1468
+ if (name.empty()) {
1469
+ LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
1470
+ builder.add_content(data.json.dump());
1471
+ continue;
1472
+ }
1473
+
1474
+ std::string args_str = arguments.dump();
1475
+ if (!builder.add_tool_call(name, id, args_str)) {
1476
+ throw common_chat_msg_partial_exception("incomplete tool call");
1477
+ }
1478
+ }
1479
+
1480
+ builder.add_content(builder.consume_rest());
1481
+ }
1482
+
1483
+ static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
1484
+ LOG_DBG("%s: parsing exaone_moe\n", __func__);
1485
+ // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1486
+ // First try to parse using the standard reasoning parsing method
1487
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1488
+
1489
+ auto start_pos = builder.pos();
1490
+ auto found_end_think = builder.try_find_literal("</think>");
1491
+ builder.move_to(start_pos);
1492
+
1493
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1494
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1495
+ common_chat_parse_exaone_moe_content(builder);
1496
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1497
+ // If reasoning was parsed successfully, the remaining content is regular content
1498
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1499
+ common_chat_parse_exaone_moe_content(builder);
1500
+ } else {
1501
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1502
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1503
+ common_chat_parse_exaone_moe_content(builder);
1504
+ return;
1505
+ }
1506
+ // If no reasoning tags found, check if we should treat everything as reasoning
1507
+ if (builder.syntax().thinking_forced_open) {
1508
+ // If thinking is forced open but no tags found, treat everything as reasoning
1509
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1510
+ builder.add_reasoning_content(builder.consume_rest());
1511
+ } else {
1512
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1513
+ common_chat_parse_exaone_moe_content(builder);
1514
+ }
1515
+ }
1516
+ }
1517
+
1406
1518
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1407
1519
  builder.try_parse_reasoning("<think>", "</think>");
1408
1520
  builder.add_content(builder.consume_rest());
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1490
1602
  case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1491
1603
  common_chat_parse_solar_open(builder);
1492
1604
  break;
1605
+ case COMMON_CHAT_FORMAT_EXAONE_MOE:
1606
+ common_chat_parse_exaone_moe(builder);
1607
+ break;
1493
1608
  default:
1494
1609
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1495
1610
  }
@@ -657,6 +657,7 @@ const char * common_chat_format_name(common_chat_format format) {
657
657
  case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
658
658
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
659
659
  case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
660
+ case COMMON_CHAT_FORMAT_EXAONE_MOE: return "EXAONE MoE";
660
661
  case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
661
662
  case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
662
663
  case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2526,6 +2527,65 @@ static common_chat_params common_chat_params_init_solar_open(const common_chat_t
2526
2527
  return data;
2527
2528
  }
2528
2529
 
2530
+ static common_chat_params common_chat_params_init_exaone_moe(const common_chat_template & tmpl, const struct templates_params & inputs) {
2531
+ common_chat_params data;
2532
+
2533
+ data.prompt = apply(tmpl, inputs);
2534
+ data.format = COMMON_CHAT_FORMAT_EXAONE_MOE;
2535
+ if (string_ends_with(data.prompt, "<think>\n")) {
2536
+ if (!inputs.enable_thinking) {
2537
+ data.prompt += "</think>\n\n";
2538
+ } else {
2539
+ data.thinking_forced_open = true;
2540
+ }
2541
+ }
2542
+
2543
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
2544
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
2545
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
2546
+ std::vector<std::string> tool_rules;
2547
+ foreach_function(inputs.tools, [&](const json & tool) {
2548
+ const auto & function = tool.at("function");
2549
+ std::string name = function.at("name");
2550
+ auto parameters = function.at("parameters");
2551
+ builder.resolve_refs(parameters);
2552
+ // Expect: <tool_call>{"name": "<name>", "arguments": {...}}</tool_call>
2553
+ tool_rules.push_back(builder.add_rule(
2554
+ name + "-call",
2555
+ "\"<tool_call>\" space " +
2556
+ builder.add_schema(name + "-obj", json{
2557
+ {"type", "object"},
2558
+ {"properties", {
2559
+ {"name", json{{"const", name}}},
2560
+ {"arguments", parameters},
2561
+ }},
2562
+ {"required", json::array({"name", "arguments"})},
2563
+ }) +
2564
+ " space \"</tool_call>\" space"));
2565
+ });
2566
+
2567
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
2568
+ builder.add_rule("root",
2569
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
2570
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
2571
+
2572
+ data.grammar_triggers.push_back({
2573
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
2574
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)?" : "") +
2575
+ "(<tool_call>)[\\s\\S]*"
2576
+ });
2577
+ data.preserved_tokens = {
2578
+ "<think>",
2579
+ "</think>",
2580
+ "<tool_call>",
2581
+ "</tool_call>",
2582
+ };
2583
+ });
2584
+ }
2585
+
2586
+ return data;
2587
+ }
2588
+
2529
2589
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2530
2590
  common_chat_params data;
2531
2591
  data.prompt = apply(tmpl, inputs);
@@ -2696,6 +2756,13 @@ static common_chat_params common_chat_templates_apply_jinja(
2696
2756
  return common_chat_params_init_xiaomi_mimo(tmpl, params);
2697
2757
  }
2698
2758
 
2759
+ // EXAONE MoE format detection
2760
+ if (src.find("<tool_call>") != std::string::npos &&
2761
+ src.find("<tool_result>") != std::string::npos &&
2762
+ src.find("<|tool_declare|>") != std::string::npos) {
2763
+ return common_chat_params_init_exaone_moe(tmpl, params);
2764
+ }
2765
+
2699
2766
  // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
2700
2767
  if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
2701
2768
  return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -136,6 +136,7 @@ enum common_chat_format {
136
136
  COMMON_CHAT_FORMAT_APRIEL_1_5,
137
137
  COMMON_CHAT_FORMAT_XIAOMI_MIMO,
138
138
  COMMON_CHAT_FORMAT_SOLAR_OPEN,
139
+ COMMON_CHAT_FORMAT_EXAONE_MOE,
139
140
 
140
141
  // These are intended to be parsed by the PEG parser
141
142
  COMMON_CHAT_FORMAT_PEG_SIMPLE,
@@ -1097,7 +1097,7 @@ common_init_result::common_init_result(common_params & params) :
1097
1097
  if (params.fit_params) {
1098
1098
  LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
1099
1099
  llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
1100
- params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx,
1100
+ params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target.data(), params.fit_params_min_ctx,
1101
1101
  params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
1102
1102
  }
1103
1103
 
@@ -1367,6 +1367,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1367
1367
  mparams.split_mode = params.split_mode;
1368
1368
  mparams.tensor_split = params.tensor_split;
1369
1369
  mparams.use_mmap = params.use_mmap;
1370
+ mparams.use_direct_io = params.use_direct_io;
1370
1371
  mparams.use_mlock = params.use_mlock;
1371
1372
  mparams.check_tensors = params.check_tensors;
1372
1373
  mparams.use_extra_bufts = !params.no_extra_bufts;
@@ -80,6 +80,7 @@ int32_t cpu_get_num_math();
80
80
  //
81
81
 
82
82
  enum llama_example {
83
+ LLAMA_EXAMPLE_BATCHED,
83
84
  LLAMA_EXAMPLE_DEBUG,
84
85
  LLAMA_EXAMPLE_COMMON,
85
86
  LLAMA_EXAMPLE_SPECULATIVE,
@@ -333,12 +334,14 @@ struct common_params {
333
334
  // offload params
334
335
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
335
336
 
336
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
337
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
338
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
339
- bool fit_params = true; // whether to fit unset model/context parameters to free device memory
340
- size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
341
- int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
337
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
338
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
339
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
340
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
341
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
342
+
343
+ // margin per device in bytes for fitting parameters to free memory:
344
+ std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
342
345
 
343
346
  enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
344
347
 
@@ -429,7 +432,8 @@ struct common_params {
429
432
  bool kv_unified = false; // enable unified KV cache
430
433
 
431
434
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
432
- bool use_mmap = true; // use mmap for faster loads
435
+ bool use_mmap = true; // enable mmap to use filesystem cache
436
+ bool use_direct_io = true; // read from disk without buffering for faster model loading
433
437
  bool use_mlock = false; // use mlock to keep model in memory
434
438
  bool verbose_prompt = false; // print prompt tokens before generation
435
439
  bool display_prompt = true; // print prompt before generation
@@ -473,6 +477,7 @@ struct common_params {
473
477
  int32_t timeout_write = timeout_read; // http write timeout in seconds
474
478
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
475
479
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
480
+ bool cache_prompt = true; // whether to enable prompt caching
476
481
  int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
477
482
  int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
478
483