@fugood/llama.node 1.4.14 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/lib/binding.ts +13 -6
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +8 -3
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +77 -65
  6. package/src/LlamaContext.cpp +31 -34
  7. package/src/llama.cpp/CMakeLists.txt +24 -8
  8. package/src/llama.cpp/common/CMakeLists.txt +15 -34
  9. package/src/llama.cpp/common/arg.cpp +59 -10
  10. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  11. package/src/llama.cpp/common/chat.cpp +356 -34
  12. package/src/llama.cpp/common/chat.h +17 -13
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +30 -25
  15. package/src/llama.cpp/common/debug.cpp +165 -0
  16. package/src/llama.cpp/common/debug.h +43 -0
  17. package/src/llama.cpp/common/download.cpp +12 -342
  18. package/src/llama.cpp/common/download.h +6 -0
  19. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  20. package/src/llama.cpp/common/jinja/caps.h +24 -0
  21. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  22. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  23. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  24. package/src/llama.cpp/common/jinja/parser.h +21 -0
  25. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  26. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  27. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  28. package/src/llama.cpp/common/jinja/string.h +58 -0
  29. package/src/llama.cpp/common/jinja/utils.h +49 -0
  30. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  31. package/src/llama.cpp/common/jinja/value.h +464 -0
  32. package/src/llama.cpp/common/preset.cpp +12 -2
  33. package/src/llama.cpp/common/sampling.cpp +52 -19
  34. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  35. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  39. package/src/llama.cpp/include/llama-cpp.h +3 -1
  40. package/src/llama.cpp/include/llama.h +29 -2
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  43. package/src/llama.cpp/src/llama-adapter.h +1 -3
  44. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  45. package/src/llama.cpp/src/llama-arch.h +1 -0
  46. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  47. package/src/llama.cpp/src/llama-chat.h +1 -0
  48. package/src/llama.cpp/src/llama-context.cpp +232 -144
  49. package/src/llama.cpp/src/llama-context.h +10 -0
  50. package/src/llama.cpp/src/llama-cparams.h +2 -0
  51. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  52. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  53. package/src/llama.cpp/src/llama-hparams.h +38 -1
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  55. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  56. package/src/llama.cpp/src/llama-mmap.cpp +13 -6
  57. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  58. package/src/llama.cpp/src/llama-model.cpp +215 -97
  59. package/src/llama.cpp/src/llama-model.h +3 -2
  60. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
  61. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  62. package/src/llama.cpp/src/llama-vocab.h +1 -0
  63. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  64. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  65. package/src/llama.cpp/src/models/models.h +13 -2
  66. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -2,10 +2,10 @@
2
2
 
3
3
  #include "chat.h"
4
4
  #include "common.h"
5
+ #include "download.h"
5
6
  #include "json-schema-to-grammar.h"
6
7
  #include "log.h"
7
8
  #include "sampling.h"
8
- #include "download.h"
9
9
  #include "preset.h"
10
10
 
11
11
  // fix problem with std::min and std::max
@@ -48,6 +48,8 @@
48
48
 
49
49
  #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
50
50
 
51
+ extern const char * LICENSES[];
52
+
51
53
  using json = nlohmann::ordered_json;
52
54
  using namespace common_arg_utils;
53
55
 
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
279
281
  static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
280
282
  GGML_ASSERT(!params.model.hf_repo.empty());
281
283
 
284
+ // the returned hf_repo is without tag
285
+ auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
286
+
287
+ // "latest" tag (default if not specified) is translated to "default" preset
288
+ if (hf_tag == "latest") {
289
+ hf_tag = "default";
290
+ }
291
+
282
292
  const bool offline = params.offline;
283
293
  std::string model_endpoint = get_model_endpoint();
284
- auto preset_url = model_endpoint + params.model.hf_repo + "/resolve/main/preset.ini";
294
+ auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
285
295
 
286
296
  // prepare local path for caching
287
- auto preset_fname = clean_file_name(params.model.hf_repo + "_preset.ini");
297
+ auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
288
298
  auto preset_path = fs_get_cache_file(preset_fname);
289
299
  const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
290
300
  const bool has_preset = status >= 200 && status < 400;
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
293
303
  if (has_preset) {
294
304
  LOG_INF("applying remote preset from %s\n", preset_url.c_str());
295
305
  common_preset_context ctx(ex, /* only_remote_allowed */ true);
296
- common_preset global; // unused for now
306
+ common_preset global;
297
307
  auto remote_presets = ctx.load_from_ini(preset_path, global);
298
- if (remote_presets.find(COMMON_PRESET_DEFAULT_NAME) != remote_presets.end()) {
299
- common_preset & preset = remote_presets.at(COMMON_PRESET_DEFAULT_NAME);
308
+ remote_presets = ctx.cascade(global, remote_presets);
309
+ if (remote_presets.find(hf_tag) != remote_presets.end()) {
310
+ common_preset preset = remote_presets.at(hf_tag);
300
311
  LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
301
312
  preset.apply_to_params(params);
302
313
  } else {
303
- throw std::runtime_error("Remote preset.ini does not contain [" + std::string(COMMON_PRESET_DEFAULT_NAME) + "] section");
314
+ throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
304
315
  }
305
316
  } else {
306
317
  LOG_INF("%s", "no remote preset found, skipping\n");
@@ -330,7 +341,7 @@ static handle_model_result common_params_handle_model(
330
341
  if (model.path.empty()) {
331
342
  auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
332
343
  if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
333
- exit(1); // built without CURL, error message already printed
344
+ exit(1); // error message already printed
334
345
  }
335
346
  model.name = model.hf_repo; // repo name with tag
336
347
  model.hf_repo = auto_detected.repo; // repo name without tag
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1030
1041
  exit(0);
1031
1042
  }
1032
1043
  ));
1044
+ add_opt(common_arg(
1045
+ {"--license"},
1046
+ "show source code license and dependencies",
1047
+ [](common_params &) {
1048
+ for (int i = 0; LICENSES[i]; ++i) {
1049
+ printf("%s\n", LICENSES[i]);
1050
+ }
1051
+ exit(0);
1052
+ }
1053
+ ));
1033
1054
  add_opt(common_arg(
1034
1055
  {"-cl", "--cache-list"},
1035
1056
  "show list of models in cache",
@@ -1274,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1274
1295
  [](common_params & params) {
1275
1296
  params.kv_unified = true;
1276
1297
  }
1277
- ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
1298
+ ).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
1278
1299
  add_opt(common_arg(
1279
1300
  {"--context-shift"},
1280
1301
  {"--no-context-shift"},
@@ -1708,6 +1729,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1708
1729
  }
1709
1730
  }
1710
1731
  ).set_sparam());
1732
+ add_opt(common_arg(
1733
+ {"--adaptive-target"}, "N",
1734
+ string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
1735
+ "to 1.0; negative = disabled) (default: %.2f)\n"
1736
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
1737
+ (double)params.sampling.adaptive_target),
1738
+ [](common_params & params, const std::string & value) {
1739
+ params.sampling.adaptive_target = std::stof(value);
1740
+ }
1741
+ ).set_sparam());
1742
+ add_opt(common_arg(
1743
+ {"--adaptive-decay"}, "N",
1744
+ string_format("adaptive-p: decay rate for target adaptation over time. lower values "
1745
+ "are more reactive, higher values are more stable.\n"
1746
+ "(valid range 0.0 to 0.99) (default: %.2f)",
1747
+ (double)params.sampling.adaptive_decay),
1748
+ [](common_params & params, const std::string & value) {
1749
+ params.sampling.adaptive_decay = std::stof(value);
1750
+ }
1751
+ ).set_sparam());
1711
1752
  add_opt(common_arg(
1712
1753
  {"--dynatemp-range"}, "N",
1713
1754
  string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
@@ -2856,10 +2897,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2856
2897
  params.n_threads_http = value;
2857
2898
  }
2858
2899
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
2900
+ add_opt(common_arg(
2901
+ {"--cache-prompt"},
2902
+ {"--no-cache-prompt"},
2903
+ string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
2904
+ [](common_params & params, bool value) {
2905
+ params.cache_prompt = value;
2906
+ }
2907
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
2859
2908
  add_opt(common_arg(
2860
2909
  {"--cache-reuse"}, "N",
2861
2910
  string_format(
2862
- "min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
2911
+ "min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
2863
2912
  "[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
2864
2913
  ),
2865
2914
  [](common_params & params, int value) {
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1403
1403
  builder.add_content(builder.consume_rest());
1404
1404
  }
1405
1405
 
1406
+ static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
1407
+ // 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
1408
+ // 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
1409
+ static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
1410
+
1411
+ if (!builder.syntax().parse_tool_calls) {
1412
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1413
+ builder.add_content(builder.consume_rest());
1414
+ return;
1415
+ }
1416
+
1417
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1418
+
1419
+ // Find all <tool_call></tool_call> blocks
1420
+ while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
1421
+ builder.move_to(first->groups[0].end);
1422
+ builder.consume_spaces();
1423
+
1424
+ builder.try_consume_literal("```json");
1425
+ builder.try_consume_literal("```");
1426
+ builder.consume_spaces();
1427
+
1428
+ // Consume JSON object
1429
+ auto data = builder.consume_json();
1430
+
1431
+ builder.consume_spaces();
1432
+ builder.try_consume_literal("```");
1433
+ builder.consume_spaces();
1434
+
1435
+ if (!builder.try_consume_literal("</tool_call>")) {
1436
+ throw common_chat_msg_partial_exception("incomplete tool call");
1437
+ }
1438
+ builder.consume_spaces();
1439
+
1440
+ // Extract name and arguments
1441
+ std::string name;
1442
+ std::string id;
1443
+ nlohmann::ordered_json arguments;
1444
+
1445
+ const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
1446
+ if (!obj.contains("name") || !obj.contains("arguments")) {
1447
+ return false;
1448
+ }
1449
+ name = obj.at("name").get<std::string>();
1450
+ arguments = obj.at("arguments");
1451
+ if (obj.contains("id") && obj.at("id").is_string()) {
1452
+ id = obj.at("id").get<std::string>();
1453
+ }
1454
+ return true;
1455
+ };
1456
+
1457
+ if (!extract_args(data.json)) {
1458
+ if (data.json.contains("function") && data.json.at("function").is_object()) {
1459
+ auto fn = data.json.at("function");
1460
+ extract_args(fn);
1461
+ if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
1462
+ id = data.json.at("id").get<std::string>();
1463
+ }
1464
+ }
1465
+ }
1466
+
1467
+ // If name is empty, treat the JSON object as content
1468
+ if (name.empty()) {
1469
+ LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
1470
+ builder.add_content(data.json.dump());
1471
+ continue;
1472
+ }
1473
+
1474
+ std::string args_str = arguments.dump();
1475
+ if (!builder.add_tool_call(name, id, args_str)) {
1476
+ throw common_chat_msg_partial_exception("incomplete tool call");
1477
+ }
1478
+ }
1479
+
1480
+ builder.add_content(builder.consume_rest());
1481
+ }
1482
+
1483
+ static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
1484
+ LOG_DBG("%s: parsing exaone_moe\n", __func__);
1485
+ // EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1486
+ // First try to parse using the standard reasoning parsing method
1487
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1488
+
1489
+ auto start_pos = builder.pos();
1490
+ auto found_end_think = builder.try_find_literal("</think>");
1491
+ builder.move_to(start_pos);
1492
+
1493
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1494
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1495
+ common_chat_parse_exaone_moe_content(builder);
1496
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1497
+ // If reasoning was parsed successfully, the remaining content is regular content
1498
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1499
+ common_chat_parse_exaone_moe_content(builder);
1500
+ } else {
1501
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1502
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1503
+ common_chat_parse_exaone_moe_content(builder);
1504
+ return;
1505
+ }
1506
+ // If no reasoning tags found, check if we should treat everything as reasoning
1507
+ if (builder.syntax().thinking_forced_open) {
1508
+ // If thinking is forced open but no tags found, treat everything as reasoning
1509
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1510
+ builder.add_reasoning_content(builder.consume_rest());
1511
+ } else {
1512
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1513
+ common_chat_parse_exaone_moe_content(builder);
1514
+ }
1515
+ }
1516
+ }
1517
+
1406
1518
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1407
1519
  builder.try_parse_reasoning("<think>", "</think>");
1408
1520
  builder.add_content(builder.consume_rest());
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1490
1602
  case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1491
1603
  common_chat_parse_solar_open(builder);
1492
1604
  break;
1605
+ case COMMON_CHAT_FORMAT_EXAONE_MOE:
1606
+ common_chat_parse_exaone_moe(builder);
1607
+ break;
1493
1608
  default:
1494
1609
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1495
1610
  }