@fugood/llama.node 1.4.14 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -6
- package/lib/index.js +2 -2
- package/lib/index.ts +8 -3
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +77 -65
- package/src/LlamaContext.cpp +31 -34
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +15 -34
- package/src/llama.cpp/common/arg.cpp +59 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +356 -34
- package/src/llama.cpp/common/chat.h +17 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +30 -25
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +13 -6
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +215 -97
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
#include "chat.h"
|
|
4
4
|
#include "common.h"
|
|
5
|
+
#include "download.h"
|
|
5
6
|
#include "json-schema-to-grammar.h"
|
|
6
7
|
#include "log.h"
|
|
7
8
|
#include "sampling.h"
|
|
8
|
-
#include "download.h"
|
|
9
9
|
#include "preset.h"
|
|
10
10
|
|
|
11
11
|
// fix problem with std::min and std::max
|
|
@@ -48,6 +48,8 @@
|
|
|
48
48
|
|
|
49
49
|
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
|
|
50
50
|
|
|
51
|
+
extern const char * LICENSES[];
|
|
52
|
+
|
|
51
53
|
using json = nlohmann::ordered_json;
|
|
52
54
|
using namespace common_arg_utils;
|
|
53
55
|
|
|
@@ -279,12 +281,20 @@ static std::string clean_file_name(const std::string & fname) {
|
|
|
279
281
|
static bool common_params_handle_remote_preset(common_params & params, llama_example ex) {
|
|
280
282
|
GGML_ASSERT(!params.model.hf_repo.empty());
|
|
281
283
|
|
|
284
|
+
// the returned hf_repo is without tag
|
|
285
|
+
auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo);
|
|
286
|
+
|
|
287
|
+
// "latest" tag (default if not specified) is translated to "default" preset
|
|
288
|
+
if (hf_tag == "latest") {
|
|
289
|
+
hf_tag = "default";
|
|
290
|
+
}
|
|
291
|
+
|
|
282
292
|
const bool offline = params.offline;
|
|
283
293
|
std::string model_endpoint = get_model_endpoint();
|
|
284
|
-
auto preset_url = model_endpoint +
|
|
294
|
+
auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini";
|
|
285
295
|
|
|
286
296
|
// prepare local path for caching
|
|
287
|
-
auto preset_fname = clean_file_name(
|
|
297
|
+
auto preset_fname = clean_file_name(hf_repo + "_preset.ini");
|
|
288
298
|
auto preset_path = fs_get_cache_file(preset_fname);
|
|
289
299
|
const int status = common_download_file_single(preset_url, preset_path, params.hf_token, offline);
|
|
290
300
|
const bool has_preset = status >= 200 && status < 400;
|
|
@@ -293,14 +303,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
|
|
|
293
303
|
if (has_preset) {
|
|
294
304
|
LOG_INF("applying remote preset from %s\n", preset_url.c_str());
|
|
295
305
|
common_preset_context ctx(ex, /* only_remote_allowed */ true);
|
|
296
|
-
common_preset global;
|
|
306
|
+
common_preset global;
|
|
297
307
|
auto remote_presets = ctx.load_from_ini(preset_path, global);
|
|
298
|
-
|
|
299
|
-
|
|
308
|
+
remote_presets = ctx.cascade(global, remote_presets);
|
|
309
|
+
if (remote_presets.find(hf_tag) != remote_presets.end()) {
|
|
310
|
+
common_preset preset = remote_presets.at(hf_tag);
|
|
300
311
|
LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline
|
|
301
312
|
preset.apply_to_params(params);
|
|
302
313
|
} else {
|
|
303
|
-
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(
|
|
314
|
+
throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
|
|
304
315
|
}
|
|
305
316
|
} else {
|
|
306
317
|
LOG_INF("%s", "no remote preset found, skipping\n");
|
|
@@ -330,7 +341,7 @@ static handle_model_result common_params_handle_model(
|
|
|
330
341
|
if (model.path.empty()) {
|
|
331
342
|
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
|
332
343
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
|
333
|
-
exit(1); //
|
|
344
|
+
exit(1); // error message already printed
|
|
334
345
|
}
|
|
335
346
|
model.name = model.hf_repo; // repo name with tag
|
|
336
347
|
model.hf_repo = auto_detected.repo; // repo name without tag
|
|
@@ -1030,6 +1041,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1030
1041
|
exit(0);
|
|
1031
1042
|
}
|
|
1032
1043
|
));
|
|
1044
|
+
add_opt(common_arg(
|
|
1045
|
+
{"--license"},
|
|
1046
|
+
"show source code license and dependencies",
|
|
1047
|
+
[](common_params &) {
|
|
1048
|
+
for (int i = 0; LICENSES[i]; ++i) {
|
|
1049
|
+
printf("%s\n", LICENSES[i]);
|
|
1050
|
+
}
|
|
1051
|
+
exit(0);
|
|
1052
|
+
}
|
|
1053
|
+
));
|
|
1033
1054
|
add_opt(common_arg(
|
|
1034
1055
|
{"-cl", "--cache-list"},
|
|
1035
1056
|
"show list of models in cache",
|
|
@@ -1274,7 +1295,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1274
1295
|
[](common_params & params) {
|
|
1275
1296
|
params.kv_unified = true;
|
|
1276
1297
|
}
|
|
1277
|
-
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
1298
|
+
).set_env("LLAMA_ARG_KV_UNIFIED").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_BATCHED}));
|
|
1278
1299
|
add_opt(common_arg(
|
|
1279
1300
|
{"--context-shift"},
|
|
1280
1301
|
{"--no-context-shift"},
|
|
@@ -1708,6 +1729,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1708
1729
|
}
|
|
1709
1730
|
}
|
|
1710
1731
|
).set_sparam());
|
|
1732
|
+
add_opt(common_arg(
|
|
1733
|
+
{"--adaptive-target"}, "N",
|
|
1734
|
+
string_format("adaptive-p: select tokens near this probability (valid range 0.0 "
|
|
1735
|
+
"to 1.0; negative = disabled) (default: %.2f)\n"
|
|
1736
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927)",
|
|
1737
|
+
(double)params.sampling.adaptive_target),
|
|
1738
|
+
[](common_params & params, const std::string & value) {
|
|
1739
|
+
params.sampling.adaptive_target = std::stof(value);
|
|
1740
|
+
}
|
|
1741
|
+
).set_sparam());
|
|
1742
|
+
add_opt(common_arg(
|
|
1743
|
+
{"--adaptive-decay"}, "N",
|
|
1744
|
+
string_format("adaptive-p: decay rate for target adaptation over time. lower values "
|
|
1745
|
+
"are more reactive, higher values are more stable.\n"
|
|
1746
|
+
"(valid range 0.0 to 0.99) (default: %.2f)",
|
|
1747
|
+
(double)params.sampling.adaptive_decay),
|
|
1748
|
+
[](common_params & params, const std::string & value) {
|
|
1749
|
+
params.sampling.adaptive_decay = std::stof(value);
|
|
1750
|
+
}
|
|
1751
|
+
).set_sparam());
|
|
1711
1752
|
add_opt(common_arg(
|
|
1712
1753
|
{"--dynatemp-range"}, "N",
|
|
1713
1754
|
string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
|
|
@@ -2856,10 +2897,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2856
2897
|
params.n_threads_http = value;
|
|
2857
2898
|
}
|
|
2858
2899
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
|
|
2900
|
+
add_opt(common_arg(
|
|
2901
|
+
{"--cache-prompt"},
|
|
2902
|
+
{"--no-cache-prompt"},
|
|
2903
|
+
string_format("whether to enable prompt caching (default: %s)", params.cache_prompt ? "enabled" : "disabled"),
|
|
2904
|
+
[](common_params & params, bool value) {
|
|
2905
|
+
params.cache_prompt = value;
|
|
2906
|
+
}
|
|
2907
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_PROMPT"));
|
|
2859
2908
|
add_opt(common_arg(
|
|
2860
2909
|
{"--cache-reuse"}, "N",
|
|
2861
2910
|
string_format(
|
|
2862
|
-
"min chunk size to attempt reusing from the cache via KV shifting (default: %d)\n"
|
|
2911
|
+
"min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: %d)\n"
|
|
2863
2912
|
"[(card)](https://ggml.ai/f0.png)", params.n_cache_reuse
|
|
2864
2913
|
),
|
|
2865
2914
|
[](common_params & params, int value) {
|
|
@@ -1403,6 +1403,118 @@ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
|
|
|
1403
1403
|
builder.add_content(builder.consume_rest());
|
|
1404
1404
|
}
|
|
1405
1405
|
|
|
1406
|
+
static void common_chat_parse_exaone_moe_content(common_chat_msg_parser & builder) {
|
|
1407
|
+
// 1) <tool_call>{ "name": "...", "arguments": {...} }</tool_call>
|
|
1408
|
+
// 2) <tool_call>{ "id": "...", "type": "function", "function": { "name": "...", "arguments": {...} } }</tool_call>
|
|
1409
|
+
static const common_regex tool_call_open(R"(<tool_call[^>]*>)");
|
|
1410
|
+
|
|
1411
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1412
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1413
|
+
builder.add_content(builder.consume_rest());
|
|
1414
|
+
return;
|
|
1415
|
+
}
|
|
1416
|
+
|
|
1417
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1418
|
+
|
|
1419
|
+
// Find all <tool_call></tool_call> blocks
|
|
1420
|
+
while (auto first = builder.try_find_regex(tool_call_open, std::string::npos, /* add_prelude_to_content= */ true)) {
|
|
1421
|
+
builder.move_to(first->groups[0].end);
|
|
1422
|
+
builder.consume_spaces();
|
|
1423
|
+
|
|
1424
|
+
builder.try_consume_literal("```json");
|
|
1425
|
+
builder.try_consume_literal("```");
|
|
1426
|
+
builder.consume_spaces();
|
|
1427
|
+
|
|
1428
|
+
// Consume JSON object
|
|
1429
|
+
auto data = builder.consume_json();
|
|
1430
|
+
|
|
1431
|
+
builder.consume_spaces();
|
|
1432
|
+
builder.try_consume_literal("```");
|
|
1433
|
+
builder.consume_spaces();
|
|
1434
|
+
|
|
1435
|
+
if (!builder.try_consume_literal("</tool_call>")) {
|
|
1436
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1437
|
+
}
|
|
1438
|
+
builder.consume_spaces();
|
|
1439
|
+
|
|
1440
|
+
// Extract name and arguments
|
|
1441
|
+
std::string name;
|
|
1442
|
+
std::string id;
|
|
1443
|
+
nlohmann::ordered_json arguments;
|
|
1444
|
+
|
|
1445
|
+
const auto extract_args = [&](const nlohmann::ordered_json & obj) -> bool {
|
|
1446
|
+
if (!obj.contains("name") || !obj.contains("arguments")) {
|
|
1447
|
+
return false;
|
|
1448
|
+
}
|
|
1449
|
+
name = obj.at("name").get<std::string>();
|
|
1450
|
+
arguments = obj.at("arguments");
|
|
1451
|
+
if (obj.contains("id") && obj.at("id").is_string()) {
|
|
1452
|
+
id = obj.at("id").get<std::string>();
|
|
1453
|
+
}
|
|
1454
|
+
return true;
|
|
1455
|
+
};
|
|
1456
|
+
|
|
1457
|
+
if (!extract_args(data.json)) {
|
|
1458
|
+
if (data.json.contains("function") && data.json.at("function").is_object()) {
|
|
1459
|
+
auto fn = data.json.at("function");
|
|
1460
|
+
extract_args(fn);
|
|
1461
|
+
if (id.empty() && data.json.contains("id") && data.json.at("id").is_string()) {
|
|
1462
|
+
id = data.json.at("id").get<std::string>();
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
// If name is empty, treat the JSON object as content
|
|
1468
|
+
if (name.empty()) {
|
|
1469
|
+
LOG_DBG("%s: tool call missing name, treating as content\n", __func__);
|
|
1470
|
+
builder.add_content(data.json.dump());
|
|
1471
|
+
continue;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
std::string args_str = arguments.dump();
|
|
1475
|
+
if (!builder.add_tool_call(name, id, args_str)) {
|
|
1476
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
|
|
1480
|
+
builder.add_content(builder.consume_rest());
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
static void common_chat_parse_exaone_moe(common_chat_msg_parser & builder) {
|
|
1484
|
+
LOG_DBG("%s: parsing exaone_moe\n", __func__);
|
|
1485
|
+
// EXAONE MoE outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1486
|
+
// First try to parse using the standard reasoning parsing method
|
|
1487
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1488
|
+
|
|
1489
|
+
auto start_pos = builder.pos();
|
|
1490
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1491
|
+
builder.move_to(start_pos);
|
|
1492
|
+
|
|
1493
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1494
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1495
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1496
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1497
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1498
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1499
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1500
|
+
} else {
|
|
1501
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1502
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1504
|
+
return;
|
|
1505
|
+
}
|
|
1506
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1507
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1508
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1509
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1510
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1511
|
+
} else {
|
|
1512
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1513
|
+
common_chat_parse_exaone_moe_content(builder);
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1406
1518
|
static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|
1407
1519
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1408
1520
|
builder.add_content(builder.consume_rest());
|
|
@@ -1490,6 +1602,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1490
1602
|
case COMMON_CHAT_FORMAT_SOLAR_OPEN:
|
|
1491
1603
|
common_chat_parse_solar_open(builder);
|
|
1492
1604
|
break;
|
|
1605
|
+
case COMMON_CHAT_FORMAT_EXAONE_MOE:
|
|
1606
|
+
common_chat_parse_exaone_moe(builder);
|
|
1607
|
+
break;
|
|
1493
1608
|
default:
|
|
1494
1609
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1495
1610
|
}
|