@fugood/llama.node 1.1.6 → 1.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +9 -9
- package/src/LlamaCompletionWorker.cpp +73 -20
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/LlamaContext.cpp +9 -0
- package/src/common.hpp +8 -1
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +132 -41
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +311 -9
- package/src/llama.cpp/common/chat.h +4 -1
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +46 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +28 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/include/llama.h +25 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +2 -4
- package/src/llama.cpp/src/llama-context.cpp +29 -22
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model.cpp +81 -70
- package/src/llama.cpp/src/llama-model.h +2 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
|
@@ -283,6 +283,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
|
|
283
283
|
}
|
|
284
284
|
if (!msg.reasoning_content.empty()) {
|
|
285
285
|
jmsg["reasoning_content"] = msg.reasoning_content;
|
|
286
|
+
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
|
|
286
287
|
}
|
|
287
288
|
if (!msg.tool_name.empty()) {
|
|
288
289
|
jmsg["name"] = msg.tool_name;
|
|
@@ -459,11 +460,12 @@ std::string common_chat_format_single(
|
|
|
459
460
|
return ss.str();
|
|
460
461
|
}
|
|
461
462
|
|
|
462
|
-
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
|
463
|
+
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
|
|
463
464
|
common_chat_templates_inputs inputs;
|
|
464
465
|
inputs.use_jinja = use_jinja;
|
|
465
466
|
inputs.add_bos = tmpls->add_bos;
|
|
466
467
|
inputs.add_eos = tmpls->add_eos;
|
|
468
|
+
inputs.chat_template_kwargs = chat_template_kwargs;
|
|
467
469
|
auto add_simple_msg = [&](auto role, auto content) {
|
|
468
470
|
common_chat_msg msg;
|
|
469
471
|
msg.role = role;
|
|
@@ -539,6 +541,17 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
539
541
|
default_template_src = CHATML_TEMPLATE_SRC;
|
|
540
542
|
}
|
|
541
543
|
}
|
|
544
|
+
|
|
545
|
+
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
|
|
546
|
+
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
|
|
547
|
+
if (default_template_src.find("<|channel|>") != std::string::npos
|
|
548
|
+
// search for the error message and patch it
|
|
549
|
+
&& default_template_src.find("in message.content or") != std::string::npos) {
|
|
550
|
+
string_replace_all(default_template_src,
|
|
551
|
+
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
|
|
552
|
+
"{%- if false %}");
|
|
553
|
+
}
|
|
554
|
+
|
|
542
555
|
std::string token_bos = bos_token_override;
|
|
543
556
|
std::string token_eos = eos_token_override;
|
|
544
557
|
bool add_bos = false;
|
|
@@ -593,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
593
606
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
594
607
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
595
608
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
609
|
+
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
596
610
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
597
611
|
default:
|
|
598
612
|
throw std::runtime_error("Unknown chat format");
|
|
@@ -610,6 +624,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
|
610
624
|
}
|
|
611
625
|
}
|
|
612
626
|
|
|
627
|
+
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
|
|
628
|
+
if (format == "none") {
|
|
629
|
+
return COMMON_REASONING_FORMAT_NONE;
|
|
630
|
+
} else if (format == "auto") {
|
|
631
|
+
return COMMON_REASONING_FORMAT_AUTO;
|
|
632
|
+
} else if (format == "deepseek") {
|
|
633
|
+
return COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
634
|
+
} else if (format == "deepseek-legacy") {
|
|
635
|
+
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
|
636
|
+
}
|
|
637
|
+
throw std::runtime_error("Unknown reasoning format: " + format);
|
|
638
|
+
}
|
|
639
|
+
|
|
613
640
|
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
|
614
641
|
std::string arguments;
|
|
615
642
|
if (builder.is_partial()) {
|
|
@@ -1299,16 +1326,164 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
|
|
1299
1326
|
data.prompt = prompt;
|
|
1300
1327
|
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1301
1328
|
|
|
1302
|
-
//
|
|
1329
|
+
// These special tokens are required to parse properly, so we include them
|
|
1330
|
+
// even if parse_tool_calls is false.
|
|
1331
|
+
data.preserved_tokens = {
|
|
1332
|
+
"<|channel|>",
|
|
1333
|
+
"<|constrain|>",
|
|
1334
|
+
"<|message|>",
|
|
1335
|
+
"<|start|>",
|
|
1336
|
+
"<|end|>",
|
|
1337
|
+
};
|
|
1338
|
+
|
|
1339
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1340
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1341
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1342
|
+
// tool calls can appear in commentary or analysis channels
|
|
1343
|
+
auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
|
|
1344
|
+
|
|
1345
|
+
std::vector<std::string> tool_rules_recipient_in_role;
|
|
1346
|
+
std::vector<std::string> tool_rules_recipient_in_channel;
|
|
1347
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1348
|
+
const auto & function = tool.at("function");
|
|
1349
|
+
std::string name = function.at("name");
|
|
1350
|
+
auto parameters = function.at("parameters");
|
|
1351
|
+
builder.resolve_refs(parameters);
|
|
1352
|
+
|
|
1353
|
+
tool_rules_recipient_in_role.push_back(
|
|
1354
|
+
builder.add_rule(name + "-call",
|
|
1355
|
+
"\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
|
|
1356
|
+
builder.add_schema(name + "-args", parameters)
|
|
1357
|
+
)
|
|
1358
|
+
);
|
|
1359
|
+
|
|
1360
|
+
tool_rules_recipient_in_channel.push_back(
|
|
1361
|
+
builder.add_rule(name + "-call",
|
|
1362
|
+
"\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
|
|
1363
|
+
builder.add_schema(name + "-args", parameters)
|
|
1364
|
+
)
|
|
1365
|
+
);
|
|
1366
|
+
});
|
|
1367
|
+
|
|
1368
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1369
|
+
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1370
|
+
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1371
|
+
);
|
|
1372
|
+
|
|
1373
|
+
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
|
1374
|
+
channel + " \" to=functions.\" ( " +
|
|
1375
|
+
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
|
1376
|
+
);
|
|
1377
|
+
|
|
1378
|
+
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
|
1379
|
+
|
|
1380
|
+
// Trigger on tool calls that appear in the commentary channel
|
|
1381
|
+
data.grammar_triggers.push_back({
|
|
1382
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
1383
|
+
"<\\|channel\\|>(commentary|analysis) to"
|
|
1384
|
+
});
|
|
1385
|
+
|
|
1386
|
+
// Trigger tool calls that appear in the role section, either at the
|
|
1387
|
+
// start or in the middle.
|
|
1388
|
+
data.grammar_triggers.push_back({
|
|
1389
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1390
|
+
"^ to"
|
|
1391
|
+
});
|
|
1392
|
+
|
|
1393
|
+
data.grammar_triggers.push_back({
|
|
1394
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
1395
|
+
"<\\|start\\|>assistant to"
|
|
1396
|
+
});
|
|
1397
|
+
});
|
|
1398
|
+
}
|
|
1303
1399
|
|
|
1304
1400
|
return data;
|
|
1305
1401
|
}
|
|
1306
1402
|
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1403
|
+
static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
|
|
1404
|
+
static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
|
|
1405
|
+
|
|
1406
|
+
static const common_regex start_regex("<\\|start\\|>assistant");
|
|
1407
|
+
static const common_regex analysis_regex("<\\|channel\\|>analysis");
|
|
1408
|
+
static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
|
|
1409
|
+
static const common_regex preamble_regex("<\\|channel\\|>commentary");
|
|
1410
|
+
static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
|
|
1411
|
+
static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
|
|
1412
|
+
|
|
1413
|
+
auto consume_end = [&](bool include_end = false) {
|
|
1414
|
+
if (auto res = builder.try_find_literal("<|end|>")) {
|
|
1415
|
+
return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
|
|
1416
|
+
}
|
|
1417
|
+
return builder.consume_rest();
|
|
1418
|
+
};
|
|
1419
|
+
|
|
1420
|
+
auto handle_tool_call = [&](const std::string & name) {
|
|
1421
|
+
if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
|
|
1422
|
+
if (builder.syntax().parse_tool_calls) {
|
|
1423
|
+
if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
|
|
1424
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1425
|
+
}
|
|
1426
|
+
} else if (args->is_partial) {
|
|
1427
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
};
|
|
1431
|
+
|
|
1432
|
+
auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
|
|
1433
|
+
auto match = regex.search(input, 0, true);
|
|
1434
|
+
if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
|
|
1435
|
+
return match;
|
|
1436
|
+
}
|
|
1437
|
+
return std::nullopt;
|
|
1438
|
+
};
|
|
1439
|
+
|
|
1440
|
+
do {
|
|
1441
|
+
auto header_start_pos = builder.pos();
|
|
1442
|
+
auto content_start = builder.try_find_literal("<|message|>");
|
|
1443
|
+
if (!content_start) {
|
|
1444
|
+
throw common_chat_msg_partial_exception("incomplete header");
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
auto header = content_start->prelude;
|
|
1448
|
+
|
|
1449
|
+
if (auto match = regex_match(tool_call1_regex, header)) {
|
|
1450
|
+
auto group = match->groups[1];
|
|
1451
|
+
auto name = header.substr(group.begin, group.end - group.begin);
|
|
1452
|
+
handle_tool_call(name);
|
|
1453
|
+
continue;
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
if (auto match = regex_match(tool_call2_regex, header)) {
|
|
1457
|
+
auto group = match->groups[2];
|
|
1458
|
+
auto name = header.substr(group.begin, group.end - group.begin);
|
|
1459
|
+
handle_tool_call(name);
|
|
1460
|
+
continue;
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
if (regex_match(analysis_regex, header)) {
|
|
1464
|
+
builder.move_to(header_start_pos);
|
|
1465
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
|
|
1466
|
+
builder.add_content(consume_end(true));
|
|
1467
|
+
} else {
|
|
1468
|
+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
|
|
1469
|
+
}
|
|
1470
|
+
continue;
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
|
|
1474
|
+
builder.add_content(consume_end());
|
|
1475
|
+
continue;
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
// Possibly a malformed message, attempt to recover by rolling
|
|
1479
|
+
// back to pick up the next <|start|>
|
|
1480
|
+
LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
|
|
1481
|
+
builder.move_to(header_start_pos);
|
|
1482
|
+
} while (builder.try_find_regex(start_regex, std::string::npos, false));
|
|
1483
|
+
|
|
1484
|
+
auto remaining = builder.consume_rest();
|
|
1485
|
+
if (!remaining.empty()) {
|
|
1486
|
+
LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
|
|
1312
1487
|
}
|
|
1313
1488
|
}
|
|
1314
1489
|
|
|
@@ -1721,6 +1896,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1721
1896
|
builder.add_content(builder.consume_rest());
|
|
1722
1897
|
}
|
|
1723
1898
|
|
|
1899
|
+
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1900
|
+
common_chat_params data;
|
|
1901
|
+
|
|
1902
|
+
// Pass thinking context for Granite template
|
|
1903
|
+
json additional_context = {
|
|
1904
|
+
{"thinking", inputs.enable_thinking},
|
|
1905
|
+
};
|
|
1906
|
+
|
|
1907
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
|
|
1908
|
+
data.format = COMMON_CHAT_FORMAT_GRANITE;
|
|
1909
|
+
|
|
1910
|
+
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
|
|
1911
|
+
if (!inputs.enable_thinking) {
|
|
1912
|
+
data.prompt += "</think>";
|
|
1913
|
+
} else {
|
|
1914
|
+
data.thinking_forced_open = true;
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1918
|
+
if (!inputs.tools.is_null()) {
|
|
1919
|
+
// Granite uses <|tool_call|> followed by JSON list
|
|
1920
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1921
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1922
|
+
std::vector<std::string> tool_rules;
|
|
1923
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1924
|
+
const auto & function = tool.at("function");
|
|
1925
|
+
std::string name = function.at("name");
|
|
1926
|
+
auto parameters = function.at("parameters");
|
|
1927
|
+
builder.resolve_refs(parameters);
|
|
1928
|
+
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
|
|
1929
|
+
"-args", {
|
|
1930
|
+
{"type", "object"},
|
|
1931
|
+
{"properties", {
|
|
1932
|
+
{"name", {{"const", name}}},
|
|
1933
|
+
{"arguments", parameters},
|
|
1934
|
+
}},
|
|
1935
|
+
{"required", json::array({"name", "arguments"})},
|
|
1936
|
+
})));
|
|
1937
|
+
});
|
|
1938
|
+
|
|
1939
|
+
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
|
1940
|
+
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
|
|
1941
|
+
|
|
1942
|
+
if (data.thinking_forced_open) {
|
|
1943
|
+
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
|
|
1944
|
+
} else {
|
|
1945
|
+
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
data.grammar_triggers.push_back({
|
|
1949
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
|
1950
|
+
"<|tool_call|>"
|
|
1951
|
+
});
|
|
1952
|
+
|
|
1953
|
+
data.preserved_tokens = {
|
|
1954
|
+
"<think>",
|
|
1955
|
+
"</think>",
|
|
1956
|
+
"<response>",
|
|
1957
|
+
"</response>",
|
|
1958
|
+
"<|tool_call|>",
|
|
1959
|
+
};
|
|
1960
|
+
});
|
|
1961
|
+
} else {
|
|
1962
|
+
// Handle thinking tags for non-tool responses
|
|
1963
|
+
if (data.thinking_forced_open && inputs.enable_thinking) {
|
|
1964
|
+
data.grammar_lazy = false;
|
|
1965
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1966
|
+
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
|
|
1967
|
+
});
|
|
1968
|
+
data.preserved_tokens = {
|
|
1969
|
+
"<think>",
|
|
1970
|
+
"</think>",
|
|
1971
|
+
"<response>",
|
|
1972
|
+
"</response>",
|
|
1973
|
+
};
|
|
1974
|
+
}
|
|
1975
|
+
}
|
|
1976
|
+
|
|
1977
|
+
return data;
|
|
1978
|
+
}
|
|
1979
|
+
|
|
1980
|
+
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
1981
|
+
// Parse thinking tags
|
|
1982
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
1983
|
+
|
|
1984
|
+
// Parse response tags using regex
|
|
1985
|
+
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
|
|
1986
|
+
if (auto res = builder.try_find_regex(response_regex)) {
|
|
1987
|
+
// Extract the content between the tags (capture group 1)
|
|
1988
|
+
auto content = builder.str(res->groups[1]);
|
|
1989
|
+
builder.add_content(content);
|
|
1990
|
+
builder.move_to(res->groups[0].end);
|
|
1991
|
+
}
|
|
1992
|
+
|
|
1993
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1994
|
+
builder.add_content(builder.consume_rest());
|
|
1995
|
+
return;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
// Look for tool calls
|
|
1999
|
+
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
|
|
2000
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2001
|
+
builder.move_to(res->groups[0].end);
|
|
2002
|
+
|
|
2003
|
+
// Expect JSON array of tool calls
|
|
2004
|
+
auto tool_calls_data = builder.consume_json();
|
|
2005
|
+
if (tool_calls_data.json.is_array()) {
|
|
2006
|
+
if (!builder.add_tool_calls(tool_calls_data.json)) {
|
|
2007
|
+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2008
|
+
}
|
|
2009
|
+
} else {
|
|
2010
|
+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2011
|
+
}
|
|
2012
|
+
} else {
|
|
2013
|
+
builder.add_content(builder.consume_rest());
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
|
|
1724
2017
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1725
2018
|
common_chat_params data;
|
|
1726
2019
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -1754,8 +2047,8 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1754
2047
|
params.enable_thinking = inputs.enable_thinking;
|
|
1755
2048
|
params.grammar = inputs.grammar;
|
|
1756
2049
|
params.now = inputs.now;
|
|
1757
|
-
params.add_bos =
|
|
1758
|
-
params.add_eos =
|
|
2050
|
+
params.add_bos = tmpls->add_bos;
|
|
2051
|
+
params.add_eos = tmpls->add_eos;
|
|
1759
2052
|
|
|
1760
2053
|
params.extra_context = json::object();
|
|
1761
2054
|
for (auto el : inputs.chat_template_kwargs) {
|
|
@@ -1792,6 +2085,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1792
2085
|
return common_chat_params_init_command_r7b(tmpl, params);
|
|
1793
2086
|
}
|
|
1794
2087
|
|
|
2088
|
+
// Granite (IBM) - detects thinking / tools support
|
|
2089
|
+
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
|
|
2090
|
+
return common_chat_params_init_granite(tmpl, params);
|
|
2091
|
+
}
|
|
2092
|
+
|
|
1795
2093
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
1796
2094
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
|
1797
2095
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
@@ -1852,6 +2150,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
|
|
1852
2150
|
int alloc_size = 0;
|
|
1853
2151
|
std::vector<llama_chat_message> chat;
|
|
1854
2152
|
std::vector<std::string> contents;
|
|
2153
|
+
|
|
1855
2154
|
for (const auto & msg : inputs.messages) {
|
|
1856
2155
|
auto content = msg.content;
|
|
1857
2156
|
for (const auto & part : msg.content_parts) {
|
|
@@ -1953,6 +2252,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1953
2252
|
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
1954
2253
|
common_chat_parse_command_r7b(builder);
|
|
1955
2254
|
break;
|
|
2255
|
+
case COMMON_CHAT_FORMAT_GRANITE:
|
|
2256
|
+
common_chat_parse_granite(builder);
|
|
2257
|
+
break;
|
|
1956
2258
|
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
1957
2259
|
common_chat_parse_gpt_oss(builder);
|
|
1958
2260
|
break;
|
|
@@ -120,6 +120,7 @@ enum common_chat_format {
|
|
|
120
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
121
121
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
122
122
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
|
+
COMMON_CHAT_FORMAT_GRANITE,
|
|
123
124
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
124
125
|
|
|
125
126
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
@@ -197,10 +198,12 @@ std::string common_chat_format_single(
|
|
|
197
198
|
// Returns an example of formatted chat
|
|
198
199
|
std::string common_chat_format_example(
|
|
199
200
|
const struct common_chat_templates * tmpls,
|
|
200
|
-
bool use_jinja
|
|
201
|
+
bool use_jinja,
|
|
202
|
+
const std::map<std::string, std::string> & chat_template_kwargs);
|
|
201
203
|
|
|
202
204
|
const char* common_chat_format_name(common_chat_format format);
|
|
203
205
|
const char* common_reasoning_format_name(common_reasoning_format format);
|
|
206
|
+
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
|
204
207
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
|
205
208
|
|
|
206
209
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
#endif
|
|
42
42
|
#include <locale>
|
|
43
43
|
#include <windows.h>
|
|
44
|
+
#include <string.h>
|
|
44
45
|
#include <fcntl.h>
|
|
45
46
|
#include <io.h>
|
|
46
47
|
#else
|
|
@@ -1566,3 +1567,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
|
|
1566
1567
|
|
|
1567
1568
|
return result;
|
|
1568
1569
|
}
|
|
1570
|
+
|
|
1571
|
+
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
|
1572
|
+
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
|
1573
|
+
const lr_opt & d = *(lr_opt *) userdata;
|
|
1574
|
+
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
|
1575
|
+
result.sgd.wd = result.adamw.wd = d.wd;
|
|
1576
|
+
return result;
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
// TODO make all command line args case-insensitive
|
|
1580
|
+
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
|
1581
|
+
return !
|
|
1582
|
+
#if defined(_MSC_VER)
|
|
1583
|
+
_stricmp
|
|
1584
|
+
#else
|
|
1585
|
+
strcasecmp
|
|
1586
|
+
#endif // defined(_MSC_VER)
|
|
1587
|
+
(a, b);
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
|
1591
|
+
if (eq_case_insensitive("adamw", n)) {
|
|
1592
|
+
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
|
1593
|
+
}
|
|
1594
|
+
if (eq_case_insensitive("sgd", n)) {
|
|
1595
|
+
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
|
1596
|
+
}
|
|
1597
|
+
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
// TODO simplify to use just log and exp
|
|
1601
|
+
static float const k_log_2 = std::log(2.f);
|
|
1602
|
+
|
|
1603
|
+
void lr_opt::init() {
|
|
1604
|
+
if (lr_min > 0 && lr_min < lr0) {
|
|
1605
|
+
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
|
1606
|
+
float e = epochs;
|
|
1607
|
+
if (decay_epochs > 0 && decay_epochs < e) {
|
|
1608
|
+
e = decay_epochs;
|
|
1609
|
+
} else {
|
|
1610
|
+
decay_epochs = e;
|
|
1611
|
+
}
|
|
1612
|
+
scale_epoch = nhalf / e;
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
float lr_opt::get_lr(float epoch) const {
|
|
1617
|
+
float r = lr_min <= 0 ? lr0 :
|
|
1618
|
+
epoch >= decay_epochs ? lr_min :
|
|
1619
|
+
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
|
1620
|
+
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
|
1621
|
+
return r;
|
|
1622
|
+
}
|
|
@@ -2,14 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
|
-
#include "llama-cpp.h"
|
|
6
|
-
|
|
7
5
|
#include <set>
|
|
6
|
+
#include <sstream>
|
|
8
7
|
#include <string>
|
|
9
8
|
#include <string_view>
|
|
10
9
|
#include <vector>
|
|
11
10
|
#include <map>
|
|
12
11
|
#include <sstream>
|
|
12
|
+
#include <cmath>
|
|
13
|
+
|
|
14
|
+
#include "ggml-opt.h"
|
|
15
|
+
#include "llama-cpp.h"
|
|
13
16
|
|
|
14
17
|
#ifdef _WIN32
|
|
15
18
|
#define DIRECTORY_SEPARATOR '\\'
|
|
@@ -82,6 +85,7 @@ enum llama_example {
|
|
|
82
85
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
86
|
LLAMA_EXAMPLE_TTS,
|
|
84
87
|
LLAMA_EXAMPLE_DIFFUSION,
|
|
88
|
+
LLAMA_EXAMPLE_FINETUNE,
|
|
85
89
|
|
|
86
90
|
LLAMA_EXAMPLE_COUNT,
|
|
87
91
|
};
|
|
@@ -202,6 +206,7 @@ struct common_params_speculative {
|
|
|
202
206
|
float p_split = 0.1f; // speculative decoding split probability
|
|
203
207
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
|
204
208
|
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
|
209
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
205
210
|
|
|
206
211
|
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
|
207
212
|
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
|
@@ -234,13 +239,36 @@ struct common_params_diffusion {
|
|
|
234
239
|
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
|
|
235
240
|
};
|
|
236
241
|
|
|
242
|
+
// reasoning API response format (not to be confused as chat template's reasoning format)
|
|
237
243
|
enum common_reasoning_format {
|
|
238
244
|
COMMON_REASONING_FORMAT_NONE,
|
|
239
|
-
COMMON_REASONING_FORMAT_AUTO,
|
|
245
|
+
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
|
|
240
246
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
241
247
|
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
|
248
|
+
// do not extend this enum unless you absolutely have to
|
|
249
|
+
// in most cases, use COMMON_REASONING_FORMAT_AUTO
|
|
250
|
+
// see: https://github.com/ggml-org/llama.cpp/pull/15408
|
|
242
251
|
};
|
|
243
252
|
|
|
253
|
+
|
|
254
|
+
struct lr_opt {
|
|
255
|
+
float lr0 = 1e-5; // learning rate at first epoch
|
|
256
|
+
float lr_min = -1;
|
|
257
|
+
float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
|
|
258
|
+
float scale_epoch = 0;
|
|
259
|
+
float wd = 0;
|
|
260
|
+
unsigned epochs = 2;
|
|
261
|
+
|
|
262
|
+
unsigned epoch; // set by optimizer outer (epochs) loop
|
|
263
|
+
// learning rate decay - constant LR per epoch only for now
|
|
264
|
+
float get_lr(float e) const;
|
|
265
|
+
float get_lr() const { return get_lr(epoch); }
|
|
266
|
+
// must call after arg parse, before get_lr
|
|
267
|
+
void init();
|
|
268
|
+
};
|
|
269
|
+
|
|
270
|
+
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
271
|
+
|
|
244
272
|
struct common_params {
|
|
245
273
|
bool vocab_only = false;
|
|
246
274
|
int32_t n_predict = -1; // new tokens to predict
|
|
@@ -348,7 +376,7 @@ struct common_params {
|
|
|
348
376
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
349
377
|
bool flash_attn = false; // flash attention
|
|
350
378
|
bool no_perf = false; // disable performance metrics
|
|
351
|
-
bool ctx_shift =
|
|
379
|
+
bool ctx_shift = false; // context shift on inifinite text generation
|
|
352
380
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
353
381
|
bool kv_unified = false; // enable unified KV cache
|
|
354
382
|
|
|
@@ -376,6 +404,11 @@ struct common_params {
|
|
|
376
404
|
bool no_mmproj = false; // explicitly disable multimodal model
|
|
377
405
|
std::vector<std::string> image; // path to image file(s)
|
|
378
406
|
|
|
407
|
+
// finetune
|
|
408
|
+
struct lr_opt lr;
|
|
409
|
+
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
|
410
|
+
float val_split = 0.05f; // fraction of the data used for the validation set
|
|
411
|
+
|
|
379
412
|
// embedding
|
|
380
413
|
bool embedding = false; // get only sentence embedding
|
|
381
414
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
@@ -384,11 +417,12 @@ struct common_params {
|
|
|
384
417
|
std::string cls_sep = "\t"; // separator of classification sequences
|
|
385
418
|
|
|
386
419
|
// server params
|
|
387
|
-
int32_t port
|
|
388
|
-
int32_t timeout_read
|
|
389
|
-
int32_t timeout_write
|
|
390
|
-
int32_t n_threads_http
|
|
391
|
-
int32_t n_cache_reuse
|
|
420
|
+
int32_t port = 8080; // server listens on this network port
|
|
421
|
+
int32_t timeout_read = 600; // http read timeout in seconds
|
|
422
|
+
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
|
423
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
|
424
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
|
425
|
+
int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
|
|
392
426
|
|
|
393
427
|
std::string hostname = "127.0.0.1";
|
|
394
428
|
std::string public_path = ""; // NOLINT
|
|
@@ -703,3 +737,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
|
|
|
703
737
|
//
|
|
704
738
|
|
|
705
739
|
ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
|
|
740
|
+
|
|
741
|
+
// "adamw" or "sgd" (case insensitive)
|
|
742
|
+
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
|
|
@@ -176,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
|
|
|
176
176
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
177
177
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
178
178
|
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
|
179
|
+
option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
|
|
179
180
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
180
181
|
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
181
182
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
@@ -187,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
|
|
|
187
188
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
|
188
189
|
option(GGML_WEBGPU "ggml: use WebGPU" OFF)
|
|
189
190
|
option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
|
|
191
|
+
option(GGML_ZDNN "ggml: use zDNN" OFF)
|
|
190
192
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
|
191
193
|
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
|
|
192
194
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|