@fugood/llama.node 1.1.5 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +4 -0
- package/lib/index.js +6 -1
- package/lib/index.ts +6 -0
- package/lib/version.js +5 -0
- package/lib/version.ts +2 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +19 -15
- package/src/LlamaCompletionWorker.cpp +73 -18
- package/src/LlamaCompletionWorker.h +8 -0
- package/src/llama.cpp/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +147 -46
- package/src/llama.cpp/common/chat-parser.cpp +9 -1
- package/src/llama.cpp/common/chat.cpp +350 -3
- package/src/llama.cpp/common/chat.h +11 -3
- package/src/llama.cpp/common/common.cpp +54 -0
- package/src/llama.cpp/common/common.h +44 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
- package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/src/llama.cpp/ggml/include/ggml.h +65 -3
- package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +26 -0
- package/src/llama.cpp/src/llama-arch.cpp +65 -0
- package/src/llama.cpp/src/llama-arch.h +10 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +15 -4
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +37 -25
- package/src/llama.cpp/src/llama-context.h +6 -5
- package/src/llama.cpp/src/llama-graph.cpp +118 -9
- package/src/llama.cpp/src/llama-graph.h +38 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
- package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
- package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
- package/src/llama.cpp/src/llama-memory.h +2 -2
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +500 -4
- package/src/llama.cpp/src/llama-model.h +25 -4
- package/src/llama.cpp/src/llama-quant.cpp +37 -1
- package/src/llama.cpp/src/llama-vocab.cpp +43 -0
|
@@ -132,6 +132,8 @@ struct templates_params {
|
|
|
132
132
|
bool enable_thinking = true;
|
|
133
133
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
134
134
|
json extra_context;
|
|
135
|
+
bool add_bos;
|
|
136
|
+
bool add_eos;
|
|
135
137
|
};
|
|
136
138
|
|
|
137
139
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
|
|
@@ -281,6 +283,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
|
|
|
281
283
|
}
|
|
282
284
|
if (!msg.reasoning_content.empty()) {
|
|
283
285
|
jmsg["reasoning_content"] = msg.reasoning_content;
|
|
286
|
+
jmsg["thinking"] = msg.reasoning_content; // gpt-oss
|
|
284
287
|
}
|
|
285
288
|
if (!msg.tool_name.empty()) {
|
|
286
289
|
jmsg["name"] = msg.tool_name;
|
|
@@ -434,6 +437,8 @@ std::string common_chat_format_single(
|
|
|
434
437
|
|
|
435
438
|
common_chat_templates_inputs inputs;
|
|
436
439
|
inputs.use_jinja = use_jinja;
|
|
440
|
+
inputs.add_bos = tmpls->add_bos;
|
|
441
|
+
inputs.add_eos = tmpls->add_eos;
|
|
437
442
|
|
|
438
443
|
std::string fmt_past_msg;
|
|
439
444
|
if (!past_msg.empty()) {
|
|
@@ -455,9 +460,12 @@ std::string common_chat_format_single(
|
|
|
455
460
|
return ss.str();
|
|
456
461
|
}
|
|
457
462
|
|
|
458
|
-
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
|
|
463
|
+
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
|
|
459
464
|
common_chat_templates_inputs inputs;
|
|
460
465
|
inputs.use_jinja = use_jinja;
|
|
466
|
+
inputs.add_bos = tmpls->add_bos;
|
|
467
|
+
inputs.add_eos = tmpls->add_eos;
|
|
468
|
+
inputs.chat_template_kwargs = chat_template_kwargs;
|
|
461
469
|
auto add_simple_msg = [&](auto role, auto content) {
|
|
462
470
|
common_chat_msg msg;
|
|
463
471
|
msg.role = role;
|
|
@@ -533,8 +541,21 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
533
541
|
default_template_src = CHATML_TEMPLATE_SRC;
|
|
534
542
|
}
|
|
535
543
|
}
|
|
544
|
+
|
|
545
|
+
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
|
|
546
|
+
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
|
|
547
|
+
if (default_template_src.find("<|channel|>") != std::string::npos
|
|
548
|
+
// search for the error message and patch it
|
|
549
|
+
&& default_template_src.find("in message.content or") != std::string::npos) {
|
|
550
|
+
string_replace_all(default_template_src,
|
|
551
|
+
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
|
|
552
|
+
"{%- if false %}");
|
|
553
|
+
}
|
|
554
|
+
|
|
536
555
|
std::string token_bos = bos_token_override;
|
|
537
556
|
std::string token_eos = eos_token_override;
|
|
557
|
+
bool add_bos = false;
|
|
558
|
+
bool add_eos = false;
|
|
538
559
|
if (model) {
|
|
539
560
|
const auto * vocab = llama_model_get_vocab(model);
|
|
540
561
|
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
|
|
@@ -549,9 +570,13 @@ common_chat_templates_ptr common_chat_templates_init(
|
|
|
549
570
|
};
|
|
550
571
|
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
|
|
551
572
|
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
|
|
573
|
+
add_bos = llama_vocab_get_add_bos(vocab);
|
|
574
|
+
add_eos = llama_vocab_get_add_eos(vocab);
|
|
552
575
|
}
|
|
553
576
|
common_chat_templates_ptr tmpls(new common_chat_templates());
|
|
554
577
|
tmpls->has_explicit_template = has_explicit_template;
|
|
578
|
+
tmpls->add_bos = add_bos;
|
|
579
|
+
tmpls->add_eos = add_eos;
|
|
555
580
|
try {
|
|
556
581
|
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
|
|
557
582
|
} catch (const std::exception & e) {
|
|
@@ -581,6 +606,8 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
581
606
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
582
607
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
583
608
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
609
|
+
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
610
|
+
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
584
611
|
default:
|
|
585
612
|
throw std::runtime_error("Unknown chat format");
|
|
586
613
|
}
|
|
@@ -589,13 +616,28 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
589
616
|
const char * common_reasoning_format_name(common_reasoning_format format) {
|
|
590
617
|
switch (format) {
|
|
591
618
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
|
619
|
+
case COMMON_REASONING_FORMAT_AUTO: return "auto";
|
|
592
620
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
|
593
621
|
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
|
|
622
|
+
case COMMON_REASONING_FORMAT_GRANITE: return "granite";
|
|
594
623
|
default:
|
|
595
624
|
throw std::runtime_error("Unknown reasoning format");
|
|
596
625
|
}
|
|
597
626
|
}
|
|
598
627
|
|
|
628
|
+
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
|
|
629
|
+
if (format == "none") {
|
|
630
|
+
return COMMON_REASONING_FORMAT_NONE;
|
|
631
|
+
} else if (format == "auto") {
|
|
632
|
+
return COMMON_REASONING_FORMAT_AUTO;
|
|
633
|
+
} else if (format == "deepseek") {
|
|
634
|
+
return COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
635
|
+
} else if (format == "deepseek-legacy") {
|
|
636
|
+
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
|
|
637
|
+
}
|
|
638
|
+
throw std::runtime_error("Unknown reasoning format: " + format);
|
|
639
|
+
}
|
|
640
|
+
|
|
599
641
|
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
|
600
642
|
std::string arguments;
|
|
601
643
|
if (builder.is_partial()) {
|
|
@@ -737,10 +779,10 @@ static std::string apply(
|
|
|
737
779
|
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
|
|
738
780
|
// may be needed inside the template / between messages too.
|
|
739
781
|
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
|
|
740
|
-
if (string_starts_with(result, tmpl.bos_token())) {
|
|
782
|
+
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
|
|
741
783
|
result = result.substr(tmpl.bos_token().size());
|
|
742
784
|
}
|
|
743
|
-
if (string_ends_with(result, tmpl.eos_token())) {
|
|
785
|
+
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
|
|
744
786
|
result = result.substr(0, result.size() - tmpl.eos_token().size());
|
|
745
787
|
}
|
|
746
788
|
return result;
|
|
@@ -1278,6 +1320,174 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1278
1320
|
tool_calls_end);
|
|
1279
1321
|
}
|
|
1280
1322
|
|
|
1323
|
+
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1324
|
+
common_chat_params data;
|
|
1325
|
+
auto prompt = apply(tmpl, inputs);
|
|
1326
|
+
|
|
1327
|
+
data.prompt = prompt;
|
|
1328
|
+
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
|
|
1329
|
+
|
|
1330
|
+
// These special tokens are required to parse properly, so we include them
|
|
1331
|
+
// even if parse_tool_calls is false.
|
|
1332
|
+
data.preserved_tokens = {
|
|
1333
|
+
"<|channel|>",
|
|
1334
|
+
"<|constrain|>",
|
|
1335
|
+
"<|message|>",
|
|
1336
|
+
"<|start|>",
|
|
1337
|
+
"<|end|>",
|
|
1338
|
+
};
|
|
1339
|
+
|
|
1340
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1341
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1342
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1343
|
+
// tool calls can appear in commentary or analysis channels
|
|
1344
|
+
auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
|
|
1345
|
+
|
|
1346
|
+
std::vector<std::string> tool_rules_recipient_in_role;
|
|
1347
|
+
std::vector<std::string> tool_rules_recipient_in_channel;
|
|
1348
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1349
|
+
const auto & function = tool.at("function");
|
|
1350
|
+
std::string name = function.at("name");
|
|
1351
|
+
auto parameters = function.at("parameters");
|
|
1352
|
+
builder.resolve_refs(parameters);
|
|
1353
|
+
|
|
1354
|
+
tool_rules_recipient_in_role.push_back(
|
|
1355
|
+
builder.add_rule(name + "-call",
|
|
1356
|
+
"\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
|
|
1357
|
+
builder.add_schema(name + "-args", parameters)
|
|
1358
|
+
)
|
|
1359
|
+
);
|
|
1360
|
+
|
|
1361
|
+
tool_rules_recipient_in_channel.push_back(
|
|
1362
|
+
builder.add_rule(name + "-call",
|
|
1363
|
+
"\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
|
|
1364
|
+
builder.add_schema(name + "-args", parameters)
|
|
1365
|
+
)
|
|
1366
|
+
);
|
|
1367
|
+
});
|
|
1368
|
+
|
|
1369
|
+
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
|
1370
|
+
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
|
1371
|
+
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
|
1372
|
+
);
|
|
1373
|
+
|
|
1374
|
+
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
|
1375
|
+
channel + " \" to=functions.\" ( " +
|
|
1376
|
+
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
|
1377
|
+
);
|
|
1378
|
+
|
|
1379
|
+
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
|
1380
|
+
|
|
1381
|
+
// Trigger on tool calls that appear in the commentary channel
|
|
1382
|
+
data.grammar_triggers.push_back({
|
|
1383
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
1384
|
+
"<\\|channel\\|>(commentary|analysis) to"
|
|
1385
|
+
});
|
|
1386
|
+
|
|
1387
|
+
// Trigger tool calls that appear in the role section, either at the
|
|
1388
|
+
// start or in the middle.
|
|
1389
|
+
data.grammar_triggers.push_back({
|
|
1390
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1391
|
+
"^ to"
|
|
1392
|
+
});
|
|
1393
|
+
|
|
1394
|
+
data.grammar_triggers.push_back({
|
|
1395
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
|
1396
|
+
"<\\|start\\|>assistant to"
|
|
1397
|
+
});
|
|
1398
|
+
});
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
return data;
|
|
1402
|
+
}
|
|
1403
|
+
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
|
|
1404
|
+
static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
|
|
1405
|
+
static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
|
|
1406
|
+
|
|
1407
|
+
static const common_regex start_regex("<\\|start\\|>assistant");
|
|
1408
|
+
static const common_regex analysis_regex("<\\|channel\\|>analysis");
|
|
1409
|
+
static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
|
|
1410
|
+
static const common_regex preamble_regex("<\\|channel\\|>commentary");
|
|
1411
|
+
static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
|
|
1412
|
+
static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
|
|
1413
|
+
|
|
1414
|
+
auto consume_end = [&](bool include_end = false) {
|
|
1415
|
+
if (auto res = builder.try_find_literal("<|end|>")) {
|
|
1416
|
+
return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
|
|
1417
|
+
}
|
|
1418
|
+
return builder.consume_rest();
|
|
1419
|
+
};
|
|
1420
|
+
|
|
1421
|
+
auto handle_tool_call = [&](const std::string & name) {
|
|
1422
|
+
if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
|
|
1423
|
+
if (builder.syntax().parse_tool_calls) {
|
|
1424
|
+
if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
|
|
1425
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1426
|
+
}
|
|
1427
|
+
} else if (args->is_partial) {
|
|
1428
|
+
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
1429
|
+
}
|
|
1430
|
+
}
|
|
1431
|
+
};
|
|
1432
|
+
|
|
1433
|
+
auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
|
|
1434
|
+
auto match = regex.search(input, 0, true);
|
|
1435
|
+
if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
|
|
1436
|
+
return match;
|
|
1437
|
+
}
|
|
1438
|
+
return std::nullopt;
|
|
1439
|
+
};
|
|
1440
|
+
|
|
1441
|
+
do {
|
|
1442
|
+
auto header_start_pos = builder.pos();
|
|
1443
|
+
auto content_start = builder.try_find_literal("<|message|>");
|
|
1444
|
+
if (!content_start) {
|
|
1445
|
+
throw common_chat_msg_partial_exception("incomplete header");
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
auto header = content_start->prelude;
|
|
1449
|
+
|
|
1450
|
+
if (auto match = regex_match(tool_call1_regex, header)) {
|
|
1451
|
+
auto group = match->groups[1];
|
|
1452
|
+
auto name = header.substr(group.begin, group.end - group.begin);
|
|
1453
|
+
handle_tool_call(name);
|
|
1454
|
+
continue;
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
if (auto match = regex_match(tool_call2_regex, header)) {
|
|
1458
|
+
auto group = match->groups[2];
|
|
1459
|
+
auto name = header.substr(group.begin, group.end - group.begin);
|
|
1460
|
+
handle_tool_call(name);
|
|
1461
|
+
continue;
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
if (regex_match(analysis_regex, header)) {
|
|
1465
|
+
builder.move_to(header_start_pos);
|
|
1466
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
|
|
1467
|
+
builder.add_content(consume_end(true));
|
|
1468
|
+
} else {
|
|
1469
|
+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
|
|
1470
|
+
}
|
|
1471
|
+
continue;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
|
|
1475
|
+
builder.add_content(consume_end());
|
|
1476
|
+
continue;
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
// Possibly a malformed message, attempt to recover by rolling
|
|
1480
|
+
// back to pick up the next <|start|>
|
|
1481
|
+
LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
|
|
1482
|
+
builder.move_to(header_start_pos);
|
|
1483
|
+
} while (builder.try_find_regex(start_regex, std::string::npos, false));
|
|
1484
|
+
|
|
1485
|
+
auto remaining = builder.consume_rest();
|
|
1486
|
+
if (!remaining.empty()) {
|
|
1487
|
+
LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
|
|
1488
|
+
}
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1281
1491
|
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1282
1492
|
LOG_DBG("%s\n", __func__);
|
|
1283
1493
|
common_chat_params data;
|
|
@@ -1687,6 +1897,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
|
|
|
1687
1897
|
builder.add_content(builder.consume_rest());
|
|
1688
1898
|
}
|
|
1689
1899
|
|
|
1900
|
+
static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1901
|
+
common_chat_params data;
|
|
1902
|
+
|
|
1903
|
+
// Pass thinking context for Granite template
|
|
1904
|
+
json additional_context = {
|
|
1905
|
+
{"thinking", inputs.enable_thinking},
|
|
1906
|
+
};
|
|
1907
|
+
|
|
1908
|
+
data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
|
|
1909
|
+
data.format = COMMON_CHAT_FORMAT_GRANITE;
|
|
1910
|
+
|
|
1911
|
+
if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
|
|
1912
|
+
if (!inputs.enable_thinking) {
|
|
1913
|
+
data.prompt += "</think>";
|
|
1914
|
+
} else {
|
|
1915
|
+
data.thinking_forced_open = true;
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
if (!inputs.tools.is_null()) {
|
|
1920
|
+
// Granite uses <|tool_call|> followed by JSON list
|
|
1921
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
1922
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1923
|
+
std::vector<std::string> tool_rules;
|
|
1924
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1925
|
+
const auto & function = tool.at("function");
|
|
1926
|
+
std::string name = function.at("name");
|
|
1927
|
+
auto parameters = function.at("parameters");
|
|
1928
|
+
builder.resolve_refs(parameters);
|
|
1929
|
+
tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
|
|
1930
|
+
"-args", {
|
|
1931
|
+
{"type", "object"},
|
|
1932
|
+
{"properties", {
|
|
1933
|
+
{"name", {{"const", name}}},
|
|
1934
|
+
{"arguments", parameters},
|
|
1935
|
+
}},
|
|
1936
|
+
{"required", json::array({"name", "arguments"})},
|
|
1937
|
+
})));
|
|
1938
|
+
});
|
|
1939
|
+
|
|
1940
|
+
auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
|
|
1941
|
+
auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
|
|
1942
|
+
|
|
1943
|
+
if (data.thinking_forced_open) {
|
|
1944
|
+
builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
|
|
1945
|
+
} else {
|
|
1946
|
+
builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
|
|
1947
|
+
}
|
|
1948
|
+
|
|
1949
|
+
data.grammar_triggers.push_back({
|
|
1950
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
|
1951
|
+
"<|tool_call|>"
|
|
1952
|
+
});
|
|
1953
|
+
|
|
1954
|
+
data.preserved_tokens = {
|
|
1955
|
+
"<think>",
|
|
1956
|
+
"</think>",
|
|
1957
|
+
"<response>",
|
|
1958
|
+
"</response>",
|
|
1959
|
+
"<|tool_call|>",
|
|
1960
|
+
};
|
|
1961
|
+
});
|
|
1962
|
+
} else {
|
|
1963
|
+
// Handle thinking tags for non-tool responses
|
|
1964
|
+
if (data.thinking_forced_open && inputs.enable_thinking) {
|
|
1965
|
+
data.grammar_lazy = false;
|
|
1966
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1967
|
+
builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
|
|
1968
|
+
});
|
|
1969
|
+
data.preserved_tokens = {
|
|
1970
|
+
"<think>",
|
|
1971
|
+
"</think>",
|
|
1972
|
+
"<response>",
|
|
1973
|
+
"</response>",
|
|
1974
|
+
};
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1977
|
+
|
|
1978
|
+
return data;
|
|
1979
|
+
}
|
|
1980
|
+
|
|
1981
|
+
static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
1982
|
+
// Parse thinking tags
|
|
1983
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
1984
|
+
|
|
1985
|
+
// Parse response tags using regex
|
|
1986
|
+
static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
|
|
1987
|
+
if (auto res = builder.try_find_regex(response_regex)) {
|
|
1988
|
+
// Extract the content between the tags (capture group 1)
|
|
1989
|
+
auto content = builder.str(res->groups[1]);
|
|
1990
|
+
builder.add_content(content);
|
|
1991
|
+
builder.move_to(res->groups[0].end);
|
|
1992
|
+
}
|
|
1993
|
+
|
|
1994
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1995
|
+
builder.add_content(builder.consume_rest());
|
|
1996
|
+
return;
|
|
1997
|
+
}
|
|
1998
|
+
|
|
1999
|
+
// Look for tool calls
|
|
2000
|
+
static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
|
|
2001
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2002
|
+
builder.move_to(res->groups[0].end);
|
|
2003
|
+
|
|
2004
|
+
// Expect JSON array of tool calls
|
|
2005
|
+
auto tool_calls_data = builder.consume_json();
|
|
2006
|
+
if (tool_calls_data.json.is_array()) {
|
|
2007
|
+
if (!builder.add_tool_calls(tool_calls_data.json)) {
|
|
2008
|
+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2009
|
+
}
|
|
2010
|
+
} else {
|
|
2011
|
+
builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
|
|
2012
|
+
}
|
|
2013
|
+
} else {
|
|
2014
|
+
builder.add_content(builder.consume_rest());
|
|
2015
|
+
}
|
|
2016
|
+
}
|
|
2017
|
+
|
|
1690
2018
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1691
2019
|
common_chat_params data;
|
|
1692
2020
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -1720,6 +2048,8 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1720
2048
|
params.enable_thinking = inputs.enable_thinking;
|
|
1721
2049
|
params.grammar = inputs.grammar;
|
|
1722
2050
|
params.now = inputs.now;
|
|
2051
|
+
params.add_bos = tmpls->add_bos;
|
|
2052
|
+
params.add_eos = tmpls->add_eos;
|
|
1723
2053
|
|
|
1724
2054
|
params.extra_context = json::object();
|
|
1725
2055
|
for (auto el : inputs.chat_template_kwargs) {
|
|
@@ -1756,11 +2086,21 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
1756
2086
|
return common_chat_params_init_command_r7b(tmpl, params);
|
|
1757
2087
|
}
|
|
1758
2088
|
|
|
2089
|
+
// Granite (IBM) - detects thinking / tools support
|
|
2090
|
+
if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
|
|
2091
|
+
return common_chat_params_init_granite(tmpl, params);
|
|
2092
|
+
}
|
|
2093
|
+
|
|
1759
2094
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
|
1760
2095
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
|
1761
2096
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
|
1762
2097
|
}
|
|
1763
2098
|
|
|
2099
|
+
// GPT-OSS
|
|
2100
|
+
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
|
|
2101
|
+
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2102
|
+
}
|
|
2103
|
+
|
|
1764
2104
|
// Use generic handler when mixing tools + JSON schema.
|
|
1765
2105
|
// TODO: support that mix in handlers below.
|
|
1766
2106
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -1811,6 +2151,7 @@ static common_chat_params common_chat_templates_apply_legacy(
|
|
|
1811
2151
|
int alloc_size = 0;
|
|
1812
2152
|
std::vector<llama_chat_message> chat;
|
|
1813
2153
|
std::vector<std::string> contents;
|
|
2154
|
+
|
|
1814
2155
|
for (const auto & msg : inputs.messages) {
|
|
1815
2156
|
auto content = msg.content;
|
|
1816
2157
|
for (const auto & part : msg.content_parts) {
|
|
@@ -1912,6 +2253,12 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
1912
2253
|
case COMMON_CHAT_FORMAT_COMMAND_R7B:
|
|
1913
2254
|
common_chat_parse_command_r7b(builder);
|
|
1914
2255
|
break;
|
|
2256
|
+
case COMMON_CHAT_FORMAT_GRANITE:
|
|
2257
|
+
common_chat_parse_granite(builder);
|
|
2258
|
+
break;
|
|
2259
|
+
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
2260
|
+
common_chat_parse_gpt_oss(builder);
|
|
2261
|
+
break;
|
|
1915
2262
|
default:
|
|
1916
2263
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
1917
2264
|
}
|
|
@@ -9,12 +9,14 @@
|
|
|
9
9
|
#include <vector>
|
|
10
10
|
#include <map>
|
|
11
11
|
|
|
12
|
-
#include
|
|
13
|
-
#include
|
|
12
|
+
#include "minja/chat-template.hpp"
|
|
13
|
+
#include "minja/minja.hpp"
|
|
14
14
|
|
|
15
15
|
typedef minja::chat_template common_chat_template;
|
|
16
16
|
|
|
17
17
|
struct common_chat_templates {
|
|
18
|
+
bool add_bos;
|
|
19
|
+
bool add_eos;
|
|
18
20
|
bool has_explicit_template; // Model had builtin template or template overridde was specified.
|
|
19
21
|
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
|
|
20
22
|
std::unique_ptr<common_chat_template> template_tool_use;
|
|
@@ -118,6 +120,8 @@ enum common_chat_format {
|
|
|
118
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
119
121
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
120
122
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
|
+
COMMON_CHAT_FORMAT_GRANITE,
|
|
124
|
+
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
121
125
|
|
|
122
126
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
123
127
|
};
|
|
@@ -136,6 +140,8 @@ struct common_chat_templates_inputs {
|
|
|
136
140
|
bool enable_thinking = true;
|
|
137
141
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
138
142
|
std::map<std::string, std::string> chat_template_kwargs;
|
|
143
|
+
bool add_bos = false;
|
|
144
|
+
bool add_eos = false;
|
|
139
145
|
};
|
|
140
146
|
|
|
141
147
|
struct common_chat_params {
|
|
@@ -192,10 +198,12 @@ std::string common_chat_format_single(
|
|
|
192
198
|
// Returns an example of formatted chat
|
|
193
199
|
std::string common_chat_format_example(
|
|
194
200
|
const struct common_chat_templates * tmpls,
|
|
195
|
-
bool use_jinja
|
|
201
|
+
bool use_jinja,
|
|
202
|
+
const std::map<std::string, std::string> & chat_template_kwargs);
|
|
196
203
|
|
|
197
204
|
const char* common_chat_format_name(common_chat_format format);
|
|
198
205
|
const char* common_reasoning_format_name(common_reasoning_format format);
|
|
206
|
+
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
|
|
199
207
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
|
200
208
|
|
|
201
209
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
@@ -41,6 +41,7 @@
|
|
|
41
41
|
#endif
|
|
42
42
|
#include <locale>
|
|
43
43
|
#include <windows.h>
|
|
44
|
+
#include <string.h>
|
|
44
45
|
#include <fcntl.h>
|
|
45
46
|
#include <io.h>
|
|
46
47
|
#else
|
|
@@ -1566,3 +1567,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
|
|
|
1566
1567
|
|
|
1567
1568
|
return result;
|
|
1568
1569
|
}
|
|
1570
|
+
|
|
1571
|
+
ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
|
|
1572
|
+
ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
|
|
1573
|
+
const lr_opt & d = *(lr_opt *) userdata;
|
|
1574
|
+
result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
|
|
1575
|
+
result.sgd.wd = result.adamw.wd = d.wd;
|
|
1576
|
+
return result;
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
// TODO make all command line args case-insensitive
|
|
1580
|
+
static inline bool eq_case_insensitive(char const* a, char const* b) {
|
|
1581
|
+
return !
|
|
1582
|
+
#if defined(_MSC_VER)
|
|
1583
|
+
_stricmp
|
|
1584
|
+
#else
|
|
1585
|
+
strcasecmp
|
|
1586
|
+
#endif // defined(_MSC_VER)
|
|
1587
|
+
(a, b);
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
|
|
1591
|
+
if (eq_case_insensitive("adamw", n)) {
|
|
1592
|
+
return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
|
|
1593
|
+
}
|
|
1594
|
+
if (eq_case_insensitive("sgd", n)) {
|
|
1595
|
+
return GGML_OPT_OPTIMIZER_TYPE_SGD;
|
|
1596
|
+
}
|
|
1597
|
+
return GGML_OPT_OPTIMIZER_TYPE_COUNT;
|
|
1598
|
+
}
|
|
1599
|
+
|
|
1600
|
+
// TODO simplify to use just log and exp
|
|
1601
|
+
static float const k_log_2 = std::log(2.f);
|
|
1602
|
+
|
|
1603
|
+
void lr_opt::init() {
|
|
1604
|
+
if (lr_min > 0 && lr_min < lr0) {
|
|
1605
|
+
float nhalf = std::log(lr0 / lr_min) / k_log_2;
|
|
1606
|
+
float e = epochs;
|
|
1607
|
+
if (decay_epochs > 0 && decay_epochs < e) {
|
|
1608
|
+
e = decay_epochs;
|
|
1609
|
+
} else {
|
|
1610
|
+
decay_epochs = e;
|
|
1611
|
+
}
|
|
1612
|
+
scale_epoch = nhalf / e;
|
|
1613
|
+
}
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
float lr_opt::get_lr(float epoch) const {
|
|
1617
|
+
float r = lr_min <= 0 ? lr0 :
|
|
1618
|
+
epoch >= decay_epochs ? lr_min :
|
|
1619
|
+
lr0 * std::pow(0.5f, epoch * scale_epoch);
|
|
1620
|
+
LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
|
|
1621
|
+
return r;
|
|
1622
|
+
}
|