@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
|
|
150
150
|
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
|
151
151
|
}
|
|
152
152
|
|
|
153
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
|
154
|
+
common_chat_templates_inputs dummy_inputs;
|
|
155
|
+
common_chat_msg msg;
|
|
156
|
+
msg.role = "user";
|
|
157
|
+
msg.content = "test";
|
|
158
|
+
dummy_inputs.messages = {msg};
|
|
159
|
+
dummy_inputs.enable_thinking = false;
|
|
160
|
+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
161
|
+
dummy_inputs.enable_thinking = true;
|
|
162
|
+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
163
|
+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
|
164
|
+
}
|
|
165
|
+
|
|
153
166
|
template <>
|
|
154
167
|
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
|
155
168
|
std::vector<common_chat_msg> msgs;
|
|
@@ -605,11 +618,13 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
605
618
|
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
|
|
606
619
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
|
|
607
620
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
|
|
621
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
|
|
608
622
|
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
|
|
609
623
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
610
624
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
611
625
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
612
626
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
627
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
613
628
|
default:
|
|
614
629
|
throw std::runtime_error("Unknown chat format");
|
|
615
630
|
}
|
|
@@ -671,11 +686,13 @@ static void parse_json_tool_calls(
|
|
|
671
686
|
size_t from = std::string::npos;
|
|
672
687
|
auto first = true;
|
|
673
688
|
while (true) {
|
|
689
|
+
auto start_pos = builder.pos();
|
|
674
690
|
auto res = function_regex_start_only && first
|
|
675
691
|
? builder.try_consume_regex(*function_regex_start_only)
|
|
676
692
|
: function_regex
|
|
677
693
|
? builder.try_find_regex(*function_regex, from)
|
|
678
694
|
: std::nullopt;
|
|
695
|
+
|
|
679
696
|
if (res) {
|
|
680
697
|
std::string name;
|
|
681
698
|
if (get_function_name) {
|
|
@@ -710,6 +727,8 @@ static void parse_json_tool_calls(
|
|
|
710
727
|
return;
|
|
711
728
|
}
|
|
712
729
|
throw common_chat_msg_partial_exception("incomplete tool call");
|
|
730
|
+
} else {
|
|
731
|
+
builder.move_to(start_pos);
|
|
713
732
|
}
|
|
714
733
|
break;
|
|
715
734
|
}
|
|
@@ -1170,6 +1189,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1170
1189
|
});
|
|
1171
1190
|
return data;
|
|
1172
1191
|
}
|
|
1192
|
+
|
|
1193
|
+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1194
|
+
common_chat_params data;
|
|
1195
|
+
|
|
1196
|
+
// Generate the prompt using the apply() function with the template
|
|
1197
|
+
data.prompt = apply(tmpl, inputs);
|
|
1198
|
+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
|
|
1199
|
+
|
|
1200
|
+
// Handle thinking tags appropriately based on inputs.enable_thinking
|
|
1201
|
+
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1202
|
+
if (!inputs.enable_thinking) {
|
|
1203
|
+
data.prompt += "</think>";
|
|
1204
|
+
} else {
|
|
1205
|
+
data.thinking_forced_open = true;
|
|
1206
|
+
}
|
|
1207
|
+
}
|
|
1208
|
+
|
|
1209
|
+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
|
|
1210
|
+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1211
|
+
data.grammar_lazy = true;
|
|
1212
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1213
|
+
auto schemas = json::array();
|
|
1214
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1215
|
+
const auto & function = tool.at("function");
|
|
1216
|
+
schemas.push_back({
|
|
1217
|
+
{ "type", "object" },
|
|
1218
|
+
{ "properties",
|
|
1219
|
+
{
|
|
1220
|
+
{ "name",
|
|
1221
|
+
{
|
|
1222
|
+
{ "type", "string" },
|
|
1223
|
+
{ "const", function.at("name") },
|
|
1224
|
+
} },
|
|
1225
|
+
{ "arguments", function.at("parameters") },
|
|
1226
|
+
} },
|
|
1227
|
+
{ "required", json::array({ "name", "arguments" }) },
|
|
1228
|
+
});
|
|
1229
|
+
});
|
|
1230
|
+
auto schema = json{
|
|
1231
|
+
{ "type", "array" },
|
|
1232
|
+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
|
1233
|
+
{ "minItems", 1 },
|
|
1234
|
+
};
|
|
1235
|
+
if (!inputs.parallel_tool_calls) {
|
|
1236
|
+
schema["maxItems"] = 1;
|
|
1237
|
+
}
|
|
1238
|
+
builder.add_rule("root",
|
|
1239
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1240
|
+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
|
|
1241
|
+
" \"</TOOLCALL>\"");
|
|
1242
|
+
});
|
|
1243
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1244
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1245
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1246
|
+
std::string(data.thinking_forced_open ?
|
|
1247
|
+
"[\\s\\S]*?(</think>\\s*)" :
|
|
1248
|
+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1249
|
+
"(<TOOLCALL>)[\\s\\S]*" });
|
|
1250
|
+
}
|
|
1251
|
+
return data;
|
|
1252
|
+
}
|
|
1173
1253
|
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
|
1174
1254
|
if (!builder.syntax().parse_tool_calls) {
|
|
1175
1255
|
builder.add_content(builder.consume_rest());
|
|
@@ -1299,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
|
|
1299
1379
|
}
|
|
1300
1380
|
return data;
|
|
1301
1381
|
}
|
|
1382
|
+
|
|
1383
|
+
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1384
|
+
common_chat_params data;
|
|
1385
|
+
|
|
1386
|
+
// Pass thinking context for DeepSeek V3.1 template
|
|
1387
|
+
json additional_context = {
|
|
1388
|
+
{"thinking", inputs.enable_thinking},
|
|
1389
|
+
};
|
|
1390
|
+
|
|
1391
|
+
auto prompt = apply(tmpl, inputs,
|
|
1392
|
+
/* messages_override= */ inputs.messages,
|
|
1393
|
+
/* tools_override= */ std::nullopt,
|
|
1394
|
+
additional_context);
|
|
1395
|
+
data.prompt = prompt;
|
|
1396
|
+
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
|
|
1397
|
+
if (string_ends_with(data.prompt, "<think>")) {
|
|
1398
|
+
if (!inputs.enable_thinking) {
|
|
1399
|
+
data.prompt += "</think>";
|
|
1400
|
+
} else {
|
|
1401
|
+
data.thinking_forced_open = true;
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1405
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
|
1406
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1407
|
+
std::vector<std::string> tool_rules;
|
|
1408
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1409
|
+
const auto & function = tool.at("function");
|
|
1410
|
+
std::string name = function.at("name");
|
|
1411
|
+
auto parameters = function.at("parameters");
|
|
1412
|
+
builder.resolve_refs(parameters);
|
|
1413
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
1414
|
+
"( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
|
|
1415
|
+
"\" " + builder.add_schema(name + "-args", parameters) + " "
|
|
1416
|
+
"\"<|tool▁call▁end|>\""));
|
|
1417
|
+
});
|
|
1418
|
+
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
|
|
1419
|
+
// so we accept common variants (then it's all constrained)
|
|
1420
|
+
builder.add_rule("root",
|
|
1421
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1422
|
+
"( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
|
|
1423
|
+
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
|
|
1424
|
+
"\"<|tool▁calls▁end|>\""
|
|
1425
|
+
" space");
|
|
1426
|
+
data.grammar_triggers.push_back({
|
|
1427
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1428
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1429
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1430
|
+
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1431
|
+
"(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
|
|
1432
|
+
});
|
|
1433
|
+
data.preserved_tokens = {
|
|
1434
|
+
"<think>",
|
|
1435
|
+
"</think>",
|
|
1436
|
+
"<|tool▁calls▁begin|>",
|
|
1437
|
+
"<|tool▁call▁begin|>",
|
|
1438
|
+
"<|tool▁sep|>",
|
|
1439
|
+
"<|tool▁call▁end|>",
|
|
1440
|
+
"<|tool▁calls▁end|>",
|
|
1441
|
+
};
|
|
1442
|
+
});
|
|
1443
|
+
}
|
|
1444
|
+
return data;
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1302
1447
|
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
1303
1448
|
builder.try_parse_reasoning("<think>", "</think>");
|
|
1304
1449
|
if (!builder.syntax().parse_tool_calls) {
|
|
@@ -1320,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
|
|
|
1320
1465
|
tool_calls_end);
|
|
1321
1466
|
}
|
|
1322
1467
|
|
|
1468
|
+
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
|
|
1469
|
+
static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
|
|
1470
|
+
|
|
1471
|
+
static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
|
|
1472
|
+
static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
|
|
1473
|
+
static const common_regex tool_calls_end("<|tool▁calls▁end|>");
|
|
1474
|
+
|
|
1475
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
1476
|
+
LOG_DBG("%s: not parse_tool_calls\n", __func__);
|
|
1477
|
+
builder.add_content(builder.consume_rest());
|
|
1478
|
+
return;
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
LOG_DBG("%s: parse_tool_calls\n", __func__);
|
|
1482
|
+
|
|
1483
|
+
parse_json_tool_calls(
|
|
1484
|
+
builder,
|
|
1485
|
+
/* block_open= */ tool_calls_begin,
|
|
1486
|
+
/* function_regex_start_only= */ std::nullopt,
|
|
1487
|
+
function_regex,
|
|
1488
|
+
close_regex,
|
|
1489
|
+
tool_calls_end);
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
|
|
1493
|
+
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
|
|
1494
|
+
// First try to parse using the standard reasoning parsing method
|
|
1495
|
+
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
|
|
1496
|
+
|
|
1497
|
+
auto start_pos = builder.pos();
|
|
1498
|
+
auto found_end_think = builder.try_find_literal("</think>");
|
|
1499
|
+
builder.move_to(start_pos);
|
|
1500
|
+
|
|
1501
|
+
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
|
|
1502
|
+
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
|
|
1503
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1504
|
+
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
|
|
1505
|
+
// If reasoning was parsed successfully, the remaining content is regular content
|
|
1506
|
+
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
|
|
1507
|
+
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
|
|
1508
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1509
|
+
} else {
|
|
1510
|
+
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
|
|
1511
|
+
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
|
|
1512
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1513
|
+
return;
|
|
1514
|
+
}
|
|
1515
|
+
// If no reasoning tags found, check if we should treat everything as reasoning
|
|
1516
|
+
if (builder.syntax().thinking_forced_open) {
|
|
1517
|
+
// If thinking is forced open but no tags found, treat everything as reasoning
|
|
1518
|
+
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
|
|
1519
|
+
builder.add_reasoning_content(builder.consume_rest());
|
|
1520
|
+
} else {
|
|
1521
|
+
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
|
|
1522
|
+
// <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
|
|
1523
|
+
common_chat_parse_deepseek_v3_1_content(builder);
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1323
1528
|
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1324
1529
|
common_chat_params data;
|
|
1325
1530
|
auto prompt = apply(tmpl, inputs);
|
|
@@ -1816,7 +2021,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
1816
2021
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1817
2022
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1818
2023
|
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
|
|
1819
|
-
"
|
|
2024
|
+
"\\s*("
|
|
1820
2025
|
"(?:<tool_call>"
|
|
1821
2026
|
"|<function"
|
|
1822
2027
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
@@ -2046,6 +2251,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2046
2251
|
}
|
|
2047
2252
|
}
|
|
2048
2253
|
|
|
2254
|
+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
|
2255
|
+
// Parse thinking tags
|
|
2256
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
2257
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2258
|
+
builder.add_content(builder.consume_rest());
|
|
2259
|
+
return;
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2262
|
+
// Look for tool calls
|
|
2263
|
+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
|
|
2264
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2265
|
+
builder.move_to(res->groups[0].end);
|
|
2266
|
+
|
|
2267
|
+
// Expect JSON array of tool calls
|
|
2268
|
+
auto tool_calls_data = builder.consume_json();
|
|
2269
|
+
if (tool_calls_data.json.is_array()) {
|
|
2270
|
+
if (!builder.try_consume_literal("</TOOLCALL>")) {
|
|
2271
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2272
|
+
}
|
|
2273
|
+
builder.add_tool_calls(tool_calls_data.json);
|
|
2274
|
+
} else {
|
|
2275
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2276
|
+
}
|
|
2277
|
+
}
|
|
2278
|
+
builder.add_content(builder.consume_rest());
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2049
2281
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2050
2282
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2051
2283
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2249,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2249
2481
|
}
|
|
2250
2482
|
}
|
|
2251
2483
|
|
|
2484
|
+
// DeepSeek V3.1: detect based on specific patterns in the template
|
|
2485
|
+
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
|
|
2486
|
+
params.json_schema.is_null()) {
|
|
2487
|
+
return common_chat_params_init_deepseek_v3_1(tmpl, params);
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2252
2490
|
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
|
|
2253
2491
|
if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
|
|
2254
2492
|
return common_chat_params_init_deepseek_r1(tmpl, params);
|
|
@@ -2279,6 +2517,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2279
2517
|
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2280
2518
|
}
|
|
2281
2519
|
|
|
2520
|
+
// Nemotron v2
|
|
2521
|
+
if (src.find("<SPECIAL_10>") != std::string::npos) {
|
|
2522
|
+
return common_chat_params_init_nemotron_v2(tmpl, params);
|
|
2523
|
+
}
|
|
2524
|
+
|
|
2282
2525
|
// Use generic handler when mixing tools + JSON schema.
|
|
2283
2526
|
// TODO: support that mix in handlers below.
|
|
2284
2527
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2416,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2416
2659
|
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
|
|
2417
2660
|
common_chat_parse_deepseek_r1(builder);
|
|
2418
2661
|
break;
|
|
2662
|
+
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
|
|
2663
|
+
common_chat_parse_deepseek_v3_1(builder);
|
|
2664
|
+
break;
|
|
2419
2665
|
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
|
|
2420
2666
|
common_chat_parse_functionary_v3_2(builder);
|
|
2421
2667
|
break;
|
|
@@ -2440,6 +2686,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2440
2686
|
case COMMON_CHAT_FORMAT_SEED_OSS:
|
|
2441
2687
|
common_chat_parse_seed_oss(builder);
|
|
2442
2688
|
break;
|
|
2689
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
|
2690
|
+
common_chat_parse_nemotron_v2(builder);
|
|
2691
|
+
break;
|
|
2443
2692
|
default:
|
|
2444
2693
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2445
2694
|
}
|
|
@@ -118,11 +118,13 @@ enum common_chat_format {
|
|
|
118
118
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
|
119
119
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
|
120
120
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
|
121
|
+
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
|
|
121
122
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
|
122
123
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
124
|
COMMON_CHAT_FORMAT_GRANITE,
|
|
124
125
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
125
126
|
COMMON_CHAT_FORMAT_SEED_OSS,
|
|
127
|
+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
|
126
128
|
|
|
127
129
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
128
130
|
};
|
|
@@ -209,6 +211,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
|
|
|
209
211
|
|
|
210
212
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
211
213
|
|
|
214
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
|
215
|
+
|
|
212
216
|
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
|
213
217
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
|
214
218
|
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
|
@@ -445,7 +445,7 @@ struct common_params {
|
|
|
445
445
|
|
|
446
446
|
// "advanced" endpoints are disabled by default for better security
|
|
447
447
|
bool webui = true;
|
|
448
|
-
bool endpoint_slots =
|
|
448
|
+
bool endpoint_slots = true;
|
|
449
449
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
450
450
|
bool endpoint_metrics = false;
|
|
451
451
|
|
|
@@ -843,9 +843,10 @@ public:
|
|
|
843
843
|
_build_object_rule(
|
|
844
844
|
properties, required, name,
|
|
845
845
|
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
|
|
846
|
-
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
|
|
846
|
+
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
|
|
847
847
|
std::unordered_set<std::string> required;
|
|
848
848
|
std::vector<std::pair<std::string, json>> properties;
|
|
849
|
+
std::map<std::string, size_t> enum_values;
|
|
849
850
|
std::string hybrid_name = name;
|
|
850
851
|
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
|
|
851
852
|
if (comp_schema.contains("$ref")) {
|
|
@@ -857,6 +858,14 @@ public:
|
|
|
857
858
|
required.insert(prop.key());
|
|
858
859
|
}
|
|
859
860
|
}
|
|
861
|
+
} else if (comp_schema.contains("enum")) {
|
|
862
|
+
for (const auto & v : comp_schema["enum"]) {
|
|
863
|
+
const auto rule = _generate_constant_rule(v);
|
|
864
|
+
if (enum_values.find(rule) == enum_values.end()) {
|
|
865
|
+
enum_values[rule] = 0;
|
|
866
|
+
}
|
|
867
|
+
enum_values[rule] += 1;
|
|
868
|
+
}
|
|
860
869
|
} else {
|
|
861
870
|
// todo warning
|
|
862
871
|
}
|
|
@@ -870,6 +879,17 @@ public:
|
|
|
870
879
|
add_component(t, true);
|
|
871
880
|
}
|
|
872
881
|
}
|
|
882
|
+
if (!enum_values.empty()) {
|
|
883
|
+
std::vector<std::string> enum_intersection;
|
|
884
|
+
for (const auto & p : enum_values) {
|
|
885
|
+
if (p.second == schema["allOf"].size()) {
|
|
886
|
+
enum_intersection.push_back(p.first);
|
|
887
|
+
}
|
|
888
|
+
}
|
|
889
|
+
if (!enum_intersection.empty()) {
|
|
890
|
+
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
|
|
891
|
+
}
|
|
892
|
+
}
|
|
873
893
|
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
|
|
874
894
|
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
|
|
875
895
|
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
|
|
@@ -4,17 +4,52 @@
|
|
|
4
4
|
#include <condition_variable>
|
|
5
5
|
#include <cstdarg>
|
|
6
6
|
#include <cstdio>
|
|
7
|
+
#include <cstdlib>
|
|
8
|
+
#include <cstring>
|
|
7
9
|
#include <mutex>
|
|
8
10
|
#include <sstream>
|
|
9
11
|
#include <thread>
|
|
10
12
|
#include <vector>
|
|
11
13
|
|
|
14
|
+
#if defined(_WIN32)
|
|
15
|
+
# include <io.h>
|
|
16
|
+
# include <windows.h>
|
|
17
|
+
# define isatty _isatty
|
|
18
|
+
# define fileno _fileno
|
|
19
|
+
#else
|
|
20
|
+
# include <unistd.h>
|
|
21
|
+
#endif // defined(_WIN32)
|
|
22
|
+
|
|
12
23
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
|
13
24
|
|
|
14
25
|
void common_log_set_verbosity_thold(int verbosity) {
|
|
15
26
|
common_log_verbosity_thold = verbosity;
|
|
16
27
|
}
|
|
17
28
|
|
|
29
|
+
// Auto-detect if colors should be enabled based on terminal and environment
|
|
30
|
+
static bool common_log_should_use_colors_auto() {
|
|
31
|
+
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
32
|
+
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
33
|
+
if (no_color[0] != '\0') {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Check TERM environment variable
|
|
39
|
+
if (const char * term = std::getenv("TERM")) {
|
|
40
|
+
if (std::strcmp(term, "dumb") == 0) {
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Check if stdout and stderr are connected to a terminal
|
|
46
|
+
// We check both because log messages can go to either
|
|
47
|
+
bool stdout_is_tty = isatty(fileno(stdout));
|
|
48
|
+
bool stderr_is_tty = isatty(fileno(stderr));
|
|
49
|
+
|
|
50
|
+
return stdout_is_tty || stderr_is_tty;
|
|
51
|
+
}
|
|
52
|
+
|
|
18
53
|
static int64_t t_us() {
|
|
19
54
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
|
20
55
|
}
|
|
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
|
|
|
353
388
|
|
|
354
389
|
struct common_log * common_log_main() {
|
|
355
390
|
static struct common_log log;
|
|
391
|
+
static std::once_flag init_flag;
|
|
392
|
+
std::call_once(init_flag, [&]() {
|
|
393
|
+
// Set default to auto-detect colors
|
|
394
|
+
log.set_colors(common_log_should_use_colors_auto());
|
|
395
|
+
});
|
|
356
396
|
|
|
357
397
|
return &log;
|
|
358
398
|
}
|
|
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
|
|
380
420
|
log->set_file(file);
|
|
381
421
|
}
|
|
382
422
|
|
|
383
|
-
void common_log_set_colors(struct common_log * log,
|
|
384
|
-
|
|
423
|
+
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
|
424
|
+
if (colors == LOG_COLORS_AUTO) {
|
|
425
|
+
log->set_colors(common_log_should_use_colors_auto());
|
|
426
|
+
return;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
if (colors == LOG_COLORS_DISABLED) {
|
|
430
|
+
log->set_colors(false);
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
|
|
435
|
+
log->set_colors(true);
|
|
385
436
|
}
|
|
386
437
|
|
|
387
438
|
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
@@ -24,6 +24,12 @@
|
|
|
24
24
|
#define LOG_DEFAULT_DEBUG 1
|
|
25
25
|
#define LOG_DEFAULT_LLAMA 0
|
|
26
26
|
|
|
27
|
+
enum log_colors {
|
|
28
|
+
LOG_COLORS_AUTO = -1,
|
|
29
|
+
LOG_COLORS_DISABLED = 0,
|
|
30
|
+
LOG_COLORS_ENABLED = 1,
|
|
31
|
+
};
|
|
32
|
+
|
|
27
33
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
|
28
34
|
// set via common_log_set_verbosity()
|
|
29
35
|
extern int common_log_verbosity_thold;
|
|
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
|
|
65
71
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
|
66
72
|
//
|
|
67
73
|
|
|
68
|
-
void common_log_set_file (struct common_log * log, const char * file);
|
|
69
|
-
void common_log_set_colors (struct common_log * log,
|
|
70
|
-
void common_log_set_prefix (struct common_log * log,
|
|
71
|
-
void common_log_set_timestamps(struct common_log * log,
|
|
74
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
|
75
|
+
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
|
76
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
|
77
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
|
72
78
|
|
|
73
79
|
// helper macros for logging
|
|
74
80
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
|
426
426
|
|
|
427
427
|
// helpers
|
|
428
428
|
|
|
429
|
-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
|
430
|
-
|
|
429
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
|
430
|
+
auto * res = &gsmpl->cur_p;
|
|
431
|
+
|
|
432
|
+
if (do_sort && !res->sorted) {
|
|
433
|
+
// remember the selected token before sorting
|
|
434
|
+
const llama_token id = res->data[res->selected].id;
|
|
435
|
+
|
|
436
|
+
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
437
|
+
return a.p > b.p;
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// restore the selected token after sorting
|
|
441
|
+
for (size_t i = 0; i < res->size; ++i) {
|
|
442
|
+
if (res->data[i].id == id) {
|
|
443
|
+
res->selected = i;
|
|
444
|
+
break;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
res->sorted = true;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return res;
|
|
431
452
|
}
|
|
432
453
|
|
|
433
454
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
|
86
86
|
// helpers
|
|
87
87
|
|
|
88
88
|
// access the internal list of current candidate tokens
|
|
89
|
-
|
|
89
|
+
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
|
|
90
|
+
// the .sorted flag of the result indicates whether the returned candidates are sorted
|
|
91
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
|
|
90
92
|
|
|
91
93
|
// get the last accepted token
|
|
92
94
|
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
|
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
317
317
|
|
|
318
318
|
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
319
319
|
|
|
320
|
-
const auto * cur_p = common_sampler_get_candidates(smpl);
|
|
320
|
+
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
322
322
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
|
323
323
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
@@ -129,10 +129,11 @@ endif()
|
|
|
129
129
|
option(GGML_LASX "ggml: enable lasx" ON)
|
|
130
130
|
option(GGML_LSX "ggml: enable lsx" ON)
|
|
131
131
|
option(GGML_RVV "ggml: enable rvv" ON)
|
|
132
|
-
option(GGML_RV_ZFH "ggml: enable riscv zfh"
|
|
132
|
+
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
133
|
+
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
134
|
+
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
133
135
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
134
136
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
135
|
-
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
|
136
137
|
|
|
137
138
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
138
139
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -132,6 +132,8 @@ extern "C" {
|
|
|
132
132
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
|
133
133
|
// GPU device using dedicated memory
|
|
134
134
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
|
135
|
+
// integrated GPU device using host memory
|
|
136
|
+
GGML_BACKEND_DEVICE_TYPE_IGPU,
|
|
135
137
|
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
|
|
136
138
|
GGML_BACKEND_DEVICE_TYPE_ACCEL
|
|
137
139
|
};
|
|
@@ -150,11 +152,21 @@ extern "C" {
|
|
|
150
152
|
|
|
151
153
|
// all the device properties
|
|
152
154
|
struct ggml_backend_dev_props {
|
|
155
|
+
// device name
|
|
153
156
|
const char * name;
|
|
157
|
+
// device description
|
|
154
158
|
const char * description;
|
|
159
|
+
// device free memory in bytes
|
|
155
160
|
size_t memory_free;
|
|
161
|
+
// device total memory in bytes
|
|
156
162
|
size_t memory_total;
|
|
163
|
+
// device type
|
|
157
164
|
enum ggml_backend_dev_type type;
|
|
165
|
+
// device id
|
|
166
|
+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
|
|
167
|
+
// if the id is unknown, this should be NULL
|
|
168
|
+
const char * device_id;
|
|
169
|
+
// device capabilities
|
|
158
170
|
struct ggml_backend_dev_caps caps;
|
|
159
171
|
};
|
|
160
172
|
|
|
@@ -307,6 +319,9 @@ extern "C" {
|
|
|
307
319
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
308
320
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
309
321
|
|
|
322
|
+
// Split graph without allocating it
|
|
323
|
+
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
324
|
+
|
|
310
325
|
// Allocate and compute graph on the backend scheduler
|
|
311
326
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
|
312
327
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|