@fugood/llama.node 1.1.10 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +20 -2
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +174 -388
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +67 -37
- package/src/llama.cpp/common/chat.cpp +263 -2
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +5 -2
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
- package/src/llama.cpp/include/llama.h +32 -7
- package/src/llama.cpp/src/llama-adapter.cpp +101 -4
- package/src/llama.cpp/src/llama-adapter.h +6 -0
- package/src/llama.cpp/src/llama-arch.cpp +69 -2
- package/src/llama.cpp/src/llama-arch.h +6 -0
- package/src/llama.cpp/src/llama-context.cpp +92 -45
- package/src/llama.cpp/src/llama-context.h +1 -5
- package/src/llama.cpp/src/llama-graph.cpp +74 -19
- package/src/llama.cpp/src/llama-graph.h +10 -1
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
- package/src/llama.cpp/src/llama-kv-cache.h +4 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +434 -21
- package/src/llama.cpp/src/llama-model.h +1 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
|
|
150
150
|
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
|
151
151
|
}
|
|
152
152
|
|
|
153
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
|
154
|
+
common_chat_templates_inputs dummy_inputs;
|
|
155
|
+
common_chat_msg msg;
|
|
156
|
+
msg.role = "user";
|
|
157
|
+
msg.content = "test";
|
|
158
|
+
dummy_inputs.messages = {msg};
|
|
159
|
+
dummy_inputs.enable_thinking = false;
|
|
160
|
+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
161
|
+
dummy_inputs.enable_thinking = true;
|
|
162
|
+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
163
|
+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
|
164
|
+
}
|
|
165
|
+
|
|
153
166
|
template <>
|
|
154
167
|
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
|
155
168
|
std::vector<common_chat_msg> msgs;
|
|
@@ -609,6 +622,8 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
609
622
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
610
623
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
611
624
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
625
|
+
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
626
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
612
627
|
default:
|
|
613
628
|
throw std::runtime_error("Unknown chat format");
|
|
614
629
|
}
|
|
@@ -1169,6 +1184,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1169
1184
|
});
|
|
1170
1185
|
return data;
|
|
1171
1186
|
}
|
|
1187
|
+
|
|
1188
|
+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1189
|
+
common_chat_params data;
|
|
1190
|
+
|
|
1191
|
+
// Generate the prompt using the apply() function with the template
|
|
1192
|
+
data.prompt = apply(tmpl, inputs);
|
|
1193
|
+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
|
|
1194
|
+
|
|
1195
|
+
// Handle thinking tags appropriately based on inputs.enable_thinking
|
|
1196
|
+
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1197
|
+
if (!inputs.enable_thinking) {
|
|
1198
|
+
data.prompt += "</think>";
|
|
1199
|
+
} else {
|
|
1200
|
+
data.thinking_forced_open = true;
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
|
|
1205
|
+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1206
|
+
data.grammar_lazy = true;
|
|
1207
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1208
|
+
auto schemas = json::array();
|
|
1209
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1210
|
+
const auto & function = tool.at("function");
|
|
1211
|
+
schemas.push_back({
|
|
1212
|
+
{ "type", "object" },
|
|
1213
|
+
{ "properties",
|
|
1214
|
+
{
|
|
1215
|
+
{ "name",
|
|
1216
|
+
{
|
|
1217
|
+
{ "type", "string" },
|
|
1218
|
+
{ "const", function.at("name") },
|
|
1219
|
+
} },
|
|
1220
|
+
{ "arguments", function.at("parameters") },
|
|
1221
|
+
} },
|
|
1222
|
+
{ "required", json::array({ "name", "arguments" }) },
|
|
1223
|
+
});
|
|
1224
|
+
});
|
|
1225
|
+
auto schema = json{
|
|
1226
|
+
{ "type", "array" },
|
|
1227
|
+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
|
1228
|
+
{ "minItems", 1 },
|
|
1229
|
+
};
|
|
1230
|
+
if (!inputs.parallel_tool_calls) {
|
|
1231
|
+
schema["maxItems"] = 1;
|
|
1232
|
+
}
|
|
1233
|
+
builder.add_rule("root",
|
|
1234
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1235
|
+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
|
|
1236
|
+
" \"</TOOLCALL>\"");
|
|
1237
|
+
});
|
|
1238
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1239
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1240
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1241
|
+
std::string(data.thinking_forced_open ?
|
|
1242
|
+
"[\\s\\S]*?(</think>\\s*)" :
|
|
1243
|
+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1244
|
+
"(<TOOLCALL>)[\\s\\S]*" });
|
|
1245
|
+
}
|
|
1246
|
+
return data;
|
|
1247
|
+
}
|
|
1172
1248
|
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
|
1173
1249
|
if (!builder.syntax().parse_tool_calls) {
|
|
1174
1250
|
builder.add_content(builder.consume_rest());
|
|
@@ -1815,7 +1891,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
1815
1891
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1816
1892
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1817
1893
|
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
|
|
1818
|
-
"
|
|
1894
|
+
"\\s*("
|
|
1819
1895
|
"(?:<tool_call>"
|
|
1820
1896
|
"|<function"
|
|
1821
1897
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
@@ -2045,6 +2121,121 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2045
2121
|
}
|
|
2046
2122
|
}
|
|
2047
2123
|
|
|
2124
|
+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
|
2125
|
+
// Parse thinking tags
|
|
2126
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
2127
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2128
|
+
builder.add_content(builder.consume_rest());
|
|
2129
|
+
return;
|
|
2130
|
+
}
|
|
2131
|
+
|
|
2132
|
+
// Look for tool calls
|
|
2133
|
+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
|
|
2134
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2135
|
+
builder.move_to(res->groups[0].end);
|
|
2136
|
+
|
|
2137
|
+
// Expect JSON array of tool calls
|
|
2138
|
+
auto tool_calls_data = builder.consume_json();
|
|
2139
|
+
if (tool_calls_data.json.is_array()) {
|
|
2140
|
+
if (!builder.try_consume_literal("</TOOLCALL>")) {
|
|
2141
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2142
|
+
}
|
|
2143
|
+
builder.add_tool_calls(tool_calls_data.json);
|
|
2144
|
+
} else {
|
|
2145
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
builder.add_content(builder.consume_rest());
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2152
|
+
// Parse thinking tags first - this handles the main reasoning content
|
|
2153
|
+
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
2154
|
+
|
|
2155
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2156
|
+
builder.add_content(builder.consume_rest());
|
|
2157
|
+
return;
|
|
2158
|
+
}
|
|
2159
|
+
|
|
2160
|
+
// Parse tool calls - Seed-OSS uses <seed:tool_call> format
|
|
2161
|
+
static const common_regex tool_call_begin_regex("<seed:tool_call>");
|
|
2162
|
+
static const common_regex tool_call_end_regex("</seed:tool_call>");
|
|
2163
|
+
static const common_regex function_regex("<function=([^>]+)>");
|
|
2164
|
+
static const common_regex param_regex("<parameter=([^>]+)>");
|
|
2165
|
+
|
|
2166
|
+
while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
|
|
2167
|
+
builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
|
|
2168
|
+
|
|
2169
|
+
// Look for function call inside tool call, ignore any content before it
|
|
2170
|
+
if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
|
|
2171
|
+
auto function_name = builder.str(func_res->groups[1]);
|
|
2172
|
+
|
|
2173
|
+
// Parse Seed-OSS parameters <parameter=name>value</parameter>
|
|
2174
|
+
json args = json::object();
|
|
2175
|
+
// Parse all parameters
|
|
2176
|
+
while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
|
|
2177
|
+
// again, ignore noise around parameters
|
|
2178
|
+
auto param_name = builder.str(param_res->groups[1]);
|
|
2179
|
+
builder.move_to(param_res->groups[0].end);
|
|
2180
|
+
builder.consume_spaces(); // Consume whitespace after parameter
|
|
2181
|
+
auto savedPos = builder.pos();
|
|
2182
|
+
if (auto param_parse = builder.try_find_literal("</parameter>")) {
|
|
2183
|
+
auto param = param_parse->prelude;
|
|
2184
|
+
builder.move_to(savedPos);
|
|
2185
|
+
try {
|
|
2186
|
+
if (auto param_res = builder.try_consume_json()) {
|
|
2187
|
+
args[param_name] = param_res->json;
|
|
2188
|
+
} else {
|
|
2189
|
+
args[param_name] = param;
|
|
2190
|
+
}
|
|
2191
|
+
} catch (json::exception &) {
|
|
2192
|
+
args[param_name] = param;
|
|
2193
|
+
}
|
|
2194
|
+
} else {
|
|
2195
|
+
throw common_chat_msg_partial_exception("Incomplete tool parameter");
|
|
2196
|
+
}
|
|
2197
|
+
}
|
|
2198
|
+
// Look for closing function tag
|
|
2199
|
+
auto end_func = builder.try_find_literal("</function>");
|
|
2200
|
+
if (end_func) {
|
|
2201
|
+
builder.move_to(end_func->groups[0].end);
|
|
2202
|
+
builder.consume_spaces(); // Consume whitespace after </function>
|
|
2203
|
+
|
|
2204
|
+
// Add the tool call with parsed arguments, but only if we REALLY got the literal
|
|
2205
|
+
auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
|
|
2206
|
+
auto funlen = std::string("</function>").length();
|
|
2207
|
+
if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
|
|
2208
|
+
if (!builder.add_tool_call(function_name, "", args.dump())) {
|
|
2209
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2210
|
+
}
|
|
2211
|
+
} else {
|
|
2212
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2213
|
+
}
|
|
2214
|
+
} else {
|
|
2215
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2216
|
+
}
|
|
2217
|
+
// Look for closing tool call tag
|
|
2218
|
+
if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
|
|
2219
|
+
builder.move_to(end_tool->groups[0].end);
|
|
2220
|
+
builder.consume_spaces(); // Consume trailing whitespace after tool call
|
|
2221
|
+
} else {
|
|
2222
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2223
|
+
}
|
|
2224
|
+
} else {
|
|
2225
|
+
// No function found - don't consume content here, let it be handled at the end
|
|
2226
|
+
break;
|
|
2227
|
+
}
|
|
2228
|
+
}
|
|
2229
|
+
|
|
2230
|
+
// Consume any remaining whitespace after all tool call processing
|
|
2231
|
+
builder.consume_spaces();
|
|
2232
|
+
auto remaining = builder.consume_rest();
|
|
2233
|
+
// If there's any non-whitespace content remaining, add it as content
|
|
2234
|
+
if (!string_strip(remaining).empty()) {
|
|
2235
|
+
builder.add_content(remaining);
|
|
2236
|
+
}
|
|
2237
|
+
}
|
|
2238
|
+
|
|
2048
2239
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2049
2240
|
common_chat_params data;
|
|
2050
2241
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2061,8 +2252,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
|
|
|
2061
2252
|
return data;
|
|
2062
2253
|
}
|
|
2063
2254
|
|
|
2255
|
+
static common_chat_params common_chat_params_init_seed_oss(
|
|
2256
|
+
const common_chat_template & tmpl,
|
|
2257
|
+
templates_params & params,
|
|
2258
|
+
const common_chat_templates_inputs & inputs)
|
|
2259
|
+
{
|
|
2260
|
+
common_chat_params data;
|
|
2261
|
+
data.prompt = apply(tmpl, params);
|
|
2262
|
+
data.format = COMMON_CHAT_FORMAT_SEED_OSS;
|
|
2263
|
+
if (string_ends_with(data.prompt, "<seed:think>")) {
|
|
2264
|
+
if (!inputs.enable_thinking) {
|
|
2265
|
+
data.prompt += "</seed:think>";
|
|
2266
|
+
} else {
|
|
2267
|
+
data.thinking_forced_open = true;
|
|
2268
|
+
}
|
|
2269
|
+
}
|
|
2270
|
+
|
|
2271
|
+
if (params.tools.is_array() && !params.tools.empty()) {
|
|
2272
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
2273
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
2274
|
+
std::vector<std::string> tool_rules;
|
|
2275
|
+
foreach_function(params.tools, [&](const json & tool) {
|
|
2276
|
+
const auto & function = tool.at("function");
|
|
2277
|
+
std::string name = function.at("name");
|
|
2278
|
+
auto parameters = function.at("parameters");
|
|
2279
|
+
builder.resolve_refs(parameters);
|
|
2280
|
+
|
|
2281
|
+
// Create rule for Seed-OSS function call format
|
|
2282
|
+
std::string param_rules;
|
|
2283
|
+
if (parameters.contains("properties")) {
|
|
2284
|
+
for (const auto & [key, value] : parameters.at("properties").items()) {
|
|
2285
|
+
param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
|
|
2286
|
+
"\"</parameter>\"";
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
2291
|
+
"\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
|
|
2292
|
+
param_rules +
|
|
2293
|
+
" \"</function>\" space \"</seed:tool_call>\""));
|
|
2294
|
+
});
|
|
2295
|
+
|
|
2296
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
|
|
2297
|
+
|
|
2298
|
+
data.preserved_tokens = {
|
|
2299
|
+
"<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
|
|
2300
|
+
"<function=", "</function>", "<parameter=", "</parameter>",
|
|
2301
|
+
};
|
|
2302
|
+
|
|
2303
|
+
builder.add_rule("root", string_join(tool_rules, " | "));
|
|
2304
|
+
});
|
|
2305
|
+
}
|
|
2306
|
+
return data;
|
|
2307
|
+
}
|
|
2308
|
+
|
|
2064
2309
|
static common_chat_params common_chat_templates_apply_jinja(
|
|
2065
|
-
const struct common_chat_templates
|
|
2310
|
+
const struct common_chat_templates * tmpls,
|
|
2066
2311
|
const struct common_chat_templates_inputs & inputs)
|
|
2067
2312
|
{
|
|
2068
2313
|
templates_params params;
|
|
@@ -2131,6 +2376,16 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2131
2376
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2132
2377
|
}
|
|
2133
2378
|
|
|
2379
|
+
// Seed-OSS
|
|
2380
|
+
if (src.find("<seed:think>") != std::string::npos) {
|
|
2381
|
+
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2382
|
+
}
|
|
2383
|
+
|
|
2384
|
+
// Nemotron v2
|
|
2385
|
+
if (src.find("<SPECIAL_10>") != std::string::npos) {
|
|
2386
|
+
return common_chat_params_init_nemotron_v2(tmpl, params);
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2134
2389
|
// Use generic handler when mixing tools + JSON schema.
|
|
2135
2390
|
// TODO: support that mix in handlers below.
|
|
2136
2391
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2289,6 +2544,12 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2289
2544
|
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
2290
2545
|
common_chat_parse_gpt_oss(builder);
|
|
2291
2546
|
break;
|
|
2547
|
+
case COMMON_CHAT_FORMAT_SEED_OSS:
|
|
2548
|
+
common_chat_parse_seed_oss(builder);
|
|
2549
|
+
break;
|
|
2550
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
|
2551
|
+
common_chat_parse_nemotron_v2(builder);
|
|
2552
|
+
break;
|
|
2292
2553
|
default:
|
|
2293
2554
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2294
2555
|
}
|
|
@@ -122,6 +122,8 @@ enum common_chat_format {
|
|
|
122
122
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
|
123
123
|
COMMON_CHAT_FORMAT_GRANITE,
|
|
124
124
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
125
|
+
COMMON_CHAT_FORMAT_SEED_OSS,
|
|
126
|
+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
|
125
127
|
|
|
126
128
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
127
129
|
};
|
|
@@ -208,6 +210,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
|
|
|
208
210
|
|
|
209
211
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
210
212
|
|
|
213
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
|
214
|
+
|
|
211
215
|
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
|
212
216
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
|
213
217
|
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
|
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
901
901
|
|
|
902
902
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
903
903
|
if (model == NULL) {
|
|
904
|
-
LOG_ERR("%s: failed to load model '%s'\n",
|
|
904
|
+
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
905
|
+
__func__, params.model.path.c_str());
|
|
905
906
|
return iparams;
|
|
906
907
|
}
|
|
907
908
|
|
|
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
911
912
|
|
|
912
913
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
913
914
|
if (lctx == NULL) {
|
|
914
|
-
LOG_ERR("%s: failed to create context with model '%s'\n",
|
|
915
|
+
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
916
|
+
__func__, params.model.path.c_str());
|
|
915
917
|
llama_model_free(model);
|
|
916
918
|
return iparams;
|
|
917
919
|
}
|
|
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
988
990
|
return iparams;
|
|
989
991
|
}
|
|
990
992
|
|
|
993
|
+
char buf[1024];
|
|
991
994
|
la.ptr = lora.get();
|
|
995
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
996
|
+
la.task_name = buf;
|
|
997
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
998
|
+
la.prompt_prefix = buf;
|
|
992
999
|
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
993
1000
|
}
|
|
994
1001
|
|
|
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1153
1160
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1154
1161
|
cparams.pooling_type = params.pooling_type;
|
|
1155
1162
|
cparams.attention_type = params.attention_type;
|
|
1163
|
+
cparams.flash_attn_type = params.flash_attn_type;
|
|
1156
1164
|
cparams.cb_eval = params.cb_eval;
|
|
1157
1165
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1158
1166
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
1159
|
-
cparams.flash_attn = params.flash_attn;
|
|
1160
1167
|
cparams.no_perf = params.no_perf;
|
|
1161
1168
|
cparams.op_offload = !params.no_op_offload;
|
|
1162
1169
|
cparams.swa_full = params.swa_full;
|
|
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
|
|
|
34
34
|
std::string path;
|
|
35
35
|
float scale;
|
|
36
36
|
|
|
37
|
+
std::string task_name;
|
|
38
|
+
std::string prompt_prefix;
|
|
39
|
+
|
|
37
40
|
struct llama_adapter_lora * ptr;
|
|
38
41
|
};
|
|
39
42
|
|
|
@@ -310,6 +313,7 @@ struct common_params {
|
|
|
310
313
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
311
314
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
312
315
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
316
|
+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
|
|
313
317
|
|
|
314
318
|
struct common_params_sampling sampling;
|
|
315
319
|
struct common_params_speculative speculative;
|
|
@@ -373,7 +377,6 @@ struct common_params {
|
|
|
373
377
|
bool multiline_input = false; // reverse the usage of `\`
|
|
374
378
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
375
379
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
376
|
-
bool flash_attn = false; // flash attention
|
|
377
380
|
bool no_perf = false; // disable performance metrics
|
|
378
381
|
bool ctx_shift = false; // context shift on infinite text generation
|
|
379
382
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
@@ -442,7 +445,7 @@ struct common_params {
|
|
|
442
445
|
|
|
443
446
|
// "advanced" endpoints are disabled by default for better security
|
|
444
447
|
bool webui = true;
|
|
445
|
-
bool endpoint_slots =
|
|
448
|
+
bool endpoint_slots = true;
|
|
446
449
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
447
450
|
bool endpoint_metrics = false;
|
|
448
451
|
|
|
@@ -4,17 +4,52 @@
|
|
|
4
4
|
#include <condition_variable>
|
|
5
5
|
#include <cstdarg>
|
|
6
6
|
#include <cstdio>
|
|
7
|
+
#include <cstdlib>
|
|
8
|
+
#include <cstring>
|
|
7
9
|
#include <mutex>
|
|
8
10
|
#include <sstream>
|
|
9
11
|
#include <thread>
|
|
10
12
|
#include <vector>
|
|
11
13
|
|
|
14
|
+
#if defined(_WIN32)
|
|
15
|
+
# include <io.h>
|
|
16
|
+
# include <windows.h>
|
|
17
|
+
# define isatty _isatty
|
|
18
|
+
# define fileno _fileno
|
|
19
|
+
#else
|
|
20
|
+
# include <unistd.h>
|
|
21
|
+
#endif // defined(_WIN32)
|
|
22
|
+
|
|
12
23
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
|
13
24
|
|
|
14
25
|
void common_log_set_verbosity_thold(int verbosity) {
|
|
15
26
|
common_log_verbosity_thold = verbosity;
|
|
16
27
|
}
|
|
17
28
|
|
|
29
|
+
// Auto-detect if colors should be enabled based on terminal and environment
|
|
30
|
+
static bool common_log_should_use_colors_auto() {
|
|
31
|
+
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
32
|
+
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
33
|
+
if (no_color[0] != '\0') {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Check TERM environment variable
|
|
39
|
+
if (const char * term = std::getenv("TERM")) {
|
|
40
|
+
if (std::strcmp(term, "dumb") == 0) {
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Check if stdout and stderr are connected to a terminal
|
|
46
|
+
// We check both because log messages can go to either
|
|
47
|
+
bool stdout_is_tty = isatty(fileno(stdout));
|
|
48
|
+
bool stderr_is_tty = isatty(fileno(stderr));
|
|
49
|
+
|
|
50
|
+
return stdout_is_tty || stderr_is_tty;
|
|
51
|
+
}
|
|
52
|
+
|
|
18
53
|
static int64_t t_us() {
|
|
19
54
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
|
20
55
|
}
|
|
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
|
|
|
353
388
|
|
|
354
389
|
struct common_log * common_log_main() {
|
|
355
390
|
static struct common_log log;
|
|
391
|
+
static std::once_flag init_flag;
|
|
392
|
+
std::call_once(init_flag, [&]() {
|
|
393
|
+
// Set default to auto-detect colors
|
|
394
|
+
log.set_colors(common_log_should_use_colors_auto());
|
|
395
|
+
});
|
|
356
396
|
|
|
357
397
|
return &log;
|
|
358
398
|
}
|
|
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
|
|
380
420
|
log->set_file(file);
|
|
381
421
|
}
|
|
382
422
|
|
|
383
|
-
void common_log_set_colors(struct common_log * log,
|
|
384
|
-
|
|
423
|
+
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
|
424
|
+
if (colors == LOG_COLORS_AUTO) {
|
|
425
|
+
log->set_colors(common_log_should_use_colors_auto());
|
|
426
|
+
return;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
if (colors == LOG_COLORS_DISABLED) {
|
|
430
|
+
log->set_colors(false);
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
|
|
435
|
+
log->set_colors(true);
|
|
385
436
|
}
|
|
386
437
|
|
|
387
438
|
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
@@ -24,6 +24,12 @@
|
|
|
24
24
|
#define LOG_DEFAULT_DEBUG 1
|
|
25
25
|
#define LOG_DEFAULT_LLAMA 0
|
|
26
26
|
|
|
27
|
+
enum log_colors {
|
|
28
|
+
LOG_COLORS_AUTO = -1,
|
|
29
|
+
LOG_COLORS_DISABLED = 0,
|
|
30
|
+
LOG_COLORS_ENABLED = 1,
|
|
31
|
+
};
|
|
32
|
+
|
|
27
33
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
|
28
34
|
// set via common_log_set_verbosity()
|
|
29
35
|
extern int common_log_verbosity_thold;
|
|
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
|
|
65
71
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
|
66
72
|
//
|
|
67
73
|
|
|
68
|
-
void common_log_set_file (struct common_log * log, const char * file);
|
|
69
|
-
void common_log_set_colors (struct common_log * log,
|
|
70
|
-
void common_log_set_prefix (struct common_log * log,
|
|
71
|
-
void common_log_set_timestamps(struct common_log * log,
|
|
74
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
|
75
|
+
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
|
76
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
|
77
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
|
72
78
|
|
|
73
79
|
// helper macros for logging
|
|
74
80
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
|
426
426
|
|
|
427
427
|
// helpers
|
|
428
428
|
|
|
429
|
-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
|
430
|
-
|
|
429
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
|
430
|
+
auto * res = &gsmpl->cur_p;
|
|
431
|
+
|
|
432
|
+
if (do_sort && !res->sorted) {
|
|
433
|
+
// remember the selected token before sorting
|
|
434
|
+
const llama_token id = res->data[res->selected].id;
|
|
435
|
+
|
|
436
|
+
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
437
|
+
return a.p > b.p;
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// restore the selected token after sorting
|
|
441
|
+
for (size_t i = 0; i < res->size; ++i) {
|
|
442
|
+
if (res->data[i].id == id) {
|
|
443
|
+
res->selected = i;
|
|
444
|
+
break;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
res->sorted = true;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return res;
|
|
431
452
|
}
|
|
432
453
|
|
|
433
454
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
|
86
86
|
// helpers
|
|
87
87
|
|
|
88
88
|
// access the internal list of current candidate tokens
|
|
89
|
-
|
|
89
|
+
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
|
|
90
|
+
// the .sorted flag of the result indicates whether the returned candidates are sorted
|
|
91
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
|
|
90
92
|
|
|
91
93
|
// get the last accepted token
|
|
92
94
|
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
|
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
317
317
|
|
|
318
318
|
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
319
319
|
|
|
320
|
-
const auto * cur_p = common_sampler_get_candidates(smpl);
|
|
320
|
+
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
322
322
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
|
323
323
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
|
2
|
-
project("ggml" C CXX)
|
|
2
|
+
project("ggml" C CXX ASM)
|
|
3
3
|
include(CheckIncludeFileCXX)
|
|
4
4
|
|
|
5
5
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
|
@@ -129,10 +129,11 @@ endif()
|
|
|
129
129
|
option(GGML_LASX "ggml: enable lasx" ON)
|
|
130
130
|
option(GGML_LSX "ggml: enable lsx" ON)
|
|
131
131
|
option(GGML_RVV "ggml: enable rvv" ON)
|
|
132
|
-
option(GGML_RV_ZFH "ggml: enable riscv zfh"
|
|
132
|
+
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
133
|
+
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
134
|
+
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
133
135
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
134
136
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
135
|
-
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
|
136
137
|
|
|
137
138
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
138
139
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -307,6 +307,9 @@ extern "C" {
|
|
|
307
307
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
308
308
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
309
309
|
|
|
310
|
+
// Split graph without allocating it
|
|
311
|
+
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
312
|
+
|
|
310
313
|
// Allocate and compute graph on the backend scheduler
|
|
311
314
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
|
312
315
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
@@ -101,7 +101,6 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
-
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
105
104
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
106
105
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
107
106
|
|