@fugood/llama.node 1.1.11 → 1.2.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +111 -1
- package/src/llama.cpp/common/chat.h +3 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
- package/src/llama.cpp/ggml/include/ggml.h +50 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +14 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +0 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +218 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +27 -4
- package/src/llama.cpp/src/llama-arch.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +62 -56
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +54 -9
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +1 -23
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +159 -1
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
|
|
|
150
150
|
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
|
|
151
151
|
}
|
|
152
152
|
|
|
153
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
|
|
154
|
+
common_chat_templates_inputs dummy_inputs;
|
|
155
|
+
common_chat_msg msg;
|
|
156
|
+
msg.role = "user";
|
|
157
|
+
msg.content = "test";
|
|
158
|
+
dummy_inputs.messages = {msg};
|
|
159
|
+
dummy_inputs.enable_thinking = false;
|
|
160
|
+
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
161
|
+
dummy_inputs.enable_thinking = true;
|
|
162
|
+
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
|
|
163
|
+
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
|
|
164
|
+
}
|
|
165
|
+
|
|
153
166
|
template <>
|
|
154
167
|
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
|
|
155
168
|
std::vector<common_chat_msg> msgs;
|
|
@@ -610,6 +623,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
610
623
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
611
624
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
612
625
|
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
626
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
|
|
613
627
|
default:
|
|
614
628
|
throw std::runtime_error("Unknown chat format");
|
|
615
629
|
}
|
|
@@ -1170,6 +1184,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
|
|
|
1170
1184
|
});
|
|
1171
1185
|
return data;
|
|
1172
1186
|
}
|
|
1187
|
+
|
|
1188
|
+
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
1189
|
+
common_chat_params data;
|
|
1190
|
+
|
|
1191
|
+
// Generate the prompt using the apply() function with the template
|
|
1192
|
+
data.prompt = apply(tmpl, inputs);
|
|
1193
|
+
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
|
|
1194
|
+
|
|
1195
|
+
// Handle thinking tags appropriately based on inputs.enable_thinking
|
|
1196
|
+
if (string_ends_with(data.prompt, "<think>\n")) {
|
|
1197
|
+
if (!inputs.enable_thinking) {
|
|
1198
|
+
data.prompt += "</think>";
|
|
1199
|
+
} else {
|
|
1200
|
+
data.thinking_forced_open = true;
|
|
1201
|
+
}
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
|
|
1205
|
+
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
|
|
1206
|
+
data.grammar_lazy = true;
|
|
1207
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
1208
|
+
auto schemas = json::array();
|
|
1209
|
+
foreach_function(inputs.tools, [&](const json & tool) {
|
|
1210
|
+
const auto & function = tool.at("function");
|
|
1211
|
+
schemas.push_back({
|
|
1212
|
+
{ "type", "object" },
|
|
1213
|
+
{ "properties",
|
|
1214
|
+
{
|
|
1215
|
+
{ "name",
|
|
1216
|
+
{
|
|
1217
|
+
{ "type", "string" },
|
|
1218
|
+
{ "const", function.at("name") },
|
|
1219
|
+
} },
|
|
1220
|
+
{ "arguments", function.at("parameters") },
|
|
1221
|
+
} },
|
|
1222
|
+
{ "required", json::array({ "name", "arguments" }) },
|
|
1223
|
+
});
|
|
1224
|
+
});
|
|
1225
|
+
auto schema = json{
|
|
1226
|
+
{ "type", "array" },
|
|
1227
|
+
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
|
|
1228
|
+
{ "minItems", 1 },
|
|
1229
|
+
};
|
|
1230
|
+
if (!inputs.parallel_tool_calls) {
|
|
1231
|
+
schema["maxItems"] = 1;
|
|
1232
|
+
}
|
|
1233
|
+
builder.add_rule("root",
|
|
1234
|
+
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
|
|
1235
|
+
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
|
|
1236
|
+
" \"</TOOLCALL>\"");
|
|
1237
|
+
});
|
|
1238
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
|
1239
|
+
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1240
|
+
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1241
|
+
std::string(data.thinking_forced_open ?
|
|
1242
|
+
"[\\s\\S]*?(</think>\\s*)" :
|
|
1243
|
+
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
|
|
1244
|
+
"(<TOOLCALL>)[\\s\\S]*" });
|
|
1245
|
+
}
|
|
1246
|
+
return data;
|
|
1247
|
+
}
|
|
1173
1248
|
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
|
|
1174
1249
|
if (!builder.syntax().parse_tool_calls) {
|
|
1175
1250
|
builder.add_content(builder.consume_rest());
|
|
@@ -1816,7 +1891,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|
|
1816
1891
|
// If thinking_forced_open, then we capture the </think> tag in the grammar,
|
|
1817
1892
|
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
|
|
1818
1893
|
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
|
|
1819
|
-
"
|
|
1894
|
+
"\\s*("
|
|
1820
1895
|
"(?:<tool_call>"
|
|
1821
1896
|
"|<function"
|
|
1822
1897
|
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
|
|
@@ -2046,6 +2121,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2046
2121
|
}
|
|
2047
2122
|
}
|
|
2048
2123
|
|
|
2124
|
+
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
|
|
2125
|
+
// Parse thinking tags
|
|
2126
|
+
builder.try_parse_reasoning("<think>", "</think>");
|
|
2127
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2128
|
+
builder.add_content(builder.consume_rest());
|
|
2129
|
+
return;
|
|
2130
|
+
}
|
|
2131
|
+
|
|
2132
|
+
// Look for tool calls
|
|
2133
|
+
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
|
|
2134
|
+
if (auto res = builder.try_find_regex(tool_call_regex)) {
|
|
2135
|
+
builder.move_to(res->groups[0].end);
|
|
2136
|
+
|
|
2137
|
+
// Expect JSON array of tool calls
|
|
2138
|
+
auto tool_calls_data = builder.consume_json();
|
|
2139
|
+
if (tool_calls_data.json.is_array()) {
|
|
2140
|
+
if (!builder.try_consume_literal("</TOOLCALL>")) {
|
|
2141
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2142
|
+
}
|
|
2143
|
+
builder.add_tool_calls(tool_calls_data.json);
|
|
2144
|
+
} else {
|
|
2145
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
builder.add_content(builder.consume_rest());
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2049
2151
|
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2050
2152
|
// Parse thinking tags first - this handles the main reasoning content
|
|
2051
2153
|
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
@@ -2279,6 +2381,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2279
2381
|
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2280
2382
|
}
|
|
2281
2383
|
|
|
2384
|
+
// Nemotron v2
|
|
2385
|
+
if (src.find("<SPECIAL_10>") != std::string::npos) {
|
|
2386
|
+
return common_chat_params_init_nemotron_v2(tmpl, params);
|
|
2387
|
+
}
|
|
2388
|
+
|
|
2282
2389
|
// Use generic handler when mixing tools + JSON schema.
|
|
2283
2390
|
// TODO: support that mix in handlers below.
|
|
2284
2391
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2440,6 +2547,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2440
2547
|
case COMMON_CHAT_FORMAT_SEED_OSS:
|
|
2441
2548
|
common_chat_parse_seed_oss(builder);
|
|
2442
2549
|
break;
|
|
2550
|
+
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
|
|
2551
|
+
common_chat_parse_nemotron_v2(builder);
|
|
2552
|
+
break;
|
|
2443
2553
|
default:
|
|
2444
2554
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2445
2555
|
}
|
|
@@ -123,6 +123,7 @@ enum common_chat_format {
|
|
|
123
123
|
COMMON_CHAT_FORMAT_GRANITE,
|
|
124
124
|
COMMON_CHAT_FORMAT_GPT_OSS,
|
|
125
125
|
COMMON_CHAT_FORMAT_SEED_OSS,
|
|
126
|
+
COMMON_CHAT_FORMAT_NEMOTRON_V2,
|
|
126
127
|
|
|
127
128
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
|
128
129
|
};
|
|
@@ -209,6 +210,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
|
|
|
209
210
|
|
|
210
211
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
|
211
212
|
|
|
213
|
+
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
|
|
214
|
+
|
|
212
215
|
// Parses a JSON array of messages in OpenAI's chat completion API format.
|
|
213
216
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
|
214
217
|
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
|
|
@@ -445,7 +445,7 @@ struct common_params {
|
|
|
445
445
|
|
|
446
446
|
// "advanced" endpoints are disabled by default for better security
|
|
447
447
|
bool webui = true;
|
|
448
|
-
bool endpoint_slots =
|
|
448
|
+
bool endpoint_slots = true;
|
|
449
449
|
bool endpoint_props = false; // only control POST requests, not GET
|
|
450
450
|
bool endpoint_metrics = false;
|
|
451
451
|
|
|
@@ -4,17 +4,52 @@
|
|
|
4
4
|
#include <condition_variable>
|
|
5
5
|
#include <cstdarg>
|
|
6
6
|
#include <cstdio>
|
|
7
|
+
#include <cstdlib>
|
|
8
|
+
#include <cstring>
|
|
7
9
|
#include <mutex>
|
|
8
10
|
#include <sstream>
|
|
9
11
|
#include <thread>
|
|
10
12
|
#include <vector>
|
|
11
13
|
|
|
14
|
+
#if defined(_WIN32)
|
|
15
|
+
# include <io.h>
|
|
16
|
+
# include <windows.h>
|
|
17
|
+
# define isatty _isatty
|
|
18
|
+
# define fileno _fileno
|
|
19
|
+
#else
|
|
20
|
+
# include <unistd.h>
|
|
21
|
+
#endif // defined(_WIN32)
|
|
22
|
+
|
|
12
23
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
|
13
24
|
|
|
14
25
|
void common_log_set_verbosity_thold(int verbosity) {
|
|
15
26
|
common_log_verbosity_thold = verbosity;
|
|
16
27
|
}
|
|
17
28
|
|
|
29
|
+
// Auto-detect if colors should be enabled based on terminal and environment
|
|
30
|
+
static bool common_log_should_use_colors_auto() {
|
|
31
|
+
// Check NO_COLOR environment variable (https://no-color.org/)
|
|
32
|
+
if (const char * no_color = std::getenv("NO_COLOR")) {
|
|
33
|
+
if (no_color[0] != '\0') {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Check TERM environment variable
|
|
39
|
+
if (const char * term = std::getenv("TERM")) {
|
|
40
|
+
if (std::strcmp(term, "dumb") == 0) {
|
|
41
|
+
return false;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Check if stdout and stderr are connected to a terminal
|
|
46
|
+
// We check both because log messages can go to either
|
|
47
|
+
bool stdout_is_tty = isatty(fileno(stdout));
|
|
48
|
+
bool stderr_is_tty = isatty(fileno(stderr));
|
|
49
|
+
|
|
50
|
+
return stdout_is_tty || stderr_is_tty;
|
|
51
|
+
}
|
|
52
|
+
|
|
18
53
|
static int64_t t_us() {
|
|
19
54
|
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
|
20
55
|
}
|
|
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
|
|
|
353
388
|
|
|
354
389
|
struct common_log * common_log_main() {
|
|
355
390
|
static struct common_log log;
|
|
391
|
+
static std::once_flag init_flag;
|
|
392
|
+
std::call_once(init_flag, [&]() {
|
|
393
|
+
// Set default to auto-detect colors
|
|
394
|
+
log.set_colors(common_log_should_use_colors_auto());
|
|
395
|
+
});
|
|
356
396
|
|
|
357
397
|
return &log;
|
|
358
398
|
}
|
|
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
|
|
|
380
420
|
log->set_file(file);
|
|
381
421
|
}
|
|
382
422
|
|
|
383
|
-
void common_log_set_colors(struct common_log * log,
|
|
384
|
-
|
|
423
|
+
void common_log_set_colors(struct common_log * log, log_colors colors) {
|
|
424
|
+
if (colors == LOG_COLORS_AUTO) {
|
|
425
|
+
log->set_colors(common_log_should_use_colors_auto());
|
|
426
|
+
return;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
if (colors == LOG_COLORS_DISABLED) {
|
|
430
|
+
log->set_colors(false);
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
|
|
435
|
+
log->set_colors(true);
|
|
385
436
|
}
|
|
386
437
|
|
|
387
438
|
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
|
@@ -24,6 +24,12 @@
|
|
|
24
24
|
#define LOG_DEFAULT_DEBUG 1
|
|
25
25
|
#define LOG_DEFAULT_LLAMA 0
|
|
26
26
|
|
|
27
|
+
enum log_colors {
|
|
28
|
+
LOG_COLORS_AUTO = -1,
|
|
29
|
+
LOG_COLORS_DISABLED = 0,
|
|
30
|
+
LOG_COLORS_ENABLED = 1,
|
|
31
|
+
};
|
|
32
|
+
|
|
27
33
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
|
28
34
|
// set via common_log_set_verbosity()
|
|
29
35
|
extern int common_log_verbosity_thold;
|
|
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
|
|
|
65
71
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
|
66
72
|
//
|
|
67
73
|
|
|
68
|
-
void common_log_set_file (struct common_log * log, const char * file);
|
|
69
|
-
void common_log_set_colors (struct common_log * log,
|
|
70
|
-
void common_log_set_prefix (struct common_log * log,
|
|
71
|
-
void common_log_set_timestamps(struct common_log * log,
|
|
74
|
+
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
|
75
|
+
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
|
|
76
|
+
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
|
77
|
+
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
|
72
78
|
|
|
73
79
|
// helper macros for logging
|
|
74
80
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
|
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
|
|
426
426
|
|
|
427
427
|
// helpers
|
|
428
428
|
|
|
429
|
-
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
|
430
|
-
|
|
429
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
|
|
430
|
+
auto * res = &gsmpl->cur_p;
|
|
431
|
+
|
|
432
|
+
if (do_sort && !res->sorted) {
|
|
433
|
+
// remember the selected token before sorting
|
|
434
|
+
const llama_token id = res->data[res->selected].id;
|
|
435
|
+
|
|
436
|
+
std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
|
|
437
|
+
return a.p > b.p;
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
// restore the selected token after sorting
|
|
441
|
+
for (size_t i = 0; i < res->size; ++i) {
|
|
442
|
+
if (res->data[i].id == id) {
|
|
443
|
+
res->selected = i;
|
|
444
|
+
break;
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
res->sorted = true;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
return res;
|
|
431
452
|
}
|
|
432
453
|
|
|
433
454
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
|
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
|
|
86
86
|
// helpers
|
|
87
87
|
|
|
88
88
|
// access the internal list of current candidate tokens
|
|
89
|
-
|
|
89
|
+
// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
|
|
90
|
+
// the .sorted flag of the result indicates whether the returned candidates are sorted
|
|
91
|
+
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
|
|
90
92
|
|
|
91
93
|
// get the last accepted token
|
|
92
94
|
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
|
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
|
|
|
317
317
|
|
|
318
318
|
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
319
319
|
|
|
320
|
-
const auto * cur_p = common_sampler_get_candidates(smpl);
|
|
320
|
+
const auto * cur_p = common_sampler_get_candidates(smpl, true);
|
|
321
321
|
|
|
322
322
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
|
323
323
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
@@ -129,10 +129,11 @@ endif()
|
|
|
129
129
|
option(GGML_LASX "ggml: enable lasx" ON)
|
|
130
130
|
option(GGML_LSX "ggml: enable lsx" ON)
|
|
131
131
|
option(GGML_RVV "ggml: enable rvv" ON)
|
|
132
|
-
option(GGML_RV_ZFH "ggml: enable riscv zfh"
|
|
132
|
+
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
|
|
133
|
+
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
|
|
134
|
+
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
|
|
133
135
|
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
|
|
134
136
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
135
|
-
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
|
|
136
137
|
|
|
137
138
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
138
139
|
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
@@ -307,6 +307,9 @@ extern "C" {
|
|
|
307
307
|
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
|
308
308
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
|
309
309
|
|
|
310
|
+
// Split graph without allocating it
|
|
311
|
+
GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
312
|
+
|
|
310
313
|
// Allocate and compute graph on the backend scheduler
|
|
311
314
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
|
312
315
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
|
@@ -101,7 +101,6 @@ extern "C" {
|
|
|
101
101
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
|
102
102
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
|
103
103
|
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
|
|
104
|
-
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
|
|
105
104
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
|
106
105
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
|
107
106
|
|
|
@@ -511,6 +511,7 @@ extern "C" {
|
|
|
511
511
|
GGML_OP_CONV_TRANSPOSE_1D,
|
|
512
512
|
GGML_OP_IM2COL,
|
|
513
513
|
GGML_OP_IM2COL_BACK,
|
|
514
|
+
GGML_OP_IM2COL_3D,
|
|
514
515
|
GGML_OP_CONV_2D,
|
|
515
516
|
GGML_OP_CONV_3D,
|
|
516
517
|
GGML_OP_CONV_2D_DW,
|
|
@@ -1870,6 +1871,41 @@ extern "C" {
|
|
|
1870
1871
|
int d0, // dilation dimension 0
|
|
1871
1872
|
int d1); // dilation dimension 1
|
|
1872
1873
|
|
|
1874
|
+
GGML_API struct ggml_tensor * ggml_im2col_3d(
|
|
1875
|
+
struct ggml_context * ctx,
|
|
1876
|
+
struct ggml_tensor * a,
|
|
1877
|
+
struct ggml_tensor * b,
|
|
1878
|
+
int64_t IC,
|
|
1879
|
+
int s0, // stride width
|
|
1880
|
+
int s1, // stride height
|
|
1881
|
+
int s2, // stride depth
|
|
1882
|
+
int p0, // padding width
|
|
1883
|
+
int p1, // padding height
|
|
1884
|
+
int p2, // padding depth
|
|
1885
|
+
int d0, // dilation width
|
|
1886
|
+
int d1, // dilation height
|
|
1887
|
+
int d2, // dilation depth
|
|
1888
|
+
enum ggml_type dst_type);
|
|
1889
|
+
|
|
1890
|
+
// a: [OC*IC, KD, KH, KW]
|
|
1891
|
+
// b: [N*IC, ID, IH, IW]
|
|
1892
|
+
// result: [N*OC, OD, OH, OW]
|
|
1893
|
+
GGML_API struct ggml_tensor * ggml_conv_3d(
|
|
1894
|
+
struct ggml_context * ctx,
|
|
1895
|
+
struct ggml_tensor * a,
|
|
1896
|
+
struct ggml_tensor * b,
|
|
1897
|
+
int64_t IC,
|
|
1898
|
+
int s0, // stride width
|
|
1899
|
+
int s1, // stride height
|
|
1900
|
+
int s2, // stride depth
|
|
1901
|
+
int p0, // padding width
|
|
1902
|
+
int p1, // padding height
|
|
1903
|
+
int p2, // padding depth
|
|
1904
|
+
int d0, // dilation width
|
|
1905
|
+
int d1, // dilation height
|
|
1906
|
+
int d2 // dilation depth
|
|
1907
|
+
);
|
|
1908
|
+
|
|
1873
1909
|
// kernel size is a->ne[0] x a->ne[1]
|
|
1874
1910
|
// stride is equal to kernel size
|
|
1875
1911
|
// padding is zero
|
|
@@ -1941,7 +1977,7 @@ extern "C" {
|
|
|
1941
1977
|
int d0, // dilation dimension 0
|
|
1942
1978
|
int d1); // dilation dimension 1
|
|
1943
1979
|
|
|
1944
|
-
GGML_API struct ggml_tensor *
|
|
1980
|
+
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
|
1945
1981
|
struct ggml_context * ctx,
|
|
1946
1982
|
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
|
1947
1983
|
struct ggml_tensor * b, // input [W, H, D, C * N]
|
|
@@ -2048,6 +2084,19 @@ extern "C" {
|
|
|
2048
2084
|
int p2,
|
|
2049
2085
|
int p3);
|
|
2050
2086
|
|
|
2087
|
+
GGML_API struct ggml_tensor * ggml_pad_ext(
|
|
2088
|
+
struct ggml_context * ctx,
|
|
2089
|
+
struct ggml_tensor * a,
|
|
2090
|
+
int lp0,
|
|
2091
|
+
int rp0,
|
|
2092
|
+
int lp1,
|
|
2093
|
+
int rp1,
|
|
2094
|
+
int lp2,
|
|
2095
|
+
int rp2,
|
|
2096
|
+
int lp3,
|
|
2097
|
+
int rp3
|
|
2098
|
+
);
|
|
2099
|
+
|
|
2051
2100
|
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
|
2052
2101
|
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
|
|
2053
2102
|
struct ggml_context * ctx,
|
|
@@ -433,15 +433,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
433
433
|
ggml-cpu/arch/riscv/quants.c
|
|
434
434
|
ggml-cpu/arch/riscv/repack.cpp
|
|
435
435
|
)
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
436
|
+
set(MARCH_STR "rv64gc")
|
|
437
|
+
if (GGML_RV_ZFH)
|
|
438
|
+
string(APPEND MARCH_STR "_zfh")
|
|
439
|
+
endif()
|
|
440
|
+
if (GGML_XTHEADVECTOR)
|
|
441
|
+
string(APPEND MARCH_STR "_xtheadvector")
|
|
442
|
+
elseif (GGML_RVV)
|
|
443
|
+
string(APPEND MARCH_STR "_v")
|
|
444
|
+
if (GGML_RV_ZVFH)
|
|
445
|
+
string(APPEND MARCH_STR "_zvfh")
|
|
443
446
|
endif()
|
|
444
447
|
endif()
|
|
448
|
+
if (GGML_RV_ZICBOP)
|
|
449
|
+
string(APPEND MARCH_STR "_zicbop")
|
|
450
|
+
endif()
|
|
451
|
+
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
|
445
452
|
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
|
|
446
453
|
message(STATUS "s390x detected")
|
|
447
454
|
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
|
|
@@ -450,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
450
457
|
|
|
451
458
|
# TODO: Separation to determine activation of VX/VXE/VXE2
|
|
452
459
|
if (${S390X_M} MATCHES "8561|8562")
|
|
453
|
-
set(GGML_NNPA OFF)
|
|
454
460
|
message(STATUS "z15 target")
|
|
455
461
|
list(APPEND ARCH_FLAGS -march=z15)
|
|
456
462
|
elseif (${S390X_M} MATCHES "3931")
|
|
@@ -472,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
472
478
|
list(APPEND ARCH_FLAGS -mvx -mzvector)
|
|
473
479
|
list(APPEND ARCH_DEFINITIONS GGML_VXE)
|
|
474
480
|
endif()
|
|
475
|
-
|
|
476
|
-
if (GGML_NNPA)
|
|
477
|
-
message(STATUS "NNPA enabled")
|
|
478
|
-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
|
|
479
|
-
endif()
|
|
480
481
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
|
|
481
482
|
message(STATUS "Wasm detected")
|
|
482
483
|
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
|