@fugood/llama.node 1.1.10 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +2 -1
- package/package.json +14 -14
- package/src/LlamaContext.cpp +17 -1
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +152 -1
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +32 -97
- package/src/llama.cpp/src/llama-kv-cache.h +3 -13
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +275 -20
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
package/lib/binding.ts
CHANGED
|
@@ -27,7 +27,8 @@ export type LlamaModelOptions = {
|
|
|
27
27
|
n_ubatch?: number
|
|
28
28
|
n_threads?: number
|
|
29
29
|
n_gpu_layers?: number
|
|
30
|
-
|
|
30
|
+
flash_attn_type?: 'auto' | 'on' | 'off'
|
|
31
|
+
flash_attn?: boolean // Deprecated: use flash_attn_type instead
|
|
31
32
|
cache_type_k?:
|
|
32
33
|
| 'f16'
|
|
33
34
|
| 'f32'
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.11",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.11",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.11",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.11",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.11",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.11",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.11",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.11",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.11",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.11",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.11",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.11",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.11",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.11"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -190,6 +190,15 @@ static ggml_type kv_cache_type_from_str(const std::string &s) {
|
|
|
190
190
|
throw std::runtime_error("Unsupported cache type: " + s);
|
|
191
191
|
}
|
|
192
192
|
|
|
193
|
+
static enum llama_flash_attn_type flash_attn_type_from_str(const std::string &s) {
|
|
194
|
+
if (s == "on")
|
|
195
|
+
return LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
|
196
|
+
if (s == "off")
|
|
197
|
+
return LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
198
|
+
return LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
193
202
|
static int32_t pooling_type_from_str(const std::string &s) {
|
|
194
203
|
if (s == "none")
|
|
195
204
|
return LLAMA_POOLING_TYPE_NONE;
|
|
@@ -242,7 +251,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
242
251
|
params.cpuparams.n_threads =
|
|
243
252
|
get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
|
|
244
253
|
params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
|
|
245
|
-
|
|
254
|
+
|
|
255
|
+
auto flash_attn_type = get_option<std::string>(options, "flash_attn_type", "auto");
|
|
256
|
+
if (!flash_attn_type.empty()) {
|
|
257
|
+
params.flash_attn_type = (enum llama_flash_attn_type)flash_attn_type_from_str(flash_attn_type.c_str());
|
|
258
|
+
} else {
|
|
259
|
+
params.flash_attn_type = get_option<bool>(options, "flash_attn", false) ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
260
|
+
}
|
|
261
|
+
|
|
246
262
|
params.cache_type_k = kv_cache_type_from_str(
|
|
247
263
|
get_option<std::string>(options, "cache_type_k", "f16").c_str());
|
|
248
264
|
params.cache_type_v = kv_cache_type_from_str(
|
|
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
1106
1106
|
printf("\"\n\n");
|
|
1107
1107
|
|
|
1108
1108
|
printf(" case \"$prev\" in\n");
|
|
1109
|
-
printf(" --model)\n");
|
|
1109
|
+
printf(" --model|-m)\n");
|
|
1110
1110
|
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
1111
1111
|
printf(" return 0\n");
|
|
1112
1112
|
printf(" ;;\n");
|
|
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1545
1545
|
}
|
|
1546
1546
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1547
1547
|
add_opt(common_arg(
|
|
1548
|
-
{"-fa", "--flash-attn"},
|
|
1549
|
-
string_format("
|
|
1550
|
-
[](common_params & params) {
|
|
1551
|
-
|
|
1548
|
+
{"-fa", "--flash-attn"}, "FA",
|
|
1549
|
+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
|
|
1550
|
+
[](common_params & params, const std::string & value) {
|
|
1551
|
+
if (value == "on" || value == "enabled") {
|
|
1552
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
|
1553
|
+
} else if (value == "off" || value == "disabled") {
|
|
1554
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
1555
|
+
} else if (value == "auto") {
|
|
1556
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
1557
|
+
} else {
|
|
1558
|
+
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
|
1559
|
+
}
|
|
1552
1560
|
}
|
|
1553
1561
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
1554
1562
|
add_opt(common_arg(
|
|
@@ -2555,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2555
2563
|
{"--lora"}, "FNAME",
|
|
2556
2564
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
2557
2565
|
[](common_params & params, const std::string & value) {
|
|
2558
|
-
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
|
|
2566
|
+
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
|
|
2559
2567
|
}
|
|
2560
2568
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2561
2569
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
@@ -2563,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2563
2571
|
{"--lora-scaled"}, "FNAME", "SCALE",
|
|
2564
2572
|
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
|
2565
2573
|
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
2566
|
-
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
|
|
2574
|
+
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
|
|
2567
2575
|
}
|
|
2568
2576
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2569
2577
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3459
3467
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
3460
3468
|
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
3461
3469
|
params.port = 8012;
|
|
3462
|
-
params.n_gpu_layers = 99;
|
|
3463
|
-
params.flash_attn = true;
|
|
3464
3470
|
params.n_ubatch = 1024;
|
|
3465
3471
|
params.n_batch = 1024;
|
|
3466
3472
|
params.n_ctx = 0;
|
|
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3475
3481
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
3476
3482
|
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
3477
3483
|
params.port = 8012;
|
|
3478
|
-
params.n_gpu_layers = 99;
|
|
3479
|
-
params.flash_attn = true;
|
|
3480
3484
|
params.n_ubatch = 1024;
|
|
3481
3485
|
params.n_batch = 1024;
|
|
3482
3486
|
params.n_ctx = 0;
|
|
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3491
3495
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3492
3496
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3493
3497
|
params.port = 8012;
|
|
3494
|
-
params.n_gpu_layers = 99;
|
|
3495
|
-
params.flash_attn = true;
|
|
3496
3498
|
params.n_ubatch = 1024;
|
|
3497
3499
|
params.n_batch = 1024;
|
|
3498
3500
|
params.n_ctx = 0;
|
|
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3508
3510
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3509
3511
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3510
3512
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3511
|
-
params.speculative.n_gpu_layers = 99;
|
|
3512
3513
|
params.port = 8012;
|
|
3513
|
-
params.n_gpu_layers = 99;
|
|
3514
|
-
params.flash_attn = true;
|
|
3515
3514
|
params.n_ubatch = 1024;
|
|
3516
3515
|
params.n_batch = 1024;
|
|
3517
3516
|
params.n_ctx = 0;
|
|
@@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3527
3526
|
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
3528
3527
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3529
3528
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3530
|
-
params.speculative.n_gpu_layers = 99;
|
|
3531
3529
|
params.port = 8012;
|
|
3532
|
-
params.
|
|
3533
|
-
params.
|
|
3530
|
+
params.n_ubatch = 1024;
|
|
3531
|
+
params.n_batch = 1024;
|
|
3532
|
+
params.n_ctx = 0;
|
|
3533
|
+
params.n_cache_reuse = 256;
|
|
3534
|
+
}
|
|
3535
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3536
|
+
|
|
3537
|
+
add_opt(common_arg(
|
|
3538
|
+
{"--fim-qwen-30b-default"},
|
|
3539
|
+
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
|
|
3540
|
+
[](common_params & params) {
|
|
3541
|
+
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
|
|
3542
|
+
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
|
|
3543
|
+
params.port = 8012;
|
|
3534
3544
|
params.n_ubatch = 1024;
|
|
3535
3545
|
params.n_batch = 1024;
|
|
3536
3546
|
params.n_ctx = 0;
|
|
@@ -609,6 +609,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
609
609
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
610
610
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
611
611
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
612
|
+
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
612
613
|
default:
|
|
613
614
|
throw std::runtime_error("Unknown chat format");
|
|
614
615
|
}
|
|
@@ -2045,6 +2046,94 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2045
2046
|
}
|
|
2046
2047
|
}
|
|
2047
2048
|
|
|
2049
|
+
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2050
|
+
// Parse thinking tags first - this handles the main reasoning content
|
|
2051
|
+
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
2052
|
+
|
|
2053
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2054
|
+
builder.add_content(builder.consume_rest());
|
|
2055
|
+
return;
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
// Parse tool calls - Seed-OSS uses <seed:tool_call> format
|
|
2059
|
+
static const common_regex tool_call_begin_regex("<seed:tool_call>");
|
|
2060
|
+
static const common_regex tool_call_end_regex("</seed:tool_call>");
|
|
2061
|
+
static const common_regex function_regex("<function=([^>]+)>");
|
|
2062
|
+
static const common_regex param_regex("<parameter=([^>]+)>");
|
|
2063
|
+
|
|
2064
|
+
while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
|
|
2065
|
+
builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
|
|
2066
|
+
|
|
2067
|
+
// Look for function call inside tool call, ignore any content before it
|
|
2068
|
+
if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
|
|
2069
|
+
auto function_name = builder.str(func_res->groups[1]);
|
|
2070
|
+
|
|
2071
|
+
// Parse Seed-OSS parameters <parameter=name>value</parameter>
|
|
2072
|
+
json args = json::object();
|
|
2073
|
+
// Parse all parameters
|
|
2074
|
+
while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
|
|
2075
|
+
// again, ignore noise around parameters
|
|
2076
|
+
auto param_name = builder.str(param_res->groups[1]);
|
|
2077
|
+
builder.move_to(param_res->groups[0].end);
|
|
2078
|
+
builder.consume_spaces(); // Consume whitespace after parameter
|
|
2079
|
+
auto savedPos = builder.pos();
|
|
2080
|
+
if (auto param_parse = builder.try_find_literal("</parameter>")) {
|
|
2081
|
+
auto param = param_parse->prelude;
|
|
2082
|
+
builder.move_to(savedPos);
|
|
2083
|
+
try {
|
|
2084
|
+
if (auto param_res = builder.try_consume_json()) {
|
|
2085
|
+
args[param_name] = param_res->json;
|
|
2086
|
+
} else {
|
|
2087
|
+
args[param_name] = param;
|
|
2088
|
+
}
|
|
2089
|
+
} catch (json::exception &) {
|
|
2090
|
+
args[param_name] = param;
|
|
2091
|
+
}
|
|
2092
|
+
} else {
|
|
2093
|
+
throw common_chat_msg_partial_exception("Incomplete tool parameter");
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
// Look for closing function tag
|
|
2097
|
+
auto end_func = builder.try_find_literal("</function>");
|
|
2098
|
+
if (end_func) {
|
|
2099
|
+
builder.move_to(end_func->groups[0].end);
|
|
2100
|
+
builder.consume_spaces(); // Consume whitespace after </function>
|
|
2101
|
+
|
|
2102
|
+
// Add the tool call with parsed arguments, but only if we REALLY got the literal
|
|
2103
|
+
auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
|
|
2104
|
+
auto funlen = std::string("</function>").length();
|
|
2105
|
+
if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
|
|
2106
|
+
if (!builder.add_tool_call(function_name, "", args.dump())) {
|
|
2107
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2108
|
+
}
|
|
2109
|
+
} else {
|
|
2110
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2111
|
+
}
|
|
2112
|
+
} else {
|
|
2113
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2114
|
+
}
|
|
2115
|
+
// Look for closing tool call tag
|
|
2116
|
+
if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
|
|
2117
|
+
builder.move_to(end_tool->groups[0].end);
|
|
2118
|
+
builder.consume_spaces(); // Consume trailing whitespace after tool call
|
|
2119
|
+
} else {
|
|
2120
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2121
|
+
}
|
|
2122
|
+
} else {
|
|
2123
|
+
// No function found - don't consume content here, let it be handled at the end
|
|
2124
|
+
break;
|
|
2125
|
+
}
|
|
2126
|
+
}
|
|
2127
|
+
|
|
2128
|
+
// Consume any remaining whitespace after all tool call processing
|
|
2129
|
+
builder.consume_spaces();
|
|
2130
|
+
auto remaining = builder.consume_rest();
|
|
2131
|
+
// If there's any non-whitespace content remaining, add it as content
|
|
2132
|
+
if (!string_strip(remaining).empty()) {
|
|
2133
|
+
builder.add_content(remaining);
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
|
|
2048
2137
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2049
2138
|
common_chat_params data;
|
|
2050
2139
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2061,8 +2150,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
|
|
|
2061
2150
|
return data;
|
|
2062
2151
|
}
|
|
2063
2152
|
|
|
2153
|
+
static common_chat_params common_chat_params_init_seed_oss(
|
|
2154
|
+
const common_chat_template & tmpl,
|
|
2155
|
+
templates_params & params,
|
|
2156
|
+
const common_chat_templates_inputs & inputs)
|
|
2157
|
+
{
|
|
2158
|
+
common_chat_params data;
|
|
2159
|
+
data.prompt = apply(tmpl, params);
|
|
2160
|
+
data.format = COMMON_CHAT_FORMAT_SEED_OSS;
|
|
2161
|
+
if (string_ends_with(data.prompt, "<seed:think>")) {
|
|
2162
|
+
if (!inputs.enable_thinking) {
|
|
2163
|
+
data.prompt += "</seed:think>";
|
|
2164
|
+
} else {
|
|
2165
|
+
data.thinking_forced_open = true;
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
if (params.tools.is_array() && !params.tools.empty()) {
|
|
2170
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
2171
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
2172
|
+
std::vector<std::string> tool_rules;
|
|
2173
|
+
foreach_function(params.tools, [&](const json & tool) {
|
|
2174
|
+
const auto & function = tool.at("function");
|
|
2175
|
+
std::string name = function.at("name");
|
|
2176
|
+
auto parameters = function.at("parameters");
|
|
2177
|
+
builder.resolve_refs(parameters);
|
|
2178
|
+
|
|
2179
|
+
// Create rule for Seed-OSS function call format
|
|
2180
|
+
std::string param_rules;
|
|
2181
|
+
if (parameters.contains("properties")) {
|
|
2182
|
+
for (const auto & [key, value] : parameters.at("properties").items()) {
|
|
2183
|
+
param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
|
|
2184
|
+
"\"</parameter>\"";
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
|
|
2188
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
2189
|
+
"\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
|
|
2190
|
+
param_rules +
|
|
2191
|
+
" \"</function>\" space \"</seed:tool_call>\""));
|
|
2192
|
+
});
|
|
2193
|
+
|
|
2194
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
|
|
2195
|
+
|
|
2196
|
+
data.preserved_tokens = {
|
|
2197
|
+
"<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
|
|
2198
|
+
"<function=", "</function>", "<parameter=", "</parameter>",
|
|
2199
|
+
};
|
|
2200
|
+
|
|
2201
|
+
builder.add_rule("root", string_join(tool_rules, " | "));
|
|
2202
|
+
});
|
|
2203
|
+
}
|
|
2204
|
+
return data;
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2064
2207
|
static common_chat_params common_chat_templates_apply_jinja(
|
|
2065
|
-
const struct common_chat_templates
|
|
2208
|
+
const struct common_chat_templates * tmpls,
|
|
2066
2209
|
const struct common_chat_templates_inputs & inputs)
|
|
2067
2210
|
{
|
|
2068
2211
|
templates_params params;
|
|
@@ -2131,6 +2274,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2131
2274
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2132
2275
|
}
|
|
2133
2276
|
|
|
2277
|
+
// Seed-OSS
|
|
2278
|
+
if (src.find("<seed:think>") != std::string::npos) {
|
|
2279
|
+
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2280
|
+
}
|
|
2281
|
+
|
|
2134
2282
|
// Use generic handler when mixing tools + JSON schema.
|
|
2135
2283
|
// TODO: support that mix in handlers below.
|
|
2136
2284
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2289,6 +2437,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2289
2437
|
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
2290
2438
|
common_chat_parse_gpt_oss(builder);
|
|
2291
2439
|
break;
|
|
2440
|
+
case COMMON_CHAT_FORMAT_SEED_OSS:
|
|
2441
|
+
common_chat_parse_seed_oss(builder);
|
|
2442
|
+
break;
|
|
2292
2443
|
default:
|
|
2293
2444
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2294
2445
|
}
|
|
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
901
901
|
|
|
902
902
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
903
903
|
if (model == NULL) {
|
|
904
|
-
LOG_ERR("%s: failed to load model '%s'\n",
|
|
904
|
+
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
905
|
+
__func__, params.model.path.c_str());
|
|
905
906
|
return iparams;
|
|
906
907
|
}
|
|
907
908
|
|
|
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
911
912
|
|
|
912
913
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
913
914
|
if (lctx == NULL) {
|
|
914
|
-
LOG_ERR("%s: failed to create context with model '%s'\n",
|
|
915
|
+
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
916
|
+
__func__, params.model.path.c_str());
|
|
915
917
|
llama_model_free(model);
|
|
916
918
|
return iparams;
|
|
917
919
|
}
|
|
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
988
990
|
return iparams;
|
|
989
991
|
}
|
|
990
992
|
|
|
993
|
+
char buf[1024];
|
|
991
994
|
la.ptr = lora.get();
|
|
995
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
996
|
+
la.task_name = buf;
|
|
997
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
998
|
+
la.prompt_prefix = buf;
|
|
992
999
|
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
993
1000
|
}
|
|
994
1001
|
|
|
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1153
1160
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1154
1161
|
cparams.pooling_type = params.pooling_type;
|
|
1155
1162
|
cparams.attention_type = params.attention_type;
|
|
1163
|
+
cparams.flash_attn_type = params.flash_attn_type;
|
|
1156
1164
|
cparams.cb_eval = params.cb_eval;
|
|
1157
1165
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1158
1166
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
1159
|
-
cparams.flash_attn = params.flash_attn;
|
|
1160
1167
|
cparams.no_perf = params.no_perf;
|
|
1161
1168
|
cparams.op_offload = !params.no_op_offload;
|
|
1162
1169
|
cparams.swa_full = params.swa_full;
|
|
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
|
|
|
34
34
|
std::string path;
|
|
35
35
|
float scale;
|
|
36
36
|
|
|
37
|
+
std::string task_name;
|
|
38
|
+
std::string prompt_prefix;
|
|
39
|
+
|
|
37
40
|
struct llama_adapter_lora * ptr;
|
|
38
41
|
};
|
|
39
42
|
|
|
@@ -310,6 +313,7 @@ struct common_params {
|
|
|
310
313
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
311
314
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
312
315
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
316
|
+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
|
|
313
317
|
|
|
314
318
|
struct common_params_sampling sampling;
|
|
315
319
|
struct common_params_speculative speculative;
|
|
@@ -373,7 +377,6 @@ struct common_params {
|
|
|
373
377
|
bool multiline_input = false; // reverse the usage of `\`
|
|
374
378
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
375
379
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
376
|
-
bool flash_attn = false; // flash attention
|
|
377
380
|
bool no_perf = false; // disable performance metrics
|
|
378
381
|
bool ctx_shift = false; // context shift on infinite text generation
|
|
379
382
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
@@ -435,7 +435,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
435
435
|
)
|
|
436
436
|
if (GGML_RVV)
|
|
437
437
|
if (GGML_XTHEADVECTOR)
|
|
438
|
-
list(APPEND ARCH_FLAGS -march=
|
|
438
|
+
list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
|
|
439
439
|
elseif (GGML_RV_ZFH)
|
|
440
440
|
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
|
|
441
441
|
else()
|
|
@@ -497,9 +497,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
497
497
|
|
|
498
498
|
# Fetch KleidiAI sources:
|
|
499
499
|
include(FetchContent)
|
|
500
|
-
set(KLEIDIAI_COMMIT_TAG "v1.
|
|
500
|
+
set(KLEIDIAI_COMMIT_TAG "v1.13.0")
|
|
501
501
|
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
|
502
|
-
set(KLEIDIAI_ARCHIVE_MD5 "
|
|
502
|
+
set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
|
|
503
503
|
|
|
504
504
|
if (POLICY CMP0135)
|
|
505
505
|
cmake_policy(SET CMP0135 NEW)
|
|
@@ -555,6 +555,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
555
555
|
|
|
556
556
|
list(APPEND GGML_KLEIDIAI_SOURCES
|
|
557
557
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
|
|
558
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
|
|
558
559
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
|
|
559
560
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
|
|
560
561
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
|
|
@@ -576,7 +577,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
|
|
576
577
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
|
|
577
578
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
|
|
578
579
|
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
|
|
579
|
-
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
|
580
|
+
${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
|
|
581
|
+
${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
|
|
580
582
|
set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
|
|
581
583
|
endif()
|
|
582
584
|
|
|
@@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
|
|
|
489
489
|
/**
|
|
490
490
|
* @see https://github.com/ggml-org/llama.cpp/pull/14037
|
|
491
491
|
*/
|
|
492
|
-
inline float vec_hsum(float32x4_t v) {
|
|
492
|
+
inline static float vec_hsum(float32x4_t v) {
|
|
493
493
|
float32x4_t v_temp = v + vec_reve(v);
|
|
494
494
|
return v_temp[0] + v_temp[1];
|
|
495
495
|
}
|