@fugood/llama.node 1.1.9 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +7 -1
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +15 -5
- package/src/LlamaCompletionWorker.cpp +12 -3
- package/src/LlamaCompletionWorker.h +3 -1
- package/src/LlamaContext.cpp +20 -2
- package/src/llama.cpp/common/arg.cpp +29 -19
- package/src/llama.cpp/common/chat.cpp +153 -3
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +10 -3
- package/src/llama.cpp/common/common.h +4 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
- package/src/llama.cpp/include/llama.h +27 -1
- package/src/llama.cpp/src/llama-adapter.cpp +68 -4
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +46 -2
- package/src/llama.cpp/src/llama-arch.h +4 -0
- package/src/llama.cpp/src/llama-context.cpp +80 -39
- package/src/llama.cpp/src/llama-context.h +0 -4
- package/src/llama.cpp/src/llama-graph.cpp +20 -10
- package/src/llama.cpp/src/llama-graph.h +2 -1
- package/src/llama.cpp/src/llama-hparams.cpp +25 -0
- package/src/llama.cpp/src/llama-hparams.h +6 -0
- package/src/llama.cpp/src/llama-impl.h +2 -0
- package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
- package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
- package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
- package/src/llama.cpp/src/llama-kv-cache.h +16 -28
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
- package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
- package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
- package/src/llama.cpp/src/llama-memory.h +8 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +302 -31
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-vocab.cpp +1 -1
- package/src/llama.cpp/src/llama.cpp +12 -0
package/lib/binding.ts
CHANGED
|
@@ -27,7 +27,8 @@ export type LlamaModelOptions = {
|
|
|
27
27
|
n_ubatch?: number
|
|
28
28
|
n_threads?: number
|
|
29
29
|
n_gpu_layers?: number
|
|
30
|
-
|
|
30
|
+
flash_attn_type?: 'auto' | 'on' | 'off'
|
|
31
|
+
flash_attn?: boolean // Deprecated: use flash_attn_type instead
|
|
31
32
|
cache_type_k?:
|
|
32
33
|
| 'f16'
|
|
33
34
|
| 'f32'
|
|
@@ -100,6 +101,11 @@ export type LlamaCompletionOptions = {
|
|
|
100
101
|
enable_thinking?: boolean
|
|
101
102
|
thinking_forced_open?: boolean
|
|
102
103
|
prompt?: string
|
|
104
|
+
/**
|
|
105
|
+
* Text to prefill the response with.
|
|
106
|
+
* This text will be added to the beginning of the generated response.
|
|
107
|
+
*/
|
|
108
|
+
prefill_text?: string
|
|
103
109
|
temperature?: number
|
|
104
110
|
top_k?: number
|
|
105
111
|
top_p?: number
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.1.
|
|
4
|
+
"version": "1.1.11",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -71,19 +71,19 @@
|
|
|
71
71
|
"CMakeLists.txt"
|
|
72
72
|
],
|
|
73
73
|
"optionalDependencies": {
|
|
74
|
-
"@fugood/node-llama-linux-x64": "1.1.
|
|
75
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.1.
|
|
76
|
-
"@fugood/node-llama-linux-x64-cuda": "1.1.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.1.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.1.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.1.
|
|
80
|
-
"@fugood/node-llama-win32-x64": "1.1.
|
|
81
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.1.
|
|
82
|
-
"@fugood/node-llama-win32-x64-cuda": "1.1.
|
|
83
|
-
"@fugood/node-llama-win32-arm64": "1.1.
|
|
84
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.1.
|
|
85
|
-
"@fugood/node-llama-darwin-x64": "1.1.
|
|
86
|
-
"@fugood/node-llama-darwin-arm64": "1.1.
|
|
74
|
+
"@fugood/node-llama-linux-x64": "1.1.11",
|
|
75
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.1.11",
|
|
76
|
+
"@fugood/node-llama-linux-x64-cuda": "1.1.11",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.1.11",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.1.11",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.1.11",
|
|
80
|
+
"@fugood/node-llama-win32-x64": "1.1.11",
|
|
81
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.1.11",
|
|
82
|
+
"@fugood/node-llama-win32-x64-cuda": "1.1.11",
|
|
83
|
+
"@fugood/node-llama-win32-arm64": "1.1.11",
|
|
84
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.1.11",
|
|
85
|
+
"@fugood/node-llama-darwin-x64": "1.1.11",
|
|
86
|
+
"@fugood/node-llama-darwin-arm64": "1.1.11"
|
|
87
87
|
},
|
|
88
88
|
"devDependencies": {
|
|
89
89
|
"@babel/preset-env": "^7.24.4",
|
package/scripts/llama.cpp.patch
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
|
|
2
|
-
index
|
|
2
|
+
index 111b4a21b..16ce87672 100644
|
|
3
3
|
--- a/src/llama.cpp/common/chat.cpp
|
|
4
4
|
+++ b/src/llama.cpp/common/chat.cpp
|
|
5
5
|
@@ -6,9 +6,6 @@
|
|
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
|
|
|
29
29
|
struct templates_params {
|
|
30
30
|
json messages;
|
|
31
31
|
json tools;
|
|
32
|
+
@@ -784,8 +771,7 @@ static std::string apply(
|
|
33
|
+
if (additional_context) {
|
|
34
|
+
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
35
|
+
}
|
|
36
|
+
- // TODO: add flag to control date/time, if only for testing purposes.
|
|
37
|
+
- // tmpl_inputs.now = std::chrono::system_clock::now();
|
|
38
|
+
+ tmpl_inputs.now = inputs.now;
|
|
39
|
+
|
|
40
|
+
minja::chat_template_options tmpl_opts;
|
|
41
|
+
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
32
42
|
diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
|
|
33
43
|
index d1e480c91..437e64e29 100644
|
|
34
44
|
--- a/src/llama.cpp/common/chat.h
|
|
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
|
|
|
54
64
|
struct common_chat_tool_call {
|
|
55
65
|
std::string name;
|
|
56
66
|
diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
|
|
57
|
-
index
|
|
67
|
+
index fdce1dcde..55aac3412 100644
|
|
58
68
|
--- a/src/llama.cpp/common/common.cpp
|
|
59
69
|
+++ b/src/llama.cpp/common/common.cpp
|
|
60
|
-
@@ -
|
|
70
|
+
@@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
61
71
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
62
72
|
}
|
|
63
73
|
|
|
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
|
|
|
66
76
|
mparams.split_mode = params.split_mode;
|
|
67
77
|
mparams.tensor_split = params.tensor_split;
|
|
68
78
|
diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
|
|
69
|
-
index
|
|
79
|
+
index 390dda5e5..f259ca785 100644
|
|
70
80
|
--- a/src/llama.cpp/common/common.h
|
|
71
81
|
+++ b/src/llama.cpp/common/common.h
|
|
72
|
-
@@ -
|
|
82
|
+
@@ -270,6 +270,7 @@ struct lr_opt {
|
|
73
83
|
struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
74
84
|
|
|
75
85
|
struct common_params {
|
|
@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
|
|
|
35
35
|
const std::vector<std::string> &media_paths,
|
|
36
36
|
const std::vector<llama_token> &guide_tokens,
|
|
37
37
|
bool has_vocoder,
|
|
38
|
-
tts_type tts_type_val
|
|
38
|
+
tts_type tts_type_val,
|
|
39
|
+
const std::string &prefill_text)
|
|
39
40
|
: AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
|
|
40
41
|
_params(params), _stop_words(stop_words), _chat_format(chat_format),
|
|
41
42
|
_thinking_forced_open(thinking_forced_open),
|
|
42
43
|
_reasoning_format(reasoning_format),
|
|
43
44
|
_media_paths(media_paths), _guide_tokens(guide_tokens),
|
|
45
|
+
_prefill_text(prefill_text),
|
|
44
46
|
_has_vocoder(has_vocoder), _tts_type(tts_type_val) {
|
|
45
47
|
if (!callback.IsEmpty()) {
|
|
46
48
|
_tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
|
|
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
|
|
|
68
70
|
|
|
69
71
|
chat_syntax.parse_tool_calls = true;
|
|
70
72
|
|
|
73
|
+
// Combine prefill_text with generated_text for parsing
|
|
74
|
+
std::string full_text = _prefill_text + generated_text;
|
|
75
|
+
|
|
71
76
|
// Use is_partial=true for streaming partial output
|
|
72
|
-
common_chat_msg parsed_msg = common_chat_parse(
|
|
77
|
+
common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
|
|
73
78
|
|
|
74
79
|
result.content = parsed_msg.content;
|
|
75
80
|
result.reasoning_content = parsed_msg.reasoning_content;
|
|
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
|
|
|
156
161
|
auto embd = _sess->tokens_ptr();
|
|
157
162
|
embd->reserve(embd->size() + max_len);
|
|
158
163
|
|
|
164
|
+
|
|
159
165
|
if (is_enc_dec) {
|
|
160
166
|
if (n_input > 0) {
|
|
161
167
|
// Decode tokens in batches using n_batch as chunk size
|
|
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
|
|
|
378
384
|
chat_syntax.thinking_forced_open = _thinking_forced_open;
|
|
379
385
|
|
|
380
386
|
chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
|
|
387
|
+
|
|
388
|
+
// Combine prefill_text with generated_text for final parsing
|
|
389
|
+
std::string full_text = _prefill_text + _result.text;
|
|
381
390
|
common_chat_msg message = common_chat_parse(
|
|
382
|
-
|
|
391
|
+
full_text,
|
|
383
392
|
false,
|
|
384
393
|
chat_syntax
|
|
385
394
|
);
|
|
@@ -26,7 +26,8 @@ public:
|
|
|
26
26
|
const std::vector<std::string> &media_paths = {},
|
|
27
27
|
const std::vector<llama_token> &guide_tokens = {},
|
|
28
28
|
bool has_vocoder = false,
|
|
29
|
-
tts_type tts_type_val = UNKNOWN
|
|
29
|
+
tts_type tts_type_val = UNKNOWN,
|
|
30
|
+
const std::string &prefill_text = "");
|
|
30
31
|
|
|
31
32
|
~LlamaCompletionWorker();
|
|
32
33
|
|
|
@@ -58,6 +59,7 @@ private:
|
|
|
58
59
|
std::string _reasoning_format;
|
|
59
60
|
std::vector<std::string> _media_paths;
|
|
60
61
|
std::vector<llama_token> _guide_tokens;
|
|
62
|
+
std::string _prefill_text;
|
|
61
63
|
std::function<void()> _onComplete;
|
|
62
64
|
bool _has_callback = false;
|
|
63
65
|
bool _interrupted = false;
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -190,6 +190,15 @@ static ggml_type kv_cache_type_from_str(const std::string &s) {
|
|
|
190
190
|
throw std::runtime_error("Unsupported cache type: " + s);
|
|
191
191
|
}
|
|
192
192
|
|
|
193
|
+
static enum llama_flash_attn_type flash_attn_type_from_str(const std::string &s) {
|
|
194
|
+
if (s == "on")
|
|
195
|
+
return LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
|
196
|
+
if (s == "off")
|
|
197
|
+
return LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
198
|
+
return LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
|
|
193
202
|
static int32_t pooling_type_from_str(const std::string &s) {
|
|
194
203
|
if (s == "none")
|
|
195
204
|
return LLAMA_POOLING_TYPE_NONE;
|
|
@@ -242,7 +251,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
|
|
|
242
251
|
params.cpuparams.n_threads =
|
|
243
252
|
get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
|
|
244
253
|
params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
|
|
245
|
-
|
|
254
|
+
|
|
255
|
+
auto flash_attn_type = get_option<std::string>(options, "flash_attn_type", "auto");
|
|
256
|
+
if (!flash_attn_type.empty()) {
|
|
257
|
+
params.flash_attn_type = (enum llama_flash_attn_type)flash_attn_type_from_str(flash_attn_type.c_str());
|
|
258
|
+
} else {
|
|
259
|
+
params.flash_attn_type = get_option<bool>(options, "flash_attn", false) ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
260
|
+
}
|
|
261
|
+
|
|
246
262
|
params.cache_type_k = kv_cache_type_from_str(
|
|
247
263
|
get_option<std::string>(options, "cache_type_k", "f16").c_str());
|
|
248
264
|
params.cache_type_v = kv_cache_type_from_str(
|
|
@@ -935,6 +951,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
935
951
|
json_schema_to_grammar(json::parse(json_schema_str));
|
|
936
952
|
}
|
|
937
953
|
|
|
954
|
+
std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
|
|
955
|
+
|
|
938
956
|
params.n_predict = get_option<int32_t>(options, "n_predict", -1);
|
|
939
957
|
params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
|
|
940
958
|
params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
|
|
@@ -1007,7 +1025,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
|
|
|
1007
1025
|
auto *worker =
|
|
1008
1026
|
new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
|
|
1009
1027
|
chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
|
|
1010
|
-
_has_vocoder, _tts_type);
|
|
1028
|
+
_has_vocoder, _tts_type, prefill_text);
|
|
1011
1029
|
worker->Queue();
|
|
1012
1030
|
_wip = worker;
|
|
1013
1031
|
worker->OnComplete([this]() { _wip = nullptr; });
|
|
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
|
|
|
1106
1106
|
printf("\"\n\n");
|
|
1107
1107
|
|
|
1108
1108
|
printf(" case \"$prev\" in\n");
|
|
1109
|
-
printf(" --model)\n");
|
|
1109
|
+
printf(" --model|-m)\n");
|
|
1110
1110
|
printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
|
|
1111
1111
|
printf(" return 0\n");
|
|
1112
1112
|
printf(" ;;\n");
|
|
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1545
1545
|
}
|
|
1546
1546
|
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
|
|
1547
1547
|
add_opt(common_arg(
|
|
1548
|
-
{"-fa", "--flash-attn"},
|
|
1549
|
-
string_format("
|
|
1550
|
-
[](common_params & params) {
|
|
1551
|
-
|
|
1548
|
+
{"-fa", "--flash-attn"}, "FA",
|
|
1549
|
+
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
|
|
1550
|
+
[](common_params & params, const std::string & value) {
|
|
1551
|
+
if (value == "on" || value == "enabled") {
|
|
1552
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
|
|
1553
|
+
} else if (value == "off" || value == "disabled") {
|
|
1554
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
|
|
1555
|
+
} else if (value == "auto") {
|
|
1556
|
+
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
|
|
1557
|
+
} else {
|
|
1558
|
+
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
|
|
1559
|
+
}
|
|
1552
1560
|
}
|
|
1553
1561
|
).set_env("LLAMA_ARG_FLASH_ATTN"));
|
|
1554
1562
|
add_opt(common_arg(
|
|
@@ -2555,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2555
2563
|
{"--lora"}, "FNAME",
|
|
2556
2564
|
"path to LoRA adapter (can be repeated to use multiple adapters)",
|
|
2557
2565
|
[](common_params & params, const std::string & value) {
|
|
2558
|
-
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
|
|
2566
|
+
params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
|
|
2559
2567
|
}
|
|
2560
2568
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2561
2569
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
@@ -2563,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2563
2571
|
{"--lora-scaled"}, "FNAME", "SCALE",
|
|
2564
2572
|
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
|
|
2565
2573
|
[](common_params & params, const std::string & fname, const std::string & scale) {
|
|
2566
|
-
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
|
|
2574
|
+
params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
|
|
2567
2575
|
}
|
|
2568
2576
|
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
|
|
2569
2577
|
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
|
|
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3459
3467
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
|
|
3460
3468
|
params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
|
|
3461
3469
|
params.port = 8012;
|
|
3462
|
-
params.n_gpu_layers = 99;
|
|
3463
|
-
params.flash_attn = true;
|
|
3464
3470
|
params.n_ubatch = 1024;
|
|
3465
3471
|
params.n_batch = 1024;
|
|
3466
3472
|
params.n_ctx = 0;
|
|
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3475
3481
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
|
|
3476
3482
|
params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
|
|
3477
3483
|
params.port = 8012;
|
|
3478
|
-
params.n_gpu_layers = 99;
|
|
3479
|
-
params.flash_attn = true;
|
|
3480
3484
|
params.n_ubatch = 1024;
|
|
3481
3485
|
params.n_batch = 1024;
|
|
3482
3486
|
params.n_ctx = 0;
|
|
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3491
3495
|
params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
|
|
3492
3496
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3493
3497
|
params.port = 8012;
|
|
3494
|
-
params.n_gpu_layers = 99;
|
|
3495
|
-
params.flash_attn = true;
|
|
3496
3498
|
params.n_ubatch = 1024;
|
|
3497
3499
|
params.n_batch = 1024;
|
|
3498
3500
|
params.n_ctx = 0;
|
|
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3508
3510
|
params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
|
|
3509
3511
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3510
3512
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3511
|
-
params.speculative.n_gpu_layers = 99;
|
|
3512
3513
|
params.port = 8012;
|
|
3513
|
-
params.n_gpu_layers = 99;
|
|
3514
|
-
params.flash_attn = true;
|
|
3515
3514
|
params.n_ubatch = 1024;
|
|
3516
3515
|
params.n_batch = 1024;
|
|
3517
3516
|
params.n_ctx = 0;
|
|
@@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3527
3526
|
params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
|
|
3528
3527
|
params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
|
|
3529
3528
|
params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
|
|
3530
|
-
params.speculative.n_gpu_layers = 99;
|
|
3531
3529
|
params.port = 8012;
|
|
3532
|
-
params.
|
|
3533
|
-
params.
|
|
3530
|
+
params.n_ubatch = 1024;
|
|
3531
|
+
params.n_batch = 1024;
|
|
3532
|
+
params.n_ctx = 0;
|
|
3533
|
+
params.n_cache_reuse = 256;
|
|
3534
|
+
}
|
|
3535
|
+
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3536
|
+
|
|
3537
|
+
add_opt(common_arg(
|
|
3538
|
+
{"--fim-qwen-30b-default"},
|
|
3539
|
+
string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
|
|
3540
|
+
[](common_params & params) {
|
|
3541
|
+
params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
|
|
3542
|
+
params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
|
|
3543
|
+
params.port = 8012;
|
|
3534
3544
|
params.n_ubatch = 1024;
|
|
3535
3545
|
params.n_batch = 1024;
|
|
3536
3546
|
params.n_ctx = 0;
|
|
@@ -609,6 +609,7 @@ const char * common_chat_format_name(common_chat_format format) {
|
|
|
609
609
|
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
|
|
610
610
|
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
|
|
611
611
|
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
|
|
612
|
+
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
|
|
612
613
|
default:
|
|
613
614
|
throw std::runtime_error("Unknown chat format");
|
|
614
615
|
}
|
|
@@ -771,8 +772,7 @@ static std::string apply(
|
|
|
771
772
|
if (additional_context) {
|
|
772
773
|
tmpl_inputs.extra_context.merge_patch(*additional_context);
|
|
773
774
|
}
|
|
774
|
-
|
|
775
|
-
// tmpl_inputs.now = std::chrono::system_clock::now();
|
|
775
|
+
tmpl_inputs.now = inputs.now;
|
|
776
776
|
|
|
777
777
|
minja::chat_template_options tmpl_opts;
|
|
778
778
|
// To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
|
|
@@ -2046,6 +2046,94 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
|
|
|
2046
2046
|
}
|
|
2047
2047
|
}
|
|
2048
2048
|
|
|
2049
|
+
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
|
|
2050
|
+
// Parse thinking tags first - this handles the main reasoning content
|
|
2051
|
+
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
|
|
2052
|
+
|
|
2053
|
+
if (!builder.syntax().parse_tool_calls) {
|
|
2054
|
+
builder.add_content(builder.consume_rest());
|
|
2055
|
+
return;
|
|
2056
|
+
}
|
|
2057
|
+
|
|
2058
|
+
// Parse tool calls - Seed-OSS uses <seed:tool_call> format
|
|
2059
|
+
static const common_regex tool_call_begin_regex("<seed:tool_call>");
|
|
2060
|
+
static const common_regex tool_call_end_regex("</seed:tool_call>");
|
|
2061
|
+
static const common_regex function_regex("<function=([^>]+)>");
|
|
2062
|
+
static const common_regex param_regex("<parameter=([^>]+)>");
|
|
2063
|
+
|
|
2064
|
+
while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
|
|
2065
|
+
builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
|
|
2066
|
+
|
|
2067
|
+
// Look for function call inside tool call, ignore any content before it
|
|
2068
|
+
if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
|
|
2069
|
+
auto function_name = builder.str(func_res->groups[1]);
|
|
2070
|
+
|
|
2071
|
+
// Parse Seed-OSS parameters <parameter=name>value</parameter>
|
|
2072
|
+
json args = json::object();
|
|
2073
|
+
// Parse all parameters
|
|
2074
|
+
while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
|
|
2075
|
+
// again, ignore noise around parameters
|
|
2076
|
+
auto param_name = builder.str(param_res->groups[1]);
|
|
2077
|
+
builder.move_to(param_res->groups[0].end);
|
|
2078
|
+
builder.consume_spaces(); // Consume whitespace after parameter
|
|
2079
|
+
auto savedPos = builder.pos();
|
|
2080
|
+
if (auto param_parse = builder.try_find_literal("</parameter>")) {
|
|
2081
|
+
auto param = param_parse->prelude;
|
|
2082
|
+
builder.move_to(savedPos);
|
|
2083
|
+
try {
|
|
2084
|
+
if (auto param_res = builder.try_consume_json()) {
|
|
2085
|
+
args[param_name] = param_res->json;
|
|
2086
|
+
} else {
|
|
2087
|
+
args[param_name] = param;
|
|
2088
|
+
}
|
|
2089
|
+
} catch (json::exception &) {
|
|
2090
|
+
args[param_name] = param;
|
|
2091
|
+
}
|
|
2092
|
+
} else {
|
|
2093
|
+
throw common_chat_msg_partial_exception("Incomplete tool parameter");
|
|
2094
|
+
}
|
|
2095
|
+
}
|
|
2096
|
+
// Look for closing function tag
|
|
2097
|
+
auto end_func = builder.try_find_literal("</function>");
|
|
2098
|
+
if (end_func) {
|
|
2099
|
+
builder.move_to(end_func->groups[0].end);
|
|
2100
|
+
builder.consume_spaces(); // Consume whitespace after </function>
|
|
2101
|
+
|
|
2102
|
+
// Add the tool call with parsed arguments, but only if we REALLY got the literal
|
|
2103
|
+
auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
|
|
2104
|
+
auto funlen = std::string("</function>").length();
|
|
2105
|
+
if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
|
|
2106
|
+
if (!builder.add_tool_call(function_name, "", args.dump())) {
|
|
2107
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2108
|
+
}
|
|
2109
|
+
} else {
|
|
2110
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2111
|
+
}
|
|
2112
|
+
} else {
|
|
2113
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2114
|
+
}
|
|
2115
|
+
// Look for closing tool call tag
|
|
2116
|
+
if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
|
|
2117
|
+
builder.move_to(end_tool->groups[0].end);
|
|
2118
|
+
builder.consume_spaces(); // Consume trailing whitespace after tool call
|
|
2119
|
+
} else {
|
|
2120
|
+
throw common_chat_msg_partial_exception("Incomplete tool call");
|
|
2121
|
+
}
|
|
2122
|
+
} else {
|
|
2123
|
+
// No function found - don't consume content here, let it be handled at the end
|
|
2124
|
+
break;
|
|
2125
|
+
}
|
|
2126
|
+
}
|
|
2127
|
+
|
|
2128
|
+
// Consume any remaining whitespace after all tool call processing
|
|
2129
|
+
builder.consume_spaces();
|
|
2130
|
+
auto remaining = builder.consume_rest();
|
|
2131
|
+
// If there's any non-whitespace content remaining, add it as content
|
|
2132
|
+
if (!string_strip(remaining).empty()) {
|
|
2133
|
+
builder.add_content(remaining);
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
|
|
2049
2137
|
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
|
2050
2138
|
common_chat_params data;
|
|
2051
2139
|
data.prompt = apply(tmpl, inputs);
|
|
@@ -2062,8 +2150,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
|
|
|
2062
2150
|
return data;
|
|
2063
2151
|
}
|
|
2064
2152
|
|
|
2153
|
+
static common_chat_params common_chat_params_init_seed_oss(
|
|
2154
|
+
const common_chat_template & tmpl,
|
|
2155
|
+
templates_params & params,
|
|
2156
|
+
const common_chat_templates_inputs & inputs)
|
|
2157
|
+
{
|
|
2158
|
+
common_chat_params data;
|
|
2159
|
+
data.prompt = apply(tmpl, params);
|
|
2160
|
+
data.format = COMMON_CHAT_FORMAT_SEED_OSS;
|
|
2161
|
+
if (string_ends_with(data.prompt, "<seed:think>")) {
|
|
2162
|
+
if (!inputs.enable_thinking) {
|
|
2163
|
+
data.prompt += "</seed:think>";
|
|
2164
|
+
} else {
|
|
2165
|
+
data.thinking_forced_open = true;
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
if (params.tools.is_array() && !params.tools.empty()) {
|
|
2170
|
+
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
|
2171
|
+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
|
2172
|
+
std::vector<std::string> tool_rules;
|
|
2173
|
+
foreach_function(params.tools, [&](const json & tool) {
|
|
2174
|
+
const auto & function = tool.at("function");
|
|
2175
|
+
std::string name = function.at("name");
|
|
2176
|
+
auto parameters = function.at("parameters");
|
|
2177
|
+
builder.resolve_refs(parameters);
|
|
2178
|
+
|
|
2179
|
+
// Create rule for Seed-OSS function call format
|
|
2180
|
+
std::string param_rules;
|
|
2181
|
+
if (parameters.contains("properties")) {
|
|
2182
|
+
for (const auto & [key, value] : parameters.at("properties").items()) {
|
|
2183
|
+
param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
|
|
2184
|
+
"\"</parameter>\"";
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
|
|
2188
|
+
tool_rules.push_back(builder.add_rule(name + "-call",
|
|
2189
|
+
"\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
|
|
2190
|
+
param_rules +
|
|
2191
|
+
" \"</function>\" space \"</seed:tool_call>\""));
|
|
2192
|
+
});
|
|
2193
|
+
|
|
2194
|
+
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
|
|
2195
|
+
|
|
2196
|
+
data.preserved_tokens = {
|
|
2197
|
+
"<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
|
|
2198
|
+
"<function=", "</function>", "<parameter=", "</parameter>",
|
|
2199
|
+
};
|
|
2200
|
+
|
|
2201
|
+
builder.add_rule("root", string_join(tool_rules, " | "));
|
|
2202
|
+
});
|
|
2203
|
+
}
|
|
2204
|
+
return data;
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2065
2207
|
static common_chat_params common_chat_templates_apply_jinja(
|
|
2066
|
-
const struct common_chat_templates
|
|
2208
|
+
const struct common_chat_templates * tmpls,
|
|
2067
2209
|
const struct common_chat_templates_inputs & inputs)
|
|
2068
2210
|
{
|
|
2069
2211
|
templates_params params;
|
|
@@ -2132,6 +2274,11 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|
|
2132
2274
|
return common_chat_params_init_gpt_oss(tmpl, params);
|
|
2133
2275
|
}
|
|
2134
2276
|
|
|
2277
|
+
// Seed-OSS
|
|
2278
|
+
if (src.find("<seed:think>") != std::string::npos) {
|
|
2279
|
+
return common_chat_params_init_seed_oss(tmpl, params, inputs);
|
|
2280
|
+
}
|
|
2281
|
+
|
|
2135
2282
|
// Use generic handler when mixing tools + JSON schema.
|
|
2136
2283
|
// TODO: support that mix in handlers below.
|
|
2137
2284
|
if ((params.tools.is_array() && params.json_schema.is_object())) {
|
|
@@ -2290,6 +2437,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
|
|
|
2290
2437
|
case COMMON_CHAT_FORMAT_GPT_OSS:
|
|
2291
2438
|
common_chat_parse_gpt_oss(builder);
|
|
2292
2439
|
break;
|
|
2440
|
+
case COMMON_CHAT_FORMAT_SEED_OSS:
|
|
2441
|
+
common_chat_parse_seed_oss(builder);
|
|
2442
|
+
break;
|
|
2293
2443
|
default:
|
|
2294
2444
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
|
|
2295
2445
|
}
|
|
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
901
901
|
|
|
902
902
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
903
903
|
if (model == NULL) {
|
|
904
|
-
LOG_ERR("%s: failed to load model '%s'\n",
|
|
904
|
+
LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
905
|
+
__func__, params.model.path.c_str());
|
|
905
906
|
return iparams;
|
|
906
907
|
}
|
|
907
908
|
|
|
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
911
912
|
|
|
912
913
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
|
913
914
|
if (lctx == NULL) {
|
|
914
|
-
LOG_ERR("%s: failed to create context with model '%s'\n",
|
|
915
|
+
LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
|
|
916
|
+
__func__, params.model.path.c_str());
|
|
915
917
|
llama_model_free(model);
|
|
916
918
|
return iparams;
|
|
917
919
|
}
|
|
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
988
990
|
return iparams;
|
|
989
991
|
}
|
|
990
992
|
|
|
993
|
+
char buf[1024];
|
|
991
994
|
la.ptr = lora.get();
|
|
995
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
|
|
996
|
+
la.task_name = buf;
|
|
997
|
+
llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
|
|
998
|
+
la.prompt_prefix = buf;
|
|
992
999
|
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
|
993
1000
|
}
|
|
994
1001
|
|
|
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1153
1160
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
|
1154
1161
|
cparams.pooling_type = params.pooling_type;
|
|
1155
1162
|
cparams.attention_type = params.attention_type;
|
|
1163
|
+
cparams.flash_attn_type = params.flash_attn_type;
|
|
1156
1164
|
cparams.cb_eval = params.cb_eval;
|
|
1157
1165
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
|
1158
1166
|
cparams.offload_kqv = !params.no_kv_offload;
|
|
1159
|
-
cparams.flash_attn = params.flash_attn;
|
|
1160
1167
|
cparams.no_perf = params.no_perf;
|
|
1161
1168
|
cparams.op_offload = !params.no_op_offload;
|
|
1162
1169
|
cparams.swa_full = params.swa_full;
|
|
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
|
|
|
34
34
|
std::string path;
|
|
35
35
|
float scale;
|
|
36
36
|
|
|
37
|
+
std::string task_name;
|
|
38
|
+
std::string prompt_prefix;
|
|
39
|
+
|
|
37
40
|
struct llama_adapter_lora * ptr;
|
|
38
41
|
};
|
|
39
42
|
|
|
@@ -310,6 +313,7 @@ struct common_params {
|
|
|
310
313
|
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
311
314
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
|
312
315
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
|
316
|
+
enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
|
|
313
317
|
|
|
314
318
|
struct common_params_sampling sampling;
|
|
315
319
|
struct common_params_speculative speculative;
|
|
@@ -373,7 +377,6 @@ struct common_params {
|
|
|
373
377
|
bool multiline_input = false; // reverse the usage of `\`
|
|
374
378
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
375
379
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
376
|
-
bool flash_attn = false; // flash attention
|
|
377
380
|
bool no_perf = false; // disable performance metrics
|
|
378
381
|
bool ctx_shift = false; // context shift on infinite text generation
|
|
379
382
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|