@fugood/llama.node 1.1.9 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/lib/binding.ts +7 -1
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +15 -5
  4. package/src/LlamaCompletionWorker.cpp +12 -3
  5. package/src/LlamaCompletionWorker.h +3 -1
  6. package/src/LlamaContext.cpp +20 -2
  7. package/src/llama.cpp/common/arg.cpp +29 -19
  8. package/src/llama.cpp/common/chat.cpp +153 -3
  9. package/src/llama.cpp/common/chat.h +1 -0
  10. package/src/llama.cpp/common/common.cpp +10 -3
  11. package/src/llama.cpp/common/common.h +4 -1
  12. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  15. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  19. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  20. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  21. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  23. package/src/llama.cpp/include/llama.h +27 -1
  24. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  25. package/src/llama.cpp/src/llama-adapter.h +3 -0
  26. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  27. package/src/llama.cpp/src/llama-arch.h +4 -0
  28. package/src/llama.cpp/src/llama-context.cpp +80 -39
  29. package/src/llama.cpp/src/llama-context.h +0 -4
  30. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  31. package/src/llama.cpp/src/llama-graph.h +2 -1
  32. package/src/llama.cpp/src/llama-hparams.cpp +25 -0
  33. package/src/llama.cpp/src/llama-hparams.h +6 -0
  34. package/src/llama.cpp/src/llama-impl.h +2 -0
  35. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +24 -7
  36. package/src/llama.cpp/src/llama-kv-cache-iswa.h +4 -2
  37. package/src/llama.cpp/src/llama-kv-cache.cpp +67 -130
  38. package/src/llama.cpp/src/llama-kv-cache.h +16 -28
  39. package/src/llama.cpp/src/llama-memory-hybrid.cpp +29 -28
  40. package/src/llama.cpp/src/llama-memory-hybrid.h +18 -22
  41. package/src/llama.cpp/src/llama-memory-recurrent.cpp +7 -7
  42. package/src/llama.cpp/src/llama-memory-recurrent.h +7 -11
  43. package/src/llama.cpp/src/llama-memory.h +8 -0
  44. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  45. package/src/llama.cpp/src/llama-model.cpp +302 -31
  46. package/src/llama.cpp/src/llama-model.h +1 -0
  47. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  48. package/src/llama.cpp/src/llama.cpp +12 -0
package/lib/binding.ts CHANGED
@@ -27,7 +27,8 @@ export type LlamaModelOptions = {
27
27
  n_ubatch?: number
28
28
  n_threads?: number
29
29
  n_gpu_layers?: number
30
- flash_attn?: boolean
30
+ flash_attn_type?: 'auto' | 'on' | 'off'
31
+ flash_attn?: boolean // Deprecated: use flash_attn_type instead
31
32
  cache_type_k?:
32
33
  | 'f16'
33
34
  | 'f32'
@@ -100,6 +101,11 @@ export type LlamaCompletionOptions = {
100
101
  enable_thinking?: boolean
101
102
  thinking_forced_open?: boolean
102
103
  prompt?: string
104
+ /**
105
+ * Text to prefill the response with.
106
+ * This text will be added to the beginning of the generated response.
107
+ */
108
+ prefill_text?: string
103
109
  temperature?: number
104
110
  top_k?: number
105
111
  top_p?: number
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.9",
4
+ "version": "1.1.11",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.9",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.9",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.9",
77
- "@fugood/node-llama-linux-arm64": "1.1.9",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.9",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.9",
80
- "@fugood/node-llama-win32-x64": "1.1.9",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.9",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.9",
83
- "@fugood/node-llama-win32-arm64": "1.1.9",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.9",
85
- "@fugood/node-llama-darwin-x64": "1.1.9",
86
- "@fugood/node-llama-darwin-arm64": "1.1.9"
74
+ "@fugood/node-llama-linux-x64": "1.1.11",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.11",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.11",
77
+ "@fugood/node-llama-linux-arm64": "1.1.11",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.11",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.11",
80
+ "@fugood/node-llama-win32-x64": "1.1.11",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.11",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.11",
83
+ "@fugood/node-llama-win32-arm64": "1.1.11",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.11",
85
+ "@fugood/node-llama-darwin-x64": "1.1.11",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.11"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -1,5 +1,5 @@
1
1
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
2
- index 23d3828f9..ca48af00c 100644
2
+ index 111b4a21b..16ce87672 100644
3
3
  --- a/src/llama.cpp/common/chat.cpp
4
4
  +++ b/src/llama.cpp/common/chat.cpp
5
5
  @@ -6,9 +6,6 @@
@@ -29,6 +29,16 @@ index 23d3828f9..ca48af00c 100644
29
29
  struct templates_params {
30
30
  json messages;
31
31
  json tools;
32
+ @@ -784,8 +771,7 @@ static std::string apply(
33
+ if (additional_context) {
34
+ tmpl_inputs.extra_context.merge_patch(*additional_context);
35
+ }
36
+ - // TODO: add flag to control date/time, if only for testing purposes.
37
+ - // tmpl_inputs.now = std::chrono::system_clock::now();
38
+ + tmpl_inputs.now = inputs.now;
39
+
40
+ minja::chat_template_options tmpl_opts;
41
+ // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
32
42
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
33
43
  index d1e480c91..437e64e29 100644
34
44
  --- a/src/llama.cpp/common/chat.h
@@ -54,10 +64,10 @@ index d1e480c91..437e64e29 100644
54
64
  struct common_chat_tool_call {
55
65
  std::string name;
56
66
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
57
- index 67dd5404f..909a97c66 100644
67
+ index fdce1dcde..55aac3412 100644
58
68
  --- a/src/llama.cpp/common/common.cpp
59
69
  +++ b/src/llama.cpp/common/common.cpp
60
- @@ -1117,6 +1117,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
70
+ @@ -1103,6 +1103,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
61
71
  mparams.n_gpu_layers = params.n_gpu_layers;
62
72
  }
63
73
 
@@ -66,10 +76,10 @@ index 67dd5404f..909a97c66 100644
66
76
  mparams.split_mode = params.split_mode;
67
77
  mparams.tensor_split = params.tensor_split;
68
78
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
69
- index 75596e6b3..0e04694c8 100644
79
+ index 390dda5e5..f259ca785 100644
70
80
  --- a/src/llama.cpp/common/common.h
71
81
  +++ b/src/llama.cpp/common/common.h
72
- @@ -267,6 +267,7 @@ struct lr_opt {
82
+ @@ -270,6 +270,7 @@ struct lr_opt {
73
83
  struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
74
84
 
75
85
  struct common_params {
@@ -35,12 +35,14 @@ LlamaCompletionWorker::LlamaCompletionWorker(
35
35
  const std::vector<std::string> &media_paths,
36
36
  const std::vector<llama_token> &guide_tokens,
37
37
  bool has_vocoder,
38
- tts_type tts_type_val)
38
+ tts_type tts_type_val,
39
+ const std::string &prefill_text)
39
40
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
40
41
  _params(params), _stop_words(stop_words), _chat_format(chat_format),
41
42
  _thinking_forced_open(thinking_forced_open),
42
43
  _reasoning_format(reasoning_format),
43
44
  _media_paths(media_paths), _guide_tokens(guide_tokens),
45
+ _prefill_text(prefill_text),
44
46
  _has_vocoder(has_vocoder), _tts_type(tts_type_val) {
45
47
  if (!callback.IsEmpty()) {
46
48
  _tsfn = Napi::ThreadSafeFunction::New(info.Env(), callback,
@@ -68,8 +70,11 @@ LlamaCompletionWorker::PartialOutput LlamaCompletionWorker::getPartialOutput(con
68
70
 
69
71
  chat_syntax.parse_tool_calls = true;
70
72
 
73
+ // Combine prefill_text with generated_text for parsing
74
+ std::string full_text = _prefill_text + generated_text;
75
+
71
76
  // Use is_partial=true for streaming partial output
72
- common_chat_msg parsed_msg = common_chat_parse(generated_text, true, chat_syntax);
77
+ common_chat_msg parsed_msg = common_chat_parse(full_text, true, chat_syntax);
73
78
 
74
79
  result.content = parsed_msg.content;
75
80
  result.reasoning_content = parsed_msg.reasoning_content;
@@ -156,6 +161,7 @@ void LlamaCompletionWorker::Execute() {
156
161
  auto embd = _sess->tokens_ptr();
157
162
  embd->reserve(embd->size() + max_len);
158
163
 
164
+
159
165
  if (is_enc_dec) {
160
166
  if (n_input > 0) {
161
167
  // Decode tokens in batches using n_batch as chunk size
@@ -378,8 +384,11 @@ void LlamaCompletionWorker::OnOK() {
378
384
  chat_syntax.thinking_forced_open = _thinking_forced_open;
379
385
 
380
386
  chat_syntax.reasoning_format = common_reasoning_format_from_name(_reasoning_format);
387
+
388
+ // Combine prefill_text with generated_text for final parsing
389
+ std::string full_text = _prefill_text + _result.text;
381
390
  common_chat_msg message = common_chat_parse(
382
- _result.text,
391
+ full_text,
383
392
  false,
384
393
  chat_syntax
385
394
  );
@@ -26,7 +26,8 @@ public:
26
26
  const std::vector<std::string> &media_paths = {},
27
27
  const std::vector<llama_token> &guide_tokens = {},
28
28
  bool has_vocoder = false,
29
- tts_type tts_type_val = UNKNOWN);
29
+ tts_type tts_type_val = UNKNOWN,
30
+ const std::string &prefill_text = "");
30
31
 
31
32
  ~LlamaCompletionWorker();
32
33
 
@@ -58,6 +59,7 @@ private:
58
59
  std::string _reasoning_format;
59
60
  std::vector<std::string> _media_paths;
60
61
  std::vector<llama_token> _guide_tokens;
62
+ std::string _prefill_text;
61
63
  std::function<void()> _onComplete;
62
64
  bool _has_callback = false;
63
65
  bool _interrupted = false;
@@ -190,6 +190,15 @@ static ggml_type kv_cache_type_from_str(const std::string &s) {
190
190
  throw std::runtime_error("Unsupported cache type: " + s);
191
191
  }
192
192
 
193
+ static enum llama_flash_attn_type flash_attn_type_from_str(const std::string &s) {
194
+ if (s == "on")
195
+ return LLAMA_FLASH_ATTN_TYPE_ENABLED;
196
+ if (s == "off")
197
+ return LLAMA_FLASH_ATTN_TYPE_DISABLED;
198
+ return LLAMA_FLASH_ATTN_TYPE_AUTO;
199
+ }
200
+
201
+
193
202
  static int32_t pooling_type_from_str(const std::string &s) {
194
203
  if (s == "none")
195
204
  return LLAMA_POOLING_TYPE_NONE;
@@ -242,7 +251,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
242
251
  params.cpuparams.n_threads =
243
252
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
244
253
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
245
- params.flash_attn = get_option<bool>(options, "flash_attn", false);
254
+
255
+ auto flash_attn_type = get_option<std::string>(options, "flash_attn_type", "auto");
256
+ if (!flash_attn_type.empty()) {
257
+ params.flash_attn_type = (enum llama_flash_attn_type)flash_attn_type_from_str(flash_attn_type.c_str());
258
+ } else {
259
+ params.flash_attn_type = get_option<bool>(options, "flash_attn", false) ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
260
+ }
261
+
246
262
  params.cache_type_k = kv_cache_type_from_str(
247
263
  get_option<std::string>(options, "cache_type_k", "f16").c_str());
248
264
  params.cache_type_v = kv_cache_type_from_str(
@@ -935,6 +951,8 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
935
951
  json_schema_to_grammar(json::parse(json_schema_str));
936
952
  }
937
953
 
954
+ std::string prefill_text = get_option<std::string>(options, "prefill_text", "");
955
+
938
956
  params.n_predict = get_option<int32_t>(options, "n_predict", -1);
939
957
  params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
940
958
  params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
@@ -1007,7 +1025,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
1007
1025
  auto *worker =
1008
1026
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words,
1009
1027
  chat_format, thinking_forced_open, reasoning_format, media_paths, guide_tokens,
1010
- _has_vocoder, _tts_type);
1028
+ _has_vocoder, _tts_type, prefill_text);
1011
1029
  worker->Queue();
1012
1030
  _wip = worker;
1013
1031
  worker->OnComplete([this]() { _wip = nullptr; });
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
1106
1106
  printf("\"\n\n");
1107
1107
 
1108
1108
  printf(" case \"$prev\" in\n");
1109
- printf(" --model)\n");
1109
+ printf(" --model|-m)\n");
1110
1110
  printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
1111
1111
  printf(" return 0\n");
1112
1112
  printf(" ;;\n");
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1545
1545
  }
1546
1546
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547
1547
  add_opt(common_arg(
1548
- {"-fa", "--flash-attn"},
1549
- string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1550
- [](common_params & params) {
1551
- params.flash_attn = true;
1548
+ {"-fa", "--flash-attn"}, "FA",
1549
+ string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1550
+ [](common_params & params, const std::string & value) {
1551
+ if (value == "on" || value == "enabled") {
1552
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553
+ } else if (value == "off" || value == "disabled") {
1554
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555
+ } else if (value == "auto") {
1556
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557
+ } else {
1558
+ throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1559
+ }
1552
1560
  }
1553
1561
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
1554
1562
  add_opt(common_arg(
@@ -2555,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2555
2563
  {"--lora"}, "FNAME",
2556
2564
  "path to LoRA adapter (can be repeated to use multiple adapters)",
2557
2565
  [](common_params & params, const std::string & value) {
2558
- params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2566
+ params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2559
2567
  }
2560
2568
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2561
2569
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -2563,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2563
2571
  {"--lora-scaled"}, "FNAME", "SCALE",
2564
2572
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2565
2573
  [](common_params & params, const std::string & fname, const std::string & scale) {
2566
- params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2574
+ params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2567
2575
  }
2568
2576
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2569
2577
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3459
3467
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3460
3468
  params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3461
3469
  params.port = 8012;
3462
- params.n_gpu_layers = 99;
3463
- params.flash_attn = true;
3464
3470
  params.n_ubatch = 1024;
3465
3471
  params.n_batch = 1024;
3466
3472
  params.n_ctx = 0;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3475
3481
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3476
3482
  params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3477
3483
  params.port = 8012;
3478
- params.n_gpu_layers = 99;
3479
- params.flash_attn = true;
3480
3484
  params.n_ubatch = 1024;
3481
3485
  params.n_batch = 1024;
3482
3486
  params.n_ctx = 0;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3491
3495
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3492
3496
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3493
3497
  params.port = 8012;
3494
- params.n_gpu_layers = 99;
3495
- params.flash_attn = true;
3496
3498
  params.n_ubatch = 1024;
3497
3499
  params.n_batch = 1024;
3498
3500
  params.n_ctx = 0;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3508
3510
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3509
3511
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3510
3512
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3511
- params.speculative.n_gpu_layers = 99;
3512
3513
  params.port = 8012;
3513
- params.n_gpu_layers = 99;
3514
- params.flash_attn = true;
3515
3514
  params.n_ubatch = 1024;
3516
3515
  params.n_batch = 1024;
3517
3516
  params.n_ctx = 0;
@@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3527
3526
  params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3528
3527
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3529
3528
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3530
- params.speculative.n_gpu_layers = 99;
3531
3529
  params.port = 8012;
3532
- params.n_gpu_layers = 99;
3533
- params.flash_attn = true;
3530
+ params.n_ubatch = 1024;
3531
+ params.n_batch = 1024;
3532
+ params.n_ctx = 0;
3533
+ params.n_cache_reuse = 256;
3534
+ }
3535
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3536
+
3537
+ add_opt(common_arg(
3538
+ {"--fim-qwen-30b-default"},
3539
+ string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3540
+ [](common_params & params) {
3541
+ params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3542
+ params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3543
+ params.port = 8012;
3534
3544
  params.n_ubatch = 1024;
3535
3545
  params.n_batch = 1024;
3536
3546
  params.n_ctx = 0;
@@ -609,6 +609,7 @@ const char * common_chat_format_name(common_chat_format format) {
609
609
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
610
610
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
611
611
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
612
+ case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
612
613
  default:
613
614
  throw std::runtime_error("Unknown chat format");
614
615
  }
@@ -771,8 +772,7 @@ static std::string apply(
771
772
  if (additional_context) {
772
773
  tmpl_inputs.extra_context.merge_patch(*additional_context);
773
774
  }
774
- // TODO: add flag to control date/time, if only for testing purposes.
775
- // tmpl_inputs.now = std::chrono::system_clock::now();
775
+ tmpl_inputs.now = inputs.now;
776
776
 
777
777
  minja::chat_template_options tmpl_opts;
778
778
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
@@ -2046,6 +2046,94 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2046
2046
  }
2047
2047
  }
2048
2048
 
2049
+ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2050
+ // Parse thinking tags first - this handles the main reasoning content
2051
+ builder.try_parse_reasoning("<seed:think>", "</seed:think>");
2052
+
2053
+ if (!builder.syntax().parse_tool_calls) {
2054
+ builder.add_content(builder.consume_rest());
2055
+ return;
2056
+ }
2057
+
2058
+ // Parse tool calls - Seed-OSS uses <seed:tool_call> format
2059
+ static const common_regex tool_call_begin_regex("<seed:tool_call>");
2060
+ static const common_regex tool_call_end_regex("</seed:tool_call>");
2061
+ static const common_regex function_regex("<function=([^>]+)>");
2062
+ static const common_regex param_regex("<parameter=([^>]+)>");
2063
+
2064
+ while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
2065
+ builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
2066
+
2067
+ // Look for function call inside tool call, ignore any content before it
2068
+ if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
2069
+ auto function_name = builder.str(func_res->groups[1]);
2070
+
2071
+ // Parse Seed-OSS parameters <parameter=name>value</parameter>
2072
+ json args = json::object();
2073
+ // Parse all parameters
2074
+ while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
2075
+ // again, ignore noise around parameters
2076
+ auto param_name = builder.str(param_res->groups[1]);
2077
+ builder.move_to(param_res->groups[0].end);
2078
+ builder.consume_spaces(); // Consume whitespace after parameter
2079
+ auto savedPos = builder.pos();
2080
+ if (auto param_parse = builder.try_find_literal("</parameter>")) {
2081
+ auto param = param_parse->prelude;
2082
+ builder.move_to(savedPos);
2083
+ try {
2084
+ if (auto param_res = builder.try_consume_json()) {
2085
+ args[param_name] = param_res->json;
2086
+ } else {
2087
+ args[param_name] = param;
2088
+ }
2089
+ } catch (json::exception &) {
2090
+ args[param_name] = param;
2091
+ }
2092
+ } else {
2093
+ throw common_chat_msg_partial_exception("Incomplete tool parameter");
2094
+ }
2095
+ }
2096
+ // Look for closing function tag
2097
+ auto end_func = builder.try_find_literal("</function>");
2098
+ if (end_func) {
2099
+ builder.move_to(end_func->groups[0].end);
2100
+ builder.consume_spaces(); // Consume whitespace after </function>
2101
+
2102
+ // Add the tool call with parsed arguments, but only if we REALLY got the literal
2103
+ auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
2104
+ auto funlen = std::string("</function>").length();
2105
+ if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
2106
+ if (!builder.add_tool_call(function_name, "", args.dump())) {
2107
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2108
+ }
2109
+ } else {
2110
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2111
+ }
2112
+ } else {
2113
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2114
+ }
2115
+ // Look for closing tool call tag
2116
+ if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
2117
+ builder.move_to(end_tool->groups[0].end);
2118
+ builder.consume_spaces(); // Consume trailing whitespace after tool call
2119
+ } else {
2120
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2121
+ }
2122
+ } else {
2123
+ // No function found - don't consume content here, let it be handled at the end
2124
+ break;
2125
+ }
2126
+ }
2127
+
2128
+ // Consume any remaining whitespace after all tool call processing
2129
+ builder.consume_spaces();
2130
+ auto remaining = builder.consume_rest();
2131
+ // If there's any non-whitespace content remaining, add it as content
2132
+ if (!string_strip(remaining).empty()) {
2133
+ builder.add_content(remaining);
2134
+ }
2135
+ }
2136
+
2049
2137
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2050
2138
  common_chat_params data;
2051
2139
  data.prompt = apply(tmpl, inputs);
@@ -2062,8 +2150,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
2062
2150
  return data;
2063
2151
  }
2064
2152
 
2153
+ static common_chat_params common_chat_params_init_seed_oss(
2154
+ const common_chat_template & tmpl,
2155
+ templates_params & params,
2156
+ const common_chat_templates_inputs & inputs)
2157
+ {
2158
+ common_chat_params data;
2159
+ data.prompt = apply(tmpl, params);
2160
+ data.format = COMMON_CHAT_FORMAT_SEED_OSS;
2161
+ if (string_ends_with(data.prompt, "<seed:think>")) {
2162
+ if (!inputs.enable_thinking) {
2163
+ data.prompt += "</seed:think>";
2164
+ } else {
2165
+ data.thinking_forced_open = true;
2166
+ }
2167
+ }
2168
+
2169
+ if (params.tools.is_array() && !params.tools.empty()) {
2170
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
2171
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
2172
+ std::vector<std::string> tool_rules;
2173
+ foreach_function(params.tools, [&](const json & tool) {
2174
+ const auto & function = tool.at("function");
2175
+ std::string name = function.at("name");
2176
+ auto parameters = function.at("parameters");
2177
+ builder.resolve_refs(parameters);
2178
+
2179
+ // Create rule for Seed-OSS function call format
2180
+ std::string param_rules;
2181
+ if (parameters.contains("properties")) {
2182
+ for (const auto & [key, value] : parameters.at("properties").items()) {
2183
+ param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
2184
+ "\"</parameter>\"";
2185
+ }
2186
+ }
2187
+
2188
+ tool_rules.push_back(builder.add_rule(name + "-call",
2189
+ "\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
2190
+ param_rules +
2191
+ " \"</function>\" space \"</seed:tool_call>\""));
2192
+ });
2193
+
2194
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
2195
+
2196
+ data.preserved_tokens = {
2197
+ "<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
2198
+ "<function=", "</function>", "<parameter=", "</parameter>",
2199
+ };
2200
+
2201
+ builder.add_rule("root", string_join(tool_rules, " | "));
2202
+ });
2203
+ }
2204
+ return data;
2205
+ }
2206
+
2065
2207
  static common_chat_params common_chat_templates_apply_jinja(
2066
- const struct common_chat_templates * tmpls,
2208
+ const struct common_chat_templates * tmpls,
2067
2209
  const struct common_chat_templates_inputs & inputs)
2068
2210
  {
2069
2211
  templates_params params;
@@ -2132,6 +2274,11 @@ static common_chat_params common_chat_templates_apply_jinja(
2132
2274
  return common_chat_params_init_gpt_oss(tmpl, params);
2133
2275
  }
2134
2276
 
2277
+ // Seed-OSS
2278
+ if (src.find("<seed:think>") != std::string::npos) {
2279
+ return common_chat_params_init_seed_oss(tmpl, params, inputs);
2280
+ }
2281
+
2135
2282
  // Use generic handler when mixing tools + JSON schema.
2136
2283
  // TODO: support that mix in handlers below.
2137
2284
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2290,6 +2437,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2290
2437
  case COMMON_CHAT_FORMAT_GPT_OSS:
2291
2438
  common_chat_parse_gpt_oss(builder);
2292
2439
  break;
2440
+ case COMMON_CHAT_FORMAT_SEED_OSS:
2441
+ common_chat_parse_seed_oss(builder);
2442
+ break;
2293
2443
  default:
2294
2444
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2295
2445
  }
@@ -122,6 +122,7 @@ enum common_chat_format {
122
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
123
  COMMON_CHAT_FORMAT_GRANITE,
124
124
  COMMON_CHAT_FORMAT_GPT_OSS,
125
+ COMMON_CHAT_FORMAT_SEED_OSS,
125
126
 
126
127
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
127
128
  };
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
901
901
 
902
902
  llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
903
903
  if (model == NULL) {
904
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
904
+ LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
905
+ __func__, params.model.path.c_str());
905
906
  return iparams;
906
907
  }
907
908
 
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
911
912
 
912
913
  llama_context * lctx = llama_init_from_model(model, cparams);
913
914
  if (lctx == NULL) {
914
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
915
+ LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
916
+ __func__, params.model.path.c_str());
915
917
  llama_model_free(model);
916
918
  return iparams;
917
919
  }
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
988
990
  return iparams;
989
991
  }
990
992
 
993
+ char buf[1024];
991
994
  la.ptr = lora.get();
995
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
996
+ la.task_name = buf;
997
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
998
+ la.prompt_prefix = buf;
992
999
  iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
993
1000
  }
994
1001
 
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1153
1160
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1154
1161
  cparams.pooling_type = params.pooling_type;
1155
1162
  cparams.attention_type = params.attention_type;
1163
+ cparams.flash_attn_type = params.flash_attn_type;
1156
1164
  cparams.cb_eval = params.cb_eval;
1157
1165
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1158
1166
  cparams.offload_kqv = !params.no_kv_offload;
1159
- cparams.flash_attn = params.flash_attn;
1160
1167
  cparams.no_perf = params.no_perf;
1161
1168
  cparams.op_offload = !params.no_op_offload;
1162
1169
  cparams.swa_full = params.swa_full;
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
34
34
  std::string path;
35
35
  float scale;
36
36
 
37
+ std::string task_name;
38
+ std::string prompt_prefix;
39
+
37
40
  struct llama_adapter_lora * ptr;
38
41
  };
39
42
 
@@ -310,6 +313,7 @@ struct common_params {
310
313
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
311
314
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
312
315
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
316
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
313
317
 
314
318
  struct common_params_sampling sampling;
315
319
  struct common_params_speculative speculative;
@@ -373,7 +377,6 @@ struct common_params {
373
377
  bool multiline_input = false; // reverse the usage of `\`
374
378
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
375
379
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
376
- bool flash_attn = false; // flash attention
377
380
  bool no_perf = false; // disable performance metrics
378
381
  bool ctx_shift = false; // context shift on infinite text generation
379
382
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -1,5 +1,5 @@
1
1
  cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
- project("ggml" C CXX)
2
+ project("ggml" C CXX ASM)
3
3
  include(CheckIncludeFileCXX)
4
4
 
5
5
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)