@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/lib/index.js +26 -20
  19. package/lib/index.ts +32 -28
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +14 -0
  22. package/src/LlamaContext.cpp +13 -4
  23. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  27. package/src/llama.cpp/common/arg.cpp +180 -3
  28. package/src/llama.cpp/common/chat-template.hpp +21 -7
  29. package/src/llama.cpp/common/chat.cpp +220 -101
  30. package/src/llama.cpp/common/chat.hpp +3 -0
  31. package/src/llama.cpp/common/common.h +15 -7
  32. package/src/llama.cpp/common/llguidance.cpp +3 -3
  33. package/src/llama.cpp/common/log.cpp +1 -0
  34. package/src/llama.cpp/common/log.h +2 -1
  35. package/src/llama.cpp/common/minja.hpp +24 -9
  36. package/src/llama.cpp/common/sampling.cpp +52 -46
  37. package/src/llama.cpp/common/speculative.h +1 -1
  38. package/src/llama.cpp/docs/build.md +2 -2
  39. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  40. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  43. package/src/llama.cpp/examples/run/run.cpp +5 -12
  44. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/server/httplib.h +381 -292
  46. package/src/llama.cpp/examples/server/server.cpp +58 -47
  47. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  48. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  49. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  50. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  51. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  52. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  55. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  58. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  59. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  60. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  61. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  62. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  63. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  64. package/src/llama.cpp/include/llama.h +14 -10
  65. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  66. package/src/llama.cpp/src/llama-grammar.h +1 -1
  67. package/src/llama.cpp/src/llama-impl.h +6 -6
  68. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  69. package/src/llama.cpp/src/llama-mmap.h +1 -0
  70. package/src/llama.cpp/src/llama-model.cpp +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  72. package/src/llama.cpp/src/llama.cpp +7 -5
  73. package/src/llama.cpp/src/unicode.cpp +9 -2
  74. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  75. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  76. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  77. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -8801,12 +8801,14 @@ static int llama_decode_impl(
8801
8801
  //llama_synchronize(&lctx);
8802
8802
 
8803
8803
  // decide if we need to defrag the kv cache
8804
- if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
8805
- const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
8804
+ if (cparams.causal_attn && cparams.defrag_thold > 0.0f) {
8805
+ // - do not defrag small contexts (i.e. < 2048 tokens)
8806
+ // - count the padding towards the number of used tokens
8807
+ const float fragmentation = kv_self.n >= 2048 ? std::max(0.0f, 1.0f - float(kv_self.used + llama_kv_cache_get_padding(cparams))/float(kv_self.n)) : 0.0f;
8806
8808
 
8807
8809
  // queue defragmentation for next llama_kv_cache_update
8808
8810
  if (fragmentation > cparams.defrag_thold) {
8809
- //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
8811
+ LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
8810
8812
 
8811
8813
  llama_kv_cache_defrag(kv_self);
8812
8814
  }
@@ -9428,8 +9430,6 @@ static struct llama_model * llama_model_load_from_file_impl(
9428
9430
  struct llama_model_params params) {
9429
9431
  ggml_time_init();
9430
9432
 
9431
- llama_model * model = new llama_model(params);
9432
-
9433
9433
  unsigned cur_percentage = 0;
9434
9434
  if (params.progress_callback == NULL) {
9435
9435
  params.progress_callback_user_data = &cur_percentage;
@@ -9447,6 +9447,8 @@ static struct llama_model * llama_model_load_from_file_impl(
9447
9447
  };
9448
9448
  }
9449
9449
 
9450
+ llama_model * model = new llama_model(params);
9451
+
9450
9452
  // create list of devices to use with this model
9451
9453
  if (params.devices) {
9452
9454
  for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
@@ -618,7 +618,14 @@ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
618
618
  result.reserve(utf8.size());
619
619
  size_t offset = 0;
620
620
  while (offset < utf8.size()) {
621
- result.push_back(unicode_cpt_from_utf8(utf8, offset));
621
+ try {
622
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
623
+ }
624
+ catch (const std::invalid_argument & /*ex*/) {
625
+ // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
626
+ ++offset;
627
+ result.emplace_back(0xFFFD); // replacement character
628
+ }
622
629
  }
623
630
  return result;
624
631
  }
@@ -701,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
701
708
  const auto cpts = unicode_cpts_from_utf8(text);
702
709
 
703
710
  // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
704
- // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
711
+ // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
705
712
  std::string text_collapsed;
706
713
  if (need_collapse) {
707
714
  // collapse all unicode categories
@@ -1254,7 +1254,7 @@ struct test_count_equal : public test_case {
1254
1254
  ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
1255
1255
  ggml_set_name(b, "b");
1256
1256
 
1257
- ggml_tensor * b_argmax = ggml_argmax(ctx, a);
1257
+ ggml_tensor * b_argmax = ggml_argmax(ctx, b);
1258
1258
  ggml_set_name(b_argmax, "b_argmax");
1259
1259
 
1260
1260
  ggml_tensor * out = ggml_count_equal(ctx, a_argmax, b_argmax);
@@ -1511,6 +1511,7 @@ struct test_cont : public test_case {
1511
1511
  };
1512
1512
 
1513
1513
  // GGML_OP_ADD
1514
+ // GGML_OP_SUB
1514
1515
  // GGML_OP_MUL
1515
1516
  // GGML_OP_DIV
1516
1517
  struct test_bin_bcast : public test_case {
@@ -3860,7 +3861,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3860
3861
  test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
3861
3862
  test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
3862
3863
 
3863
- test_cases.emplace_back(new test_count_equal());
3864
+ test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 500, 1, 1}));
3865
+ test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
3864
3866
 
3865
3867
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 1, 1, 1}));
3866
3868
  test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {100, 10, 1, 1}));
@@ -3885,8 +3887,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3885
3887
  test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 2, 1, 1}, view));
3886
3888
  test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 2, 1}, view));
3887
3889
  test_cases.emplace_back(new test_repeat_back(GGML_TYPE_F32, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
3888
- test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I32, {8, 6, 4, 2}, {2, 1, 1, 1}, view));
3889
- test_cases.emplace_back(new test_repeat_back(GGML_TYPE_I16, {8, 6, 4, 2}, {1, 1, 1, 2}, view));
3890
3890
  }
3891
3891
 
3892
3892
  test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
@@ -3938,7 +3938,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
3938
3938
  test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
3939
3939
 
3940
3940
  auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
3941
- for (auto op : {ggml_add, ggml_mul, ggml_div}) {
3941
+ for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {
3942
3942
  test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
3943
3943
  }
3944
3944
  };
@@ -24,7 +24,10 @@ static common_chat_msg msg_from_json(const json & message) {
24
24
  ret.content = message.at("content");
25
25
  }
26
26
  if (message.contains("tool_plan")) {
27
- ret.tool_plan = message.at("tool_plan");
27
+ ret.reasoning_content = message.at("tool_plan");
28
+ }
29
+ if (message.contains("reasoning_content")) {
30
+ ret.reasoning_content = message.at("reasoning_content");
28
31
  }
29
32
  auto has_tool_calls = message.contains("tool_calls");
30
33
  if (has_tool_calls) {
@@ -105,6 +108,7 @@ static std::string dump(const json & j) {
105
108
  static void assert_msg_equals(const common_chat_msg & expected, const common_chat_msg & actual) {
106
109
  assert_equals(expected.role, actual.role);
107
110
  assert_equals(expected.content, actual.content);
111
+ assert_equals(expected.reasoning_content, actual.reasoning_content);
108
112
  assert_equals(expected.tool_calls.size(), actual.tool_calls.size());
109
113
  for (size_t i = 0; i < expected.tool_calls.size(); i++) {
110
114
  const auto & expected_tool_call = expected.tool_calls[i];
@@ -176,13 +180,15 @@ struct delta_data {
176
180
 
177
181
  static delta_data init_delta(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
178
182
  const json & user_message, const json & delta_message, const json & tools,
179
- const json & tool_choice) {
183
+ const json & tool_choice,
184
+ bool think = false) {
180
185
  common_chat_inputs inputs;
181
186
  inputs.parallel_tool_calls = true;
182
187
  inputs.messages = json::array();
183
188
  inputs.messages.push_back(user_message);
184
189
  inputs.tools = tools;
185
190
  inputs.tool_choice = tool_choice;
191
+ inputs.extract_reasoning = think;
186
192
  auto params_prefix = common_chat_params_init(tmpl, inputs);
187
193
 
188
194
  inputs.messages.push_back(delta_message);
@@ -192,17 +198,24 @@ static delta_data init_delta(const common_chat_template & tmpl, const std::vecto
192
198
  std::string prefix = params_prefix.prompt;
193
199
  std::string full = params_full.prompt;
194
200
 
195
- // Check full starts with prefix
196
- if (full.find(prefix) != 0) {
197
- fprintf(stderr, "Full:\n%s\n\nPrefix:\n%s\n\n", full.c_str(), prefix.c_str());
198
- throw std::runtime_error("Full message does not start with prefix");
199
- }
200
-
201
201
  if (full == prefix) {
202
202
  throw std::runtime_error("Full message is the same as the prefix");
203
203
  }
204
204
 
205
- auto delta = full.substr(prefix.size());
205
+ size_t common_prefix_length = 0;
206
+ for (size_t i = 0; i < prefix.size() && i < full.size(); ++i) {
207
+ if (prefix[i] != full[i]) {
208
+ break;
209
+ }
210
+ if (prefix[i] == '<') {
211
+ // DeepSeek R1's template (as of 20250209) adds a trailing <think> if add_generation_prompt,
212
+ // but it removes thinking tags for past messages.
213
+ // The prefix and full strings diverge at <think> vs. <|tool▁calls▁begin|>, we avoid consuming the leading <.
214
+ continue;
215
+ }
216
+ common_prefix_length = i + 1;
217
+ }
218
+ auto delta = full.substr(common_prefix_length);
206
219
 
207
220
  // Strip end tokens
208
221
  for (const auto & end_token : end_tokens) {
@@ -223,7 +236,9 @@ static delta_data init_delta(const common_chat_template & tmpl, const std::vecto
223
236
  */
224
237
  static void test_template(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
225
238
  const json & test_message, const json & tools = {}, const std::string & expected_delta = "",
226
- bool expect_grammar_triggered = true) {
239
+ bool expect_grammar_triggered = true,
240
+ bool test_grammar_if_triggered = true,
241
+ bool think = false) {
227
242
  common_chat_msg expected_msg = msg_from_json(test_message);
228
243
 
229
244
  auto user_message = json{
@@ -232,7 +247,7 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
232
247
  };
233
248
 
234
249
  for (const auto & tool_choice : json({ "auto", "required" })) {
235
- auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools, tool_choice);
250
+ auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools, tool_choice, think);
236
251
  if (!expected_delta.empty()) {
237
252
  assert_equals(expected_delta, data.delta);
238
253
  }
@@ -274,7 +289,7 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
274
289
  assert_equals(expect_grammar_triggered, grammar_triggered);
275
290
  }
276
291
 
277
- if (grammar_triggered && !match_string(constrained, grammar.get())) {
292
+ if (grammar_triggered && test_grammar_if_triggered && !match_string(constrained, grammar.get())) {
278
293
  throw std::runtime_error("Failed to match delta against grammar:\n\n" + data.delta +
279
294
  "\n\nGrammar: " + data.params.grammar);
280
295
  }
@@ -283,16 +298,33 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
283
298
  }
284
299
 
285
300
  static void test_template_output_parsers() {
286
- json text_message {
301
+ json message_user {
302
+ { "role", "user" },
303
+ { "content", "Hey there!" },
304
+ };
305
+ json message_assist {
306
+ { "role", "assistant" },
307
+ { "content", "Hello, world!\nWhat's up?" },
308
+ };
309
+ json message_assist_thoughts_unparsed_think {
310
+ { "role", "assistant" },
311
+ { "content", "<think>I'm thinking</think>Hello, world!\nWhat's up?" },
312
+ };
313
+ json message_assist_thoughts_unparsed_r7b {
314
+ { "role", "assistant" },
315
+ { "content", "<|START_THINKING|>I'm thinking<|END_THINKING|>Hello, world!\nWhat's up?" },
316
+ };
317
+ json message_assist_thoughts {
287
318
  { "role", "assistant" },
288
319
  { "content", "Hello, world!\nWhat's up?" },
320
+ { "reasoning_content", "I'm thinking" },
289
321
  };
290
322
  json tool_calls = json::array({{
291
323
  { "type", "function" },
292
324
  { "function", { { "name", "special_function" }, { "arguments", "{\"arg1\": 1}" } } },
293
325
  }});
294
326
 
295
- json tool_call_message {
327
+ json message_assist_call {
296
328
  { "role", "assistant"},
297
329
  { "content", {}},
298
330
  { "tool_calls", {
@@ -305,7 +337,34 @@ static void test_template_output_parsers() {
305
337
  },
306
338
  }},
307
339
  };
308
- json tool_call_message_with_id {
340
+ json message_assist_call_thoughts = {
341
+ { "role", "assistant" },
342
+ { "content", nullptr },
343
+ { "reasoning_content", "I'm\nthinking" },
344
+ { "tool_calls", {
345
+ {
346
+ { "type", "function" },
347
+ { "function", {
348
+ { "name", "special_function" },
349
+ { "arguments", "{\"arg1\": 1}" },
350
+ }},
351
+ },
352
+ }},
353
+ };
354
+ json message_assist_call_thoughts_unparsed = {
355
+ { "role", "assistant" },
356
+ { "content", "<think>I'm\nthinking</think>" },
357
+ { "tool_calls", {
358
+ {
359
+ { "type", "function" },
360
+ { "function", {
361
+ { "name", "special_function" },
362
+ { "arguments", "{\"arg1\": 1}" },
363
+ }},
364
+ },
365
+ }},
366
+ };
367
+ json message_assist_call_id {
309
368
  { "role", "assistant"},
310
369
  { "content", {}},
311
370
  { "tool_calls", {
@@ -322,10 +381,9 @@ static void test_template_output_parsers() {
322
381
  { "content", {} },
323
382
  { "tool_calls", tool_calls }
324
383
  };
325
- json tool_call_plan_message_with_idx {
384
+ json message_assist_call_idx {
326
385
  { "role", "assistant"},
327
386
  { "content", {}},
328
- { "tool_plan", "I'm not so sure"},
329
387
  { "tool_calls", {
330
388
  {
331
389
  { "type", "function" },
@@ -341,8 +399,10 @@ static void test_template_output_parsers() {
341
399
  { "content", {} },
342
400
  { "tool_calls", tool_calls }
343
401
  };
402
+ json message_assist_call_tool_plan_idx = message_assist_call_idx;
403
+ message_assist_call_tool_plan_idx["tool_plan"] = "I'm thinking";
344
404
 
345
- auto python_tool_call_message = json{
405
+ auto python_message_assist_call = json{
346
406
  { "role", "assistant" },
347
407
  { "content", {} },
348
408
  { "tool_calls", json{ {
@@ -357,7 +417,7 @@ static void test_template_output_parsers() {
357
417
  } },
358
418
  } } }
359
419
  };
360
- auto code_interpreter_tool_call_message = json{
420
+ auto code_interpreter_message_assist_call = json{
361
421
  { "role", "assistant" },
362
422
  { "content", {} },
363
423
  { "tool_calls", json{ {
@@ -374,17 +434,27 @@ static void test_template_output_parsers() {
374
434
  };
375
435
 
376
436
  common_chat_inputs inputs_no_tools;
377
- inputs_no_tools.messages = {
378
- { { "role", "user" }, { "content", "Hey\nThere" } }
379
- };
437
+ inputs_no_tools.messages = json::array({message_user});
438
+ inputs_no_tools.extract_reasoning = false;
380
439
 
381
- common_chat_inputs inputs_tools = inputs_no_tools;
382
- inputs_tools.tools = json::array();
383
- inputs_tools.tools.push_back(special_function_tool);
440
+ common_chat_inputs inputs_no_tools_think;
441
+ inputs_no_tools_think.messages = json::array({message_user});
442
+ inputs_no_tools_think.extract_reasoning = true;
384
443
 
385
- common_chat_inputs inputs_tools_builtin = inputs_no_tools;
386
- inputs_tools_builtin.tools = json::array();
387
- inputs_tools_builtin.tools.push_back(python_tool);
444
+ common_chat_inputs inputs_tools;
445
+ inputs_tools.messages = json::array({message_user});
446
+ inputs_tools.tools = json::array({special_function_tool});
447
+ inputs_tools.extract_reasoning = false;
448
+
449
+ common_chat_inputs inputs_tools_think;
450
+ inputs_tools_think.messages = json::array({message_user});
451
+ inputs_tools_think.tools = json::array({special_function_tool});
452
+ inputs_tools_think.extract_reasoning = true;
453
+
454
+ common_chat_inputs inputs_tools_builtin;
455
+ inputs_tools_builtin.messages = json::array({message_user});
456
+ inputs_tools_builtin.tools = json::array({python_tool});
457
+ inputs_tools_builtin.extract_reasoning = false;
388
458
 
389
459
  {
390
460
  // Not supported yet
@@ -395,15 +465,53 @@ static void test_template_output_parsers() {
395
465
  const common_chat_template tmpl(read_file("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja"), "<s>", "</s>");
396
466
  std::vector<std::string> end_tokens{ "<|END_OF_TURN_TOKEN|>" };
397
467
 
398
- assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_params_init(tmpl, inputs_no_tools).format);
399
- assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, common_chat_params_init(tmpl, inputs_tools).format);
400
-
401
- test_template(tmpl, end_tokens, tool_call_plan_message_with_idx, tools,
402
- "<|START_THINKING|>I'm not so sure<|END_THINKING|>"
468
+ assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, common_chat_params_init(tmpl, inputs_no_tools).format);
469
+ assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B, common_chat_params_init(tmpl, inputs_tools).format);
470
+ assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING, common_chat_params_init(tmpl, inputs_tools_think).format);
471
+
472
+ assert_msg_equals(msg_from_json(message_assist),
473
+ common_chat_parse(
474
+ "Hello, world!\nWhat's up?",
475
+ COMMON_CHAT_FORMAT_COMMAND_R7B));
476
+ assert_msg_equals(msg_from_json(message_assist),
477
+ common_chat_parse(
478
+ "Hello, world!\nWhat's up?<|END_RESPONSE|>",
479
+ COMMON_CHAT_FORMAT_COMMAND_R7B));
480
+ assert_msg_equals(msg_from_json(message_assist),
481
+ common_chat_parse(
482
+ "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
483
+ COMMON_CHAT_FORMAT_COMMAND_R7B));
484
+ assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_r7b),
485
+ common_chat_parse(
486
+ "<|START_THINKING|>I'm thinking<|END_THINKING|>"
487
+ "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
488
+ COMMON_CHAT_FORMAT_COMMAND_R7B));
489
+ assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_r7b),
490
+ common_chat_parse(
491
+ "<|START_THINKING|>I'm thinking<|END_THINKING|>"
492
+ "Hello, world!\nWhat's up?<|END_RESPONSE|>",
493
+ COMMON_CHAT_FORMAT_COMMAND_R7B));
494
+
495
+ assert_msg_equals(msg_from_json(message_assist_thoughts),
496
+ common_chat_parse(
497
+ "<|START_THINKING|>I'm thinking<|END_THINKING|>"
498
+ "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
499
+ COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING));
500
+
501
+ test_template(tmpl, end_tokens, message_assist_call_idx, tools,
502
+ "<|START_THINKING|><|END_THINKING|>"
403
503
  "<|START_ACTION|>[\n"
404
504
  " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
405
505
  "]<|END_ACTION|>");
406
- test_template(tmpl, end_tokens, text_message, tools,
506
+ test_template(tmpl, end_tokens, message_assist_call_tool_plan_idx, tools,
507
+ "<|START_THINKING|>I'm thinking<|END_THINKING|>"
508
+ "<|START_ACTION|>[\n"
509
+ " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
510
+ "]<|END_ACTION|>",
511
+ /* expect_grammar_triggered= */ true,
512
+ /* test_grammar_if_triggered= */ true,
513
+ /* think= */ true);
514
+ test_template(tmpl, end_tokens, message_assist, tools,
407
515
  "<|START_RESPONSE|>Hello, world!\n"
408
516
  "What's up?<|END_RESPONSE|>",
409
517
  /* expect_grammar_triggered= */ false);
@@ -423,12 +531,12 @@ static void test_template_output_parsers() {
423
531
 
424
532
  // Generic tool calls doesn't generate / parse content-only messages symmetrically.
425
533
 
426
- assert_msg_equals(msg_from_json(text_message),
534
+ assert_msg_equals(msg_from_json(message_assist),
427
535
  common_chat_parse("{\n"
428
536
  " \"response\": \"Hello, world!\\nWhat's up?\"\n"
429
537
  "}",
430
538
  common_chat_params_init(tmpl, inputs_tools).format));
431
- test_template(tmpl, end_tokens, tool_call_message_with_id, tools,
539
+ test_template(tmpl, end_tokens, message_assist_call_id, tools,
432
540
  "{\n"
433
541
  " \"tool_calls\": [\n"
434
542
  " {\n"
@@ -448,9 +556,9 @@ static void test_template_output_parsers() {
448
556
 
449
557
  assert_equals(COMMON_CHAT_FORMAT_MISTRAL_NEMO, common_chat_params_init(tmpl, inputs_tools).format);
450
558
 
451
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
559
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
452
560
  test_template(
453
- tmpl, end_tokens, tool_call_message_with_id, tools,
561
+ tmpl, end_tokens, message_assist_call_id, tools,
454
562
  "[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]");
455
563
  }
456
564
  {
@@ -473,12 +581,12 @@ static void test_template_output_parsers() {
473
581
  inputs_tools)
474
582
  .format);
475
583
 
476
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
477
- test_template(tmpl, end_tokens, tool_call_message, tools,
584
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
585
+ test_template(tmpl, end_tokens, message_assist_call, tools,
478
586
  "<tool_call>\n"
479
587
  "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
480
588
  "</tool_call>");
481
- test_template(tmpl, end_tokens, python_tool_call_message, tools,
589
+ test_template(tmpl, end_tokens, python_message_assist_call, tools,
482
590
  "<tool_call>\n"
483
591
  "{\"name\": \"python\", \"arguments\": {\"code\": \"print('hey')\"}}\n"
484
592
  "</tool_call>");
@@ -498,12 +606,12 @@ static void test_template_output_parsers() {
498
606
  inputs_tools_builtin)
499
607
  .format);
500
608
 
501
- // test_template(tmpl, end_tokens, text_message, tools, R"(?)", /* expect_grammar_triggered= */ false);
502
- test_template(tmpl, end_tokens, code_interpreter_tool_call_message, llama_3_1_tools,
609
+ // test_template(tmpl, end_tokens, message_assist, tools, R"(?)", /* expect_grammar_triggered= */ false);
610
+ test_template(tmpl, end_tokens, code_interpreter_message_assist_call, llama_3_1_tools,
503
611
  "<|python_tag|>code_interpreter.call(code=\"print('hey')\")");
504
- test_template(tmpl, end_tokens, python_tool_call_message, tools,
612
+ test_template(tmpl, end_tokens, python_message_assist_call, tools,
505
613
  "<|python_tag|>python.call(code=\"print('hey')\")");
506
- test_template(tmpl, end_tokens, tool_call_message, tools,
614
+ test_template(tmpl, end_tokens, message_assist_call, tools,
507
615
  "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
508
616
  }
509
617
  {
@@ -513,8 +621,8 @@ static void test_template_output_parsers() {
513
621
 
514
622
  assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X, common_chat_params_init(tmpl, inputs_tools).format);
515
623
 
516
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
517
- test_template(tmpl, end_tokens, tool_call_message, tools,
624
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
625
+ test_template(tmpl, end_tokens, message_assist_call, tools,
518
626
  "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}");
519
627
  }
520
628
  {
@@ -525,8 +633,8 @@ static void test_template_output_parsers() {
525
633
  assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
526
634
  common_chat_params_init(tmpl, inputs_tools).format);
527
635
 
528
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
529
- test_template(tmpl, end_tokens, tool_call_message, tools,
636
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
637
+ test_template(tmpl, end_tokens, message_assist_call, tools,
530
638
  "<function=special_function>{\"arg1\": 1}</function>");
531
639
  }
532
640
  {
@@ -537,12 +645,12 @@ static void test_template_output_parsers() {
537
645
  assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_params_init(tmpl, inputs_no_tools).format);
538
646
  assert_equals(COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2, common_chat_params_init(tmpl, inputs_tools).format);
539
647
 
540
- test_template(tmpl, end_tokens, text_message, {},
648
+ test_template(tmpl, end_tokens, message_assist, {},
541
649
  "all\n"
542
650
  "Hello, world!\n"
543
651
  "What's up?",
544
652
  /* expect_grammar_triggered= */ false);
545
- test_template(tmpl, end_tokens, tool_call_message, tools,
653
+ test_template(tmpl, end_tokens, message_assist_call, tools,
546
654
  "special_function\n"
547
655
  "{\"arg1\": 1}");
548
656
  }
@@ -553,23 +661,79 @@ static void test_template_output_parsers() {
553
661
 
554
662
  assert_equals(COMMON_CHAT_FORMAT_FIREFUNCTION_V2, common_chat_params_init(tmpl, inputs_tools).format);
555
663
 
556
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
557
- test_template(tmpl, end_tokens, tool_call_message, tools,
664
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
665
+ test_template(tmpl, end_tokens, message_assist_call, tools,
558
666
  " functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]");
559
667
  }
560
668
  {
669
+ // Original DeepSeek R1 template. Leaves <|tool▁calls▁begin|> and others unclosed. Our logic fixes the prompt.
561
670
  const common_chat_template tmpl(read_file("models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja"),
562
671
  "<s>", "</s>");
563
672
  std::vector<std::string> end_tokens{ "<|end▁of▁sentence|>" };
564
673
 
565
- assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
674
+ assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
675
+ assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, common_chat_params_init(tmpl, inputs_tools_think).format);
676
+
677
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
678
+ test_template(tmpl, end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
679
+ assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_think),
680
+ common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
681
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1));
682
+ assert_msg_equals(msg_from_json(message_assist_thoughts),
683
+ common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
684
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
685
+ assert_msg_equals(msg_from_json(message_assist_thoughts),
686
+ // Latest template update (ast of 20250209) adds a trailing <think>\n if add_generation_prompt is true.
687
+ common_chat_parse("I'm thinking</think>Hello, world!\nWhat's up?",
688
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
689
+ // test_template(tmpl, end_tokens, message_assist_call, tools,
690
+ // "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
691
+ // "```json\n"
692
+ // "{\"arg1\": 1}\n"
693
+ // // Look what's not here: <|tool▁calls▁end|> (also missing the <|end▁of▁sentence|>, but that is removed lazily by the test's delta logic)
694
+ // "```<|tool▁call▁end|>",
695
+ // /* expect_grammar_triggered= */ true,
696
+ // /* test_grammar_if_triggered= */ false);
697
+ }
698
+ {
699
+ // Replacement DeepSeek R1 template. Makes the Distill Qwen 7B/32B models happy to call tools and all.
700
+ const common_chat_template tmpl(read_file("models/templates/llama-cpp-deepseek-r1.jinja"),
701
+ "<s>", "</s>");
702
+ std::vector<std::string> end_tokens{ "<|end▁of▁sentence|>" };
566
703
 
567
- test_template(tmpl, end_tokens, text_message, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
568
- test_template(tmpl, end_tokens, tool_call_message, tools,
569
- "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
570
- "```json\n"
571
- "{\"arg1\": 1}\n"
572
- "```<|tool▁call▁end|>");
704
+ assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
705
+ assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING, common_chat_params_init(tmpl, inputs_tools_think).format);
706
+
707
+ test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
708
+ test_template(tmpl, end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
709
+ assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_think),
710
+ common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
711
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1));
712
+ assert_msg_equals(msg_from_json(message_assist_thoughts),
713
+ common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
714
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
715
+
716
+ assert_msg_equals(msg_from_json(message_assist_call_thoughts_unparsed),
717
+ common_chat_parse(
718
+ "<think>I'm\nthinking</think>\n\n"
719
+ "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
720
+ "```json\n"
721
+ "{\"arg1\": 1}\n"
722
+ "```<|tool▁call▁end|><|tool▁calls▁end|>",
723
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1));
724
+ assert_msg_equals(msg_from_json(message_assist_call_thoughts),
725
+ common_chat_parse(
726
+ "<think>I'm\nthinking</think>\n\n"
727
+ "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
728
+ "```json\n"
729
+ "{\"arg1\": 1}\n"
730
+ "```<|tool▁call▁end|><|tool▁calls▁end|>",
731
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING));
732
+ test_template(tmpl, end_tokens, message_assist_call, tools,
733
+ "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n"
734
+ "```json\n"
735
+ "{\"arg1\": 1}\n"
736
+ "```<|tool▁call▁end|><|tool▁calls▁end|>");
573
737
  }
574
738
  }
575
739
 
@@ -586,16 +750,20 @@ int main(int argc, char ** argv) {
586
750
  std::cout << "|----------|--------|\n";
587
751
 
588
752
  for (int i = 1; i < argc; i++) {
589
- std::string path = argv[i];
590
- if (path.rfind(".jinja") != path.size() - 6) {
591
- std::cerr << "Skipping non-jinja file: " << path << std::endl;
592
- continue;
753
+ try {
754
+ std::string path = argv[i];
755
+ if (path.rfind(".jinja") != path.size() - 6) {
756
+ std::cerr << "Skipping non-jinja file: " << path << std::endl;
757
+ continue;
758
+ }
759
+ common_chat_template tmpl(read_file(path), "", "");
760
+ auto parts = string_split(path, "/");
761
+ auto name = parts[parts.size() - 1];
762
+ auto format = common_chat_format_name(common_chat_params_init(tmpl, inputs).format);
763
+ std::cout << "| " << name << " | " << format << " |\n";
764
+ } catch (const std::exception & e) {
765
+ std::cerr << "Failed to process " << argv[i] << ": " << e.what() << std::endl;
593
766
  }
594
- common_chat_template tmpl(read_file(path), "", "");
595
- auto parts = string_split(path, "/");
596
- auto name = parts[parts.size() - 1];
597
- std::cout << "| " << name << " | " << common_chat_format_name(common_chat_params_init(tmpl, inputs).format)
598
- << " |\n";
599
767
  }
600
768
  } else
601
769
  #endif
@@ -697,8 +697,8 @@ static std::pair<int, int> test_handcrafted_file(const unsigned int seed) {
697
697
 
698
698
  #ifdef _WIN32
699
699
  if (!file) {
700
- printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
701
- printf("%s: skipping tests");
700
+ printf("failed to create tmpfile(), needs elevated privileges on Windows");
701
+ printf("skipping tests");
702
702
  continue;
703
703
  }
704
704
  #else
@@ -1086,8 +1086,8 @@ static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned
1086
1086
 
1087
1087
  #ifdef _WIN32
1088
1088
  if (!file) {
1089
- printf("%s: failed to create tmpfile(), needs elevated privileges on Windows");
1090
- printf("%s: skipping tests");
1089
+ printf("failed to create tmpfile(), needs elevated privileges on Windows");
1090
+ printf("skipping tests");
1091
1091
  return std::make_pair(0, 0);
1092
1092
  }
1093
1093
  #else