@fugood/llama.node 0.3.11 → 0.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -0
  18. package/lib/index.js +26 -20
  19. package/lib/index.ts +32 -28
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +14 -0
  22. package/src/LlamaContext.cpp +13 -4
  23. package/src/llama.cpp/.github/workflows/build.yml +35 -3
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/common/CMakeLists.txt +20 -3
  27. package/src/llama.cpp/common/arg.cpp +180 -3
  28. package/src/llama.cpp/common/chat-template.hpp +21 -7
  29. package/src/llama.cpp/common/chat.cpp +220 -101
  30. package/src/llama.cpp/common/chat.hpp +3 -0
  31. package/src/llama.cpp/common/common.h +15 -7
  32. package/src/llama.cpp/common/llguidance.cpp +3 -3
  33. package/src/llama.cpp/common/log.cpp +1 -0
  34. package/src/llama.cpp/common/log.h +2 -1
  35. package/src/llama.cpp/common/minja.hpp +24 -9
  36. package/src/llama.cpp/common/sampling.cpp +52 -46
  37. package/src/llama.cpp/common/speculative.h +1 -1
  38. package/src/llama.cpp/docs/build.md +2 -2
  39. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -1
  40. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  43. package/src/llama.cpp/examples/run/run.cpp +5 -12
  44. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/server/httplib.h +381 -292
  46. package/src/llama.cpp/examples/server/server.cpp +58 -47
  47. package/src/llama.cpp/examples/server/utils.hpp +7 -5
  48. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  49. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  50. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  51. package/src/llama.cpp/ggml/include/ggml.h +1 -1
  52. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  53. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -12
  54. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +852 -268
  55. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +200 -107
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -5
  57. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  58. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +2 -2
  59. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +26 -4
  60. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -7
  61. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +812 -569
  62. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +25 -1
  63. package/src/llama.cpp/ggml/src/ggml.c +1 -1
  64. package/src/llama.cpp/include/llama.h +14 -10
  65. package/src/llama.cpp/src/llama-grammar.cpp +1 -1
  66. package/src/llama.cpp/src/llama-grammar.h +1 -1
  67. package/src/llama.cpp/src/llama-impl.h +6 -6
  68. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  69. package/src/llama.cpp/src/llama-mmap.h +1 -0
  70. package/src/llama.cpp/src/llama-model.cpp +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +131 -57
  72. package/src/llama.cpp/src/llama.cpp +7 -5
  73. package/src/llama.cpp/src/unicode.cpp +9 -2
  74. package/src/llama.cpp/tests/test-backend-ops.cpp +5 -5
  75. package/src/llama.cpp/tests/test-chat.cpp +237 -69
  76. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  77. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
@@ -12,11 +12,13 @@ std::string common_chat_format_name(common_chat_format format) {
12
12
  case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
13
13
  case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
14
14
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
15
+ case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING: return "DeepSeek R1 (extract reasoning)";
15
16
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
16
17
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
17
18
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
18
19
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
19
20
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
21
+ case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)";
20
22
  default:
21
23
  throw std::runtime_error("Unknown chat format");
22
24
  }
@@ -105,7 +107,6 @@ static common_chat_msg parse_json_tool_calls(
105
107
  std::sregex_iterator rend;
106
108
  std::sregex_iterator rit(it, end, function_regex);
107
109
  if (rit == rend) {
108
- fprintf(stderr, "No more tool calls found\n");
109
110
  result.content += std::string(it, end);
110
111
  break;
111
112
  }
@@ -115,14 +116,21 @@ static common_chat_msg parse_json_tool_calls(
115
116
 
116
117
  json arguments;
117
118
  if (!parse_json(it, end, arguments)) {
118
- throw std::runtime_error("Failed to parse json tool call arguments");
119
+ throw std::runtime_error("Failed to parse json tool call arguments: " + input);
119
120
  }
120
121
  if (!std::regex_search(it, end, match, close_regex)) {
121
- throw std::runtime_error("Malformed input, missing closing pattern");
122
+ throw std::runtime_error("Malformed input, missing closing pattern: " + input);
122
123
  }
123
124
  it = match.suffix().first;
124
125
  result.tool_calls.push_back({name, arguments.is_string() ? arguments.get<std::string>() : arguments.dump(), /* id= */ ""});
125
126
  }
127
+
128
+ if (!result.tool_calls.empty()) {
129
+ if (!string_strip(result.content).empty()) {
130
+ LOG_WRN("Content found with tool calls: %s\n", result.content.c_str());
131
+ }
132
+ result.content = "";
133
+ }
126
134
  return result;
127
135
  }
128
136
 
@@ -134,11 +142,11 @@ static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& in
134
142
  result.role = "assistant";
135
143
  const auto process_tool_calls = [&](const json & tool_calls) {
136
144
  for (const auto & tool_call : tool_calls) {
137
- const auto & arguments = tool_call["arguments"];
145
+ const auto & arguments = tool_call.at("arguments");
138
146
  result.tool_calls.push_back({
139
- tool_call["name"],
147
+ tool_call.at("name"),
140
148
  arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
141
- tool_call.contains("id") ? tool_call["id"] : "",
149
+ tool_call.contains("id") ? tool_call.at("id") : "",
142
150
  });
143
151
  }
144
152
  };
@@ -155,7 +163,7 @@ static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& in
155
163
 
156
164
  static void foreach_function(const json & tools, const std::function<void(const json &)> & fn) {
157
165
  for (const auto & tool : tools) {
158
- if (!tool.contains("type") || tool["type"] != "function" || !tool.contains("function")) {
166
+ if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
159
167
  LOG_INF("Skipping tool without function: %s", tool.dump(2).c_str());
160
168
  continue;
161
169
  }
@@ -190,27 +198,27 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
190
198
 
191
199
  auto tool_call_schemas = json::array();
192
200
  foreach_function(inputs.tools, [&](const json & tool) {
193
- const auto & function = tool["function"];
201
+ const auto & function = tool.at("function");
194
202
  auto tool_schema = json {
195
203
  {"type", "object"},
196
204
  {"properties", {
197
205
  {"name", {
198
206
  {"type", "string"},
199
- {"const", function["name"]},
207
+ {"const", function.at("name")},
200
208
  }},
201
- {"arguments", function["parameters"]},
209
+ {"arguments", function.at("parameters")},
202
210
  }},
203
211
  {"required", json::array({"name", "arguments"})},
204
212
  };
205
213
  if (function.contains("description")) {
206
- tool_schema["description"] = function["description"];
214
+ tool_schema["description"] = function.at("description");
207
215
  }
208
216
  if (inputs.parallel_tool_calls) {
209
- tool_schema["properties"]["id"] = {
217
+ tool_schema.at("properties")["id"] = {
210
218
  {"type", "string"},
211
219
  {"minLength", 4},
212
220
  };
213
- tool_schema["required"].push_back("id");
221
+ tool_schema.at("required").push_back("id");
214
222
  }
215
223
  tool_call_schemas.emplace_back(tool_schema);
216
224
  });
@@ -275,21 +283,21 @@ static common_chat_msg common_chat_parse_generic(const std::string & input) {
275
283
  common_chat_msg result;
276
284
  result.role = "assistant";
277
285
  if (data.contains("tool_calls")) {
278
- for (const auto & tool_call : data["tool_calls"]) {
286
+ for (const auto & tool_call : data.at("tool_calls")) {
279
287
  result.tool_calls.push_back({
280
- tool_call["name"],
281
- tool_call["arguments"].dump(),
282
- tool_call.contains("id") ? tool_call["id"] : "",
288
+ tool_call.at("name"),
289
+ tool_call.at("arguments").dump(),
290
+ tool_call.contains("id") ? tool_call.at("id") : "",
283
291
  });
284
292
  }
285
293
  } else if (data.contains("tool_call")) {
286
294
  result.tool_calls.push_back({
287
- data["tool_call"]["name"],
288
- data["tool_call"]["arguments"].dump(),
295
+ data.at("tool_call").at("name"),
296
+ data.at("tool_call").at("arguments").dump(),
289
297
  /* id= */ "",
290
298
  });
291
299
  } else if (data.contains("response")) {
292
- const auto & response = data["response"];
300
+ const auto & response = data.at("response");
293
301
  result.content = response.is_string() ? response.get<std::string>() : response.dump(2);
294
302
  }
295
303
  return result;
@@ -301,7 +309,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
301
309
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
302
310
  auto schemas = json::array();
303
311
  foreach_function(inputs.tools, [&](const json & tool) {
304
- const auto & function = tool["function"];
312
+ const auto & function = tool.at("function");
305
313
  schemas.push_back({
306
314
  {"type", "object"},
307
315
  {"properties", {
@@ -309,9 +317,9 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
309
317
  // It's hard to constrain that for now (while reusing the JSON schema conversion), so we're just expecting a plain object.
310
318
  {"name", {
311
319
  {"type", "string"},
312
- {"const", function["name"]},
320
+ {"const", function.at("name")},
313
321
  }},
314
- {"arguments", function["parameters"]},
322
+ {"arguments", function.at("parameters")},
315
323
  {"id", {
316
324
  {"type", "string"},
317
325
  // Nemo's template expects a 9-character alphanumeric ID.
@@ -346,7 +354,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
346
354
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
347
355
  auto schemas = json::array();
348
356
  foreach_function(inputs.tools, [&](const json & tool) {
349
- const auto & function = tool["function"];
357
+ const auto & function = tool.at("function");
350
358
  schemas.push_back({
351
359
  {"type", "object"},
352
360
  {"properties", {
@@ -357,9 +365,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
357
365
  }},
358
366
  {"tool_name", {
359
367
  {"type", "string"},
360
- {"const", function["name"]},
368
+ {"const", function.at("name")},
361
369
  }},
362
- {"parameters", function["parameters"]},
370
+ {"parameters", function.at("parameters")},
363
371
  }},
364
372
  {"required", json::array({"tool_call_id", "tool_name", "parameters"})},
365
373
  });
@@ -382,39 +390,65 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
382
390
  "<|END_THINKING|>",
383
391
  "<|END_ACTION|>",
384
392
  };
385
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
386
- data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
393
+ auto adjusted_messages = json::array();
394
+ for (const auto & msg : inputs.messages) {
395
+ auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
396
+ auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
397
+ if (has_reasoning_content && has_tool_calls) {
398
+ auto adjusted_message = msg;
399
+ adjusted_message["tool_plan"] = msg.at("reasoning_content");
400
+ adjusted_message.erase("reasoning_content");
401
+ adjusted_messages.push_back(adjusted_message);
402
+ } else {
403
+ adjusted_messages.push_back(msg);
404
+ }
405
+ }
406
+ data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
407
+ data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING : COMMON_CHAT_FORMAT_COMMAND_R7B;
387
408
  return data;
388
409
  }
389
- static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
390
- static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
391
- static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
410
+ static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) {
411
+ static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
412
+ static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
413
+ static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
414
+
392
415
  std::smatch match;
393
416
 
394
417
  common_chat_msg result;
395
418
  result.role = "assistant";
396
- if (std::regex_match(input, match, response_regex)) {
397
- result.content = match[1].str();
398
- } else if (std::regex_match(input, match, thought_action_regex)) {
399
- result.tool_plan = match[1].str();
400
- auto actions_str = match[2].str();
419
+
420
+ std::string rest = input;
421
+
422
+ if (std::regex_match(rest, match, thought_regex)) {
423
+ if (extract_reasoning) {
424
+ result.reasoning_content = match[2].str();
425
+ } else if (!match[2].str().empty()) {
426
+ // Let the unparsed thinking tags through in content only if their insides aren't empty.
427
+ result.content = match[1].str();
428
+ }
429
+ rest = match[3].str();
430
+ }
431
+ if (std::regex_match(rest, match, action_regex)) {
432
+ auto actions_str = match[1].str();
401
433
  auto actions = json::parse(actions_str);
402
434
  for (const auto & action : actions) {
403
435
  result.tool_calls.push_back({
404
- /* .name = */ action["tool_name"],
405
- /* .arguments = */ action["parameters"].dump(),
406
- /* .id = */ action["tool_call_id"],
436
+ /* .name = */ action.at("tool_name"),
437
+ /* .arguments = */ action.at("parameters").dump(),
438
+ /* .id = */ action.at("tool_call_id"),
407
439
  });
408
440
  }
441
+ } else if (std::regex_match(rest, match, response_regex)) {
442
+ auto response = match[1].str();
443
+ result.content += response;
409
444
  } else {
410
- LOG_ERR("Failed to parse command_r output");
411
- result.content = input;
445
+ result.content += rest;
412
446
  }
413
447
  return result;
414
448
  }
415
449
 
416
450
  static void expect_tool_parameters(const std::string & name, const json & parameters, const std::vector<std::string> & expected_properties) {
417
- if (!parameters.is_object() || !parameters.contains("type") || parameters["type"] != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
451
+ if (!parameters.is_object() || !parameters.contains("type") || parameters.at("type") != "object" || !parameters.contains("properties") || !parameters.contains("required")) {
418
452
  throw std::runtime_error("Parameters of tool " + name + " must be an object w/ required properties");
419
453
  }
420
454
  const auto & parameters_properties = parameters.at("properties");
@@ -468,9 +502,9 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com
468
502
  };
469
503
 
470
504
  foreach_function(inputs.tools, [&](const json & tool) {
471
- const auto & function = tool["function"];
472
- std::string name = function["name"];
473
- auto parameters = function["parameters"];
505
+ const auto & function = tool.at("function");
506
+ std::string name = function.at("name");
507
+ auto parameters = function.at("parameters");
474
508
  builder.resolve_refs(parameters);
475
509
 
476
510
  // https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/remote/tool_runtime
@@ -546,34 +580,90 @@ static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bo
546
580
 
547
581
  static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
548
582
  common_chat_params data;
549
- data.grammar_lazy = inputs.tool_choice != "required";
550
- data.grammar = build_grammar([&](const common_grammar_builder & builder) {
551
- std::vector<std::string> tool_rules;
552
- foreach_function(inputs.tools, [&](const json & tool) {
553
- const auto & function = tool["function"];
554
- std::string name = function["name"];
555
- auto parameters = function["parameters"];
556
- auto args_rule = builder.add_schema(name + "-args", parameters);
557
- tool_rules.push_back(builder.add_rule(name + "-call",
558
- "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
559
- });
560
- data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
561
- data.preserved_tokens = {
562
- "<|tool▁sep|>",
563
- "<|tool▁call▁end|>",
564
- };
565
- builder.add_rule("root", "\"<|tool▁calls▁begin|>\" (" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " space");
566
- }, grammar_options);
583
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
584
+ data.grammar_lazy = inputs.tool_choice != "required" && inputs.json_schema.is_null();
585
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
586
+ std::vector<std::string> tool_rules;
587
+ foreach_function(inputs.tools, [&](const json & tool) {
588
+ const auto & function = tool.at("function");
589
+ std::string name = function.at("name");
590
+ auto parameters = function.at("parameters");
591
+ auto args_rule = builder.add_schema(name + "-args", parameters);
592
+ tool_rules.push_back(builder.add_rule(name + "-call",
593
+ "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n"
594
+ "```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\""));
595
+ });
596
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
597
+ // so we accept common variants (then it's all constrained)
598
+ builder.add_rule("root",
599
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" ) "
600
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
601
+ "\"<|tool▁calls▁end|>\""
602
+ " space");
603
+ data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false});
604
+ data.grammar_triggers.push_back({"<|tool_calls_begin|>", /* .at_start = */ false});
605
+ data.grammar_triggers.push_back({"<|tool calls begin|>", /* .at_start = */ false});
606
+ data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", /* .at_start = */ false});
607
+ data.preserved_tokens = {
608
+ "<think>",
609
+ "</think>",
610
+ "<|tool▁sep|>",
611
+ "<|tool▁calls▁end|",
612
+ "<|tool▁call▁end|>",
613
+ };
614
+ }, grammar_options);
615
+ }
567
616
  auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
617
+
618
+ // Hacks to fix the official (broken) prompt.
619
+ // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
620
+ // until the official template is fixed.
621
+ if (tmpl.source().find("{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}") != std::string::npos) {
622
+ // Don't leave the chat dangling after tool results
623
+ if (string_ends_with(prompt, "<|tool▁outputs▁end|>")) {
624
+ prompt += "<|end▁of▁sentence|>";
625
+ if (inputs.add_generation_prompt) {
626
+ prompt += "<|Assistant|>";
627
+ }
628
+ }
629
+ // Fix up tool call delta example added by Minja
630
+ prompt = std::regex_replace(
631
+ prompt,
632
+ std::regex("(<|tool▁call▁end|>)[\\s\\r\\n]*(<|tool▁outputs▁begin|>|<|User|>)"),
633
+ "$1<|tool▁calls▁end|><|end▁of▁sentence|>$2");
634
+ }
568
635
  data.prompt = prompt;
569
- data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
636
+ data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1;
570
637
  return data;
571
638
  }
572
- static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input) {
573
- static std::regex trigger_regex("<|tool▁calls▁begin|>");
639
+ static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) {
574
640
  static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n");
575
- static std::regex close_regex("```<|tool▁call▁end|>");
576
- return parse_json_tool_calls(input, trigger_regex, function_regex, close_regex);
641
+ static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>");
642
+ static std::regex reasoning_content_regex("((?:<think>)?([\\s\\S\\r\\n]*?)</think>)?([\\s\\S\\r\\n]*)");
643
+ static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>");
644
+ common_chat_msg msg;
645
+ msg.role = "assistant";
646
+ std::smatch match;
647
+ if (std::regex_match(input, match, reasoning_content_regex)) {
648
+ std::string rest;
649
+ if (extract_reasoning) {
650
+ msg.reasoning_content = string_strip(match[2].str());
651
+ } else {
652
+ msg.content = match[1].str();
653
+ }
654
+ rest = match[3].str();
655
+
656
+ if (std::regex_search(rest, match, tool_calls_regex)) {
657
+ auto tool_calls = match[1].str();
658
+ auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex);
659
+ msg.tool_calls = std::move(msg2.tool_calls);
660
+ } else {
661
+ msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end());
662
+ }
663
+ } else {
664
+ msg.content = input;
665
+ }
666
+ return msg;
577
667
  }
578
668
 
579
669
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
@@ -583,20 +673,20 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c
583
673
  {"datetime", "Jan 29 2025 13:00:00 GMT"},
584
674
  {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
585
675
  });
586
- if (!inputs.tools.is_null() && !inputs.tools.empty()) {
676
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
587
677
  data.grammar_lazy = inputs.tool_choice != "required";
588
678
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
589
679
  auto schemas = json::array();
590
680
  foreach_function(inputs.tools, [&](const json & tool) {
591
- const auto & function = tool["function"];
681
+ const auto & function = tool.at("function");
592
682
  schemas.push_back({
593
683
  {"type", "object"},
594
684
  {"properties", {
595
685
  {"name", {
596
686
  {"type", "string"},
597
- {"const", function["name"]},
687
+ {"const", function.at("name")},
598
688
  }},
599
- {"arguments", function["parameters"]},
689
+ {"arguments", function.at("parameters")},
600
690
  }},
601
691
  {"required", json::array({"name", "arguments", "id"})},
602
692
  });
@@ -628,15 +718,15 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
628
718
  common_chat_params data;
629
719
  data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
630
720
  data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
631
- if (!inputs.tools.is_null() && !inputs.tools.empty()) {
721
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
632
722
  data.grammar_lazy = inputs.tool_choice != "required";
633
723
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
634
724
  std::vector<std::string> first_tool_rules;
635
725
  std::vector<std::string> subsequent_tool_rules;
636
726
  foreach_function(inputs.tools, [&](const json & tool) {
637
- const auto & function = tool["function"];
638
- std::string name = function["name"];
639
- auto parameters = function["parameters"];
727
+ const auto & function = tool.at("function");
728
+ std::string name = function.at("name");
729
+ auto parameters = function.at("parameters");
640
730
  auto args_rule = builder.add_schema(name + "-args", parameters);
641
731
  first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule));
642
732
  subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule));
@@ -716,9 +806,9 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
716
806
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
717
807
  std::vector<std::string> tool_rules;
718
808
  foreach_function(inputs.tools, [&](const json & tool) {
719
- const auto & function = tool["function"];
720
- const auto & parameters = function["parameters"];
721
- std::string name = function["name"];
809
+ const auto & function = tool.at("function");
810
+ const auto & parameters = function.at("parameters");
811
+ std::string name = function.at("name");
722
812
  if (name == "python" || name == "ipython") {
723
813
  if (!parameters.contains("type")) {
724
814
  throw std::runtime_error("Missing type in python tool");
@@ -789,9 +879,9 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
789
879
  data.grammar = build_grammar([&](const common_grammar_builder & builder) {
790
880
  std::vector<std::string> tool_rules;
791
881
  foreach_function(inputs.tools, [&](const json & tool) {
792
- const auto & function = tool["function"];
793
- std::string name = function["name"];
794
- auto parameters = function["parameters"];
882
+ const auto & function = tool.at("function");
883
+ std::string name = function.at("name");
884
+ auto parameters = function.at("parameters");
795
885
  builder.resolve_refs(parameters);
796
886
  tool_rules.push_back(builder.add_schema(name + "-call", {
797
887
  {"type", "object"},
@@ -839,9 +929,9 @@ static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input)
839
929
  if (!parse_json(it, end, call)) {
840
930
  throw std::runtime_error("Failed to parse json tool call");
841
931
  }
842
- const auto & arguments = call["arguments"];
932
+ const auto & arguments = call.at("arguments");
843
933
  result.tool_calls.push_back({
844
- call["name"],
934
+ call.at("name"),
845
935
  arguments.dump(),
846
936
  // arguments.is_string() ? arguments.get<std::string>() : arguments.dump(),
847
937
  /* id= */ "",
@@ -878,53 +968,78 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
878
968
  }
879
969
  data.grammar = json_schema_to_grammar(inputs.json_schema);
880
970
  } else {
881
- data.grammar = inputs.grammar.empty();
971
+ data.grammar = inputs.grammar;
882
972
  }
883
973
  return data;
884
974
  }
885
975
 
886
976
  common_chat_params common_chat_params_init(const common_chat_template & tmpl, const struct common_chat_inputs & inputs) {
887
- auto has_tools = !inputs.tools.is_null() && inputs.tool_choice != "none";
888
- LOG_DBG("[%s] has_tools=%s\n", __func__, has_tools ? "true" : "false");
977
+ const auto & src = tmpl.source();
978
+ const auto & caps = tmpl.original_caps();
889
979
 
890
- if (has_tools && !inputs.grammar.empty()) {
891
- throw std::runtime_error("Cannot specify grammar with tools");
980
+ if (inputs.tools.is_array()) {
981
+ if (inputs.tool_choice != "none" && !inputs.grammar.empty()) {
982
+ throw std::runtime_error("Cannot specify grammar with tools");
983
+ }
984
+ if (caps.supports_tool_calls && !caps.supports_tools) {
985
+ LOG_WRN("Template supports tool calls but does not natively describe tools. The fallback behaviour used may produce bad results, inspect prompt w/ --verbose & consider overriding the template.\n");
986
+ }
892
987
  }
893
988
 
894
- const auto & src = tmpl.source();
989
+ // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
990
+ if (src.find("<|tool▁calls▁begin|>") != std::string::npos && inputs.json_schema.is_null()) {
991
+ return common_chat_params_init_deepseek_r1(tmpl, inputs);
992
+ }
993
+
994
+ // Command R7B: : use handler in all cases except json schema (thinking / tools).
995
+ if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && inputs.json_schema.is_null()) {
996
+ return common_chat_params_init_command_r7b(tmpl, inputs);
997
+ }
998
+
999
+ // Use generic handler when mixing tools + JSON schema.
1000
+ // TODO: support that mix in handlers below.
1001
+ if ((!inputs.tools.is_array() && inputs.json_schema.is_object())) {
1002
+ return common_chat_params_init_generic(tmpl, inputs);
1003
+ }
1004
+
1005
+ // Functionary prepends "all\n" to plain content outputs, so we use its handler in all cases.
895
1006
  if (src.find(">>>all") != std::string::npos) {
896
- // Functionary prepends "all\n" to plain content outputs, so we use the parser no matter when
897
1007
  return common_chat_params_init_functionary_v3_2(tmpl, inputs);
898
1008
  }
1009
+
1010
+ // Firefunction v2 requires datetime and functions in the context even w/o tools, so we also use its handler in all cases.
899
1011
  if (src.find(" functools[") != std::string::npos) {
900
- // Firefunction v2 requires datetime and functions in the context, even w/o tools.
901
1012
  return common_chat_params_init_firefunction_v2(tmpl, inputs);
902
1013
  }
903
1014
 
904
- if (!has_tools) {
1015
+ // Plain handler (no tools)
1016
+ if (inputs.tools.is_null() || inputs.tool_choice == "none") {
905
1017
  return common_chat_params_init_without_tools(tmpl, inputs);
906
1018
  }
907
1019
 
1020
+ // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
908
1021
  if (src.find("<tool_call>") != std::string::npos) {
909
1022
  return common_chat_params_init_hermes_2_pro(tmpl, inputs);
910
1023
  }
1024
+
1025
+ // Functionary v3.1 (w/ tools)
911
1026
  if (src.find("<|start_header_id|>") != std::string::npos
912
1027
  && src.find("<function=") != std::string::npos) {
913
1028
  return common_chat_params_init_functionary_v3_1_llama_3_1(tmpl, inputs);
914
1029
  }
1030
+
1031
+ // Llama 3.1, 3.2, 3.3 (w/ tools)
915
1032
  if (src.find("<|start_header_id|>ipython<|end_header_id|>") != std::string::npos) {
916
1033
  auto allow_python_tag_builtin_tools = src.find("<|python_tag|>") != std::string::npos;
917
1034
  return common_chat_params_init_llama_3_1_tool_calls(tmpl, inputs, allow_python_tag_builtin_tools);
918
1035
  }
919
- if (src.find("<|tool▁calls▁begin|>") != std::string::npos) {
920
- return common_chat_params_init_deepseek_r1(tmpl, inputs);
921
- }
1036
+
1037
+ // Mistral Nemo (w/ tools)
922
1038
  if (src.find("[TOOL_CALLS]") != std::string::npos) {
923
1039
  return common_chat_params_init_mistral_nemo(tmpl, inputs);
924
1040
  }
925
- if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
926
- return common_chat_params_init_command_r7b(tmpl, inputs);
927
- }
1041
+
1042
+ // Generic fallback
928
1043
  return common_chat_params_init_generic(tmpl, inputs);
929
1044
  }
930
1045
 
@@ -949,7 +1064,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
949
1064
  case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
950
1065
  return common_chat_parse_llama_3_1(input, /* with_builtin_tools= */ true);
951
1066
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
952
- return common_chat_parse_deepseek_r1(input);
1067
+ return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ false);
1068
+ case COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING:
1069
+ return common_chat_parse_deepseek_r1(input, /* extract_reasoning= */ true);
953
1070
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
954
1071
  return common_chat_parse_functionary_v3_2(input);
955
1072
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1:
@@ -959,7 +1076,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
959
1076
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
960
1077
  return common_chat_parse_firefunction_v2(input);
961
1078
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
962
- return common_chat_parse_command_r7b(input);
1079
+ return common_chat_parse_command_r7b(input, /* extract_reasoning= */ false);
1080
+ case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING:
1081
+ return common_chat_parse_command_r7b(input, /* extract_reasoning= */ true);
963
1082
  default:
964
1083
  throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
965
1084
  }
@@ -19,6 +19,7 @@ struct common_chat_inputs {
19
19
  bool stream;
20
20
  std::string grammar;
21
21
  bool add_generation_prompt = true;
22
+ bool extract_reasoning = true;
22
23
  };
23
24
 
24
25
  enum common_chat_format {
@@ -28,11 +29,13 @@ enum common_chat_format {
28
29
  COMMON_CHAT_FORMAT_LLAMA_3_X,
29
30
  COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
30
31
  COMMON_CHAT_FORMAT_DEEPSEEK_R1,
32
+ COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
31
33
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
32
34
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
33
35
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
34
36
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
35
37
  COMMON_CHAT_FORMAT_COMMAND_R7B,
38
+ COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
36
39
 
37
40
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
38
41
  };