@fugood/llama.node 1.1.6 → 1.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +9 -9
  8. package/src/LlamaCompletionWorker.cpp +73 -20
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/LlamaContext.cpp +9 -0
  11. package/src/common.hpp +8 -1
  12. package/src/llama.cpp/CMakeLists.txt +2 -0
  13. package/src/llama.cpp/common/arg.cpp +132 -41
  14. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  15. package/src/llama.cpp/common/chat.cpp +311 -9
  16. package/src/llama.cpp/common/chat.h +4 -1
  17. package/src/llama.cpp/common/common.cpp +54 -0
  18. package/src/llama.cpp/common/common.h +46 -9
  19. package/src/llama.cpp/ggml/CMakeLists.txt +2 -0
  20. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  21. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  22. package/src/llama.cpp/ggml/include/ggml.h +28 -2
  23. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +66 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  27. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +14 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -2
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -1
  33. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  37. package/src/llama.cpp/include/llama.h +25 -0
  38. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  39. package/src/llama.cpp/src/llama-chat.cpp +2 -4
  40. package/src/llama.cpp/src/llama-context.cpp +29 -22
  41. package/src/llama.cpp/src/llama-context.h +6 -5
  42. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  43. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  44. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +89 -69
  45. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  46. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  47. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  48. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  49. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  50. package/src/llama.cpp/src/llama-memory.h +2 -2
  51. package/src/llama.cpp/src/llama-model.cpp +81 -70
  52. package/src/llama.cpp/src/llama-model.h +2 -0
  53. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  54. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
@@ -283,6 +283,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
283
283
  }
284
284
  if (!msg.reasoning_content.empty()) {
285
285
  jmsg["reasoning_content"] = msg.reasoning_content;
286
+ jmsg["thinking"] = msg.reasoning_content; // gpt-oss
286
287
  }
287
288
  if (!msg.tool_name.empty()) {
288
289
  jmsg["name"] = msg.tool_name;
@@ -459,11 +460,12 @@ std::string common_chat_format_single(
459
460
  return ss.str();
460
461
  }
461
462
 
462
- std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
463
+ std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
463
464
  common_chat_templates_inputs inputs;
464
465
  inputs.use_jinja = use_jinja;
465
466
  inputs.add_bos = tmpls->add_bos;
466
467
  inputs.add_eos = tmpls->add_eos;
468
+ inputs.chat_template_kwargs = chat_template_kwargs;
467
469
  auto add_simple_msg = [&](auto role, auto content) {
468
470
  common_chat_msg msg;
469
471
  msg.role = role;
@@ -539,6 +541,17 @@ common_chat_templates_ptr common_chat_templates_init(
539
541
  default_template_src = CHATML_TEMPLATE_SRC;
540
542
  }
541
543
  }
544
+
545
+ // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
546
+ // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
547
+ if (default_template_src.find("<|channel|>") != std::string::npos
548
+ // search for the error message and patch it
549
+ && default_template_src.find("in message.content or") != std::string::npos) {
550
+ string_replace_all(default_template_src,
551
+ "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
552
+ "{%- if false %}");
553
+ }
554
+
542
555
  std::string token_bos = bos_token_override;
543
556
  std::string token_eos = eos_token_override;
544
557
  bool add_bos = false;
@@ -593,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
593
606
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
594
607
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
595
608
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609
+ case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
596
610
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
597
611
  default:
598
612
  throw std::runtime_error("Unknown chat format");
@@ -610,6 +624,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
610
624
  }
611
625
  }
612
626
 
627
+ common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
628
+ if (format == "none") {
629
+ return COMMON_REASONING_FORMAT_NONE;
630
+ } else if (format == "auto") {
631
+ return COMMON_REASONING_FORMAT_AUTO;
632
+ } else if (format == "deepseek") {
633
+ return COMMON_REASONING_FORMAT_DEEPSEEK;
634
+ } else if (format == "deepseek-legacy") {
635
+ return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
636
+ }
637
+ throw std::runtime_error("Unknown reasoning format: " + format);
638
+ }
639
+
613
640
  static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
614
641
  std::string arguments;
615
642
  if (builder.is_partial()) {
@@ -1299,16 +1326,164 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
1299
1326
  data.prompt = prompt;
1300
1327
  data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1301
1328
 
1302
- // TODO: support tool calls in GPT-OSS?
1329
+ // These special tokens are required to parse properly, so we include them
1330
+ // even if parse_tool_calls is false.
1331
+ data.preserved_tokens = {
1332
+ "<|channel|>",
1333
+ "<|constrain|>",
1334
+ "<|message|>",
1335
+ "<|start|>",
1336
+ "<|end|>",
1337
+ };
1338
+
1339
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
1340
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1341
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1342
+ // tool calls can appear in commentary or analysis channels
1343
+ auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
1344
+
1345
+ std::vector<std::string> tool_rules_recipient_in_role;
1346
+ std::vector<std::string> tool_rules_recipient_in_channel;
1347
+ foreach_function(inputs.tools, [&](const json & tool) {
1348
+ const auto & function = tool.at("function");
1349
+ std::string name = function.at("name");
1350
+ auto parameters = function.at("parameters");
1351
+ builder.resolve_refs(parameters);
1352
+
1353
+ tool_rules_recipient_in_role.push_back(
1354
+ builder.add_rule(name + "-call",
1355
+ "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
1356
+ builder.add_schema(name + "-args", parameters)
1357
+ )
1358
+ );
1359
+
1360
+ tool_rules_recipient_in_channel.push_back(
1361
+ builder.add_rule(name + "-call",
1362
+ "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
1363
+ builder.add_schema(name + "-args", parameters)
1364
+ )
1365
+ );
1366
+ });
1367
+
1368
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
1369
+ "\"<|start|>assistant\"? \" to=functions.\" ( " +
1370
+ string_join(tool_rules_recipient_in_role, " | ") + " )"
1371
+ );
1372
+
1373
+ auto recipient_in_channel = builder.add_rule("recipient_in_channel",
1374
+ channel + " \" to=functions.\" ( " +
1375
+ string_join(tool_rules_recipient_in_channel, " | ") + " )"
1376
+ );
1377
+
1378
+ builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
1379
+
1380
+ // Trigger on tool calls that appear in the commentary channel
1381
+ data.grammar_triggers.push_back({
1382
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1383
+ "<\\|channel\\|>(commentary|analysis) to"
1384
+ });
1385
+
1386
+ // Trigger tool calls that appear in the role section, either at the
1387
+ // start or in the middle.
1388
+ data.grammar_triggers.push_back({
1389
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1390
+ "^ to"
1391
+ });
1392
+
1393
+ data.grammar_triggers.push_back({
1394
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1395
+ "<\\|start\\|>assistant to"
1396
+ });
1397
+ });
1398
+ }
1303
1399
 
1304
1400
  return data;
1305
1401
  }
1306
1402
  static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1307
- // TODO @ngxson : this won't work with --special enabled, we should fix that
1308
- builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1309
- if (!builder.syntax().parse_tool_calls) {
1310
- builder.add_content(builder.consume_rest());
1311
- return;
1403
+ static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
1404
+ static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
1405
+
1406
+ static const common_regex start_regex("<\\|start\\|>assistant");
1407
+ static const common_regex analysis_regex("<\\|channel\\|>analysis");
1408
+ static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
1409
+ static const common_regex preamble_regex("<\\|channel\\|>commentary");
1410
+ static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
1411
+ static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
1412
+
1413
+ auto consume_end = [&](bool include_end = false) {
1414
+ if (auto res = builder.try_find_literal("<|end|>")) {
1415
+ return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
1416
+ }
1417
+ return builder.consume_rest();
1418
+ };
1419
+
1420
+ auto handle_tool_call = [&](const std::string & name) {
1421
+ if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
1422
+ if (builder.syntax().parse_tool_calls) {
1423
+ if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
1424
+ throw common_chat_msg_partial_exception("incomplete tool call");
1425
+ }
1426
+ } else if (args->is_partial) {
1427
+ throw common_chat_msg_partial_exception("incomplete tool call");
1428
+ }
1429
+ }
1430
+ };
1431
+
1432
+ auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
1433
+ auto match = regex.search(input, 0, true);
1434
+ if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
1435
+ return match;
1436
+ }
1437
+ return std::nullopt;
1438
+ };
1439
+
1440
+ do {
1441
+ auto header_start_pos = builder.pos();
1442
+ auto content_start = builder.try_find_literal("<|message|>");
1443
+ if (!content_start) {
1444
+ throw common_chat_msg_partial_exception("incomplete header");
1445
+ }
1446
+
1447
+ auto header = content_start->prelude;
1448
+
1449
+ if (auto match = regex_match(tool_call1_regex, header)) {
1450
+ auto group = match->groups[1];
1451
+ auto name = header.substr(group.begin, group.end - group.begin);
1452
+ handle_tool_call(name);
1453
+ continue;
1454
+ }
1455
+
1456
+ if (auto match = regex_match(tool_call2_regex, header)) {
1457
+ auto group = match->groups[2];
1458
+ auto name = header.substr(group.begin, group.end - group.begin);
1459
+ handle_tool_call(name);
1460
+ continue;
1461
+ }
1462
+
1463
+ if (regex_match(analysis_regex, header)) {
1464
+ builder.move_to(header_start_pos);
1465
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
1466
+ builder.add_content(consume_end(true));
1467
+ } else {
1468
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
1469
+ }
1470
+ continue;
1471
+ }
1472
+
1473
+ if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
1474
+ builder.add_content(consume_end());
1475
+ continue;
1476
+ }
1477
+
1478
+ // Possibly a malformed message, attempt to recover by rolling
1479
+ // back to pick up the next <|start|>
1480
+ LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
1481
+ builder.move_to(header_start_pos);
1482
+ } while (builder.try_find_regex(start_regex, std::string::npos, false));
1483
+
1484
+ auto remaining = builder.consume_rest();
1485
+ if (!remaining.empty()) {
1486
+ LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
1312
1487
  }
1313
1488
  }
1314
1489
 
@@ -1721,6 +1896,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1721
1896
  builder.add_content(builder.consume_rest());
1722
1897
  }
1723
1898
 
1899
+ static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
1900
+ common_chat_params data;
1901
+
1902
+ // Pass thinking context for Granite template
1903
+ json additional_context = {
1904
+ {"thinking", inputs.enable_thinking},
1905
+ };
1906
+
1907
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
1908
+ data.format = COMMON_CHAT_FORMAT_GRANITE;
1909
+
1910
+ if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
1911
+ if (!inputs.enable_thinking) {
1912
+ data.prompt += "</think>";
1913
+ } else {
1914
+ data.thinking_forced_open = true;
1915
+ }
1916
+ }
1917
+
1918
+ if (!inputs.tools.is_null()) {
1919
+ // Granite uses <|tool_call|> followed by JSON list
1920
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1921
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1922
+ std::vector<std::string> tool_rules;
1923
+ foreach_function(inputs.tools, [&](const json & tool) {
1924
+ const auto & function = tool.at("function");
1925
+ std::string name = function.at("name");
1926
+ auto parameters = function.at("parameters");
1927
+ builder.resolve_refs(parameters);
1928
+ tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
1929
+ "-args", {
1930
+ {"type", "object"},
1931
+ {"properties", {
1932
+ {"name", {{"const", name}}},
1933
+ {"arguments", parameters},
1934
+ }},
1935
+ {"required", json::array({"name", "arguments"})},
1936
+ })));
1937
+ });
1938
+
1939
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
1940
+ auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
1941
+
1942
+ if (data.thinking_forced_open) {
1943
+ builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
1944
+ } else {
1945
+ builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
1946
+ }
1947
+
1948
+ data.grammar_triggers.push_back({
1949
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1950
+ "<|tool_call|>"
1951
+ });
1952
+
1953
+ data.preserved_tokens = {
1954
+ "<think>",
1955
+ "</think>",
1956
+ "<response>",
1957
+ "</response>",
1958
+ "<|tool_call|>",
1959
+ };
1960
+ });
1961
+ } else {
1962
+ // Handle thinking tags for non-tool responses
1963
+ if (data.thinking_forced_open && inputs.enable_thinking) {
1964
+ data.grammar_lazy = false;
1965
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1966
+ builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
1967
+ });
1968
+ data.preserved_tokens = {
1969
+ "<think>",
1970
+ "</think>",
1971
+ "<response>",
1972
+ "</response>",
1973
+ };
1974
+ }
1975
+ }
1976
+
1977
+ return data;
1978
+ }
1979
+
1980
+ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
1981
+ // Parse thinking tags
1982
+ builder.try_parse_reasoning("<think>", "</think>");
1983
+
1984
+ // Parse response tags using regex
1985
+ static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
1986
+ if (auto res = builder.try_find_regex(response_regex)) {
1987
+ // Extract the content between the tags (capture group 1)
1988
+ auto content = builder.str(res->groups[1]);
1989
+ builder.add_content(content);
1990
+ builder.move_to(res->groups[0].end);
1991
+ }
1992
+
1993
+ if (!builder.syntax().parse_tool_calls) {
1994
+ builder.add_content(builder.consume_rest());
1995
+ return;
1996
+ }
1997
+
1998
+ // Look for tool calls
1999
+ static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
2000
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2001
+ builder.move_to(res->groups[0].end);
2002
+
2003
+ // Expect JSON array of tool calls
2004
+ auto tool_calls_data = builder.consume_json();
2005
+ if (tool_calls_data.json.is_array()) {
2006
+ if (!builder.add_tool_calls(tool_calls_data.json)) {
2007
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2008
+ }
2009
+ } else {
2010
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2011
+ }
2012
+ } else {
2013
+ builder.add_content(builder.consume_rest());
2014
+ }
2015
+ }
2016
+
1724
2017
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
1725
2018
  common_chat_params data;
1726
2019
  data.prompt = apply(tmpl, inputs);
@@ -1754,8 +2047,8 @@ static common_chat_params common_chat_templates_apply_jinja(
1754
2047
  params.enable_thinking = inputs.enable_thinking;
1755
2048
  params.grammar = inputs.grammar;
1756
2049
  params.now = inputs.now;
1757
- params.add_bos = inputs.add_bos;
1758
- params.add_eos = inputs.add_eos;
2050
+ params.add_bos = tmpls->add_bos;
2051
+ params.add_eos = tmpls->add_eos;
1759
2052
 
1760
2053
  params.extra_context = json::object();
1761
2054
  for (auto el : inputs.chat_template_kwargs) {
@@ -1792,6 +2085,11 @@ static common_chat_params common_chat_templates_apply_jinja(
1792
2085
  return common_chat_params_init_command_r7b(tmpl, params);
1793
2086
  }
1794
2087
 
2088
+ // Granite (IBM) - detects thinking / tools support
2089
+ if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
2090
+ return common_chat_params_init_granite(tmpl, params);
2091
+ }
2092
+
1795
2093
  // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1796
2094
  if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
1797
2095
  return common_chat_params_init_hermes_2_pro(tmpl, params);
@@ -1852,6 +2150,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1852
2150
  int alloc_size = 0;
1853
2151
  std::vector<llama_chat_message> chat;
1854
2152
  std::vector<std::string> contents;
2153
+
1855
2154
  for (const auto & msg : inputs.messages) {
1856
2155
  auto content = msg.content;
1857
2156
  for (const auto & part : msg.content_parts) {
@@ -1953,6 +2252,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1953
2252
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
1954
2253
  common_chat_parse_command_r7b(builder);
1955
2254
  break;
2255
+ case COMMON_CHAT_FORMAT_GRANITE:
2256
+ common_chat_parse_granite(builder);
2257
+ break;
1956
2258
  case COMMON_CHAT_FORMAT_GPT_OSS:
1957
2259
  common_chat_parse_gpt_oss(builder);
1958
2260
  break;
@@ -120,6 +120,7 @@ enum common_chat_format {
120
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
121
121
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
122
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
+ COMMON_CHAT_FORMAT_GRANITE,
123
124
  COMMON_CHAT_FORMAT_GPT_OSS,
124
125
 
125
126
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
@@ -197,10 +198,12 @@ std::string common_chat_format_single(
197
198
  // Returns an example of formatted chat
198
199
  std::string common_chat_format_example(
199
200
  const struct common_chat_templates * tmpls,
200
- bool use_jinja);
201
+ bool use_jinja,
202
+ const std::map<std::string, std::string> & chat_template_kwargs);
201
203
 
202
204
  const char* common_chat_format_name(common_chat_format format);
203
205
  const char* common_reasoning_format_name(common_reasoning_format format);
206
+ common_reasoning_format common_reasoning_format_from_name(const std::string & format);
204
207
  common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
205
208
 
206
209
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -41,6 +41,7 @@
41
41
  #endif
42
42
  #include <locale>
43
43
  #include <windows.h>
44
+ #include <string.h>
44
45
  #include <fcntl.h>
45
46
  #include <io.h>
46
47
  #else
@@ -1566,3 +1567,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
1566
1567
 
1567
1568
  return result;
1568
1569
  }
1570
+
1571
+ ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
1572
+ ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
1573
+ const lr_opt & d = *(lr_opt *) userdata;
1574
+ result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
1575
+ result.sgd.wd = result.adamw.wd = d.wd;
1576
+ return result;
1577
+ }
1578
+
1579
+ // TODO make all command line args case-insensitive
1580
+ static inline bool eq_case_insensitive(char const* a, char const* b) {
1581
+ return !
1582
+ #if defined(_MSC_VER)
1583
+ _stricmp
1584
+ #else
1585
+ strcasecmp
1586
+ #endif // defined(_MSC_VER)
1587
+ (a, b);
1588
+ }
1589
+
1590
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
1591
+ if (eq_case_insensitive("adamw", n)) {
1592
+ return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
1593
+ }
1594
+ if (eq_case_insensitive("sgd", n)) {
1595
+ return GGML_OPT_OPTIMIZER_TYPE_SGD;
1596
+ }
1597
+ return GGML_OPT_OPTIMIZER_TYPE_COUNT;
1598
+ }
1599
+
1600
+ // TODO simplify to use just log and exp
1601
+ static float const k_log_2 = std::log(2.f);
1602
+
1603
+ void lr_opt::init() {
1604
+ if (lr_min > 0 && lr_min < lr0) {
1605
+ float nhalf = std::log(lr0 / lr_min) / k_log_2;
1606
+ float e = epochs;
1607
+ if (decay_epochs > 0 && decay_epochs < e) {
1608
+ e = decay_epochs;
1609
+ } else {
1610
+ decay_epochs = e;
1611
+ }
1612
+ scale_epoch = nhalf / e;
1613
+ }
1614
+ }
1615
+
1616
+ float lr_opt::get_lr(float epoch) const {
1617
+ float r = lr_min <= 0 ? lr0 :
1618
+ epoch >= decay_epochs ? lr_min :
1619
+ lr0 * std::pow(0.5f, epoch * scale_epoch);
1620
+ LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
1621
+ return r;
1622
+ }
@@ -2,14 +2,17 @@
2
2
 
3
3
  #pragma once
4
4
 
5
- #include "llama-cpp.h"
6
-
7
5
  #include <set>
6
+ #include <sstream>
8
7
  #include <string>
9
8
  #include <string_view>
10
9
  #include <vector>
11
10
  #include <map>
12
11
  #include <sstream>
12
+ #include <cmath>
13
+
14
+ #include "ggml-opt.h"
15
+ #include "llama-cpp.h"
13
16
 
14
17
  #ifdef _WIN32
15
18
  #define DIRECTORY_SEPARATOR '\\'
@@ -82,6 +85,7 @@ enum llama_example {
82
85
  LLAMA_EXAMPLE_PARALLEL,
83
86
  LLAMA_EXAMPLE_TTS,
84
87
  LLAMA_EXAMPLE_DIFFUSION,
88
+ LLAMA_EXAMPLE_FINETUNE,
85
89
 
86
90
  LLAMA_EXAMPLE_COUNT,
87
91
  };
@@ -202,6 +206,7 @@ struct common_params_speculative {
202
206
  float p_split = 0.1f; // speculative decoding split probability
203
207
  float p_min = 0.75f; // minimum speculative decoding probability (greedy)
204
208
  std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
209
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
205
210
 
206
211
  ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
207
212
  ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
@@ -234,13 +239,36 @@ struct common_params_diffusion {
234
239
  bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
235
240
  };
236
241
 
242
+ // reasoning API response format (not to be confused as chat template's reasoning format)
237
243
  enum common_reasoning_format {
238
244
  COMMON_REASONING_FORMAT_NONE,
239
- COMMON_REASONING_FORMAT_AUTO,
245
+ COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
240
246
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
241
247
  COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
248
+ // do not extend this enum unless you absolutely have to
249
+ // in most cases, use COMMON_REASONING_FORMAT_AUTO
250
+ // see: https://github.com/ggml-org/llama.cpp/pull/15408
242
251
  };
243
252
 
253
+
254
+ struct lr_opt {
255
+ float lr0 = 1e-5; // learning rate at first epoch
256
+ float lr_min = -1;
257
+ float decay_epochs = -1; // if >0, the learning rate starts at lr0 and decays to lr_min after this many epochs
258
+ float scale_epoch = 0;
259
+ float wd = 0;
260
+ unsigned epochs = 2;
261
+
262
+ unsigned epoch; // set by optimizer outer (epochs) loop
263
+ // learning rate decay - constant LR per epoch only for now
264
+ float get_lr(float e) const;
265
+ float get_lr() const { return get_lr(epoch); }
266
+ // must call after arg parse, before get_lr
267
+ void init();
268
+ };
269
+
270
+ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
271
+
244
272
  struct common_params {
245
273
  bool vocab_only = false;
246
274
  int32_t n_predict = -1; // new tokens to predict
@@ -348,7 +376,7 @@ struct common_params {
348
376
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
349
377
  bool flash_attn = false; // flash attention
350
378
  bool no_perf = false; // disable performance metrics
351
- bool ctx_shift = true; // context shift on inifinite text generation
379
+ bool ctx_shift = false; // context shift on inifinite text generation
352
380
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
353
381
  bool kv_unified = false; // enable unified KV cache
354
382
 
@@ -376,6 +404,11 @@ struct common_params {
376
404
  bool no_mmproj = false; // explicitly disable multimodal model
377
405
  std::vector<std::string> image; // path to image file(s)
378
406
 
407
+ // finetune
408
+ struct lr_opt lr;
409
+ enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
410
+ float val_split = 0.05f; // fraction of the data used for the validation set
411
+
379
412
  // embedding
380
413
  bool embedding = false; // get only sentence embedding
381
414
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
@@ -384,11 +417,12 @@ struct common_params {
384
417
  std::string cls_sep = "\t"; // separator of classification sequences
385
418
 
386
419
  // server params
387
- int32_t port = 8080; // server listens on this network port
388
- int32_t timeout_read = 600; // http read timeout in seconds
389
- int32_t timeout_write = timeout_read; // http write timeout in seconds
390
- int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
391
- int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
420
+ int32_t port = 8080; // server listens on this network port
421
+ int32_t timeout_read = 600; // http read timeout in seconds
422
+ int32_t timeout_write = timeout_read; // http write timeout in seconds
423
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
424
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
425
+ int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
392
426
 
393
427
  std::string hostname = "127.0.0.1";
394
428
  std::string public_path = ""; // NOLINT
@@ -703,3 +737,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
703
737
  //
704
738
 
705
739
  ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
740
+
741
+ // "adamw" or "sgd" (case insensitive)
742
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
@@ -176,6 +176,7 @@ option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM"
176
176
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
177
177
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
178
178
  option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
179
+ option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
179
180
  option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
180
181
  option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
181
182
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
@@ -187,6 +188,7 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
187
188
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
188
189
  option(GGML_WEBGPU "ggml: use WebGPU" OFF)
189
190
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
191
+ option(GGML_ZDNN "ggml: use zDNN" OFF)
190
192
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
191
193
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
192
194
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)