@fugood/llama.node 1.1.5 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/lib/binding.ts +4 -0
  2. package/lib/index.js +6 -1
  3. package/lib/index.ts +6 -0
  4. package/lib/version.js +5 -0
  5. package/lib/version.ts +2 -0
  6. package/package.json +14 -14
  7. package/scripts/llama.cpp.patch +19 -15
  8. package/src/LlamaCompletionWorker.cpp +73 -18
  9. package/src/LlamaCompletionWorker.h +8 -0
  10. package/src/llama.cpp/CMakeLists.txt +2 -0
  11. package/src/llama.cpp/common/arg.cpp +147 -46
  12. package/src/llama.cpp/common/chat-parser.cpp +9 -1
  13. package/src/llama.cpp/common/chat.cpp +350 -3
  14. package/src/llama.cpp/common/chat.h +11 -3
  15. package/src/llama.cpp/common/common.cpp +54 -0
  16. package/src/llama.cpp/common/common.h +44 -9
  17. package/src/llama.cpp/ggml/CMakeLists.txt +5 -2
  18. package/src/llama.cpp/ggml/include/ggml-opt.h +25 -6
  19. package/src/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
  20. package/src/llama.cpp/ggml/include/ggml.h +65 -3
  21. package/src/llama.cpp/ggml/src/CMakeLists.txt +13 -1
  22. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +1 -1
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +1136 -1077
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +20 -1
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +270 -11
  31. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +3 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +200 -51
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
  37. package/src/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  39. package/src/llama.cpp/include/llama.h +26 -0
  40. package/src/llama.cpp/src/llama-arch.cpp +65 -0
  41. package/src/llama.cpp/src/llama-arch.h +10 -0
  42. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  43. package/src/llama.cpp/src/llama-chat.cpp +15 -4
  44. package/src/llama.cpp/src/llama-chat.h +1 -0
  45. package/src/llama.cpp/src/llama-context.cpp +37 -25
  46. package/src/llama.cpp/src/llama-context.h +6 -5
  47. package/src/llama.cpp/src/llama-graph.cpp +118 -9
  48. package/src/llama.cpp/src/llama-graph.h +38 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -3
  50. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +12 -6
  51. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +2 -2
  52. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +93 -69
  53. package/src/llama.cpp/src/llama-kv-cache-unified.h +2 -2
  54. package/src/llama.cpp/src/llama-memory-hybrid.cpp +6 -2
  55. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -2
  56. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -2
  57. package/src/llama.cpp/src/llama-memory-recurrent.h +2 -2
  58. package/src/llama.cpp/src/llama-memory.h +2 -2
  59. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  60. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  61. package/src/llama.cpp/src/llama-model.cpp +500 -4
  62. package/src/llama.cpp/src/llama-model.h +25 -4
  63. package/src/llama.cpp/src/llama-quant.cpp +37 -1
  64. package/src/llama.cpp/src/llama-vocab.cpp +43 -0
@@ -132,6 +132,8 @@ struct templates_params {
132
132
  bool enable_thinking = true;
133
133
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
134
134
  json extra_context;
135
+ bool add_bos;
136
+ bool add_eos;
135
137
  };
136
138
 
137
139
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -281,6 +283,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
281
283
  }
282
284
  if (!msg.reasoning_content.empty()) {
283
285
  jmsg["reasoning_content"] = msg.reasoning_content;
286
+ jmsg["thinking"] = msg.reasoning_content; // gpt-oss
284
287
  }
285
288
  if (!msg.tool_name.empty()) {
286
289
  jmsg["name"] = msg.tool_name;
@@ -434,6 +437,8 @@ std::string common_chat_format_single(
434
437
 
435
438
  common_chat_templates_inputs inputs;
436
439
  inputs.use_jinja = use_jinja;
440
+ inputs.add_bos = tmpls->add_bos;
441
+ inputs.add_eos = tmpls->add_eos;
437
442
 
438
443
  std::string fmt_past_msg;
439
444
  if (!past_msg.empty()) {
@@ -455,9 +460,12 @@ std::string common_chat_format_single(
455
460
  return ss.str();
456
461
  }
457
462
 
458
- std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
463
+ std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
459
464
  common_chat_templates_inputs inputs;
460
465
  inputs.use_jinja = use_jinja;
466
+ inputs.add_bos = tmpls->add_bos;
467
+ inputs.add_eos = tmpls->add_eos;
468
+ inputs.chat_template_kwargs = chat_template_kwargs;
461
469
  auto add_simple_msg = [&](auto role, auto content) {
462
470
  common_chat_msg msg;
463
471
  msg.role = role;
@@ -533,8 +541,21 @@ common_chat_templates_ptr common_chat_templates_init(
533
541
  default_template_src = CHATML_TEMPLATE_SRC;
534
542
  }
535
543
  }
544
+
545
+ // TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
546
+ // Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
547
+ if (default_template_src.find("<|channel|>") != std::string::npos
548
+ // search for the error message and patch it
549
+ && default_template_src.find("in message.content or") != std::string::npos) {
550
+ string_replace_all(default_template_src,
551
+ "{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
552
+ "{%- if false %}");
553
+ }
554
+
536
555
  std::string token_bos = bos_token_override;
537
556
  std::string token_eos = eos_token_override;
557
+ bool add_bos = false;
558
+ bool add_eos = false;
538
559
  if (model) {
539
560
  const auto * vocab = llama_model_get_vocab(model);
540
561
  const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -549,9 +570,13 @@ common_chat_templates_ptr common_chat_templates_init(
549
570
  };
550
571
  token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
551
572
  token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
573
+ add_bos = llama_vocab_get_add_bos(vocab);
574
+ add_eos = llama_vocab_get_add_eos(vocab);
552
575
  }
553
576
  common_chat_templates_ptr tmpls(new common_chat_templates());
554
577
  tmpls->has_explicit_template = has_explicit_template;
578
+ tmpls->add_bos = add_bos;
579
+ tmpls->add_eos = add_eos;
555
580
  try {
556
581
  tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
557
582
  } catch (const std::exception & e) {
@@ -581,6 +606,8 @@ const char * common_chat_format_name(common_chat_format format) {
581
606
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
582
607
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
583
608
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609
+ case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
610
+ case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
584
611
  default:
585
612
  throw std::runtime_error("Unknown chat format");
586
613
  }
@@ -589,13 +616,28 @@ const char * common_chat_format_name(common_chat_format format) {
589
616
  const char * common_reasoning_format_name(common_reasoning_format format) {
590
617
  switch (format) {
591
618
  case COMMON_REASONING_FORMAT_NONE: return "none";
619
+ case COMMON_REASONING_FORMAT_AUTO: return "auto";
592
620
  case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
593
621
  case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
622
+ case COMMON_REASONING_FORMAT_GRANITE: return "granite";
594
623
  default:
595
624
  throw std::runtime_error("Unknown reasoning format");
596
625
  }
597
626
  }
598
627
 
628
+ common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
629
+ if (format == "none") {
630
+ return COMMON_REASONING_FORMAT_NONE;
631
+ } else if (format == "auto") {
632
+ return COMMON_REASONING_FORMAT_AUTO;
633
+ } else if (format == "deepseek") {
634
+ return COMMON_REASONING_FORMAT_DEEPSEEK;
635
+ } else if (format == "deepseek-legacy") {
636
+ return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
637
+ }
638
+ throw std::runtime_error("Unknown reasoning format: " + format);
639
+ }
640
+
599
641
  static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
600
642
  std::string arguments;
601
643
  if (builder.is_partial()) {
@@ -737,10 +779,10 @@ static std::string apply(
737
779
  // instead of using `chat_template_options.use_bos_token = false`, since these tokens
738
780
  // may be needed inside the template / between messages too.
739
781
  auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
740
- if (string_starts_with(result, tmpl.bos_token())) {
782
+ if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
741
783
  result = result.substr(tmpl.bos_token().size());
742
784
  }
743
- if (string_ends_with(result, tmpl.eos_token())) {
785
+ if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
744
786
  result = result.substr(0, result.size() - tmpl.eos_token().size());
745
787
  }
746
788
  return result;
@@ -1278,6 +1320,174 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1278
1320
  tool_calls_end);
1279
1321
  }
1280
1322
 
1323
+ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1324
+ common_chat_params data;
1325
+ auto prompt = apply(tmpl, inputs);
1326
+
1327
+ data.prompt = prompt;
1328
+ data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1329
+
1330
+ // These special tokens are required to parse properly, so we include them
1331
+ // even if parse_tool_calls is false.
1332
+ data.preserved_tokens = {
1333
+ "<|channel|>",
1334
+ "<|constrain|>",
1335
+ "<|message|>",
1336
+ "<|start|>",
1337
+ "<|end|>",
1338
+ };
1339
+
1340
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
1341
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1342
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1343
+ // tool calls can appear in commentary or analysis channels
1344
+ auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
1345
+
1346
+ std::vector<std::string> tool_rules_recipient_in_role;
1347
+ std::vector<std::string> tool_rules_recipient_in_channel;
1348
+ foreach_function(inputs.tools, [&](const json & tool) {
1349
+ const auto & function = tool.at("function");
1350
+ std::string name = function.at("name");
1351
+ auto parameters = function.at("parameters");
1352
+ builder.resolve_refs(parameters);
1353
+
1354
+ tool_rules_recipient_in_role.push_back(
1355
+ builder.add_rule(name + "-call",
1356
+ "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
1357
+ builder.add_schema(name + "-args", parameters)
1358
+ )
1359
+ );
1360
+
1361
+ tool_rules_recipient_in_channel.push_back(
1362
+ builder.add_rule(name + "-call",
1363
+ "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
1364
+ builder.add_schema(name + "-args", parameters)
1365
+ )
1366
+ );
1367
+ });
1368
+
1369
+ auto recipient_in_role = builder.add_rule("recipient_in_role",
1370
+ "\"<|start|>assistant\"? \" to=functions.\" ( " +
1371
+ string_join(tool_rules_recipient_in_role, " | ") + " )"
1372
+ );
1373
+
1374
+ auto recipient_in_channel = builder.add_rule("recipient_in_channel",
1375
+ channel + " \" to=functions.\" ( " +
1376
+ string_join(tool_rules_recipient_in_channel, " | ") + " )"
1377
+ );
1378
+
1379
+ builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
1380
+
1381
+ // Trigger on tool calls that appear in the commentary channel
1382
+ data.grammar_triggers.push_back({
1383
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1384
+ "<\\|channel\\|>(commentary|analysis) to"
1385
+ });
1386
+
1387
+ // Trigger tool calls that appear in the role section, either at the
1388
+ // start or in the middle.
1389
+ data.grammar_triggers.push_back({
1390
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1391
+ "^ to"
1392
+ });
1393
+
1394
+ data.grammar_triggers.push_back({
1395
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
1396
+ "<\\|start\\|>assistant to"
1397
+ });
1398
+ });
1399
+ }
1400
+
1401
+ return data;
1402
+ }
1403
+ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1404
+ static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
1405
+ static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
1406
+
1407
+ static const common_regex start_regex("<\\|start\\|>assistant");
1408
+ static const common_regex analysis_regex("<\\|channel\\|>analysis");
1409
+ static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
1410
+ static const common_regex preamble_regex("<\\|channel\\|>commentary");
1411
+ static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
1412
+ static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
1413
+
1414
+ auto consume_end = [&](bool include_end = false) {
1415
+ if (auto res = builder.try_find_literal("<|end|>")) {
1416
+ return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
1417
+ }
1418
+ return builder.consume_rest();
1419
+ };
1420
+
1421
+ auto handle_tool_call = [&](const std::string & name) {
1422
+ if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
1423
+ if (builder.syntax().parse_tool_calls) {
1424
+ if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
1425
+ throw common_chat_msg_partial_exception("incomplete tool call");
1426
+ }
1427
+ } else if (args->is_partial) {
1428
+ throw common_chat_msg_partial_exception("incomplete tool call");
1429
+ }
1430
+ }
1431
+ };
1432
+
1433
+ auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
1434
+ auto match = regex.search(input, 0, true);
1435
+ if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
1436
+ return match;
1437
+ }
1438
+ return std::nullopt;
1439
+ };
1440
+
1441
+ do {
1442
+ auto header_start_pos = builder.pos();
1443
+ auto content_start = builder.try_find_literal("<|message|>");
1444
+ if (!content_start) {
1445
+ throw common_chat_msg_partial_exception("incomplete header");
1446
+ }
1447
+
1448
+ auto header = content_start->prelude;
1449
+
1450
+ if (auto match = regex_match(tool_call1_regex, header)) {
1451
+ auto group = match->groups[1];
1452
+ auto name = header.substr(group.begin, group.end - group.begin);
1453
+ handle_tool_call(name);
1454
+ continue;
1455
+ }
1456
+
1457
+ if (auto match = regex_match(tool_call2_regex, header)) {
1458
+ auto group = match->groups[2];
1459
+ auto name = header.substr(group.begin, group.end - group.begin);
1460
+ handle_tool_call(name);
1461
+ continue;
1462
+ }
1463
+
1464
+ if (regex_match(analysis_regex, header)) {
1465
+ builder.move_to(header_start_pos);
1466
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
1467
+ builder.add_content(consume_end(true));
1468
+ } else {
1469
+ builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
1470
+ }
1471
+ continue;
1472
+ }
1473
+
1474
+ if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
1475
+ builder.add_content(consume_end());
1476
+ continue;
1477
+ }
1478
+
1479
+ // Possibly a malformed message, attempt to recover by rolling
1480
+ // back to pick up the next <|start|>
1481
+ LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
1482
+ builder.move_to(header_start_pos);
1483
+ } while (builder.try_find_regex(start_regex, std::string::npos, false));
1484
+
1485
+ auto remaining = builder.consume_rest();
1486
+ if (!remaining.empty()) {
1487
+ LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
1488
+ }
1489
+ }
1490
+
1281
1491
  static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1282
1492
  LOG_DBG("%s\n", __func__);
1283
1493
  common_chat_params data;
@@ -1687,6 +1897,124 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
1687
1897
  builder.add_content(builder.consume_rest());
1688
1898
  }
1689
1899
 
1900
+ static common_chat_params common_chat_params_init_granite(const common_chat_template & tmpl, const struct templates_params & inputs) {
1901
+ common_chat_params data;
1902
+
1903
+ // Pass thinking context for Granite template
1904
+ json additional_context = {
1905
+ {"thinking", inputs.enable_thinking},
1906
+ };
1907
+
1908
+ data.prompt = apply(tmpl, inputs, /* messages_override= */ std::nullopt, /* tools_override= */ std::nullopt, additional_context);
1909
+ data.format = COMMON_CHAT_FORMAT_GRANITE;
1910
+
1911
+ if (string_ends_with(data.prompt, "<think>\n") || string_ends_with(data.prompt, "<think>")) {
1912
+ if (!inputs.enable_thinking) {
1913
+ data.prompt += "</think>";
1914
+ } else {
1915
+ data.thinking_forced_open = true;
1916
+ }
1917
+ }
1918
+
1919
+ if (!inputs.tools.is_null()) {
1920
+ // Granite uses <|tool_call|> followed by JSON list
1921
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
1922
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1923
+ std::vector<std::string> tool_rules;
1924
+ foreach_function(inputs.tools, [&](const json & tool) {
1925
+ const auto & function = tool.at("function");
1926
+ std::string name = function.at("name");
1927
+ auto parameters = function.at("parameters");
1928
+ builder.resolve_refs(parameters);
1929
+ tool_rules.push_back(builder.add_rule(name + "-call", builder.add_schema(name +
1930
+ "-args", {
1931
+ {"type", "object"},
1932
+ {"properties", {
1933
+ {"name", {{"const", name}}},
1934
+ {"arguments", parameters},
1935
+ }},
1936
+ {"required", json::array({"name", "arguments"})},
1937
+ })));
1938
+ });
1939
+
1940
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | "));
1941
+ auto tool_list = builder.add_rule("tool_list", "\"[\" space " + tool_call + " (\",\" space " + tool_call + ")* space \"]\"");
1942
+
1943
+ if (data.thinking_forced_open) {
1944
+ builder.add_rule("root", "\"</think>\" space \"<response>\" space [^<]* \"</response>\" space \"<|tool_call|>\" space " + tool_list);
1945
+ } else {
1946
+ builder.add_rule("root", "\"<|tool_call|>\" space " + tool_list);
1947
+ }
1948
+
1949
+ data.grammar_triggers.push_back({
1950
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
1951
+ "<|tool_call|>"
1952
+ });
1953
+
1954
+ data.preserved_tokens = {
1955
+ "<think>",
1956
+ "</think>",
1957
+ "<response>",
1958
+ "</response>",
1959
+ "<|tool_call|>",
1960
+ };
1961
+ });
1962
+ } else {
1963
+ // Handle thinking tags for non-tool responses
1964
+ if (data.thinking_forced_open && inputs.enable_thinking) {
1965
+ data.grammar_lazy = false;
1966
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1967
+ builder.add_rule("root", "\"</think>\" space \"<response>\" space .* \"</response>\" space");
1968
+ });
1969
+ data.preserved_tokens = {
1970
+ "<think>",
1971
+ "</think>",
1972
+ "<response>",
1973
+ "</response>",
1974
+ };
1975
+ }
1976
+ }
1977
+
1978
+ return data;
1979
+ }
1980
+
1981
+ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
1982
+ // Parse thinking tags
1983
+ builder.try_parse_reasoning("<think>", "</think>");
1984
+
1985
+ // Parse response tags using regex
1986
+ static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
1987
+ if (auto res = builder.try_find_regex(response_regex)) {
1988
+ // Extract the content between the tags (capture group 1)
1989
+ auto content = builder.str(res->groups[1]);
1990
+ builder.add_content(content);
1991
+ builder.move_to(res->groups[0].end);
1992
+ }
1993
+
1994
+ if (!builder.syntax().parse_tool_calls) {
1995
+ builder.add_content(builder.consume_rest());
1996
+ return;
1997
+ }
1998
+
1999
+ // Look for tool calls
2000
+ static const common_regex tool_call_regex(regex_escape("<|tool_call|>"));
2001
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2002
+ builder.move_to(res->groups[0].end);
2003
+
2004
+ // Expect JSON array of tool calls
2005
+ auto tool_calls_data = builder.consume_json();
2006
+ if (tool_calls_data.json.is_array()) {
2007
+ if (!builder.add_tool_calls(tool_calls_data.json)) {
2008
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2009
+ }
2010
+ } else {
2011
+ builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
2012
+ }
2013
+ } else {
2014
+ builder.add_content(builder.consume_rest());
2015
+ }
2016
+ }
2017
+
1690
2018
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
1691
2019
  common_chat_params data;
1692
2020
  data.prompt = apply(tmpl, inputs);
@@ -1720,6 +2048,8 @@ static common_chat_params common_chat_templates_apply_jinja(
1720
2048
  params.enable_thinking = inputs.enable_thinking;
1721
2049
  params.grammar = inputs.grammar;
1722
2050
  params.now = inputs.now;
2051
+ params.add_bos = tmpls->add_bos;
2052
+ params.add_eos = tmpls->add_eos;
1723
2053
 
1724
2054
  params.extra_context = json::object();
1725
2055
  for (auto el : inputs.chat_template_kwargs) {
@@ -1756,11 +2086,21 @@ static common_chat_params common_chat_templates_apply_jinja(
1756
2086
  return common_chat_params_init_command_r7b(tmpl, params);
1757
2087
  }
1758
2088
 
2089
+ // Granite (IBM) - detects thinking / tools support
2090
+ if (src.find("elif thinking") != std::string::npos && src.find("<|tool_call|>") != std::string::npos) {
2091
+ return common_chat_params_init_granite(tmpl, params);
2092
+ }
2093
+
1759
2094
  // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
1760
2095
  if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
1761
2096
  return common_chat_params_init_hermes_2_pro(tmpl, params);
1762
2097
  }
1763
2098
 
2099
+ // GPT-OSS
2100
+ if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
2101
+ return common_chat_params_init_gpt_oss(tmpl, params);
2102
+ }
2103
+
1764
2104
  // Use generic handler when mixing tools + JSON schema.
1765
2105
  // TODO: support that mix in handlers below.
1766
2106
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1811,6 +2151,7 @@ static common_chat_params common_chat_templates_apply_legacy(
1811
2151
  int alloc_size = 0;
1812
2152
  std::vector<llama_chat_message> chat;
1813
2153
  std::vector<std::string> contents;
2154
+
1814
2155
  for (const auto & msg : inputs.messages) {
1815
2156
  auto content = msg.content;
1816
2157
  for (const auto & part : msg.content_parts) {
@@ -1912,6 +2253,12 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1912
2253
  case COMMON_CHAT_FORMAT_COMMAND_R7B:
1913
2254
  common_chat_parse_command_r7b(builder);
1914
2255
  break;
2256
+ case COMMON_CHAT_FORMAT_GRANITE:
2257
+ common_chat_parse_granite(builder);
2258
+ break;
2259
+ case COMMON_CHAT_FORMAT_GPT_OSS:
2260
+ common_chat_parse_gpt_oss(builder);
2261
+ break;
1915
2262
  default:
1916
2263
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1917
2264
  }
@@ -9,12 +9,14 @@
9
9
  #include <vector>
10
10
  #include <map>
11
11
 
12
- #include <minja/chat-template.hpp>
13
- #include <minja/minja.hpp>
12
+ #include "minja/chat-template.hpp"
13
+ #include "minja/minja.hpp"
14
14
 
15
15
  typedef minja::chat_template common_chat_template;
16
16
 
17
17
  struct common_chat_templates {
18
+ bool add_bos;
19
+ bool add_eos;
18
20
  bool has_explicit_template; // Model had builtin template or template overridde was specified.
19
21
  std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
20
22
  std::unique_ptr<common_chat_template> template_tool_use;
@@ -118,6 +120,8 @@ enum common_chat_format {
118
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
119
121
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
120
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
+ COMMON_CHAT_FORMAT_GRANITE,
124
+ COMMON_CHAT_FORMAT_GPT_OSS,
121
125
 
122
126
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
123
127
  };
@@ -136,6 +140,8 @@ struct common_chat_templates_inputs {
136
140
  bool enable_thinking = true;
137
141
  std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
138
142
  std::map<std::string, std::string> chat_template_kwargs;
143
+ bool add_bos = false;
144
+ bool add_eos = false;
139
145
  };
140
146
 
141
147
  struct common_chat_params {
@@ -192,10 +198,12 @@ std::string common_chat_format_single(
192
198
  // Returns an example of formatted chat
193
199
  std::string common_chat_format_example(
194
200
  const struct common_chat_templates * tmpls,
195
- bool use_jinja);
201
+ bool use_jinja,
202
+ const std::map<std::string, std::string> & chat_template_kwargs);
196
203
 
197
204
  const char* common_chat_format_name(common_chat_format format);
198
205
  const char* common_reasoning_format_name(common_reasoning_format format);
206
+ common_reasoning_format common_reasoning_format_from_name(const std::string & format);
199
207
  common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
200
208
 
201
209
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
@@ -41,6 +41,7 @@
41
41
  #endif
42
42
  #include <locale>
43
43
  #include <windows.h>
44
+ #include <string.h>
44
45
  #include <fcntl.h>
45
46
  #include <io.h>
46
47
  #else
@@ -1566,3 +1567,56 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
1566
1567
 
1567
1568
  return result;
1568
1569
  }
1570
+
1571
+ ggml_opt_optimizer_params common_opt_lr_pars(void * userdata) {
1572
+ ggml_opt_optimizer_params result = ggml_opt_get_default_optimizer_params(nullptr);
1573
+ const lr_opt & d = *(lr_opt *) userdata;
1574
+ result.adamw.alpha = result.sgd.alpha = d.get_lr(d.epoch);
1575
+ result.sgd.wd = result.adamw.wd = d.wd;
1576
+ return result;
1577
+ }
1578
+
1579
+ // TODO make all command line args case-insensitive
1580
+ static inline bool eq_case_insensitive(char const* a, char const* b) {
1581
+ return !
1582
+ #if defined(_MSC_VER)
1583
+ _stricmp
1584
+ #else
1585
+ strcasecmp
1586
+ #endif // defined(_MSC_VER)
1587
+ (a, b);
1588
+ }
1589
+
1590
+ enum ggml_opt_optimizer_type common_opt_get_optimizer(const char * n) {
1591
+ if (eq_case_insensitive("adamw", n)) {
1592
+ return GGML_OPT_OPTIMIZER_TYPE_ADAMW;
1593
+ }
1594
+ if (eq_case_insensitive("sgd", n)) {
1595
+ return GGML_OPT_OPTIMIZER_TYPE_SGD;
1596
+ }
1597
+ return GGML_OPT_OPTIMIZER_TYPE_COUNT;
1598
+ }
1599
+
1600
+ // TODO simplify to use just log and exp
1601
+ static float const k_log_2 = std::log(2.f);
1602
+
1603
+ void lr_opt::init() {
1604
+ if (lr_min > 0 && lr_min < lr0) {
1605
+ float nhalf = std::log(lr0 / lr_min) / k_log_2;
1606
+ float e = epochs;
1607
+ if (decay_epochs > 0 && decay_epochs < e) {
1608
+ e = decay_epochs;
1609
+ } else {
1610
+ decay_epochs = e;
1611
+ }
1612
+ scale_epoch = nhalf / e;
1613
+ }
1614
+ }
1615
+
1616
+ float lr_opt::get_lr(float epoch) const {
1617
+ float r = lr_min <= 0 ? lr0 :
1618
+ epoch >= decay_epochs ? lr_min :
1619
+ lr0 * std::pow(0.5f, epoch * scale_epoch);
1620
+ LOG_INF("epoch %.2g lr=%.2g\n", epoch, r);
1621
+ return r;
1622
+ }