@fugood/llama.node 1.1.11 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +18 -1
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +166 -396
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +50 -30
  27. package/src/llama.cpp/common/chat.cpp +250 -1
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.h +1 -1
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
  39. package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
  40. package/src/llama.cpp/ggml/include/ggml.h +56 -2
  41. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
  42. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  43. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
  45. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
  46. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
  53. package/src/llama.cpp/include/llama.h +5 -6
  54. package/src/llama.cpp/src/llama-adapter.cpp +33 -0
  55. package/src/llama.cpp/src/llama-adapter.h +3 -0
  56. package/src/llama.cpp/src/llama-arch.cpp +28 -4
  57. package/src/llama.cpp/src/llama-arch.h +3 -0
  58. package/src/llama.cpp/src/llama-context.cpp +65 -57
  59. package/src/llama.cpp/src/llama-context.h +1 -1
  60. package/src/llama.cpp/src/llama-graph.cpp +57 -11
  61. package/src/llama.cpp/src/llama-graph.h +8 -0
  62. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  63. package/src/llama.cpp/src/llama-hparams.h +10 -3
  64. package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
  65. package/src/llama.cpp/src/llama-kv-cache.h +9 -0
  66. package/src/llama.cpp/src/llama-model.cpp +217 -97
  67. package/src/llama.cpp/src/llama-model.h +0 -1
  68. package/src/llama.cpp/src/llama-quant.cpp +3 -3
  69. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  70. package/src/llama.cpp/src/llama.cpp +53 -10
  71. package/src/anyascii.c +0 -22223
  72. package/src/anyascii.h +0 -42
  73. package/src/tts_utils.cpp +0 -371
  74. package/src/tts_utils.h +0 -103
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
150
150
  throw std::runtime_error("Invalid tool_choice: " + tool_choice);
151
151
  }
152
152
 
153
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
154
+ common_chat_templates_inputs dummy_inputs;
155
+ common_chat_msg msg;
156
+ msg.role = "user";
157
+ msg.content = "test";
158
+ dummy_inputs.messages = {msg};
159
+ dummy_inputs.enable_thinking = false;
160
+ const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
161
+ dummy_inputs.enable_thinking = true;
162
+ const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
163
+ return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
164
+ }
165
+
153
166
  template <>
154
167
  std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
155
168
  std::vector<common_chat_msg> msgs;
@@ -605,11 +618,13 @@ const char * common_chat_format_name(common_chat_format format) {
605
618
  case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
606
619
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
607
620
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
621
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
608
622
  case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
609
623
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
610
624
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
611
625
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
612
626
  case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
627
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
613
628
  default:
614
629
  throw std::runtime_error("Unknown chat format");
615
630
  }
@@ -671,11 +686,13 @@ static void parse_json_tool_calls(
671
686
  size_t from = std::string::npos;
672
687
  auto first = true;
673
688
  while (true) {
689
+ auto start_pos = builder.pos();
674
690
  auto res = function_regex_start_only && first
675
691
  ? builder.try_consume_regex(*function_regex_start_only)
676
692
  : function_regex
677
693
  ? builder.try_find_regex(*function_regex, from)
678
694
  : std::nullopt;
695
+
679
696
  if (res) {
680
697
  std::string name;
681
698
  if (get_function_name) {
@@ -710,6 +727,8 @@ static void parse_json_tool_calls(
710
727
  return;
711
728
  }
712
729
  throw common_chat_msg_partial_exception("incomplete tool call");
730
+ } else {
731
+ builder.move_to(start_pos);
713
732
  }
714
733
  break;
715
734
  }
@@ -1170,6 +1189,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1170
1189
  });
1171
1190
  return data;
1172
1191
  }
1192
+
1193
+ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1194
+ common_chat_params data;
1195
+
1196
+ // Generate the prompt using the apply() function with the template
1197
+ data.prompt = apply(tmpl, inputs);
1198
+ data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1199
+
1200
+ // Handle thinking tags appropriately based on inputs.enable_thinking
1201
+ if (string_ends_with(data.prompt, "<think>\n")) {
1202
+ if (!inputs.enable_thinking) {
1203
+ data.prompt += "</think>";
1204
+ } else {
1205
+ data.thinking_forced_open = true;
1206
+ }
1207
+ }
1208
+
1209
+ // When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1210
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1211
+ data.grammar_lazy = true;
1212
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1213
+ auto schemas = json::array();
1214
+ foreach_function(inputs.tools, [&](const json & tool) {
1215
+ const auto & function = tool.at("function");
1216
+ schemas.push_back({
1217
+ { "type", "object" },
1218
+ { "properties",
1219
+ {
1220
+ { "name",
1221
+ {
1222
+ { "type", "string" },
1223
+ { "const", function.at("name") },
1224
+ } },
1225
+ { "arguments", function.at("parameters") },
1226
+ } },
1227
+ { "required", json::array({ "name", "arguments" }) },
1228
+ });
1229
+ });
1230
+ auto schema = json{
1231
+ { "type", "array" },
1232
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1233
+ { "minItems", 1 },
1234
+ };
1235
+ if (!inputs.parallel_tool_calls) {
1236
+ schema["maxItems"] = 1;
1237
+ }
1238
+ builder.add_rule("root",
1239
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1240
+ "\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1241
+ " \"</TOOLCALL>\"");
1242
+ });
1243
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1244
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1245
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1246
+ std::string(data.thinking_forced_open ?
1247
+ "[\\s\\S]*?(</think>\\s*)" :
1248
+ "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1249
+ "(<TOOLCALL>)[\\s\\S]*" });
1250
+ }
1251
+ return data;
1252
+ }
1173
1253
  static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1174
1254
  if (!builder.syntax().parse_tool_calls) {
1175
1255
  builder.add_content(builder.consume_rest());
@@ -1299,6 +1379,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
1299
1379
  }
1300
1380
  return data;
1301
1381
  }
1382
+
1383
+ static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
1384
+ common_chat_params data;
1385
+
1386
+ // Pass thinking context for DeepSeek V3.1 template
1387
+ json additional_context = {
1388
+ {"thinking", inputs.enable_thinking},
1389
+ };
1390
+
1391
+ auto prompt = apply(tmpl, inputs,
1392
+ /* messages_override= */ inputs.messages,
1393
+ /* tools_override= */ std::nullopt,
1394
+ additional_context);
1395
+ data.prompt = prompt;
1396
+ data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
1397
+ if (string_ends_with(data.prompt, "<think>")) {
1398
+ if (!inputs.enable_thinking) {
1399
+ data.prompt += "</think>";
1400
+ } else {
1401
+ data.thinking_forced_open = true;
1402
+ }
1403
+ }
1404
+ if (inputs.tools.is_array() && !inputs.tools.empty()) {
1405
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
1406
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1407
+ std::vector<std::string> tool_rules;
1408
+ foreach_function(inputs.tools, [&](const json & tool) {
1409
+ const auto & function = tool.at("function");
1410
+ std::string name = function.at("name");
1411
+ auto parameters = function.at("parameters");
1412
+ builder.resolve_refs(parameters);
1413
+ tool_rules.push_back(builder.add_rule(name + "-call",
1414
+ "( \"<|tool▁call▁begin|>\" )? \"" + name + "<|tool▁sep|>"
1415
+ "\" " + builder.add_schema(name + "-args", parameters) + " "
1416
+ "\"<|tool▁call▁end|>\""));
1417
+ });
1418
+ // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
1419
+ // so we accept common variants (then it's all constrained)
1420
+ builder.add_rule("root",
1421
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1422
+ "( \"<|tool▁calls▁begin|>\" | \"<|tool_calls_begin|>\" | \"<|tool calls begin|>\" | \"<|tool\\\\_calls\\\\_begin|>\" | \"<|tool▁calls|>\" ) "
1423
+ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
1424
+ "\"<|tool▁calls▁end|>\""
1425
+ " space");
1426
+ data.grammar_triggers.push_back({
1427
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1428
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1429
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1430
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1431
+ "(<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)[\\s\\S]*"
1432
+ });
1433
+ data.preserved_tokens = {
1434
+ "<think>",
1435
+ "</think>",
1436
+ "<|tool▁calls▁begin|>",
1437
+ "<|tool▁call▁begin|>",
1438
+ "<|tool▁sep|>",
1439
+ "<|tool▁call▁end|>",
1440
+ "<|tool▁calls▁end|>",
1441
+ };
1442
+ });
1443
+ }
1444
+ return data;
1445
+ }
1446
+
1302
1447
  static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1303
1448
  builder.try_parse_reasoning("<think>", "</think>");
1304
1449
  if (!builder.syntax().parse_tool_calls) {
@@ -1320,6 +1465,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
1320
1465
  tool_calls_end);
1321
1466
  }
1322
1467
 
1468
+ static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
1469
+ static const common_regex function_regex("(?:<|tool▁call▁begin|>)?([^\\n<]+)(?:<|tool▁sep|>)");
1470
+
1471
+ static const common_regex close_regex("(?:[\\s]*)?<|tool▁call▁end|>");
1472
+ static const common_regex tool_calls_begin("(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>|<|tool▁calls|>)");
1473
+ static const common_regex tool_calls_end("<|tool▁calls▁end|>");
1474
+
1475
+ if (!builder.syntax().parse_tool_calls) {
1476
+ LOG_DBG("%s: not parse_tool_calls\n", __func__);
1477
+ builder.add_content(builder.consume_rest());
1478
+ return;
1479
+ }
1480
+
1481
+ LOG_DBG("%s: parse_tool_calls\n", __func__);
1482
+
1483
+ parse_json_tool_calls(
1484
+ builder,
1485
+ /* block_open= */ tool_calls_begin,
1486
+ /* function_regex_start_only= */ std::nullopt,
1487
+ function_regex,
1488
+ close_regex,
1489
+ tool_calls_end);
1490
+ }
1491
+
1492
+ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
1493
+ // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
1494
+ // First try to parse using the standard reasoning parsing method
1495
+ LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
1496
+
1497
+ auto start_pos = builder.pos();
1498
+ auto found_end_think = builder.try_find_literal("</think>");
1499
+ builder.move_to(start_pos);
1500
+
1501
+ if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
1502
+ LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
1503
+ common_chat_parse_deepseek_v3_1_content(builder);
1504
+ } else if (builder.try_parse_reasoning("<think>", "</think>")) {
1505
+ // If reasoning was parsed successfully, the remaining content is regular content
1506
+ LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
1507
+ // </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
1508
+ common_chat_parse_deepseek_v3_1_content(builder);
1509
+ } else {
1510
+ if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
1511
+ LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
1512
+ common_chat_parse_deepseek_v3_1_content(builder);
1513
+ return;
1514
+ }
1515
+ // If no reasoning tags found, check if we should treat everything as reasoning
1516
+ if (builder.syntax().thinking_forced_open) {
1517
+ // If thinking is forced open but no tags found, treat everything as reasoning
1518
+ LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
1519
+ builder.add_reasoning_content(builder.consume_rest());
1520
+ } else {
1521
+ LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
1522
+ // <|tool▁call▁begin|>NAME<|tool▁sep|>JSON<|tool▁call▁end|>
1523
+ common_chat_parse_deepseek_v3_1_content(builder);
1524
+ }
1525
+ }
1526
+ }
1527
+
1323
1528
  static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1324
1529
  common_chat_params data;
1325
1530
  auto prompt = apply(tmpl, inputs);
@@ -1816,7 +2021,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
1816
2021
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
1817
2022
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1818
2023
  std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1819
- "(\\s*"
2024
+ "\\s*("
1820
2025
  "(?:<tool_call>"
1821
2026
  "|<function"
1822
2027
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2046,6 +2251,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2046
2251
  }
2047
2252
  }
2048
2253
 
2254
+ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2255
+ // Parse thinking tags
2256
+ builder.try_parse_reasoning("<think>", "</think>");
2257
+ if (!builder.syntax().parse_tool_calls) {
2258
+ builder.add_content(builder.consume_rest());
2259
+ return;
2260
+ }
2261
+
2262
+ // Look for tool calls
2263
+ static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2264
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2265
+ builder.move_to(res->groups[0].end);
2266
+
2267
+ // Expect JSON array of tool calls
2268
+ auto tool_calls_data = builder.consume_json();
2269
+ if (tool_calls_data.json.is_array()) {
2270
+ if (!builder.try_consume_literal("</TOOLCALL>")) {
2271
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2272
+ }
2273
+ builder.add_tool_calls(tool_calls_data.json);
2274
+ } else {
2275
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2276
+ }
2277
+ }
2278
+ builder.add_content(builder.consume_rest());
2279
+ }
2280
+
2049
2281
  static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2050
2282
  // Parse thinking tags first - this handles the main reasoning content
2051
2283
  builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2249,6 +2481,12 @@ static common_chat_params common_chat_templates_apply_jinja(
2249
2481
  }
2250
2482
  }
2251
2483
 
2484
+ // DeepSeek V3.1: detect based on specific patterns in the template
2485
+ if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
2486
+ params.json_schema.is_null()) {
2487
+ return common_chat_params_init_deepseek_v3_1(tmpl, params);
2488
+ }
2489
+
2252
2490
  // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
2253
2491
  if (src.find("<|tool▁calls▁begin|>") != std::string::npos && params.json_schema.is_null()) {
2254
2492
  return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2279,6 +2517,11 @@ static common_chat_params common_chat_templates_apply_jinja(
2279
2517
  return common_chat_params_init_seed_oss(tmpl, params, inputs);
2280
2518
  }
2281
2519
 
2520
+ // Nemotron v2
2521
+ if (src.find("<SPECIAL_10>") != std::string::npos) {
2522
+ return common_chat_params_init_nemotron_v2(tmpl, params);
2523
+ }
2524
+
2282
2525
  // Use generic handler when mixing tools + JSON schema.
2283
2526
  // TODO: support that mix in handlers below.
2284
2527
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2416,6 +2659,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2416
2659
  case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
2417
2660
  common_chat_parse_deepseek_r1(builder);
2418
2661
  break;
2662
+ case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
2663
+ common_chat_parse_deepseek_v3_1(builder);
2664
+ break;
2419
2665
  case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
2420
2666
  common_chat_parse_functionary_v3_2(builder);
2421
2667
  break;
@@ -2440,6 +2686,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2440
2686
  case COMMON_CHAT_FORMAT_SEED_OSS:
2441
2687
  common_chat_parse_seed_oss(builder);
2442
2688
  break;
2689
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2690
+ common_chat_parse_nemotron_v2(builder);
2691
+ break;
2443
2692
  default:
2444
2693
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2445
2694
  }
@@ -118,11 +118,13 @@ enum common_chat_format {
118
118
  COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
119
119
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
120
120
  COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
121
+ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
121
122
  COMMON_CHAT_FORMAT_HERMES_2_PRO,
122
123
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
124
  COMMON_CHAT_FORMAT_GRANITE,
124
125
  COMMON_CHAT_FORMAT_GPT_OSS,
125
126
  COMMON_CHAT_FORMAT_SEED_OSS,
127
+ COMMON_CHAT_FORMAT_NEMOTRON_V2,
126
128
 
127
129
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
128
130
  };
@@ -209,6 +211,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
209
211
 
210
212
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
211
213
 
214
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
215
+
212
216
  // Parses a JSON array of messages in OpenAI's chat completion API format.
213
217
  // T can be std::string containing JSON or nlohmann::ordered_json
214
218
  template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
@@ -445,7 +445,7 @@ struct common_params {
445
445
 
446
446
  // "advanced" endpoints are disabled by default for better security
447
447
  bool webui = true;
448
- bool endpoint_slots = false;
448
+ bool endpoint_slots = true;
449
449
  bool endpoint_props = false; // only control POST requests, not GET
450
450
  bool endpoint_metrics = false;
451
451
 
@@ -843,9 +843,10 @@ public:
843
843
  _build_object_rule(
844
844
  properties, required, name,
845
845
  schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
846
- } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
846
+ } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
847
847
  std::unordered_set<std::string> required;
848
848
  std::vector<std::pair<std::string, json>> properties;
849
+ std::map<std::string, size_t> enum_values;
849
850
  std::string hybrid_name = name;
850
851
  std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
851
852
  if (comp_schema.contains("$ref")) {
@@ -857,6 +858,14 @@ public:
857
858
  required.insert(prop.key());
858
859
  }
859
860
  }
861
+ } else if (comp_schema.contains("enum")) {
862
+ for (const auto & v : comp_schema["enum"]) {
863
+ const auto rule = _generate_constant_rule(v);
864
+ if (enum_values.find(rule) == enum_values.end()) {
865
+ enum_values[rule] = 0;
866
+ }
867
+ enum_values[rule] += 1;
868
+ }
860
869
  } else {
861
870
  // todo warning
862
871
  }
@@ -870,6 +879,17 @@ public:
870
879
  add_component(t, true);
871
880
  }
872
881
  }
882
+ if (!enum_values.empty()) {
883
+ std::vector<std::string> enum_intersection;
884
+ for (const auto & p : enum_values) {
885
+ if (p.second == schema["allOf"].size()) {
886
+ enum_intersection.push_back(p.first);
887
+ }
888
+ }
889
+ if (!enum_intersection.empty()) {
890
+ return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
891
+ }
892
+ }
873
893
  return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
874
894
  } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
875
895
  json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
@@ -4,17 +4,52 @@
4
4
  #include <condition_variable>
5
5
  #include <cstdarg>
6
6
  #include <cstdio>
7
+ #include <cstdlib>
8
+ #include <cstring>
7
9
  #include <mutex>
8
10
  #include <sstream>
9
11
  #include <thread>
10
12
  #include <vector>
11
13
 
14
+ #if defined(_WIN32)
15
+ # include <io.h>
16
+ # include <windows.h>
17
+ # define isatty _isatty
18
+ # define fileno _fileno
19
+ #else
20
+ # include <unistd.h>
21
+ #endif // defined(_WIN32)
22
+
12
23
  int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
13
24
 
14
25
  void common_log_set_verbosity_thold(int verbosity) {
15
26
  common_log_verbosity_thold = verbosity;
16
27
  }
17
28
 
29
+ // Auto-detect if colors should be enabled based on terminal and environment
30
+ static bool common_log_should_use_colors_auto() {
31
+ // Check NO_COLOR environment variable (https://no-color.org/)
32
+ if (const char * no_color = std::getenv("NO_COLOR")) {
33
+ if (no_color[0] != '\0') {
34
+ return false;
35
+ }
36
+ }
37
+
38
+ // Check TERM environment variable
39
+ if (const char * term = std::getenv("TERM")) {
40
+ if (std::strcmp(term, "dumb") == 0) {
41
+ return false;
42
+ }
43
+ }
44
+
45
+ // Check if stdout and stderr are connected to a terminal
46
+ // We check both because log messages can go to either
47
+ bool stdout_is_tty = isatty(fileno(stdout));
48
+ bool stderr_is_tty = isatty(fileno(stderr));
49
+
50
+ return stdout_is_tty || stderr_is_tty;
51
+ }
52
+
18
53
  static int64_t t_us() {
19
54
  return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
20
55
  }
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
353
388
 
354
389
  struct common_log * common_log_main() {
355
390
  static struct common_log log;
391
+ static std::once_flag init_flag;
392
+ std::call_once(init_flag, [&]() {
393
+ // Set default to auto-detect colors
394
+ log.set_colors(common_log_should_use_colors_auto());
395
+ });
356
396
 
357
397
  return &log;
358
398
  }
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
380
420
  log->set_file(file);
381
421
  }
382
422
 
383
- void common_log_set_colors(struct common_log * log, bool colors) {
384
- log->set_colors(colors);
423
+ void common_log_set_colors(struct common_log * log, log_colors colors) {
424
+ if (colors == LOG_COLORS_AUTO) {
425
+ log->set_colors(common_log_should_use_colors_auto());
426
+ return;
427
+ }
428
+
429
+ if (colors == LOG_COLORS_DISABLED) {
430
+ log->set_colors(false);
431
+ return;
432
+ }
433
+
434
+ GGML_ASSERT(colors == LOG_COLORS_ENABLED);
435
+ log->set_colors(true);
385
436
  }
386
437
 
387
438
  void common_log_set_prefix(struct common_log * log, bool prefix) {
@@ -24,6 +24,12 @@
24
24
  #define LOG_DEFAULT_DEBUG 1
25
25
  #define LOG_DEFAULT_LLAMA 0
26
26
 
27
+ enum log_colors {
28
+ LOG_COLORS_AUTO = -1,
29
+ LOG_COLORS_DISABLED = 0,
30
+ LOG_COLORS_ENABLED = 1,
31
+ };
32
+
27
33
  // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
28
34
  // set via common_log_set_verbosity()
29
35
  extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
65
71
  // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
66
72
  //
67
73
 
68
- void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
69
- void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
70
- void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
71
- void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
74
+ void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
75
+ void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
76
+ void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
77
+ void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
72
78
 
73
79
  // helper macros for logging
74
80
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
426
426
 
427
427
  // helpers
428
428
 
429
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430
- return &gsmpl->cur_p;
429
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
430
+ auto * res = &gsmpl->cur_p;
431
+
432
+ if (do_sort && !res->sorted) {
433
+ // remember the selected token before sorting
434
+ const llama_token id = res->data[res->selected].id;
435
+
436
+ std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
437
+ return a.p > b.p;
438
+ });
439
+
440
+ // restore the selected token after sorting
441
+ for (size_t i = 0; i < res->size; ++i) {
442
+ if (res->data[i].id == id) {
443
+ res->selected = i;
444
+ break;
445
+ }
446
+ }
447
+
448
+ res->sorted = true;
449
+ }
450
+
451
+ return res;
431
452
  }
432
453
 
433
454
  llama_token common_sampler_last(const struct common_sampler * gsmpl) {
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
86
86
  // helpers
87
87
 
88
88
  // access the internal list of current candidate tokens
89
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
89
+ // if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
90
+ // the .sorted flag of the result indicates whether the returned candidates are sorted
91
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
90
92
 
91
93
  // get the last accepted token
92
94
  llama_token common_sampler_last(const struct common_sampler * gsmpl);
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
317
317
 
318
318
  common_sampler_sample(smpl, ctx_dft, 0, true);
319
319
 
320
- const auto * cur_p = common_sampler_get_candidates(smpl);
320
+ const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
322
322
  for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
323
323
  LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
@@ -129,10 +129,11 @@ endif()
129
129
  option(GGML_LASX "ggml: enable lasx" ON)
130
130
  option(GGML_LSX "ggml: enable lsx" ON)
131
131
  option(GGML_RVV "ggml: enable rvv" ON)
132
- option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
+ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
133
+ option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
134
+ option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
133
135
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
134
136
  option(GGML_VXE "ggml: enable vxe" ON)
135
- option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
136
137
 
137
138
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
138
139
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -132,6 +132,8 @@ extern "C" {
132
132
  GGML_BACKEND_DEVICE_TYPE_CPU,
133
133
  // GPU device using dedicated memory
134
134
  GGML_BACKEND_DEVICE_TYPE_GPU,
135
+ // integrated GPU device using host memory
136
+ GGML_BACKEND_DEVICE_TYPE_IGPU,
135
137
  // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136
138
  GGML_BACKEND_DEVICE_TYPE_ACCEL
137
139
  };
@@ -150,11 +152,21 @@ extern "C" {
150
152
 
151
153
  // all the device properties
152
154
  struct ggml_backend_dev_props {
155
+ // device name
153
156
  const char * name;
157
+ // device description
154
158
  const char * description;
159
+ // device free memory in bytes
155
160
  size_t memory_free;
161
+ // device total memory in bytes
156
162
  size_t memory_total;
163
+ // device type
157
164
  enum ggml_backend_dev_type type;
165
+ // device id
166
+ // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167
+ // if the id is unknown, this should be NULL
168
+ const char * device_id;
169
+ // device capabilities
158
170
  struct ggml_backend_dev_caps caps;
159
171
  };
160
172
 
@@ -307,6 +319,9 @@ extern "C" {
307
319
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
308
320
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
309
321
 
322
+ // Split graph without allocating it
323
+ GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
324
+
310
325
  // Allocate and compute graph on the backend scheduler
311
326
  GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
312
327
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);