@fugood/llama.node 1.1.10 → 1.2.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/CMakeLists.txt +5 -8
  2. package/lib/binding.ts +20 -2
  3. package/lib/index.js +2 -2
  4. package/lib/index.ts +2 -2
  5. package/package.json +20 -16
  6. package/src/DecodeAudioTokenWorker.cpp +23 -26
  7. package/src/DecodeAudioTokenWorker.h +6 -8
  8. package/src/DetokenizeWorker.cpp +5 -8
  9. package/src/DetokenizeWorker.h +6 -5
  10. package/src/DisposeWorker.cpp +23 -3
  11. package/src/DisposeWorker.h +4 -2
  12. package/src/EmbeddingWorker.cpp +9 -35
  13. package/src/EmbeddingWorker.h +3 -2
  14. package/src/LlamaCompletionWorker.cpp +217 -315
  15. package/src/LlamaCompletionWorker.h +6 -12
  16. package/src/LlamaContext.cpp +174 -388
  17. package/src/LlamaContext.h +8 -13
  18. package/src/LoadSessionWorker.cpp +22 -19
  19. package/src/LoadSessionWorker.h +3 -2
  20. package/src/RerankWorker.h +3 -2
  21. package/src/SaveSessionWorker.cpp +22 -19
  22. package/src/SaveSessionWorker.h +3 -2
  23. package/src/TokenizeWorker.cpp +38 -35
  24. package/src/TokenizeWorker.h +12 -3
  25. package/src/common.hpp +0 -458
  26. package/src/llama.cpp/common/arg.cpp +67 -37
  27. package/src/llama.cpp/common/chat.cpp +263 -2
  28. package/src/llama.cpp/common/chat.h +4 -0
  29. package/src/llama.cpp/common/common.cpp +10 -3
  30. package/src/llama.cpp/common/common.h +5 -2
  31. package/src/llama.cpp/common/log.cpp +53 -2
  32. package/src/llama.cpp/common/log.h +10 -4
  33. package/src/llama.cpp/common/sampling.cpp +23 -2
  34. package/src/llama.cpp/common/sampling.h +3 -1
  35. package/src/llama.cpp/common/speculative.cpp +1 -1
  36. package/src/llama.cpp/ggml/CMakeLists.txt +4 -3
  37. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -0
  38. package/src/llama.cpp/ggml/include/ggml-cpu.h +0 -1
  39. package/src/llama.cpp/ggml/include/ggml.h +50 -1
  40. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +19 -16
  41. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
  42. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -7
  43. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +11 -37
  44. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +3 -4
  45. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  46. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  47. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +18 -18
  48. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +234 -16
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +80 -51
  52. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +161 -20
  53. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +399 -50
  54. package/src/llama.cpp/include/llama.h +32 -7
  55. package/src/llama.cpp/src/llama-adapter.cpp +101 -4
  56. package/src/llama.cpp/src/llama-adapter.h +6 -0
  57. package/src/llama.cpp/src/llama-arch.cpp +69 -2
  58. package/src/llama.cpp/src/llama-arch.h +6 -0
  59. package/src/llama.cpp/src/llama-context.cpp +92 -45
  60. package/src/llama.cpp/src/llama-context.h +1 -5
  61. package/src/llama.cpp/src/llama-graph.cpp +74 -19
  62. package/src/llama.cpp/src/llama-graph.h +10 -1
  63. package/src/llama.cpp/src/llama-hparams.cpp +37 -0
  64. package/src/llama.cpp/src/llama-hparams.h +9 -3
  65. package/src/llama.cpp/src/llama-impl.h +2 -0
  66. package/src/llama.cpp/src/llama-kv-cache.cpp +33 -120
  67. package/src/llama.cpp/src/llama-kv-cache.h +4 -13
  68. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  69. package/src/llama.cpp/src/llama-model.cpp +434 -21
  70. package/src/llama.cpp/src/llama-model.h +1 -1
  71. package/src/llama.cpp/src/llama-sampling.cpp +226 -126
  72. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  73. package/src/llama.cpp/src/llama.cpp +12 -0
  74. package/src/anyascii.c +0 -22223
  75. package/src/anyascii.h +0 -42
  76. package/src/tts_utils.cpp +0 -371
  77. package/src/tts_utils.h +0 -103
@@ -150,6 +150,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
150
150
  throw std::runtime_error("Invalid tool_choice: " + tool_choice);
151
151
  }
152
152
 
153
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
154
+ common_chat_templates_inputs dummy_inputs;
155
+ common_chat_msg msg;
156
+ msg.role = "user";
157
+ msg.content = "test";
158
+ dummy_inputs.messages = {msg};
159
+ dummy_inputs.enable_thinking = false;
160
+ const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
161
+ dummy_inputs.enable_thinking = true;
162
+ const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
163
+ return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
164
+ }
165
+
153
166
  template <>
154
167
  std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
155
168
  std::vector<common_chat_msg> msgs;
@@ -609,6 +622,8 @@ const char * common_chat_format_name(common_chat_format format) {
609
622
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
610
623
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
611
624
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
625
+ case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
626
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
612
627
  default:
613
628
  throw std::runtime_error("Unknown chat format");
614
629
  }
@@ -1169,6 +1184,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
1169
1184
  });
1170
1185
  return data;
1171
1186
  }
1187
+
1188
+ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
1189
+ common_chat_params data;
1190
+
1191
+ // Generate the prompt using the apply() function with the template
1192
+ data.prompt = apply(tmpl, inputs);
1193
+ data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
1194
+
1195
+ // Handle thinking tags appropriately based on inputs.enable_thinking
1196
+ if (string_ends_with(data.prompt, "<think>\n")) {
1197
+ if (!inputs.enable_thinking) {
1198
+ data.prompt += "</think>";
1199
+ } else {
1200
+ data.thinking_forced_open = true;
1201
+ }
1202
+ }
1203
+
1204
+ // When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
1205
+ if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
1206
+ data.grammar_lazy = true;
1207
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1208
+ auto schemas = json::array();
1209
+ foreach_function(inputs.tools, [&](const json & tool) {
1210
+ const auto & function = tool.at("function");
1211
+ schemas.push_back({
1212
+ { "type", "object" },
1213
+ { "properties",
1214
+ {
1215
+ { "name",
1216
+ {
1217
+ { "type", "string" },
1218
+ { "const", function.at("name") },
1219
+ } },
1220
+ { "arguments", function.at("parameters") },
1221
+ } },
1222
+ { "required", json::array({ "name", "arguments" }) },
1223
+ });
1224
+ });
1225
+ auto schema = json{
1226
+ { "type", "array" },
1227
+ { "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
1228
+ { "minItems", 1 },
1229
+ };
1230
+ if (!inputs.parallel_tool_calls) {
1231
+ schema["maxItems"] = 1;
1232
+ }
1233
+ builder.add_rule("root",
1234
+ std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
1235
+ "\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
1236
+ " \"</TOOLCALL>\"");
1237
+ });
1238
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
1239
+ // If thinking_forced_open, then we capture the </think> tag in the grammar,
1240
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1241
+ std::string(data.thinking_forced_open ?
1242
+ "[\\s\\S]*?(</think>\\s*)" :
1243
+ "(?:<think>[\\s\\S]*?</think>\\s*)?") +
1244
+ "(<TOOLCALL>)[\\s\\S]*" });
1245
+ }
1246
+ return data;
1247
+ }
1172
1248
  static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
1173
1249
  if (!builder.syntax().parse_tool_calls) {
1174
1250
  builder.add_content(builder.consume_rest());
@@ -1815,7 +1891,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
1815
1891
  // If thinking_forced_open, then we capture the </think> tag in the grammar,
1816
1892
  // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
1817
1893
  std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
1818
- "(\\s*"
1894
+ "\\s*("
1819
1895
  "(?:<tool_call>"
1820
1896
  "|<function"
1821
1897
  "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2045,6 +2121,121 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2045
2121
  }
2046
2122
  }
2047
2123
 
2124
+ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
2125
+ // Parse thinking tags
2126
+ builder.try_parse_reasoning("<think>", "</think>");
2127
+ if (!builder.syntax().parse_tool_calls) {
2128
+ builder.add_content(builder.consume_rest());
2129
+ return;
2130
+ }
2131
+
2132
+ // Look for tool calls
2133
+ static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
2134
+ if (auto res = builder.try_find_regex(tool_call_regex)) {
2135
+ builder.move_to(res->groups[0].end);
2136
+
2137
+ // Expect JSON array of tool calls
2138
+ auto tool_calls_data = builder.consume_json();
2139
+ if (tool_calls_data.json.is_array()) {
2140
+ if (!builder.try_consume_literal("</TOOLCALL>")) {
2141
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2142
+ }
2143
+ builder.add_tool_calls(tool_calls_data.json);
2144
+ } else {
2145
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2146
+ }
2147
+ }
2148
+ builder.add_content(builder.consume_rest());
2149
+ }
2150
+
2151
+ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2152
+ // Parse thinking tags first - this handles the main reasoning content
2153
+ builder.try_parse_reasoning("<seed:think>", "</seed:think>");
2154
+
2155
+ if (!builder.syntax().parse_tool_calls) {
2156
+ builder.add_content(builder.consume_rest());
2157
+ return;
2158
+ }
2159
+
2160
+ // Parse tool calls - Seed-OSS uses <seed:tool_call> format
2161
+ static const common_regex tool_call_begin_regex("<seed:tool_call>");
2162
+ static const common_regex tool_call_end_regex("</seed:tool_call>");
2163
+ static const common_regex function_regex("<function=([^>]+)>");
2164
+ static const common_regex param_regex("<parameter=([^>]+)>");
2165
+
2166
+ while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
2167
+ builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
2168
+
2169
+ // Look for function call inside tool call, ignore any content before it
2170
+ if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
2171
+ auto function_name = builder.str(func_res->groups[1]);
2172
+
2173
+ // Parse Seed-OSS parameters <parameter=name>value</parameter>
2174
+ json args = json::object();
2175
+ // Parse all parameters
2176
+ while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
2177
+ // again, ignore noise around parameters
2178
+ auto param_name = builder.str(param_res->groups[1]);
2179
+ builder.move_to(param_res->groups[0].end);
2180
+ builder.consume_spaces(); // Consume whitespace after parameter
2181
+ auto savedPos = builder.pos();
2182
+ if (auto param_parse = builder.try_find_literal("</parameter>")) {
2183
+ auto param = param_parse->prelude;
2184
+ builder.move_to(savedPos);
2185
+ try {
2186
+ if (auto param_res = builder.try_consume_json()) {
2187
+ args[param_name] = param_res->json;
2188
+ } else {
2189
+ args[param_name] = param;
2190
+ }
2191
+ } catch (json::exception &) {
2192
+ args[param_name] = param;
2193
+ }
2194
+ } else {
2195
+ throw common_chat_msg_partial_exception("Incomplete tool parameter");
2196
+ }
2197
+ }
2198
+ // Look for closing function tag
2199
+ auto end_func = builder.try_find_literal("</function>");
2200
+ if (end_func) {
2201
+ builder.move_to(end_func->groups[0].end);
2202
+ builder.consume_spaces(); // Consume whitespace after </function>
2203
+
2204
+ // Add the tool call with parsed arguments, but only if we REALLY got the literal
2205
+ auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
2206
+ auto funlen = std::string("</function>").length();
2207
+ if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
2208
+ if (!builder.add_tool_call(function_name, "", args.dump())) {
2209
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2210
+ }
2211
+ } else {
2212
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2213
+ }
2214
+ } else {
2215
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2216
+ }
2217
+ // Look for closing tool call tag
2218
+ if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
2219
+ builder.move_to(end_tool->groups[0].end);
2220
+ builder.consume_spaces(); // Consume trailing whitespace after tool call
2221
+ } else {
2222
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2223
+ }
2224
+ } else {
2225
+ // No function found - don't consume content here, let it be handled at the end
2226
+ break;
2227
+ }
2228
+ }
2229
+
2230
+ // Consume any remaining whitespace after all tool call processing
2231
+ builder.consume_spaces();
2232
+ auto remaining = builder.consume_rest();
2233
+ // If there's any non-whitespace content remaining, add it as content
2234
+ if (!string_strip(remaining).empty()) {
2235
+ builder.add_content(remaining);
2236
+ }
2237
+ }
2238
+
2048
2239
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2049
2240
  common_chat_params data;
2050
2241
  data.prompt = apply(tmpl, inputs);
@@ -2061,8 +2252,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
2061
2252
  return data;
2062
2253
  }
2063
2254
 
2255
+ static common_chat_params common_chat_params_init_seed_oss(
2256
+ const common_chat_template & tmpl,
2257
+ templates_params & params,
2258
+ const common_chat_templates_inputs & inputs)
2259
+ {
2260
+ common_chat_params data;
2261
+ data.prompt = apply(tmpl, params);
2262
+ data.format = COMMON_CHAT_FORMAT_SEED_OSS;
2263
+ if (string_ends_with(data.prompt, "<seed:think>")) {
2264
+ if (!inputs.enable_thinking) {
2265
+ data.prompt += "</seed:think>";
2266
+ } else {
2267
+ data.thinking_forced_open = true;
2268
+ }
2269
+ }
2270
+
2271
+ if (params.tools.is_array() && !params.tools.empty()) {
2272
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
2273
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
2274
+ std::vector<std::string> tool_rules;
2275
+ foreach_function(params.tools, [&](const json & tool) {
2276
+ const auto & function = tool.at("function");
2277
+ std::string name = function.at("name");
2278
+ auto parameters = function.at("parameters");
2279
+ builder.resolve_refs(parameters);
2280
+
2281
+ // Create rule for Seed-OSS function call format
2282
+ std::string param_rules;
2283
+ if (parameters.contains("properties")) {
2284
+ for (const auto & [key, value] : parameters.at("properties").items()) {
2285
+ param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
2286
+ "\"</parameter>\"";
2287
+ }
2288
+ }
2289
+
2290
+ tool_rules.push_back(builder.add_rule(name + "-call",
2291
+ "\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
2292
+ param_rules +
2293
+ " \"</function>\" space \"</seed:tool_call>\""));
2294
+ });
2295
+
2296
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
2297
+
2298
+ data.preserved_tokens = {
2299
+ "<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
2300
+ "<function=", "</function>", "<parameter=", "</parameter>",
2301
+ };
2302
+
2303
+ builder.add_rule("root", string_join(tool_rules, " | "));
2304
+ });
2305
+ }
2306
+ return data;
2307
+ }
2308
+
2064
2309
  static common_chat_params common_chat_templates_apply_jinja(
2065
- const struct common_chat_templates * tmpls,
2310
+ const struct common_chat_templates * tmpls,
2066
2311
  const struct common_chat_templates_inputs & inputs)
2067
2312
  {
2068
2313
  templates_params params;
@@ -2131,6 +2376,16 @@ static common_chat_params common_chat_templates_apply_jinja(
2131
2376
  return common_chat_params_init_gpt_oss(tmpl, params);
2132
2377
  }
2133
2378
 
2379
+ // Seed-OSS
2380
+ if (src.find("<seed:think>") != std::string::npos) {
2381
+ return common_chat_params_init_seed_oss(tmpl, params, inputs);
2382
+ }
2383
+
2384
+ // Nemotron v2
2385
+ if (src.find("<SPECIAL_10>") != std::string::npos) {
2386
+ return common_chat_params_init_nemotron_v2(tmpl, params);
2387
+ }
2388
+
2134
2389
  // Use generic handler when mixing tools + JSON schema.
2135
2390
  // TODO: support that mix in handlers below.
2136
2391
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2289,6 +2544,12 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2289
2544
  case COMMON_CHAT_FORMAT_GPT_OSS:
2290
2545
  common_chat_parse_gpt_oss(builder);
2291
2546
  break;
2547
+ case COMMON_CHAT_FORMAT_SEED_OSS:
2548
+ common_chat_parse_seed_oss(builder);
2549
+ break;
2550
+ case COMMON_CHAT_FORMAT_NEMOTRON_V2:
2551
+ common_chat_parse_nemotron_v2(builder);
2552
+ break;
2292
2553
  default:
2293
2554
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2294
2555
  }
@@ -122,6 +122,8 @@ enum common_chat_format {
122
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
123
  COMMON_CHAT_FORMAT_GRANITE,
124
124
  COMMON_CHAT_FORMAT_GPT_OSS,
125
+ COMMON_CHAT_FORMAT_SEED_OSS,
126
+ COMMON_CHAT_FORMAT_NEMOTRON_V2,
125
127
 
126
128
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
127
129
  };
@@ -208,6 +210,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
208
210
 
209
211
  common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
210
212
 
213
+ bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
214
+
211
215
  // Parses a JSON array of messages in OpenAI's chat completion API format.
212
216
  // T can be std::string containing JSON or nlohmann::ordered_json
213
217
  template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
901
901
 
902
902
  llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
903
903
  if (model == NULL) {
904
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
904
+ LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
905
+ __func__, params.model.path.c_str());
905
906
  return iparams;
906
907
  }
907
908
 
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
911
912
 
912
913
  llama_context * lctx = llama_init_from_model(model, cparams);
913
914
  if (lctx == NULL) {
914
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
915
+ LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
916
+ __func__, params.model.path.c_str());
915
917
  llama_model_free(model);
916
918
  return iparams;
917
919
  }
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
988
990
  return iparams;
989
991
  }
990
992
 
993
+ char buf[1024];
991
994
  la.ptr = lora.get();
995
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
996
+ la.task_name = buf;
997
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
998
+ la.prompt_prefix = buf;
992
999
  iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
993
1000
  }
994
1001
 
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1153
1160
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1154
1161
  cparams.pooling_type = params.pooling_type;
1155
1162
  cparams.attention_type = params.attention_type;
1163
+ cparams.flash_attn_type = params.flash_attn_type;
1156
1164
  cparams.cb_eval = params.cb_eval;
1157
1165
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1158
1166
  cparams.offload_kqv = !params.no_kv_offload;
1159
- cparams.flash_attn = params.flash_attn;
1160
1167
  cparams.no_perf = params.no_perf;
1161
1168
  cparams.op_offload = !params.no_op_offload;
1162
1169
  cparams.swa_full = params.swa_full;
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
34
34
  std::string path;
35
35
  float scale;
36
36
 
37
+ std::string task_name;
38
+ std::string prompt_prefix;
39
+
37
40
  struct llama_adapter_lora * ptr;
38
41
  };
39
42
 
@@ -310,6 +313,7 @@ struct common_params {
310
313
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
311
314
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
312
315
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
316
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
313
317
 
314
318
  struct common_params_sampling sampling;
315
319
  struct common_params_speculative speculative;
@@ -373,7 +377,6 @@ struct common_params {
373
377
  bool multiline_input = false; // reverse the usage of `\`
374
378
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
375
379
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
376
- bool flash_attn = false; // flash attention
377
380
  bool no_perf = false; // disable performance metrics
378
381
  bool ctx_shift = false; // context shift on infinite text generation
379
382
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -442,7 +445,7 @@ struct common_params {
442
445
 
443
446
  // "advanced" endpoints are disabled by default for better security
444
447
  bool webui = true;
445
- bool endpoint_slots = false;
448
+ bool endpoint_slots = true;
446
449
  bool endpoint_props = false; // only control POST requests, not GET
447
450
  bool endpoint_metrics = false;
448
451
 
@@ -4,17 +4,52 @@
4
4
  #include <condition_variable>
5
5
  #include <cstdarg>
6
6
  #include <cstdio>
7
+ #include <cstdlib>
8
+ #include <cstring>
7
9
  #include <mutex>
8
10
  #include <sstream>
9
11
  #include <thread>
10
12
  #include <vector>
11
13
 
14
+ #if defined(_WIN32)
15
+ # include <io.h>
16
+ # include <windows.h>
17
+ # define isatty _isatty
18
+ # define fileno _fileno
19
+ #else
20
+ # include <unistd.h>
21
+ #endif // defined(_WIN32)
22
+
12
23
  int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
13
24
 
14
25
  void common_log_set_verbosity_thold(int verbosity) {
15
26
  common_log_verbosity_thold = verbosity;
16
27
  }
17
28
 
29
+ // Auto-detect if colors should be enabled based on terminal and environment
30
+ static bool common_log_should_use_colors_auto() {
31
+ // Check NO_COLOR environment variable (https://no-color.org/)
32
+ if (const char * no_color = std::getenv("NO_COLOR")) {
33
+ if (no_color[0] != '\0') {
34
+ return false;
35
+ }
36
+ }
37
+
38
+ // Check TERM environment variable
39
+ if (const char * term = std::getenv("TERM")) {
40
+ if (std::strcmp(term, "dumb") == 0) {
41
+ return false;
42
+ }
43
+ }
44
+
45
+ // Check if stdout and stderr are connected to a terminal
46
+ // We check both because log messages can go to either
47
+ bool stdout_is_tty = isatty(fileno(stdout));
48
+ bool stderr_is_tty = isatty(fileno(stderr));
49
+
50
+ return stdout_is_tty || stderr_is_tty;
51
+ }
52
+
18
53
  static int64_t t_us() {
19
54
  return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
20
55
  }
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
353
388
 
354
389
  struct common_log * common_log_main() {
355
390
  static struct common_log log;
391
+ static std::once_flag init_flag;
392
+ std::call_once(init_flag, [&]() {
393
+ // Set default to auto-detect colors
394
+ log.set_colors(common_log_should_use_colors_auto());
395
+ });
356
396
 
357
397
  return &log;
358
398
  }
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
380
420
  log->set_file(file);
381
421
  }
382
422
 
383
- void common_log_set_colors(struct common_log * log, bool colors) {
384
- log->set_colors(colors);
423
+ void common_log_set_colors(struct common_log * log, log_colors colors) {
424
+ if (colors == LOG_COLORS_AUTO) {
425
+ log->set_colors(common_log_should_use_colors_auto());
426
+ return;
427
+ }
428
+
429
+ if (colors == LOG_COLORS_DISABLED) {
430
+ log->set_colors(false);
431
+ return;
432
+ }
433
+
434
+ GGML_ASSERT(colors == LOG_COLORS_ENABLED);
435
+ log->set_colors(true);
385
436
  }
386
437
 
387
438
  void common_log_set_prefix(struct common_log * log, bool prefix) {
@@ -24,6 +24,12 @@
24
24
  #define LOG_DEFAULT_DEBUG 1
25
25
  #define LOG_DEFAULT_LLAMA 0
26
26
 
27
+ enum log_colors {
28
+ LOG_COLORS_AUTO = -1,
29
+ LOG_COLORS_DISABLED = 0,
30
+ LOG_COLORS_ENABLED = 1,
31
+ };
32
+
27
33
  // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
28
34
  // set via common_log_set_verbosity()
29
35
  extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
65
71
  // D - debug (stderr, V = LOG_DEFAULT_DEBUG)
66
72
  //
67
73
 
68
- void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
69
- void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
70
- void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
71
- void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
74
+ void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
75
+ void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
76
+ void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
77
+ void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
72
78
 
73
79
  // helper macros for logging
74
80
  // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
426
426
 
427
427
  // helpers
428
428
 
429
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430
- return &gsmpl->cur_p;
429
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
430
+ auto * res = &gsmpl->cur_p;
431
+
432
+ if (do_sort && !res->sorted) {
433
+ // remember the selected token before sorting
434
+ const llama_token id = res->data[res->selected].id;
435
+
436
+ std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
437
+ return a.p > b.p;
438
+ });
439
+
440
+ // restore the selected token after sorting
441
+ for (size_t i = 0; i < res->size; ++i) {
442
+ if (res->data[i].id == id) {
443
+ res->selected = i;
444
+ break;
445
+ }
446
+ }
447
+
448
+ res->sorted = true;
449
+ }
450
+
451
+ return res;
431
452
  }
432
453
 
433
454
  llama_token common_sampler_last(const struct common_sampler * gsmpl) {
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
86
86
  // helpers
87
87
 
88
88
  // access the internal list of current candidate tokens
89
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
89
+ // if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
90
+ // the .sorted flag of the result indicates whether the returned candidates are sorted
91
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);
90
92
 
91
93
  // get the last accepted token
92
94
  llama_token common_sampler_last(const struct common_sampler * gsmpl);
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(
317
317
 
318
318
  common_sampler_sample(smpl, ctx_dft, 0, true);
319
319
 
320
- const auto * cur_p = common_sampler_get_candidates(smpl);
320
+ const auto * cur_p = common_sampler_get_candidates(smpl, true);
321
321
 
322
322
  for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
323
323
  LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
@@ -1,5 +1,5 @@
1
1
  cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
- project("ggml" C CXX)
2
+ project("ggml" C CXX ASM)
3
3
  include(CheckIncludeFileCXX)
4
4
 
5
5
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -129,10 +129,11 @@ endif()
129
129
  option(GGML_LASX "ggml: enable lasx" ON)
130
130
  option(GGML_LSX "ggml: enable lsx" ON)
131
131
  option(GGML_RVV "ggml: enable rvv" ON)
132
- option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
+ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
133
+ option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
134
+ option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
133
135
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
134
136
  option(GGML_VXE "ggml: enable vxe" ON)
135
- option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
136
137
 
137
138
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
138
139
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -307,6 +307,9 @@ extern "C" {
307
307
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
308
308
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
309
309
 
310
+ // Split graph without allocating it
311
+ GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
312
+
310
313
  // Allocate and compute graph on the backend scheduler
311
314
  GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
312
315
  GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
@@ -101,7 +101,6 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
- GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
105
104
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106
105
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107
106