@fugood/llama.node 1.1.10 → 1.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/lib/binding.ts +2 -1
  2. package/package.json +14 -14
  3. package/src/LlamaContext.cpp +17 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -19
  5. package/src/llama.cpp/common/chat.cpp +152 -1
  6. package/src/llama.cpp/common/chat.h +1 -0
  7. package/src/llama.cpp/common/common.cpp +10 -3
  8. package/src/llama.cpp/common/common.h +4 -1
  9. package/src/llama.cpp/ggml/CMakeLists.txt +1 -1
  10. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -4
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +43 -6
  13. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +4 -1
  14. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +14 -9
  15. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
  16. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -12
  17. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
  19. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +103 -1
  20. package/src/llama.cpp/include/llama.h +27 -1
  21. package/src/llama.cpp/src/llama-adapter.cpp +68 -4
  22. package/src/llama.cpp/src/llama-adapter.h +3 -0
  23. package/src/llama.cpp/src/llama-arch.cpp +46 -2
  24. package/src/llama.cpp/src/llama-arch.h +4 -0
  25. package/src/llama.cpp/src/llama-context.cpp +80 -39
  26. package/src/llama.cpp/src/llama-context.h +0 -4
  27. package/src/llama.cpp/src/llama-graph.cpp +20 -10
  28. package/src/llama.cpp/src/llama-graph.h +2 -1
  29. package/src/llama.cpp/src/llama-impl.h +2 -0
  30. package/src/llama.cpp/src/llama-kv-cache.cpp +32 -97
  31. package/src/llama.cpp/src/llama-kv-cache.h +3 -13
  32. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  33. package/src/llama.cpp/src/llama-model.cpp +275 -20
  34. package/src/llama.cpp/src/llama-model.h +1 -0
  35. package/src/llama.cpp/src/llama-vocab.cpp +1 -1
  36. package/src/llama.cpp/src/llama.cpp +12 -0
package/lib/binding.ts CHANGED
@@ -27,7 +27,8 @@ export type LlamaModelOptions = {
27
27
  n_ubatch?: number
28
28
  n_threads?: number
29
29
  n_gpu_layers?: number
30
- flash_attn?: boolean
30
+ flash_attn_type?: 'auto' | 'on' | 'off'
31
+ flash_attn?: boolean // Deprecated: use flash_attn_type instead
31
32
  cache_type_k?:
32
33
  | 'f16'
33
34
  | 'f32'
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.1.10",
4
+ "version": "1.1.11",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -71,19 +71,19 @@
71
71
  "CMakeLists.txt"
72
72
  ],
73
73
  "optionalDependencies": {
74
- "@fugood/node-llama-linux-x64": "1.1.10",
75
- "@fugood/node-llama-linux-x64-vulkan": "1.1.10",
76
- "@fugood/node-llama-linux-x64-cuda": "1.1.10",
77
- "@fugood/node-llama-linux-arm64": "1.1.10",
78
- "@fugood/node-llama-linux-arm64-vulkan": "1.1.10",
79
- "@fugood/node-llama-linux-arm64-cuda": "1.1.10",
80
- "@fugood/node-llama-win32-x64": "1.1.10",
81
- "@fugood/node-llama-win32-x64-vulkan": "1.1.10",
82
- "@fugood/node-llama-win32-x64-cuda": "1.1.10",
83
- "@fugood/node-llama-win32-arm64": "1.1.10",
84
- "@fugood/node-llama-win32-arm64-vulkan": "1.1.10",
85
- "@fugood/node-llama-darwin-x64": "1.1.10",
86
- "@fugood/node-llama-darwin-arm64": "1.1.10"
74
+ "@fugood/node-llama-linux-x64": "1.1.11",
75
+ "@fugood/node-llama-linux-x64-vulkan": "1.1.11",
76
+ "@fugood/node-llama-linux-x64-cuda": "1.1.11",
77
+ "@fugood/node-llama-linux-arm64": "1.1.11",
78
+ "@fugood/node-llama-linux-arm64-vulkan": "1.1.11",
79
+ "@fugood/node-llama-linux-arm64-cuda": "1.1.11",
80
+ "@fugood/node-llama-win32-x64": "1.1.11",
81
+ "@fugood/node-llama-win32-x64-vulkan": "1.1.11",
82
+ "@fugood/node-llama-win32-x64-cuda": "1.1.11",
83
+ "@fugood/node-llama-win32-arm64": "1.1.11",
84
+ "@fugood/node-llama-win32-arm64-vulkan": "1.1.11",
85
+ "@fugood/node-llama-darwin-x64": "1.1.11",
86
+ "@fugood/node-llama-darwin-arm64": "1.1.11"
87
87
  },
88
88
  "devDependencies": {
89
89
  "@babel/preset-env": "^7.24.4",
@@ -190,6 +190,15 @@ static ggml_type kv_cache_type_from_str(const std::string &s) {
190
190
  throw std::runtime_error("Unsupported cache type: " + s);
191
191
  }
192
192
 
193
+ static enum llama_flash_attn_type flash_attn_type_from_str(const std::string &s) {
194
+ if (s == "on")
195
+ return LLAMA_FLASH_ATTN_TYPE_ENABLED;
196
+ if (s == "off")
197
+ return LLAMA_FLASH_ATTN_TYPE_DISABLED;
198
+ return LLAMA_FLASH_ATTN_TYPE_AUTO;
199
+ }
200
+
201
+
193
202
  static int32_t pooling_type_from_str(const std::string &s) {
194
203
  if (s == "none")
195
204
  return LLAMA_POOLING_TYPE_NONE;
@@ -242,7 +251,14 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
242
251
  params.cpuparams.n_threads =
243
252
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
244
253
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
245
- params.flash_attn = get_option<bool>(options, "flash_attn", false);
254
+
255
+ auto flash_attn_type = get_option<std::string>(options, "flash_attn_type", "auto");
256
+ if (!flash_attn_type.empty()) {
257
+ params.flash_attn_type = (enum llama_flash_attn_type)flash_attn_type_from_str(flash_attn_type.c_str());
258
+ } else {
259
+ params.flash_attn_type = get_option<bool>(options, "flash_attn", false) ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
260
+ }
261
+
246
262
  params.cache_type_k = kv_cache_type_from_str(
247
263
  get_option<std::string>(options, "cache_type_k", "f16").c_str());
248
264
  params.cache_type_v = kv_cache_type_from_str(
@@ -1106,7 +1106,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
1106
1106
  printf("\"\n\n");
1107
1107
 
1108
1108
  printf(" case \"$prev\" in\n");
1109
- printf(" --model)\n");
1109
+ printf(" --model|-m)\n");
1110
1110
  printf(" COMPREPLY=( $(compgen -f -X '!*.gguf' -- \"$cur\") $(compgen -d -- \"$cur\") )\n");
1111
1111
  printf(" return 0\n");
1112
1112
  printf(" ;;\n");
@@ -1545,10 +1545,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1545
1545
  }
1546
1546
  ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
1547
1547
  add_opt(common_arg(
1548
- {"-fa", "--flash-attn"},
1549
- string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
1550
- [](common_params & params) {
1551
- params.flash_attn = true;
1548
+ {"-fa", "--flash-attn"}, "FA",
1549
+ string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
1550
+ [](common_params & params, const std::string & value) {
1551
+ if (value == "on" || value == "enabled") {
1552
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
1553
+ } else if (value == "off" || value == "disabled") {
1554
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
1555
+ } else if (value == "auto") {
1556
+ params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
1557
+ } else {
1558
+ throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
1559
+ }
1552
1560
  }
1553
1561
  ).set_env("LLAMA_ARG_FLASH_ATTN"));
1554
1562
  add_opt(common_arg(
@@ -2555,7 +2563,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2555
2563
  {"--lora"}, "FNAME",
2556
2564
  "path to LoRA adapter (can be repeated to use multiple adapters)",
2557
2565
  [](common_params & params, const std::string & value) {
2558
- params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
2566
+ params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr });
2559
2567
  }
2560
2568
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2561
2569
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -2563,7 +2571,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2563
2571
  {"--lora-scaled"}, "FNAME", "SCALE",
2564
2572
  "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
2565
2573
  [](common_params & params, const std::string & fname, const std::string & scale) {
2566
- params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
2574
+ params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr });
2567
2575
  }
2568
2576
  // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
2569
2577
  ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
@@ -3459,8 +3467,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3459
3467
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
3460
3468
  params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
3461
3469
  params.port = 8012;
3462
- params.n_gpu_layers = 99;
3463
- params.flash_attn = true;
3464
3470
  params.n_ubatch = 1024;
3465
3471
  params.n_batch = 1024;
3466
3472
  params.n_ctx = 0;
@@ -3475,8 +3481,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3475
3481
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
3476
3482
  params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
3477
3483
  params.port = 8012;
3478
- params.n_gpu_layers = 99;
3479
- params.flash_attn = true;
3480
3484
  params.n_ubatch = 1024;
3481
3485
  params.n_batch = 1024;
3482
3486
  params.n_ctx = 0;
@@ -3491,8 +3495,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3491
3495
  params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
3492
3496
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3493
3497
  params.port = 8012;
3494
- params.n_gpu_layers = 99;
3495
- params.flash_attn = true;
3496
3498
  params.n_ubatch = 1024;
3497
3499
  params.n_batch = 1024;
3498
3500
  params.n_ctx = 0;
@@ -3508,10 +3510,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3508
3510
  params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
3509
3511
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3510
3512
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3511
- params.speculative.n_gpu_layers = 99;
3512
3513
  params.port = 8012;
3513
- params.n_gpu_layers = 99;
3514
- params.flash_attn = true;
3515
3514
  params.n_ubatch = 1024;
3516
3515
  params.n_batch = 1024;
3517
3516
  params.n_ctx = 0;
@@ -3527,10 +3526,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3527
3526
  params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
3528
3527
  params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
3529
3528
  params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
3530
- params.speculative.n_gpu_layers = 99;
3531
3529
  params.port = 8012;
3532
- params.n_gpu_layers = 99;
3533
- params.flash_attn = true;
3530
+ params.n_ubatch = 1024;
3531
+ params.n_batch = 1024;
3532
+ params.n_ctx = 0;
3533
+ params.n_cache_reuse = 256;
3534
+ }
3535
+ ).set_examples({LLAMA_EXAMPLE_SERVER}));
3536
+
3537
+ add_opt(common_arg(
3538
+ {"--fim-qwen-30b-default"},
3539
+ string_format("use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet)"),
3540
+ [](common_params & params) {
3541
+ params.model.hf_repo = "ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF";
3542
+ params.model.hf_file = "qwen3-coder-30b-a3b-instruct-q8_0.gguf";
3543
+ params.port = 8012;
3534
3544
  params.n_ubatch = 1024;
3535
3545
  params.n_batch = 1024;
3536
3546
  params.n_ctx = 0;
@@ -609,6 +609,7 @@ const char * common_chat_format_name(common_chat_format format) {
609
609
  case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
610
610
  case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
611
611
  case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
612
+ case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
612
613
  default:
613
614
  throw std::runtime_error("Unknown chat format");
614
615
  }
@@ -2045,6 +2046,94 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
2045
2046
  }
2046
2047
  }
2047
2048
 
2049
+ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
2050
+ // Parse thinking tags first - this handles the main reasoning content
2051
+ builder.try_parse_reasoning("<seed:think>", "</seed:think>");
2052
+
2053
+ if (!builder.syntax().parse_tool_calls) {
2054
+ builder.add_content(builder.consume_rest());
2055
+ return;
2056
+ }
2057
+
2058
+ // Parse tool calls - Seed-OSS uses <seed:tool_call> format
2059
+ static const common_regex tool_call_begin_regex("<seed:tool_call>");
2060
+ static const common_regex tool_call_end_regex("</seed:tool_call>");
2061
+ static const common_regex function_regex("<function=([^>]+)>");
2062
+ static const common_regex param_regex("<parameter=([^>]+)>");
2063
+
2064
+ while (auto tool_res = builder.try_find_regex(tool_call_begin_regex)) {
2065
+ builder.consume_spaces(); // Consume whitespace after <seed:tool_call>
2066
+
2067
+ // Look for function call inside tool call, ignore any content before it
2068
+ if (auto func_res = builder.try_find_regex(function_regex, std::string::npos, false)) {
2069
+ auto function_name = builder.str(func_res->groups[1]);
2070
+
2071
+ // Parse Seed-OSS parameters <parameter=name>value</parameter>
2072
+ json args = json::object();
2073
+ // Parse all parameters
2074
+ while (auto param_res = builder.try_find_regex(param_regex, std::string::npos, false)) {
2075
+ // again, ignore noise around parameters
2076
+ auto param_name = builder.str(param_res->groups[1]);
2077
+ builder.move_to(param_res->groups[0].end);
2078
+ builder.consume_spaces(); // Consume whitespace after parameter
2079
+ auto savedPos = builder.pos();
2080
+ if (auto param_parse = builder.try_find_literal("</parameter>")) {
2081
+ auto param = param_parse->prelude;
2082
+ builder.move_to(savedPos);
2083
+ try {
2084
+ if (auto param_res = builder.try_consume_json()) {
2085
+ args[param_name] = param_res->json;
2086
+ } else {
2087
+ args[param_name] = param;
2088
+ }
2089
+ } catch (json::exception &) {
2090
+ args[param_name] = param;
2091
+ }
2092
+ } else {
2093
+ throw common_chat_msg_partial_exception("Incomplete tool parameter");
2094
+ }
2095
+ }
2096
+ // Look for closing function tag
2097
+ auto end_func = builder.try_find_literal("</function>");
2098
+ if (end_func) {
2099
+ builder.move_to(end_func->groups[0].end);
2100
+ builder.consume_spaces(); // Consume whitespace after </function>
2101
+
2102
+ // Add the tool call with parsed arguments, but only if we REALLY got the literal
2103
+ auto eaten_fragment = builder.input().substr(end_func->groups[0].begin, end_func->groups[0].end);
2104
+ auto funlen = std::string("</function>").length();
2105
+ if (eaten_fragment.length() >= funlen && eaten_fragment.substr(0, funlen) == std::string("</function>")) {
2106
+ if (!builder.add_tool_call(function_name, "", args.dump())) {
2107
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2108
+ }
2109
+ } else {
2110
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2111
+ }
2112
+ } else {
2113
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2114
+ }
2115
+ // Look for closing tool call tag
2116
+ if (auto end_tool = builder.try_find_regex(tool_call_end_regex, std::string::npos, false)) {
2117
+ builder.move_to(end_tool->groups[0].end);
2118
+ builder.consume_spaces(); // Consume trailing whitespace after tool call
2119
+ } else {
2120
+ throw common_chat_msg_partial_exception("Incomplete tool call");
2121
+ }
2122
+ } else {
2123
+ // No function found - don't consume content here, let it be handled at the end
2124
+ break;
2125
+ }
2126
+ }
2127
+
2128
+ // Consume any remaining whitespace after all tool call processing
2129
+ builder.consume_spaces();
2130
+ auto remaining = builder.consume_rest();
2131
+ // If there's any non-whitespace content remaining, add it as content
2132
+ if (!string_strip(remaining).empty()) {
2133
+ builder.add_content(remaining);
2134
+ }
2135
+ }
2136
+
2048
2137
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2049
2138
  common_chat_params data;
2050
2139
  data.prompt = apply(tmpl, inputs);
@@ -2061,8 +2150,62 @@ static common_chat_params common_chat_params_init_without_tools(const common_cha
2061
2150
  return data;
2062
2151
  }
2063
2152
 
2153
+ static common_chat_params common_chat_params_init_seed_oss(
2154
+ const common_chat_template & tmpl,
2155
+ templates_params & params,
2156
+ const common_chat_templates_inputs & inputs)
2157
+ {
2158
+ common_chat_params data;
2159
+ data.prompt = apply(tmpl, params);
2160
+ data.format = COMMON_CHAT_FORMAT_SEED_OSS;
2161
+ if (string_ends_with(data.prompt, "<seed:think>")) {
2162
+ if (!inputs.enable_thinking) {
2163
+ data.prompt += "</seed:think>";
2164
+ } else {
2165
+ data.thinking_forced_open = true;
2166
+ }
2167
+ }
2168
+
2169
+ if (params.tools.is_array() && !params.tools.empty()) {
2170
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
2171
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
2172
+ std::vector<std::string> tool_rules;
2173
+ foreach_function(params.tools, [&](const json & tool) {
2174
+ const auto & function = tool.at("function");
2175
+ std::string name = function.at("name");
2176
+ auto parameters = function.at("parameters");
2177
+ builder.resolve_refs(parameters);
2178
+
2179
+ // Create rule for Seed-OSS function call format
2180
+ std::string param_rules;
2181
+ if (parameters.contains("properties")) {
2182
+ for (const auto & [key, value] : parameters.at("properties").items()) {
2183
+ param_rules += "\"<parameter=" + key + ">\"" + builder.add_schema(name + "-arg-" + key, value) +
2184
+ "\"</parameter>\"";
2185
+ }
2186
+ }
2187
+
2188
+ tool_rules.push_back(builder.add_rule(name + "-call",
2189
+ "\"<seed:tool_call>\" space \"<function=" + name + ">\" space " +
2190
+ param_rules +
2191
+ " \"</function>\" space \"</seed:tool_call>\""));
2192
+ });
2193
+
2194
+ data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<seed:tool_call>" });
2195
+
2196
+ data.preserved_tokens = {
2197
+ "<seed:think>", "</seed:think>", "<seed:tool_call>", "</seed:tool_call>",
2198
+ "<function=", "</function>", "<parameter=", "</parameter>",
2199
+ };
2200
+
2201
+ builder.add_rule("root", string_join(tool_rules, " | "));
2202
+ });
2203
+ }
2204
+ return data;
2205
+ }
2206
+
2064
2207
  static common_chat_params common_chat_templates_apply_jinja(
2065
- const struct common_chat_templates * tmpls,
2208
+ const struct common_chat_templates * tmpls,
2066
2209
  const struct common_chat_templates_inputs & inputs)
2067
2210
  {
2068
2211
  templates_params params;
@@ -2131,6 +2274,11 @@ static common_chat_params common_chat_templates_apply_jinja(
2131
2274
  return common_chat_params_init_gpt_oss(tmpl, params);
2132
2275
  }
2133
2276
 
2277
+ // Seed-OSS
2278
+ if (src.find("<seed:think>") != std::string::npos) {
2279
+ return common_chat_params_init_seed_oss(tmpl, params, inputs);
2280
+ }
2281
+
2134
2282
  // Use generic handler when mixing tools + JSON schema.
2135
2283
  // TODO: support that mix in handlers below.
2136
2284
  if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2289,6 +2437,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
2289
2437
  case COMMON_CHAT_FORMAT_GPT_OSS:
2290
2438
  common_chat_parse_gpt_oss(builder);
2291
2439
  break;
2440
+ case COMMON_CHAT_FORMAT_SEED_OSS:
2441
+ common_chat_parse_seed_oss(builder);
2442
+ break;
2292
2443
  default:
2293
2444
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
2294
2445
  }
@@ -122,6 +122,7 @@ enum common_chat_format {
122
122
  COMMON_CHAT_FORMAT_COMMAND_R7B,
123
123
  COMMON_CHAT_FORMAT_GRANITE,
124
124
  COMMON_CHAT_FORMAT_GPT_OSS,
125
+ COMMON_CHAT_FORMAT_SEED_OSS,
125
126
 
126
127
  COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
127
128
  };
@@ -901,7 +901,8 @@ struct common_init_result common_init_from_params(common_params & params) {
901
901
 
902
902
  llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
903
903
  if (model == NULL) {
904
- LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
904
+ LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
905
+ __func__, params.model.path.c_str());
905
906
  return iparams;
906
907
  }
907
908
 
@@ -911,7 +912,8 @@ struct common_init_result common_init_from_params(common_params & params) {
911
912
 
912
913
  llama_context * lctx = llama_init_from_model(model, cparams);
913
914
  if (lctx == NULL) {
914
- LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
915
+ LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
916
+ __func__, params.model.path.c_str());
915
917
  llama_model_free(model);
916
918
  return iparams;
917
919
  }
@@ -988,7 +990,12 @@ struct common_init_result common_init_from_params(common_params & params) {
988
990
  return iparams;
989
991
  }
990
992
 
993
+ char buf[1024];
991
994
  la.ptr = lora.get();
995
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
996
+ la.task_name = buf;
997
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
998
+ la.prompt_prefix = buf;
992
999
  iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
993
1000
  }
994
1001
 
@@ -1153,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1153
1160
  cparams.yarn_orig_ctx = params.yarn_orig_ctx;
1154
1161
  cparams.pooling_type = params.pooling_type;
1155
1162
  cparams.attention_type = params.attention_type;
1163
+ cparams.flash_attn_type = params.flash_attn_type;
1156
1164
  cparams.cb_eval = params.cb_eval;
1157
1165
  cparams.cb_eval_user_data = params.cb_eval_user_data;
1158
1166
  cparams.offload_kqv = !params.no_kv_offload;
1159
- cparams.flash_attn = params.flash_attn;
1160
1167
  cparams.no_perf = params.no_perf;
1161
1168
  cparams.op_offload = !params.no_op_offload;
1162
1169
  cparams.swa_full = params.swa_full;
@@ -34,6 +34,9 @@ struct common_adapter_lora_info {
34
34
  std::string path;
35
35
  float scale;
36
36
 
37
+ std::string task_name;
38
+ std::string prompt_prefix;
39
+
37
40
  struct llama_adapter_lora * ptr;
38
41
  };
39
42
 
@@ -310,6 +313,7 @@ struct common_params {
310
313
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
311
314
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
312
315
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
316
+ enum llama_flash_attn_type flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention
313
317
 
314
318
  struct common_params_sampling sampling;
315
319
  struct common_params_speculative speculative;
@@ -373,7 +377,6 @@ struct common_params {
373
377
  bool multiline_input = false; // reverse the usage of `\`
374
378
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
375
379
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
376
- bool flash_attn = false; // flash attention
377
380
  bool no_perf = false; // disable performance metrics
378
381
  bool ctx_shift = false; // context shift on infinite text generation
379
382
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -1,5 +1,5 @@
1
1
  cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
2
- project("ggml" C CXX)
2
+ project("ggml" C CXX ASM)
3
3
  include(CheckIncludeFileCXX)
4
4
 
5
5
  set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -435,7 +435,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
435
435
  )
436
436
  if (GGML_RVV)
437
437
  if (GGML_XTHEADVECTOR)
438
- list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
438
+ list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
439
439
  elseif (GGML_RV_ZFH)
440
440
  list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
441
441
  else()
@@ -497,9 +497,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
497
497
 
498
498
  # Fetch KleidiAI sources:
499
499
  include(FetchContent)
500
- set(KLEIDIAI_COMMIT_TAG "v1.11.0")
500
+ set(KLEIDIAI_COMMIT_TAG "v1.13.0")
501
501
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
502
- set(KLEIDIAI_ARCHIVE_MD5 "3fe9e5ab964c375c53839296eb71eaa2")
502
+ set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
503
503
 
504
504
  if (POLICY CMP0135)
505
505
  cmake_policy(SET CMP0135 NEW)
@@ -555,6 +555,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
555
555
 
556
556
  list(APPEND GGML_KLEIDIAI_SOURCES
557
557
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
558
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p4x8sb_f32_neon.c
558
559
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
559
560
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
560
561
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
@@ -576,7 +577,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
576
577
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
577
578
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
578
579
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
579
- ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
580
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
581
+ ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
580
582
  set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
581
583
  endif()
582
584
 
@@ -489,7 +489,7 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
489
489
  /**
490
490
  * @see https://github.com/ggml-org/llama.cpp/pull/14037
491
491
  */
492
- inline float vec_hsum(float32x4_t v) {
492
+ inline static float vec_hsum(float32x4_t v) {
493
493
  float32x4_t v_temp = v + vec_reve(v);
494
494
  return v_temp[0] + v_temp[1];
495
495
  }