@fugood/llama.node 1.4.10 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +30 -30
  3. package/src/LlamaContext.cpp +1 -1
  4. package/src/llama.cpp/common/arg.cpp +29 -14
  5. package/src/llama.cpp/common/arg.h +1 -0
  6. package/src/llama.cpp/common/chat-parser.cpp +11 -0
  7. package/src/llama.cpp/common/chat.cpp +32 -3
  8. package/src/llama.cpp/common/chat.h +1 -0
  9. package/src/llama.cpp/common/common.cpp +23 -23
  10. package/src/llama.cpp/common/common.h +1 -1
  11. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  12. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
  13. package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
  14. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
  17. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
  20. package/src/llama.cpp/include/llama.h +13 -4
  21. package/src/llama.cpp/src/CMakeLists.txt +4 -0
  22. package/src/llama.cpp/src/llama-adapter.cpp +12 -3
  23. package/src/llama.cpp/src/llama-adapter.h +7 -1
  24. package/src/llama.cpp/src/llama-arch.cpp +76 -0
  25. package/src/llama.cpp/src/llama-arch.h +7 -0
  26. package/src/llama.cpp/src/llama-chat.cpp +11 -0
  27. package/src/llama.cpp/src/llama-chat.h +1 -0
  28. package/src/llama.cpp/src/llama-context.cpp +22 -21
  29. package/src/llama.cpp/src/llama-hparams.h +4 -3
  30. package/src/llama.cpp/src/llama-kv-cache.h +1 -1
  31. package/src/llama.cpp/src/llama-mmap.cpp +11 -4
  32. package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
  33. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  34. package/src/llama.cpp/src/llama-model.cpp +287 -16
  35. package/src/llama.cpp/src/llama-model.h +13 -2
  36. package/src/llama.cpp/src/llama-sampling.cpp +44 -33
  37. package/src/llama.cpp/src/llama-sampling.h +3 -0
  38. package/src/llama.cpp/src/llama-vocab.cpp +101 -33
  39. package/src/llama.cpp/src/llama-vocab.h +2 -0
  40. package/src/llama.cpp/src/llama.cpp +52 -37
  41. package/src/llama.cpp/src/models/bert.cpp +4 -2
  42. package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
  43. package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
  44. package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
  45. package/src/llama.cpp/src/models/gemma3.cpp +3 -4
  46. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
  47. package/src/llama.cpp/src/models/llama.cpp +19 -6
  48. package/src/llama.cpp/src/models/maincoder.cpp +117 -0
  49. package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
  50. package/src/llama.cpp/src/models/models.h +18 -0
  51. package/src/llama.cpp/src/models/modern-bert.cpp +115 -0
  52. package/src/llama.cpp/src/models/plamo3.cpp +128 -0
  53. package/src/llama.cpp/src/unicode.cpp +23 -14
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.4.10",
4
+ "version": "1.4.12",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,20 +72,20 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-darwin-arm64": "1.4.10",
76
- "@fugood/node-llama-darwin-x64": "1.4.10",
77
- "@fugood/node-llama-linux-arm64": "1.4.10",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.4.10",
79
- "@fugood/node-llama-linux-arm64-snapdragon": "1.4.10",
80
- "@fugood/node-llama-linux-arm64-vulkan": "1.4.10",
81
- "@fugood/node-llama-linux-x64": "1.4.10",
82
- "@fugood/node-llama-linux-x64-cuda": "1.4.10",
83
- "@fugood/node-llama-linux-x64-vulkan": "1.4.10",
84
- "@fugood/node-llama-win32-arm64": "1.4.10",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.4.10",
86
- "@fugood/node-llama-win32-x64": "1.4.10",
87
- "@fugood/node-llama-win32-x64-cuda": "1.4.10",
88
- "@fugood/node-llama-win32-x64-vulkan": "1.4.10"
75
+ "@fugood/node-llama-darwin-arm64": "1.4.12",
76
+ "@fugood/node-llama-darwin-x64": "1.4.12",
77
+ "@fugood/node-llama-linux-arm64": "1.4.12",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.4.12",
79
+ "@fugood/node-llama-linux-arm64-snapdragon": "1.4.12",
80
+ "@fugood/node-llama-linux-arm64-vulkan": "1.4.12",
81
+ "@fugood/node-llama-linux-x64": "1.4.12",
82
+ "@fugood/node-llama-linux-x64-cuda": "1.4.12",
83
+ "@fugood/node-llama-linux-x64-vulkan": "1.4.12",
84
+ "@fugood/node-llama-win32-arm64": "1.4.12",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.4.12",
86
+ "@fugood/node-llama-win32-x64": "1.4.12",
87
+ "@fugood/node-llama-win32-x64-cuda": "1.4.12",
88
+ "@fugood/node-llama-win32-x64-vulkan": "1.4.12"
89
89
  },
90
90
  "devDependencies": {
91
91
  "@babel/preset-env": "^7.24.4",
@@ -32,7 +32,7 @@ index 1bcba9cd8..b7cd68734 100644
32
32
  static std::string_view trim_trailing_space(std::string_view sv, int max = -1) {
33
33
  int count = 0;
34
34
  diff --git a/src/llama.cpp/common/chat.cpp b/src/llama.cpp/common/chat.cpp
35
- index 0a426f447..ab02be247 100644
35
+ index b98ab21ce..2f782837a 100644
36
36
  --- a/src/llama.cpp/common/chat.cpp
37
37
  +++ b/src/llama.cpp/common/chat.cpp
38
38
  @@ -7,9 +7,6 @@
@@ -62,7 +62,7 @@ index 0a426f447..ab02be247 100644
62
62
  struct templates_params {
63
63
  json messages;
64
64
  json tools;
65
- @@ -751,7 +738,7 @@ static std::string apply(
65
+ @@ -752,7 +739,7 @@ static std::string apply(
66
66
  tmpl_inputs.extra_context.merge_patch(*additional_context);
67
67
  }
68
68
  // TODO: add flag to control date/time, if only for testing purposes.
@@ -72,7 +72,7 @@ index 0a426f447..ab02be247 100644
72
72
  minja::chat_template_options tmpl_opts;
73
73
  // To avoid double BOS / EOS tokens, we're manually removing begining / trailing tokens
74
74
  diff --git a/src/llama.cpp/common/chat.h b/src/llama.cpp/common/chat.h
75
- index 6085510a4..263076ce2 100644
75
+ index 8bd4a325f..333b3301f 100644
76
76
  --- a/src/llama.cpp/common/chat.h
77
77
  +++ b/src/llama.cpp/common/chat.h
78
78
  @@ -10,7 +10,18 @@
@@ -96,19 +96,19 @@ index 6085510a4..263076ce2 100644
96
96
  struct common_chat_tool_call {
97
97
  std::string name;
98
98
  diff --git a/src/llama.cpp/common/common.cpp b/src/llama.cpp/common/common.cpp
99
- index d4e8c7405..af3dec813 100644
99
+ index 79c475612..cf189f8bc 100644
100
100
  --- a/src/llama.cpp/common/common.cpp
101
101
  +++ b/src/llama.cpp/common/common.cpp
102
- @@ -1343,6 +1343,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
- mparams.n_gpu_layers = params.n_gpu_layers;
102
+ @@ -1342,6 +1342,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
103
+ mparams.devices = params.devices.data();
104
104
  }
105
105
 
106
106
  + mparams.vocab_only = params.vocab_only;
107
+ mparams.n_gpu_layers = params.n_gpu_layers;
107
108
  mparams.main_gpu = params.main_gpu;
108
109
  mparams.split_mode = params.split_mode;
109
- mparams.tensor_split = params.tensor_split;
110
110
  diff --git a/src/llama.cpp/common/common.h b/src/llama.cpp/common/common.h
111
- index 334372073..e912b593a 100644
111
+ index f8bc686b6..555ba044a 100644
112
112
  --- a/src/llama.cpp/common/common.h
113
113
  +++ b/src/llama.cpp/common/common.h
114
114
  @@ -307,6 +307,7 @@ struct lr_opt {
@@ -120,7 +120,7 @@ index 334372073..e912b593a 100644
120
120
  int32_t n_ctx = 0; // context size, 0 == context the model was trained with
121
121
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
122
122
  diff --git a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
123
- index 28fb7612e..63f7e1ca1 100644
123
+ index 7622d0bf4..d2edcfddb 100644
124
124
  --- a/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
125
125
  +++ b/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt
126
126
  @@ -106,7 +106,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
@@ -133,14 +133,13 @@ index 28fb7612e..63f7e1ca1 100644
133
133
  check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E)
134
134
  if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
135
135
  diff --git a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
136
- index 6a00abacc..9e12459b6 100644
136
+ index 13b96d61f..5fa163442 100644
137
137
  --- a/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
138
138
  +++ b/src/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp
139
- @@ -3226,11 +3226,26 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
139
+ @@ -2680,9 +2680,24 @@ static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev
140
140
  GGML_UNUSED(dev);
141
141
  }
142
142
 
143
- +
144
143
  +// ~2GB per session for now
145
144
  +#define GGML_HEXAGON_SESSION_MEMORY_DEFAULT (2ULL * 1024 * 1024 * 1024)
146
145
  +// Max to 3.5GB
@@ -149,7 +148,6 @@ index 6a00abacc..9e12459b6 100644
149
148
  static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
150
149
  - // ~2GB per session for now
151
150
  - *free = 2ULL * 1024 * 1024 * 1024;
152
- - *total = *free;
153
151
  + const char * str_mem = getenv("GGML_HEXAGON_SESSION_MEMORY");
154
152
  + if (str_mem) {
155
153
  + *free = std::stoull(str_mem);
@@ -161,32 +159,34 @@ index 6a00abacc..9e12459b6 100644
161
159
  + } else {
162
160
  + *free = GGML_HEXAGON_SESSION_MEMORY_DEFAULT;
163
161
  + }
162
+ +
163
+ *total = *free;
164
164
 
165
- + *total = *free;
166
165
  GGML_UNUSED(dev);
167
- }
168
-
169
- @@ -3413,10 +3428,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
166
+ @@ -2879,10 +2894,17 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
170
167
  }
171
168
  }
172
169
 
173
- +#if defined(__ANDROID__)
174
- if(opt_arch < 75) {
175
- opt_ndev = 1;
170
+ - if (opt_arch < 75) {
171
+ - opt_ndev = 1;
176
172
  - GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
177
- + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
178
- + }
179
- +#else
180
- + if(opt_arch < 73) {
181
- + opt_ndev = 1;
182
- + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
183
- }
184
- +#endif
173
+ - }
174
+ + #if defined(__ANDROID__)
175
+ + if(opt_arch < 75) {
176
+ + opt_ndev = 1;
177
+ + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75 for Android.\n");
178
+ + }
179
+ + #else
180
+ + if(opt_arch < 73) {
181
+ + opt_ndev = 1;
182
+ + GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v73 for Linux and Windows.\n");
183
+ + }
184
+ + #endif
185
185
 
186
186
  GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
187
187
 
188
- @@ -3429,6 +3451,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
- } catch (std::exception const &exc) {
188
+ @@ -2895,6 +2917,8 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
189
+ } catch (const std::exception & exc) {
190
190
  GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
191
191
  devices[i].context = nullptr;
192
192
  + opt_ndev = i;
@@ -585,7 +585,7 @@ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
585
585
  for (int i = 0; i < count; i++) {
586
586
  char key[256];
587
587
  llama_model_meta_key_by_index(model, i, key, sizeof(key));
588
- char val[4096];
588
+ char val[16384];
589
589
  llama_model_meta_val_str_by_index(model, i, val, sizeof(val));
590
590
 
591
591
  metadata.Set(key, val);
@@ -2017,7 +2017,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2017
2017
  if (llama_supports_rpc()) {
2018
2018
  add_opt(common_arg(
2019
2019
  {"--rpc"}, "SERVERS",
2020
- "comma separated list of RPC servers",
2020
+ "comma separated list of RPC servers (host:port)",
2021
2021
  [](common_params & params, const std::string & value) {
2022
2022
  add_rpc_devices(value);
2023
2023
  GGML_UNUSED(params);
@@ -2087,7 +2087,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2087
2087
  "override tensor buffer type", [](common_params & params, const std::string & value) {
2088
2088
  parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
2089
2089
  }
2090
- ));
2090
+ ).set_env("LLAMA_ARG_OVERRIDE_TENSOR"));
2091
2091
  add_opt(common_arg(
2092
2092
  {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
2093
2093
  "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
@@ -2137,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2137
2137
  }
2138
2138
  }
2139
2139
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
2140
+ GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
2140
2141
  add_opt(common_arg(
2141
2142
  {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
2142
- string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
2143
- [](common_params & params, int value) {
2144
- params.n_gpu_layers = value;
2143
+ string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
2144
+ [](common_params & params, const std::string & value) {
2145
+ if (value == "auto") {
2146
+ params.n_gpu_layers = -1;
2147
+ } else if (value == "all") {
2148
+ params.n_gpu_layers = -2;
2149
+ } else {
2150
+ params.n_gpu_layers = std::stoi(value);
2151
+ }
2145
2152
  if (!llama_supports_gpu_offload()) {
2146
2153
  fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
2147
2154
  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3175,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3175
3182
  params.speculative.devices = parse_device_list(value);
3176
3183
  }
3177
3184
  ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
3185
+ GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
3178
3186
  add_opt(common_arg(
3179
3187
  {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
3180
- "number of layers to store in VRAM for the draft model",
3181
- [](common_params & params, int value) {
3182
- params.speculative.n_gpu_layers = value;
3188
+ string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
3189
+ params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
3190
+ [](common_params & params, const std::string & value) {
3191
+ if (value == "auto") {
3192
+ params.speculative.n_gpu_layers = -1;
3193
+ } else if (value == "all") {
3194
+ params.speculative.n_gpu_layers = -2;
3195
+ } else {
3196
+ params.speculative.n_gpu_layers = std::stoi(value);
3197
+ }
3183
3198
  if (!llama_supports_gpu_offload()) {
3184
3199
  fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
3185
3200
  fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -3518,15 +3533,15 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
3518
3533
  [](common_params &, const std::string &) { /* unused */ }
3519
3534
  ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
3520
3535
 
3536
+ args.push_back(common_arg(
3537
+ {"stop-timeout"}, "SECONDS",
3538
+ "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
3539
+ [](common_params &, int) { /* unused */ }
3540
+ ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
3541
+
3521
3542
  // args.push_back(common_arg(
3522
3543
  // {"pin"},
3523
3544
  // "in server router mode, do not unload this model if models_max is exceeded",
3524
3545
  // [](common_params &) { /* unused */ }
3525
3546
  // ).set_preset_only());
3526
-
3527
- // args.push_back(common_arg(
3528
- // {"unload-idle-seconds"}, "SECONDS",
3529
- // "in server router mode, unload models idle for more than this many seconds",
3530
- // [](common_params &, int) { /* unused */ }
3531
- // ).set_preset_only());
3532
3547
  }
@@ -10,6 +10,7 @@
10
10
 
11
11
  // pseudo-env variable to identify preset-only arguments
12
12
  #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13
+ #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
13
14
 
14
15
  //
15
16
  // CLI argument parsing
@@ -1395,6 +1395,14 @@ static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
1395
1395
  builder.consume_reasoning_with_xml_tool_calls(form, "<seed:think>", "</seed:think>");
1396
1396
  }
1397
1397
 
1398
+ static void common_chat_parse_solar_open(common_chat_msg_parser & builder) {
1399
+ builder.try_parse_reasoning("<|think|>", "<|end|><|begin|>assistant<|content|>");
1400
+
1401
+ // TODO: Tool calling
1402
+
1403
+ builder.add_content(builder.consume_rest());
1404
+ }
1405
+
1398
1406
  static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
1399
1407
  builder.try_parse_reasoning("<think>", "</think>");
1400
1408
  builder.add_content(builder.consume_rest());
@@ -1479,6 +1487,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
1479
1487
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO:
1480
1488
  common_chat_parse_xiaomi_mimo(builder);
1481
1489
  break;
1490
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN:
1491
+ common_chat_parse_solar_open(builder);
1492
+ break;
1482
1493
  default:
1483
1494
  throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
1484
1495
  }
@@ -306,7 +306,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
306
306
  }
307
307
  }
308
308
  } else {
309
- jmsg["content"] = json(); // null
309
+ jmsg["content"] = "";
310
310
  }
311
311
  if (!msg.reasoning_content.empty()) {
312
312
  jmsg["reasoning_content"] = msg.reasoning_content;
@@ -367,8 +367,8 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
367
367
  const auto & function = tool.at("function");
368
368
  result.push_back({
369
369
  /* .name = */ function.at("name"),
370
- /* .description = */ function.at("description"),
371
- /* .parameters = */ function.at("parameters").dump(),
370
+ /* .description = */ function.value("description", ""),
371
+ /* .parameters = */ function.value("parameters", json::object()).dump(),
372
372
  });
373
373
  }
374
374
  }
@@ -656,6 +656,7 @@ const char * common_chat_format_name(common_chat_format format) {
656
656
  case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder";
657
657
  case COMMON_CHAT_FORMAT_APRIEL_1_5: return "Apriel 1.5";
658
658
  case COMMON_CHAT_FORMAT_XIAOMI_MIMO: return "Xiaomi MiMo";
659
+ case COMMON_CHAT_FORMAT_SOLAR_OPEN: return "Solar Open";
659
660
  case COMMON_CHAT_FORMAT_PEG_SIMPLE: return "peg-simple";
660
661
  case COMMON_CHAT_FORMAT_PEG_NATIVE: return "peg-native";
661
662
  case COMMON_CHAT_FORMAT_PEG_CONSTRUCTED: return "peg-constructed";
@@ -2504,6 +2505,27 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp
2504
2505
  return data;
2505
2506
  }
2506
2507
 
2508
+ static common_chat_params common_chat_params_init_solar_open(const common_chat_template & tmpl, const struct templates_params & inputs) {
2509
+ common_chat_params data;
2510
+
2511
+ // TODO: Reasoning effort
2512
+ json additional_context = {};
2513
+
2514
+ data.prompt = apply(tmpl, inputs, std::nullopt, std::nullopt, additional_context);
2515
+ data.format = COMMON_CHAT_FORMAT_SOLAR_OPEN;
2516
+
2517
+ data.preserved_tokens = {
2518
+ "<|think|>",
2519
+ "<|content|>",
2520
+ "<|begin|>",
2521
+ "<|end|>",
2522
+ };
2523
+
2524
+ // TODO: Tool calling
2525
+
2526
+ return data;
2527
+ }
2528
+
2507
2529
  static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
2508
2530
  common_chat_params data;
2509
2531
  data.prompt = apply(tmpl, inputs);
@@ -2767,6 +2789,13 @@ static common_chat_params common_chat_templates_apply_jinja(
2767
2789
  return common_chat_params_init_magistral(tmpl, params);
2768
2790
  }
2769
2791
 
2792
+ // Solar Open
2793
+ if (src.find("<|tool_response:begin|>") != std::string::npos &&
2794
+ src.find("<|tool_response:name|>") != std::string::npos &&
2795
+ src.find("<|tool_response:result|>") != std::string::npos) {
2796
+ return common_chat_params_init_solar_open(tmpl, params);
2797
+ }
2798
+
2770
2799
  // Plain handler (no tools)
2771
2800
  if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
2772
2801
  return common_chat_params_init_without_tools(tmpl, params);
@@ -135,6 +135,7 @@ enum common_chat_format {
135
135
  COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
136
136
  COMMON_CHAT_FORMAT_APRIEL_1_5,
137
137
  COMMON_CHAT_FORMAT_XIAOMI_MIMO,
138
+ COMMON_CHAT_FORMAT_SOLAR_OPEN,
138
139
 
139
140
  // These are intended to be parsed by the PEG parser
140
141
  COMMON_CHAT_FORMAT_PEG_SIMPLE,
@@ -251,7 +251,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
251
251
  case GGML_SCHED_PRIO_REALTIME: p = -20; break;
252
252
  }
253
253
 
254
- if (!setpriority(PRIO_PROCESS, 0, p)) {
254
+ if (setpriority(PRIO_PROCESS, 0, p) != 0) {
255
255
  LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
256
256
  return false;
257
257
  }
@@ -1078,6 +1078,8 @@ struct common_init_result::impl {
1078
1078
  impl() = default;
1079
1079
  ~impl() = default;
1080
1080
 
1081
+ // note: the order in which model, context, etc. are declared matters because their destructors will be called bottom-to-top
1082
+
1081
1083
  llama_model_ptr model;
1082
1084
  llama_context_ptr context;
1083
1085
 
@@ -1107,6 +1109,25 @@ common_init_result::common_init_result(common_params & params) :
1107
1109
 
1108
1110
  const llama_vocab * vocab = llama_model_get_vocab(model);
1109
1111
 
1112
+ // load and optionally apply lora adapters (must be loaded before context creation)
1113
+ for (auto & la : params.lora_adapters) {
1114
+ llama_adapter_lora_ptr lora;
1115
+ lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1116
+ if (lora == nullptr) {
1117
+ LOG_ERR("%s: failed to load lora adapter '%s'\n", __func__, la.path.c_str());
1118
+ pimpl->model.reset(model);
1119
+ return;
1120
+ }
1121
+
1122
+ char buf[1024];
1123
+ la.ptr = lora.get();
1124
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1125
+ la.task_name = buf;
1126
+ llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1127
+ la.prompt_prefix = buf;
1128
+ pimpl->lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
1129
+ }
1130
+
1110
1131
  // updates params.sampling
1111
1132
  // TODO: fix naming
1112
1133
  common_init_sampler_from_model(model, params.sampling);
@@ -1243,24 +1264,6 @@ common_init_result_ptr common_init_from_params(common_params & params) {
1243
1264
  }
1244
1265
  }
1245
1266
 
1246
- // load and optionally apply lora adapters
1247
- for (auto & la : params.lora_adapters) {
1248
- llama_adapter_lora_ptr lora;
1249
- lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
1250
- if (lora == nullptr) {
1251
- LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
1252
- return res;
1253
- }
1254
-
1255
- char buf[1024];
1256
- la.ptr = lora.get();
1257
- llama_adapter_meta_val_str(la.ptr, "adapter.lora.task_name", buf, sizeof(buf));
1258
- la.task_name = buf;
1259
- llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
1260
- la.prompt_prefix = buf;
1261
- res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
1262
- }
1263
-
1264
1267
  if (!params.lora_init_without_apply) {
1265
1268
  common_set_adapter_lora(lctx, params.lora_adapters);
1266
1269
  }
@@ -1339,11 +1342,8 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1339
1342
  mparams.devices = params.devices.data();
1340
1343
  }
1341
1344
 
1342
- if (params.n_gpu_layers != -1) {
1343
- mparams.n_gpu_layers = params.n_gpu_layers;
1344
- }
1345
-
1346
1345
  mparams.vocab_only = params.vocab_only;
1346
+ mparams.n_gpu_layers = params.n_gpu_layers;
1347
1347
  mparams.main_gpu = params.main_gpu;
1348
1348
  mparams.split_mode = params.split_mode;
1349
1349
  mparams.tensor_split = params.tensor_split;
@@ -330,7 +330,7 @@ struct common_params {
330
330
  // offload params
331
331
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
332
332
 
333
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
333
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
334
334
  int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
335
335
  float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
336
336
  bool fit_params = true; // whether to fit unset model/context parameters to free device memory
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
4
4
  ### GGML Version
5
5
  set(GGML_VERSION_MAJOR 0)
6
6
  set(GGML_VERSION_MINOR 9)
7
- set(GGML_VERSION_PATCH 4)
7
+ set(GGML_VERSION_PATCH 5)
8
8
  set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
9
9
 
10
10
  find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -430,10 +430,22 @@ if (MSVC)
430
430
  configure_msvc_target(ggml-cpu-x64)
431
431
  configure_msvc_target(ggml-cpu-sse42)
432
432
  configure_msvc_target(ggml-cpu-sandybridge)
433
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
434
+ # skipping ggml-cpu-ivybridge
435
+ # skipping ggml-cpu-piledriver
433
436
  configure_msvc_target(ggml-cpu-haswell)
434
437
  configure_msvc_target(ggml-cpu-skylakex)
438
+ configure_msvc_target(ggml-cpu-cannonlake)
439
+ configure_msvc_target(ggml-cpu-cascadelake)
435
440
  configure_msvc_target(ggml-cpu-icelake)
441
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
442
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
443
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
444
+ # skipping ggml-cpu-cooperlake
445
+ # skipping ggml-cpu-zen4
436
446
  configure_msvc_target(ggml-cpu-alderlake)
447
+ # MSVC doesn't support AMX
448
+ # skipping ggml-cpu-sapphirerapids
437
449
 
438
450
  if (GGML_BUILD_EXAMPLES)
439
451
  configure_msvc_target(common-ggml)
@@ -358,7 +358,7 @@ extern "C" {
358
358
  typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
359
359
 
360
360
  // Compare the output of two backends
361
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node);
361
+ GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes);
362
362
 
363
363
  // Tensor initialization
364
364
  GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
@@ -357,15 +357,29 @@ if (GGML_CPU_ALL_VARIANTS)
357
357
  endif()
358
358
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
359
359
  ggml_add_cpu_backend_variant(x64)
360
- ggml_add_cpu_backend_variant(sse42 SSE42)
361
- ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
- ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C AVX2 BMI2 FMA)
363
- ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
364
- ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
365
- ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
360
+ ggml_add_cpu_backend_variant(sse42 SSE42)
361
+ ggml_add_cpu_backend_variant(sandybridge SSE42 AVX)
362
+ if (NOT MSVC)
363
+ # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
364
+ ggml_add_cpu_backend_variant(ivybridge SSE42 AVX F16C)
365
+ ggml_add_cpu_backend_variant(piledriver SSE42 AVX F16C FMA)
366
+ endif()
367
+ ggml_add_cpu_backend_variant(haswell SSE42 AVX F16C FMA AVX2 BMI2)
368
+ ggml_add_cpu_backend_variant(skylakex SSE42 AVX F16C FMA AVX2 BMI2 AVX512)
369
+ ggml_add_cpu_backend_variant(cannonlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI)
370
+ ggml_add_cpu_backend_variant(cascadelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI)
371
+ ggml_add_cpu_backend_variant(icelake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI)
372
+ if (NOT MSVC)
373
+ # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
374
+ # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
375
+ # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
376
+ ggml_add_cpu_backend_variant(cooperlake SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VNNI AVX512_BF16)
377
+ ggml_add_cpu_backend_variant(zen4 SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16)
378
+ endif()
379
+ ggml_add_cpu_backend_variant(alderlake SSE42 AVX F16C FMA AVX2 BMI2 AVX_VNNI)
366
380
  if (NOT MSVC)
367
381
  # MSVC doesn't support AMX
368
- ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
382
+ ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C FMA AVX2 BMI2 AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
369
383
  endif()
370
384
  elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
371
385
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -387,8 +401,8 @@ if (GGML_CPU_ALL_VARIANTS)
387
401
  ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
388
402
  ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
389
403
  ggml_add_cpu_backend_variant(android_armv9.0_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE2)
390
- ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SME)
391
- ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
404
+ ggml_add_cpu_backend_variant(android_armv9.2_1 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SME)
405
+ ggml_add_cpu_backend_variant(android_armv9.2_2 DOTPROD MATMUL_INT8 FP16_VECTOR_ARITHMETIC SVE SVE2 SME)
392
406
  elseif (APPLE)
393
407
  ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
394
408
  ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
@@ -561,9 +561,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
561
561
 
562
562
  # Fetch KleidiAI sources:
563
563
  include(FetchContent)
564
- set(KLEIDIAI_COMMIT_TAG "v1.14.0")
564
+ set(KLEIDIAI_COMMIT_TAG "v1.16.0")
565
565
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
566
- set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
566
+ set(KLEIDIAI_ARCHIVE_MD5 "0a9e9008adb6031f9e8cf70dff4a3321")
567
567
 
568
568
  if (POLICY CMP0135)
569
569
  cmake_policy(SET CMP0135 NEW)
@@ -615,6 +615,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
615
615
  string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
616
616
  string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
617
617
  string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
618
+ string(FIND "${ARCH_FLAGS_TEMP}" "+sve" SVE_ENABLED)
618
619
 
619
620
  set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})
620
621
 
@@ -659,6 +660,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
659
660
  set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
660
661
  endif()
661
662
 
663
+ if (NOT SVE_ENABLED MATCHES -1)
664
+ list(APPEND GGML_KLEIDIAI_SOURCES
665
+ ${KLEIDIAI_SRC}/kai/kai_common_sve_asm.S
666
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod_asm.S
667
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p8x8_1x8_sve_dotprod.c
668
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm_asm.S
669
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p8x8_16x8_sve_i8mm.c)
670
+ endif()
671
+
662
672
  set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
663
673
  list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
664
674
  endif()
@@ -328,7 +328,7 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
328
328
 
329
329
  #if defined(_MSC_VER) || defined(__MINGW32__)
330
330
  #include <intrin.h>
331
- #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
331
+ #elif defined(__SSE__) || defined(__SSE3__) || defined(__SSSE3__) || defined(__AVX__) || defined(__F16C__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX512BF16__)
332
332
  #include <immintrin.h>
333
333
  #endif
334
334