@fugood/llama.node 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +484 -204
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +156 -15
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +22 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/json-partial.cpp +51 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  19. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  21. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  22. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  25. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  28. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  30. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +17 -17
  31. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  32. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +11 -9
  40. package/src/llama.cpp/include/llama.h +8 -0
  41. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  42. package/src/llama.cpp/src/llama-arch.h +22 -0
  43. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  44. package/src/llama.cpp/src/llama-context.cpp +6 -0
  45. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  46. package/src/llama.cpp/src/llama-graph.h +10 -1
  47. package/src/llama.cpp/src/llama-hparams.cpp +5 -1
  48. package/src/llama.cpp/src/llama-hparams.h +17 -2
  49. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +2 -2
  50. package/src/llama.cpp/src/llama-kv-cache.cpp +2 -5
  51. package/src/llama.cpp/src/llama-memory-hybrid.cpp +11 -9
  52. package/src/llama.cpp/src/llama-memory-recurrent.cpp +11 -3
  53. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  54. package/src/llama.cpp/src/llama-model.cpp +572 -45
  55. package/src/llama.cpp/src/llama-model.h +18 -0
  56. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  57. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  58. package/src/llama.cpp/src/llama-vocab.h +41 -40
  59. package/src/llama.cpp/src/unicode.h +43 -0
@@ -379,7 +379,7 @@ struct common_params {
379
379
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
380
380
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
381
381
  bool no_perf = false; // disable performance metrics
382
- bool ctx_shift = false; // context shift on infinite text generation
382
+ bool ctx_shift = false; // context shift on infinite text generation
383
383
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
384
384
  bool kv_unified = false; // enable unified KV cache
385
385
 
@@ -393,6 +393,7 @@ struct common_params {
393
393
  bool check_tensors = false; // validate tensor data
394
394
  bool no_op_offload = false; // globally disable offload host tensor operations to device
395
395
  bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
396
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
396
397
 
397
398
  bool single_turn = false; // single turn chat conversation
398
399
 
@@ -425,7 +426,8 @@ struct common_params {
425
426
  int32_t timeout_write = timeout_read; // http write timeout in seconds
426
427
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427
428
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428
- int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
429
+ int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
430
+ int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
429
431
 
430
432
  std::string hostname = "127.0.0.1";
431
433
  std::string public_path = ""; // NOLINT
@@ -433,7 +435,7 @@ struct common_params {
433
435
  std::string chat_template = ""; // NOLINT
434
436
  bool use_jinja = false; // NOLINT
435
437
  bool enable_chat_template = true;
436
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
438
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
437
439
  int reasoning_budget = -1;
438
440
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
439
441
 
@@ -739,7 +741,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
739
741
  // MoE utils
740
742
  //
741
743
 
742
- const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
744
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
743
745
 
744
746
  static std::string llm_ffn_exps_block_regex(int idx) {
745
747
  return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
@@ -0,0 +1,73 @@
1
+ #pragma once
2
+
3
+ #include <cpp-httplib/httplib.h>
4
+
5
+ struct common_http_url {
6
+ std::string scheme;
7
+ std::string user;
8
+ std::string password;
9
+ std::string host;
10
+ std::string path;
11
+ };
12
+
13
+ static common_http_url common_http_parse_url(const std::string & url) {
14
+ common_http_url parts;
15
+ auto scheme_end = url.find("://");
16
+
17
+ if (scheme_end == std::string::npos) {
18
+ throw std::runtime_error("invalid URL: no scheme");
19
+ }
20
+ parts.scheme = url.substr(0, scheme_end);
21
+
22
+ if (parts.scheme != "http" && parts.scheme != "https") {
23
+ throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
24
+ }
25
+
26
+ auto rest = url.substr(scheme_end + 3);
27
+ auto at_pos = rest.find('@');
28
+
29
+ if (at_pos != std::string::npos) {
30
+ auto auth = rest.substr(0, at_pos);
31
+ auto colon_pos = auth.find(':');
32
+ if (colon_pos != std::string::npos) {
33
+ parts.user = auth.substr(0, colon_pos);
34
+ parts.password = auth.substr(colon_pos + 1);
35
+ } else {
36
+ parts.user = auth;
37
+ }
38
+ rest = rest.substr(at_pos + 1);
39
+ }
40
+
41
+ auto slash_pos = rest.find('/');
42
+
43
+ if (slash_pos != std::string::npos) {
44
+ parts.host = rest.substr(0, slash_pos);
45
+ parts.path = rest.substr(slash_pos);
46
+ } else {
47
+ parts.host = rest;
48
+ parts.path = "/";
49
+ }
50
+ return parts;
51
+ }
52
+
53
+ static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
54
+ common_http_url parts = common_http_parse_url(url);
55
+
56
+ if (parts.host.empty()) {
57
+ throw std::runtime_error("error: invalid URL format");
58
+ }
59
+
60
+ httplib::Client cli(parts.scheme + "://" + parts.host);
61
+
62
+ if (!parts.user.empty()) {
63
+ cli.set_basic_auth(parts.user, parts.password);
64
+ }
65
+
66
+ cli.set_follow_location(true);
67
+
68
+ return { std::move(cli), std::move(parts) };
69
+ }
70
+
71
+ static std::string common_http_show_masked_url(const common_http_url & parts) {
72
+ return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
73
+ }
@@ -5,6 +5,7 @@
5
5
  #include <nlohmann/json.hpp>
6
6
 
7
7
  #include <string>
8
+ #include <regex>
8
9
 
9
10
  using json = nlohmann::ordered_json;
10
11
 
@@ -168,6 +169,47 @@ bool common_json_parse(
168
169
  }
169
170
  }
170
171
 
172
+ // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
173
+ static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
174
+
175
+ auto is_high_surrogate = [&](const std::string & s) {
176
+ // Check if a partial of a high surrogate (U+D800-U+DBFF)
177
+ return s.length() >= 4 &&
178
+ s[0] == '\\' && s[1] == 'u' &&
179
+ std::tolower(s[2]) == 'd' &&
180
+ (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
181
+ };
182
+
183
+ // Initialize the unicode marker to a low surrogate to handle the edge case
184
+ // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
185
+ // backslash (\)
186
+ std::string unicode_marker_padding = "udc00";
187
+ std::smatch last_unicode_seq;
188
+
189
+ if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
190
+ std::smatch second_last_seq;
191
+ std::string prelude = str.substr(0, last_unicode_seq.position());
192
+
193
+ // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
194
+ unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
195
+
196
+ if (is_high_surrogate(last_unicode_seq.str())) {
197
+ // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
198
+ unicode_marker_padding += "\\udc00";
199
+ } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
200
+ if (is_high_surrogate(second_last_seq.str())) {
201
+ // If this follows a high surrogate, pad it to be a low surrogate
202
+ if (last_unicode_seq.length() == 2) {
203
+ unicode_marker_padding = "dc00";
204
+ } else if (last_unicode_seq.length() == 3) {
205
+ unicode_marker_padding = "c00";
206
+ } else {
207
+ // The original unicode_marker_padding is already padded with 0s
208
+ }
209
+ }
210
+ }
211
+ }
212
+
171
213
  const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";
172
214
 
173
215
  if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
186
228
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
187
229
  // Was inside an object value string after an escape
188
230
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
231
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
232
+ // Was inside an object value string after a partial unicode escape
233
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
189
234
  } else {
190
235
  // find last :
191
236
  auto last_pos = str.find_last_of(':');
@@ -205,6 +250,9 @@ bool common_json_parse(
205
250
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
206
251
  // Was inside an array value string after an escape
207
252
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
253
+ } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
254
+ // Was inside an array value string after a partial unicode escape
255
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
208
256
  } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
209
257
  // Had just finished a value
210
258
  str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
@@ -230,6 +278,9 @@ bool common_json_parse(
230
278
  } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
231
279
  // Was inside an object key string after an escape
232
280
  str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
281
+ } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
282
+ // Was inside an object key string after a partial unicode escape
283
+ str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
233
284
  } else {
234
285
  auto last_pos = str.find_last_of(':');
235
286
  if (last_pos == std::string::npos) {
@@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
4
4
  ### GGML Version
5
5
  set(GGML_VERSION_MAJOR 0)
6
6
  set(GGML_VERSION_MINOR 9)
7
- set(GGML_VERSION_PATCH 0)
8
- set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
7
+ set(GGML_VERSION_PATCH 4)
9
8
  set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
10
9
 
11
10
  find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -26,8 +25,8 @@ if(GIT_EXE)
26
25
  )
27
26
  endif()
28
27
 
29
- # Build the version string with optional -dev suffix and dirty flag
30
- set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
28
+ # Build the version string with optional dirty flag
29
+ set(GGML_VERSION "${GGML_VERSION_BASE}")
31
30
  if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
32
31
  set(GGML_VERSION "${GGML_VERSION}-dirty")
33
32
  endif()
@@ -177,7 +176,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
177
176
 
178
177
 
179
178
  if (MINGW)
180
- set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
179
+ set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
181
180
  endif()
182
181
 
183
182
  # ggml core
@@ -210,7 +209,6 @@ option(GGML_HIP "ggml: use HIP"
210
209
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
211
210
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
212
211
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
213
- option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
214
212
  option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
215
213
  option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
216
214
  option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
@@ -224,6 +222,9 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
224
222
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
225
223
  option(GGML_WEBGPU "ggml: use WebGPU" OFF)
226
224
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
225
+ option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
226
+ option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
227
+
227
228
  option(GGML_ZDNN "ggml: use zDNN" OFF)
228
229
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
229
230
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -215,6 +215,8 @@ extern "C" {
215
215
  // Backend registry
216
216
  //
217
217
 
218
+ GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
219
+
218
220
  GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
219
221
 
220
222
  // Backend (reg) enumeration
@@ -7,26 +7,25 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- #define RPC_PROTO_MAJOR_VERSION 2
10
+ #define RPC_PROTO_MAJOR_VERSION 3
11
11
  #define RPC_PROTO_MINOR_VERSION 0
12
12
  #define RPC_PROTO_PATCH_VERSION 0
13
13
  #define GGML_RPC_MAX_SERVERS 16
14
14
 
15
15
  // backend API
16
- GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
16
+ GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
17
17
  GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
18
18
 
19
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
19
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
20
20
 
21
- GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
21
+ GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
22
22
 
23
- GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
24
- const char * cache_dir,
25
- size_t free_mem, size_t total_mem);
23
+ GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
24
+ size_t n_threads, size_t n_devices,
25
+ ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
26
26
 
27
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
28
-
29
- GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
28
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
30
29
 
31
30
  #ifdef __cplusplus
32
31
  }
@@ -237,6 +237,8 @@
237
237
  #define GGML_EXIT_SUCCESS 0
238
238
  #define GGML_EXIT_ABORTED 1
239
239
 
240
+ // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
241
+ #define GGML_ROPE_TYPE_NORMAL 0
240
242
  #define GGML_ROPE_TYPE_NEOX 2
241
243
  #define GGML_ROPE_TYPE_MROPE 8
242
244
  #define GGML_ROPE_TYPE_VISION 24
@@ -574,6 +576,7 @@ extern "C" {
574
576
  GGML_UNARY_OP_HARDSIGMOID,
575
577
  GGML_UNARY_OP_EXP,
576
578
  GGML_UNARY_OP_GELU_ERF,
579
+ GGML_UNARY_OP_XIELU,
577
580
 
578
581
  GGML_UNARY_OP_COUNT,
579
582
  };
@@ -1148,6 +1151,18 @@ extern "C" {
1148
1151
  struct ggml_context * ctx,
1149
1152
  struct ggml_tensor * a);
1150
1153
 
1154
+ // xIELU activation function
1155
+ // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
1156
+ // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1157
+ // that constrain the positive and negative source alpha values respectively
1158
+ GGML_API struct ggml_tensor * ggml_xielu(
1159
+ struct ggml_context * ctx,
1160
+ struct ggml_tensor * a,
1161
+ float alpha_n,
1162
+ float alpha_p,
1163
+ float beta,
1164
+ float eps);
1165
+
1151
1166
  // gated linear unit ops
1152
1167
  // A: n columns, r rows,
1153
1168
  // result is n / 2 columns, r rows,
@@ -1615,6 +1630,13 @@ extern "C" {
1615
1630
  float scale,
1616
1631
  float max_bias);
1617
1632
 
1633
+ GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1634
+ struct ggml_context * ctx,
1635
+ struct ggml_tensor * a,
1636
+ struct ggml_tensor * mask,
1637
+ float scale,
1638
+ float max_bias);
1639
+
1618
1640
  GGML_API void ggml_soft_max_add_sinks(
1619
1641
  struct ggml_tensor * a,
1620
1642
  struct ggml_tensor * sinks);
@@ -145,6 +145,9 @@ endif()
145
145
  # which was introduced in POSIX.1-2008, forcing us to go higher
146
146
  if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
147
147
  add_compile_definitions(_XOPEN_SOURCE=700)
148
+ elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
149
+ # Don't define _XOPEN_SOURCE. We need _ALL_SOURCE, which is the default,
150
+ # in order to define _SC_PHYS_PAGES.
148
151
  else()
149
152
  add_compile_definitions(_XOPEN_SOURCE=600)
150
153
  endif()
@@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
439
439
  ggml-cpu/arch/riscv/quants.c
440
440
  ggml-cpu/arch/riscv/repack.cpp
441
441
  )
442
+ if (GGML_CPU_RISCV64_SPACEMIT)
443
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
444
+ list(APPEND GGML_CPU_SOURCES
445
+ ggml-cpu/spacemit/ime.cpp
446
+ ggml-cpu/spacemit/ime.h
447
+ ggml-cpu/spacemit/ime1_kernels.cpp
448
+ ggml-cpu/spacemit/ime_kernels.h
449
+ )
450
+ endif()
442
451
  set(MARCH_STR "rv64gc")
443
452
  if (GGML_RV_ZFH)
444
453
  string(APPEND MARCH_STR "_zfh")
@@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
504
513
 
505
514
  # Fetch KleidiAI sources:
506
515
  include(FetchContent)
507
- set(KLEIDIAI_COMMIT_TAG "v1.13.0")
516
+ set(KLEIDIAI_COMMIT_TAG "v1.14.0")
508
517
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
509
- set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
518
+ set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
510
519
 
511
520
  if (POLICY CMP0135)
512
521
  cmake_policy(SET CMP0135 NEW)
@@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
583
592
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
584
593
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
585
594
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
595
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
586
596
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
587
597
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
588
598
  ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
149
149
  if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
150
150
  is_contiguous_2d(op->src[1]) && // src1 must be contiguous
151
151
  op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
152
+ op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
152
153
  op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
153
154
  (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
154
155
  // src1 must be host buffer
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
105
105
 
106
106
  return ((v4f32)res)[0];
107
107
  }
108
+
109
+ // multiply int8_t, add results pairwise twice
110
+ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
111
+ // Get absolute values of x vectors
112
+ const __m128i ax = __lsx_vsigncov_b(x, x);
113
+ // Sign the values of the y vectors
114
+ const __m128i sy = __lsx_vsigncov_b(x, y);
115
+ // Perform multiplication and create 16-bit values
116
+ const __m128i dot = lsx_maddubs_h(ax, sy);
117
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
118
+ return lsx_madd_h(ones, dot);
119
+ }
108
120
  #endif
109
121
 
110
122
  #if defined(__loongarch_asx)
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
323
335
  }
324
336
  }
325
337
 
326
- // multiply int8_t, add results pairwise twice
327
- static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
328
- // Get absolute values of x vectors
329
- const __m128i ax = __lsx_vsigncov_b(x, x);
330
- // Sign the values of the y vectors
331
- const __m128i sy = __lsx_vsigncov_b(x, y);
332
- // Perform multiplication and create 16-bit values
333
- const __m128i dot = lsx_maddubs_h(ax, sy);
334
- const __m128i ones = __lsx_vreplgr2vr_h(1);
335
- return lsx_madd_h(ones, dot);
336
- }
337
-
338
338
  // horizontally add 8 floats
339
339
  static inline float hsum_float_8(const __m256 x) {
340
340
  __m128 res = lasx_extractf128(x, 1);
@@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
75
75
 
76
76
  for (int j = 0; j < 8; j++) {
77
77
  const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
- const int32x4_t vi = vec_signed(v);
78
+ /* Uses non-default rounding for vec_signed or vec_round */
79
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
79
80
 
80
81
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
81
82
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
122
123
 
123
124
  for (int j = 0; j < 8; j++) {
124
125
  const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125
- const int32x4_t vi = vec_signed(v);
126
+ /* Uses non-default rounding for vec_signed or vec_round */
127
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
126
128
 
127
129
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
128
130
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
260
262
  #endif
261
263
  }
262
264
 
265
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266
+ assert(nrc == 1);
267
+ UNUSED(nrc);
268
+ UNUSED(bx);
269
+ UNUSED(by);
270
+ UNUSED(bs);
271
+ assert(n % QK_MXFP4 == 0);
272
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
273
+
274
+ const int qk = QK_MXFP4;
275
+ const int nb = n / qk;
276
+
277
+ const block_mxfp4 * GGML_RESTRICT x = vx;
278
+ const block_q8_0 * GGML_RESTRICT y = vy;
279
+
280
+ int ib = 0;
281
+ float sumf = 0.0f;
282
+
283
+ #if defined(__VXE__) || defined(__VXE2__)
284
+ const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
285
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
286
+
287
+ float32x4_t v_acc = vec_splats(0.0f);
288
+
289
+ #pragma GCC unroll 8
290
+ for (; ib + 1 < nb; ib += 2) {
291
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
292
+ const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
293
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
294
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
295
+
296
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
297
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
298
+
299
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
300
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
301
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
302
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
303
+
304
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
305
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
306
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
307
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
308
+
309
+ const int8x16_t v_y0l = vec_xl(0, y0->qs);
310
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
311
+ const int8x16_t v_y1l = vec_xl(0, y1->qs);
312
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
313
+
314
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
315
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
316
+
317
+ const float32x4_t v_xy0f = vec_float(v_xy0);
318
+ const float32x4_t v_xy1f = vec_float(v_xy1);
319
+
320
+ const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
321
+ const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
322
+
323
+ v_acc = vec_madd(v_xy0f, v_d0, v_acc);
324
+ v_acc = vec_madd(v_xy1f, v_d1, v_acc);
325
+ }
326
+
327
+ for (; ib < nb; ++ib) {
328
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
329
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
330
+
331
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
332
+
333
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
334
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
335
+
336
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
337
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
338
+
339
+ const int8x16_t v_yl = vec_xl(0, y0->qs);
340
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
341
+
342
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
343
+ const float32x4_t v_xyf = vec_float(v_xy);
344
+
345
+ const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
346
+ v_acc = vec_madd(v_xyf, v_d, v_acc);
347
+ }
348
+
349
+ sumf = vec_hsum_f32x4(v_acc);
350
+ *s = sumf;
351
+ #else
352
+ UNUSED(x);
353
+ UNUSED(y);
354
+ UNUSED(ib);
355
+ UNUSED(sumf);
356
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
357
+ #endif
358
+ }
359
+
263
360
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
264
361
  const int qk = QK8_0;
265
362
  const int nb = n / qk;
@@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
636
733
  uint8x16_t q3h[4];
637
734
  uint8x16_t q3b[2];
638
735
  int8x16_t q3bytes[4];
639
- int8x16_t q8bytes[4];
736
+ int8x16_t q8bytes[8];
640
737
  uint8x16_t qhbits[2];
641
738
 
642
739
  float sum = 0;
@@ -160,7 +160,6 @@
160
160
  #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
161
161
  #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
162
162
  #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
163
- #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
164
163
  // repack.cpp
165
164
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
166
165
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2187
2187
  case GGML_UNARY_OP_GELU_ERF:
2188
2188
  case GGML_UNARY_OP_GELU_QUICK:
2189
2189
  case GGML_UNARY_OP_SILU:
2190
+ case GGML_UNARY_OP_XIELU:
2190
2191
  {
2191
2192
  n_tasks = n_threads;
2192
2193
  } break;
@@ -18,6 +18,10 @@
18
18
  # include "kleidiai/kleidiai.h"
19
19
  #endif
20
20
 
21
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
22
+ # include "spacemit/ime.h"
23
+ #endif
24
+
21
25
  #if defined(_WIN32)
22
26
  # define WIN32_LEAN_AND_MEAN
23
27
  # ifndef NOMINMAX
@@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
45
49
  }
46
50
  #endif
47
51
 
52
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
53
+ if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
54
+ bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
55
+ }
56
+ #endif
57
+
48
58
  #ifdef GGML_USE_CPU_KLEIDIAI
49
59
  if (ggml_backend_cpu_kleidiai_buffer_type()) {
50
60
  bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());