@fugood/llama.node 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/package.json +14 -14
  2. package/scripts/llama.cpp.patch +33 -11
  3. package/src/llama.cpp/CMakeLists.txt +1 -0
  4. package/src/llama.cpp/common/CMakeLists.txt +46 -2
  5. package/src/llama.cpp/common/arg.cpp +423 -186
  6. package/src/llama.cpp/common/arg.h +0 -1
  7. package/src/llama.cpp/common/chat-parser.cpp +154 -13
  8. package/src/llama.cpp/common/chat-parser.h +3 -0
  9. package/src/llama.cpp/common/chat.cpp +217 -6
  10. package/src/llama.cpp/common/chat.h +5 -3
  11. package/src/llama.cpp/common/common.cpp +23 -6
  12. package/src/llama.cpp/common/common.h +6 -4
  13. package/src/llama.cpp/common/http.h +73 -0
  14. package/src/llama.cpp/common/sampling.cpp +1 -0
  15. package/src/llama.cpp/ggml/CMakeLists.txt +7 -6
  16. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -1
  17. package/src/llama.cpp/ggml/include/ggml-rpc.h +8 -9
  18. package/src/llama.cpp/ggml/include/ggml-zdnn.h +3 -0
  19. package/src/llama.cpp/ggml/include/ggml.h +22 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +12 -12
  24. package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +100 -3
  25. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +2 -2
  26. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
  27. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +18 -3
  28. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -0
  29. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +209 -96
  30. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +32 -44
  31. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +107 -83
  32. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +27 -19
  33. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +8 -8
  34. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1024 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +103 -0
  39. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +1 -0
  40. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +66 -0
  41. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +6 -5
  42. package/src/llama.cpp/include/llama.h +23 -11
  43. package/src/llama.cpp/src/llama-arch.cpp +93 -0
  44. package/src/llama.cpp/src/llama-arch.h +22 -0
  45. package/src/llama.cpp/src/llama-chat.cpp +1 -1
  46. package/src/llama.cpp/src/llama-context.cpp +157 -0
  47. package/src/llama.cpp/src/llama-context.h +10 -0
  48. package/src/llama.cpp/src/llama-graph.cpp +57 -22
  49. package/src/llama.cpp/src/llama-graph.h +10 -1
  50. package/src/llama.cpp/src/llama-hparams.h +17 -2
  51. package/src/llama.cpp/src/llama-kv-cache-iswa.cpp +10 -2
  52. package/src/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
  53. package/src/llama.cpp/src/llama-kv-cache.cpp +10 -5
  54. package/src/llama.cpp/src/llama-kv-cache.h +2 -0
  55. package/src/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
  56. package/src/llama.cpp/src/llama-memory-hybrid.h +2 -0
  57. package/src/llama.cpp/src/llama-memory-recurrent.cpp +19 -3
  58. package/src/llama.cpp/src/llama-memory-recurrent.h +3 -0
  59. package/src/llama.cpp/src/llama-memory.h +3 -0
  60. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  61. package/src/llama.cpp/src/llama-model.cpp +582 -45
  62. package/src/llama.cpp/src/llama-model.h +23 -1
  63. package/src/llama.cpp/src/llama-sampling.cpp +5 -0
  64. package/src/llama.cpp/src/llama-vocab.cpp +7 -1
  65. package/src/llama.cpp/src/llama-vocab.h +41 -40
  66. package/src/llama.cpp/src/unicode.h +43 -0
@@ -379,7 +379,7 @@ struct common_params {
379
379
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
380
380
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
381
381
  bool no_perf = false; // disable performance metrics
382
- bool ctx_shift = false; // context shift on infinite text generation
382
+ bool ctx_shift = false; // context shift on infinite text generation
383
383
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
384
384
  bool kv_unified = false; // enable unified KV cache
385
385
 
@@ -393,6 +393,7 @@ struct common_params {
393
393
  bool check_tensors = false; // validate tensor data
394
394
  bool no_op_offload = false; // globally disable offload host tensor operations to device
395
395
  bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking)
396
+ bool no_host = false; // bypass host buffer allowing extra buffers to be used
396
397
 
397
398
  bool single_turn = false; // single turn chat conversation
398
399
 
@@ -425,7 +426,8 @@ struct common_params {
425
426
  int32_t timeout_write = timeout_read; // http write timeout in seconds
426
427
  int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
427
428
  int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
428
- int32_t n_swa_checkpoints = 3; // max number of SWA checkpoints per slot
429
+ int32_t n_ctx_checkpoints = 8; // max number of context checkpoints per slot
430
+ int32_t cache_ram_mib = 8192; // 0 = no limit, 1 = 1 MiB, etc.
429
431
 
430
432
  std::string hostname = "127.0.0.1";
431
433
  std::string public_path = ""; // NOLINT
@@ -433,7 +435,7 @@ struct common_params {
433
435
  std::string chat_template = ""; // NOLINT
434
436
  bool use_jinja = false; // NOLINT
435
437
  bool enable_chat_template = true;
436
- common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
438
+ common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
437
439
  int reasoning_budget = -1;
438
440
  bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
439
441
 
@@ -739,7 +741,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
739
741
  // MoE utils
740
742
  //
741
743
 
742
- const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
744
+ const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
743
745
 
744
746
  static std::string llm_ffn_exps_block_regex(int idx) {
745
747
  return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
@@ -0,0 +1,73 @@
1
+ #pragma once
2
+
3
+ #include <cpp-httplib/httplib.h>
4
+
5
+ struct common_http_url {
6
+ std::string scheme;
7
+ std::string user;
8
+ std::string password;
9
+ std::string host;
10
+ std::string path;
11
+ };
12
+
13
+ static common_http_url common_http_parse_url(const std::string & url) {
14
+ common_http_url parts;
15
+ auto scheme_end = url.find("://");
16
+
17
+ if (scheme_end == std::string::npos) {
18
+ throw std::runtime_error("invalid URL: no scheme");
19
+ }
20
+ parts.scheme = url.substr(0, scheme_end);
21
+
22
+ if (parts.scheme != "http" && parts.scheme != "https") {
23
+ throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
24
+ }
25
+
26
+ auto rest = url.substr(scheme_end + 3);
27
+ auto at_pos = rest.find('@');
28
+
29
+ if (at_pos != std::string::npos) {
30
+ auto auth = rest.substr(0, at_pos);
31
+ auto colon_pos = auth.find(':');
32
+ if (colon_pos != std::string::npos) {
33
+ parts.user = auth.substr(0, colon_pos);
34
+ parts.password = auth.substr(colon_pos + 1);
35
+ } else {
36
+ parts.user = auth;
37
+ }
38
+ rest = rest.substr(at_pos + 1);
39
+ }
40
+
41
+ auto slash_pos = rest.find('/');
42
+
43
+ if (slash_pos != std::string::npos) {
44
+ parts.host = rest.substr(0, slash_pos);
45
+ parts.path = rest.substr(slash_pos);
46
+ } else {
47
+ parts.host = rest;
48
+ parts.path = "/";
49
+ }
50
+ return parts;
51
+ }
52
+
53
+ static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
54
+ common_http_url parts = common_http_parse_url(url);
55
+
56
+ if (parts.host.empty()) {
57
+ throw std::runtime_error("error: invalid URL format");
58
+ }
59
+
60
+ httplib::Client cli(parts.scheme + "://" + parts.host);
61
+
62
+ if (!parts.user.empty()) {
63
+ cli.set_basic_auth(parts.user, parts.password);
64
+ }
65
+
66
+ cli.set_follow_location(true);
67
+
68
+ return { std::move(cli), std::move(parts) };
69
+ }
70
+
71
+ static std::string common_http_show_masked_url(const common_http_url & parts) {
72
+ return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
73
+ }
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
332
332
  }
333
333
  if (ctx) {
334
334
  llama_perf_context_print(ctx);
335
+ llama_memory_breakdown_print(ctx);
335
336
  }
336
337
  }
337
338
 
@@ -4,8 +4,7 @@ project("ggml" C CXX ASM)
4
4
  ### GGML Version
5
5
  set(GGML_VERSION_MAJOR 0)
6
6
  set(GGML_VERSION_MINOR 9)
7
- set(GGML_VERSION_PATCH 0)
8
- set(GGML_VERSION_DEV "-dev") # "-dev" for development, "" for releases
7
+ set(GGML_VERSION_PATCH 4)
9
8
  set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
10
9
 
11
10
  find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
@@ -26,8 +25,8 @@ if(GIT_EXE)
26
25
  )
27
26
  endif()
28
27
 
29
- # Build the version string with optional -dev suffix and dirty flag
30
- set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
28
+ # Build the version string with optional dirty flag
29
+ set(GGML_VERSION "${GGML_VERSION_BASE}")
31
30
  if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
32
31
  set(GGML_VERSION "${GGML_VERSION}-dirty")
33
32
  endif()
@@ -177,7 +176,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
177
176
 
178
177
 
179
178
  if (MINGW)
180
- set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
179
+ set(GGML_WIN_VER "0xA00" CACHE STRING "ggml: Windows version")
181
180
  endif()
182
181
 
183
182
  # ggml core
@@ -210,7 +209,6 @@ option(GGML_HIP "ggml: use HIP"
210
209
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
211
210
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
212
211
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
213
- option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
214
212
  option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
215
213
  option(GGML_HIP_EXPORT_METRICS "ggml: enable kernel perf metrics output" OFF)
216
214
  option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
@@ -224,6 +222,9 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
224
222
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
225
223
  option(GGML_WEBGPU "ggml: use WebGPU" OFF)
226
224
  option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
225
+ option(GGML_WEBGPU_CPU_PROFILE "ggml: enable WebGPU profiling (CPU)" OFF)
226
+ option(GGML_WEBGPU_GPU_PROFILE "ggml: enable WebGPU profiling (GPU)" OFF)
227
+
227
228
  option(GGML_ZDNN "ggml: use zDNN" OFF)
228
229
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
229
230
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -215,6 +215,8 @@ extern "C" {
215
215
  // Backend registry
216
216
  //
217
217
 
218
+ GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
219
+
218
220
  GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
219
221
 
220
222
  // Backend (reg) enumeration
@@ -314,7 +316,8 @@ extern "C" {
314
316
  GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
315
317
  GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
316
318
 
317
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
319
+ GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
320
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
318
321
 
319
322
  GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
320
323
  GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
@@ -7,26 +7,25 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
- #define RPC_PROTO_MAJOR_VERSION 2
10
+ #define RPC_PROTO_MAJOR_VERSION 3
11
11
  #define RPC_PROTO_MINOR_VERSION 0
12
12
  #define RPC_PROTO_PATCH_VERSION 0
13
13
  #define GGML_RPC_MAX_SERVERS 16
14
14
 
15
15
  // backend API
16
- GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
16
+ GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
17
17
  GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
18
18
 
19
- GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
19
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
20
20
 
21
- GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
21
+ GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
22
22
 
23
- GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
24
- const char * cache_dir,
25
- size_t free_mem, size_t total_mem);
23
+ GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
24
+ size_t n_threads, size_t n_devices,
25
+ ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
26
26
 
27
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
28
-
29
- GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
28
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
30
29
 
31
30
  #ifdef __cplusplus
32
31
  }
@@ -7,6 +7,9 @@
7
7
  extern "C" {
8
8
  #endif
9
9
 
10
+ // device buffer
11
+ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
12
+
10
13
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
11
14
 
12
15
  #ifdef __cplusplus
@@ -237,6 +237,8 @@
237
237
  #define GGML_EXIT_SUCCESS 0
238
238
  #define GGML_EXIT_ABORTED 1
239
239
 
240
+ // TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
241
+ #define GGML_ROPE_TYPE_NORMAL 0
240
242
  #define GGML_ROPE_TYPE_NEOX 2
241
243
  #define GGML_ROPE_TYPE_MROPE 8
242
244
  #define GGML_ROPE_TYPE_VISION 24
@@ -574,6 +576,7 @@ extern "C" {
574
576
  GGML_UNARY_OP_HARDSIGMOID,
575
577
  GGML_UNARY_OP_EXP,
576
578
  GGML_UNARY_OP_GELU_ERF,
579
+ GGML_UNARY_OP_XIELU,
577
580
 
578
581
  GGML_UNARY_OP_COUNT,
579
582
  };
@@ -1148,6 +1151,18 @@ extern "C" {
1148
1151
  struct ggml_context * ctx,
1149
1152
  struct ggml_tensor * a);
1150
1153
 
1154
+ // xIELU activation function
1155
+ // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
1156
+ // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
1157
+ // that constrain the positive and negative source alpha values respectively
1158
+ GGML_API struct ggml_tensor * ggml_xielu(
1159
+ struct ggml_context * ctx,
1160
+ struct ggml_tensor * a,
1161
+ float alpha_n,
1162
+ float alpha_p,
1163
+ float beta,
1164
+ float eps);
1165
+
1151
1166
  // gated linear unit ops
1152
1167
  // A: n columns, r rows,
1153
1168
  // result is n / 2 columns, r rows,
@@ -1615,6 +1630,13 @@ extern "C" {
1615
1630
  float scale,
1616
1631
  float max_bias);
1617
1632
 
1633
+ GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
1634
+ struct ggml_context * ctx,
1635
+ struct ggml_tensor * a,
1636
+ struct ggml_tensor * mask,
1637
+ float scale,
1638
+ float max_bias);
1639
+
1618
1640
  GGML_API void ggml_soft_max_add_sinks(
1619
1641
  struct ggml_tensor * a,
1620
1642
  struct ggml_tensor * sinks);
@@ -145,6 +145,9 @@ endif()
145
145
  # which was introduced in POSIX.1-2008, forcing us to go higher
146
146
  if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
147
147
  add_compile_definitions(_XOPEN_SOURCE=700)
148
+ elseif (CMAKE_SYSTEM_NAME MATCHES "AIX")
149
+ # Don't define _XOPEN_SOURCE. We need _ALL_SOURCE, which is the default,
150
+ # in order to define _SC_PHYS_PAGES.
148
151
  else()
149
152
  add_compile_definitions(_XOPEN_SOURCE=600)
150
153
  endif()
@@ -439,6 +439,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
439
439
  ggml-cpu/arch/riscv/quants.c
440
440
  ggml-cpu/arch/riscv/repack.cpp
441
441
  )
442
+ if (GGML_CPU_RISCV64_SPACEMIT)
443
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_RISCV64_SPACEMIT ${RISCV64_SPACEMIT_IME_SPEC})
444
+ list(APPEND GGML_CPU_SOURCES
445
+ ggml-cpu/spacemit/ime.cpp
446
+ ggml-cpu/spacemit/ime.h
447
+ ggml-cpu/spacemit/ime1_kernels.cpp
448
+ ggml-cpu/spacemit/ime_kernels.h
449
+ )
450
+ endif()
442
451
  set(MARCH_STR "rv64gc")
443
452
  if (GGML_RV_ZFH)
444
453
  string(APPEND MARCH_STR "_zfh")
@@ -504,9 +513,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
504
513
 
505
514
  # Fetch KleidiAI sources:
506
515
  include(FetchContent)
507
- set(KLEIDIAI_COMMIT_TAG "v1.13.0")
516
+ set(KLEIDIAI_COMMIT_TAG "v1.14.0")
508
517
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
509
- set(KLEIDIAI_ARCHIVE_MD5 "d82a8de939d9814621a5ba23907bdac1")
518
+ set(KLEIDIAI_ARCHIVE_MD5 "45e110675d93f99f82c23a1afcca76bc")
510
519
 
511
520
  if (POLICY CMP0135)
512
521
  cmake_policy(SET CMP0135 NEW)
@@ -583,6 +592,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
583
592
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
584
593
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
585
594
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
595
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa_asm.S
586
596
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
587
597
  ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c
588
598
  ${KLEIDIAI_SRC}/kai/kai_common_sme_asm.S)
@@ -149,6 +149,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
149
149
  if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous
150
150
  is_contiguous_2d(op->src[1]) && // src1 must be contiguous
151
151
  op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() &&
152
+ op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315)
152
153
  op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x
153
154
  (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) {
154
155
  // src1 must be host buffer
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
105
105
 
106
106
  return ((v4f32)res)[0];
107
107
  }
108
+
109
+ // multiply int8_t, add results pairwise twice
110
+ static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
111
+ // Get absolute values of x vectors
112
+ const __m128i ax = __lsx_vsigncov_b(x, x);
113
+ // Sign the values of the y vectors
114
+ const __m128i sy = __lsx_vsigncov_b(x, y);
115
+ // Perform multiplication and create 16-bit values
116
+ const __m128i dot = lsx_maddubs_h(ax, sy);
117
+ const __m128i ones = __lsx_vreplgr2vr_h(1);
118
+ return lsx_madd_h(ones, dot);
119
+ }
108
120
  #endif
109
121
 
110
122
  #if defined(__loongarch_asx)
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
323
335
  }
324
336
  }
325
337
 
326
- // multiply int8_t, add results pairwise twice
327
- static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
328
- // Get absolute values of x vectors
329
- const __m128i ax = __lsx_vsigncov_b(x, x);
330
- // Sign the values of the y vectors
331
- const __m128i sy = __lsx_vsigncov_b(x, y);
332
- // Perform multiplication and create 16-bit values
333
- const __m128i dot = lsx_maddubs_h(ax, sy);
334
- const __m128i ones = __lsx_vreplgr2vr_h(1);
335
- return lsx_madd_h(ones, dot);
336
- }
337
-
338
338
  // horizontally add 8 floats
339
339
  static inline float hsum_float_8(const __m256 x) {
340
340
  __m128 res = lasx_extractf128(x, 1);
@@ -75,7 +75,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
75
75
 
76
76
  for (int j = 0; j < 8; j++) {
77
77
  const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
78
- const int32x4_t vi = vec_signed(v);
78
+ /* Uses non-default rounding for vec_signed or vec_round */
79
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
79
80
 
80
81
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
81
82
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -122,7 +123,8 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
122
123
 
123
124
  for (int j = 0; j < 8; j++) {
124
125
  const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
125
- const int32x4_t vi = vec_signed(v);
126
+ /* Uses non-default rounding for vec_signed or vec_round */
127
+ const int32x4_t vi = vec_signed(__builtin_s390_vfisb(v, 4, 1));
126
128
 
127
129
  y[i].qs[4*j + 0] = vec_extract(vi, 0);
128
130
  y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -260,6 +262,101 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
260
262
  #endif
261
263
  }
262
264
 
265
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
266
+ assert(nrc == 1);
267
+ UNUSED(nrc);
268
+ UNUSED(bx);
269
+ UNUSED(by);
270
+ UNUSED(bs);
271
+ assert(n % QK_MXFP4 == 0);
272
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
273
+
274
+ const int qk = QK_MXFP4;
275
+ const int nb = n / qk;
276
+
277
+ const block_mxfp4 * GGML_RESTRICT x = vx;
278
+ const block_q8_0 * GGML_RESTRICT y = vy;
279
+
280
+ int ib = 0;
281
+ float sumf = 0.0f;
282
+
283
+ #if defined(__VXE__) || defined(__VXE2__)
284
+ const int8x16_t v_k = vec_xl(0, kvalues_mxfp4);
285
+ const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
286
+
287
+ float32x4_t v_acc = vec_splats(0.0f);
288
+
289
+ #pragma GCC unroll 8
290
+ for (; ib + 1 < nb; ib += 2) {
291
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
292
+ const block_mxfp4 * GGML_RESTRICT x1 = &x[ib + 1];
293
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
294
+ const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1];
295
+
296
+ const uint8x16_t v_x0 = vec_xl(0, x0->qs);
297
+ const uint8x16_t v_x1 = vec_xl(0, x1->qs);
298
+
299
+ int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m);
300
+ int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4);
301
+ int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m);
302
+ int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4);
303
+
304
+ v_x0l = vec_perm(v_k, v_k, (uchar8x16_t)v_x0l);
305
+ v_x0h = vec_perm(v_k, v_k, (uchar8x16_t)v_x0h);
306
+ v_x1l = vec_perm(v_k, v_k, (uchar8x16_t)v_x1l);
307
+ v_x1h = vec_perm(v_k, v_k, (uchar8x16_t)v_x1h);
308
+
309
+ const int8x16_t v_y0l = vec_xl(0, y0->qs);
310
+ const int8x16_t v_y0h = vec_xl(QK8_0/2, y0->qs);
311
+ const int8x16_t v_y1l = vec_xl(0, y1->qs);
312
+ const int8x16_t v_y1h = vec_xl(QK8_0/2, y1->qs);
313
+
314
+ const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0l, v_y0l), v_x0h, v_y0h);
315
+ const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1l, v_y1l), v_x1h, v_y1h);
316
+
317
+ const float32x4_t v_xy0f = vec_float(v_xy0);
318
+ const float32x4_t v_xy1f = vec_float(v_xy1);
319
+
320
+ const float32x4_t v_d0 = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
321
+ const float32x4_t v_d1 = vec_splats(GGML_E8M0_TO_FP32_HALF(x1->e) * GGML_CPU_FP16_TO_FP32(y1->d));
322
+
323
+ v_acc = vec_madd(v_xy0f, v_d0, v_acc);
324
+ v_acc = vec_madd(v_xy1f, v_d1, v_acc);
325
+ }
326
+
327
+ for (; ib < nb; ++ib) {
328
+ const block_mxfp4 * GGML_RESTRICT x0 = &x[ib + 0];
329
+ const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0];
330
+
331
+ const uint8x16_t v_x = vec_xl(0, x0->qs);
332
+
333
+ int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m);
334
+ int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4);
335
+
336
+ v_xl = vec_perm(v_k, v_k, (uchar8x16_t)v_xl);
337
+ v_xh = vec_perm(v_k, v_k, (uchar8x16_t)v_xh);
338
+
339
+ const int8x16_t v_yl = vec_xl(0, y0->qs);
340
+ const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
341
+
342
+ const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
343
+ const float32x4_t v_xyf = vec_float(v_xy);
344
+
345
+ const float32x4_t v_d = vec_splats(GGML_E8M0_TO_FP32_HALF(x0->e) * GGML_CPU_FP16_TO_FP32(y0->d));
346
+ v_acc = vec_madd(v_xyf, v_d, v_acc);
347
+ }
348
+
349
+ sumf = vec_hsum_f32x4(v_acc);
350
+ *s = sumf;
351
+ #else
352
+ UNUSED(x);
353
+ UNUSED(y);
354
+ UNUSED(ib);
355
+ UNUSED(sumf);
356
+ ggml_vec_dot_mxfp4_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
357
+ #endif
358
+ }
359
+
263
360
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
264
361
  const int qk = QK8_0;
265
362
  const int nb = n / qk;
@@ -636,7 +733,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
636
733
  uint8x16_t q3h[4];
637
734
  uint8x16_t q3b[2];
638
735
  int8x16_t q3bytes[4];
639
- int8x16_t q8bytes[4];
736
+ int8x16_t q8bytes[8];
640
737
  uint8x16_t qhbits[2];
641
738
 
642
739
  float sum = 0;
@@ -878,7 +878,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
878
878
  const __m256i rhs_raw_mat_89AB_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 64));
879
879
  const __m256i rhs_raw_mat_CDEF_1 = _mm256_loadu_si256((const __m256i *)(b_ptr_1[b].qs + 96));
880
880
 
881
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
881
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
882
882
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
883
883
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
884
884
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -1231,7 +1231,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
1231
1231
  const __m256i rhs_raw_mat_0123_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 64));
1232
1232
  const __m256i rhs_raw_mat_4567_1 = _mm256_loadu_si256((const __m256i *)(b_ptr[b].qs + 96));
1233
1233
 
1234
- // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of valuess
1234
+ // Save the values in the following vectors in the formats B0B1B4B5, B2B3B6B7 for further processing and storing of values
1235
1235
  const __m256i rhs_raw_mat_0145_0 = _mm256_blend_epi32(rhs_raw_mat_0123_0, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_0, requiredOrder), 240);
1236
1236
  const __m256i rhs_raw_mat_2367_0 = _mm256_blend_epi32(_mm256_permutevar8x32_epi32(rhs_raw_mat_0123_0, requiredOrder), rhs_raw_mat_4567_0, 240);
1237
1237
  const __m256i rhs_raw_mat_0145_1 = _mm256_blend_epi32(rhs_raw_mat_0123_1, _mm256_permutevar8x32_epi32(rhs_raw_mat_4567_1, requiredOrder), 240);
@@ -160,7 +160,6 @@
160
160
  #define ggml_vec_dot_iq3_s_q8_K_generic ggml_vec_dot_iq3_s_q8_K
161
161
  #define ggml_vec_dot_iq1_s_q8_K_generic ggml_vec_dot_iq1_s_q8_K
162
162
  #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K
163
- #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
164
163
  // repack.cpp
165
164
  #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
166
165
  #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8
@@ -473,10 +473,10 @@ struct ggml_threadpool {
473
473
  struct ggml_compute_state {
474
474
  #ifndef GGML_USE_OPENMP
475
475
  ggml_thread_t thrd;
476
- bool cpumask[GGML_MAX_N_THREADS];
477
476
  int last_graph;
478
477
  bool pending;
479
478
  #endif
479
+ bool cpumask[GGML_MAX_N_THREADS];
480
480
  struct ggml_threadpool * threadpool;
481
481
  int ith;
482
482
  };
@@ -2187,6 +2187,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2187
2187
  case GGML_UNARY_OP_GELU_ERF:
2188
2188
  case GGML_UNARY_OP_GELU_QUICK:
2189
2189
  case GGML_UNARY_OP_SILU:
2190
+ case GGML_UNARY_OP_XIELU:
2190
2191
  {
2191
2192
  n_tasks = n_threads;
2192
2193
  } break;
@@ -3081,7 +3082,14 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
3081
3082
 
3082
3083
  threadpool->workers = workers;
3083
3084
 
3084
- #ifndef GGML_USE_OPENMP
3085
+ #ifdef GGML_USE_OPENMP
3086
+ int32_t cpumask_iter = 0;
3087
+
3088
+ // Compute CPU masks for each thread
3089
+ for (int j = 0; j < tpp->n_threads; j++) {
3090
+ ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter);
3091
+ }
3092
+ #else // GGML_USE_OPENMP
3085
3093
  ggml_mutex_init(&threadpool->mutex);
3086
3094
  ggml_cond_init(&threadpool->cond);
3087
3095
 
@@ -3154,7 +3162,14 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
3154
3162
  atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
3155
3163
  }
3156
3164
 
3157
- ggml_graph_compute_thread(&threadpool->workers[omp_get_thread_num()]);
3165
+ // Apply thread CPU mask and priority
3166
+ int ith = omp_get_thread_num();
3167
+
3168
+ ggml_thread_apply_priority(threadpool->prio);
3169
+ if (ggml_thread_cpumask_is_valid(threadpool->workers[ith].cpumask)) {
3170
+ ggml_thread_apply_affinity(threadpool->workers[ith].cpumask);
3171
+ }
3172
+ ggml_graph_compute_thread(&threadpool->workers[ith]);
3158
3173
  }
3159
3174
  } else {
3160
3175
  atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
@@ -18,6 +18,10 @@
18
18
  # include "kleidiai/kleidiai.h"
19
19
  #endif
20
20
 
21
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
22
+ # include "spacemit/ime.h"
23
+ #endif
24
+
21
25
  #if defined(_WIN32)
22
26
  # define WIN32_LEAN_AND_MEAN
23
27
  # ifndef NOMINMAX
@@ -45,6 +49,12 @@ std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffer_type
45
49
  }
46
50
  #endif
47
51
 
52
+ #ifdef GGML_USE_CPU_RISCV64_SPACEMIT
53
+ if (ggml_backend_cpu_riscv64_spacemit_buffer_type()) {
54
+ bufts.push_back(ggml_backend_cpu_riscv64_spacemit_buffer_type());
55
+ }
56
+ #endif
57
+
48
58
  #ifdef GGML_USE_CPU_KLEIDIAI
49
59
  if (ggml_backend_cpu_kleidiai_buffer_type()) {
50
60
  bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());