@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -796,11 +796,11 @@ static bool need_transform(ggml_type type) {
796
796
  * @param buffer The CANN buffer from which to initialize the tensor.
797
797
  * @param tensor Pointer to the tensor to be initialized.
798
798
  */
799
- static void ggml_backend_cann_buffer_init_tensor(
799
+ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
800
800
  ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
801
801
  if (tensor->view_src != NULL && tensor->view_offs == 0) {
802
802
  GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
803
- return;
803
+ return GGML_STATUS_SUCCESS;
804
804
  }
805
805
 
806
806
  // TODO: can backend doesn't support quantized yet. Just leave the code
@@ -817,6 +817,7 @@ static void ggml_backend_cann_buffer_init_tensor(
817
817
  memset_size, 0, memset_size));
818
818
  }
819
819
  }
820
+ return GGML_STATUS_SUCCESS;
820
821
  }
821
822
 
822
823
  // TODO: need handle tensor which has paddings.
@@ -1688,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
1688
1689
  case GGML_OP_MUL_MAT: {
1689
1690
  switch (op->src[0]->type) {
1690
1691
  case GGML_TYPE_Q8_0:
1691
- // Current groupsize should not be greater than k-1 in
1692
- // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
1693
- if (op->src[0]->ne[0] <= QK8_0) {
1694
- return false;
1695
- }
1696
1692
  case GGML_TYPE_F16:
1697
1693
  case GGML_TYPE_F32:
1698
1694
  case GGML_TYPE_Q4_0:
@@ -1,7 +1,5 @@
1
1
  #include "kernel_operator.h"
2
2
 
3
- #include <cmath>
4
-
5
3
  using namespace AscendC;
6
4
 
7
5
  #define BUFFER_NUM 2
@@ -183,7 +181,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
183
181
  copy_to_ub(output_ne_gm, output_ne_ub, 32);
184
182
  copy_to_ub(output_nb_gm, output_nb_ub, 32);
185
183
 
186
- DupByRows<float_t, float_t> op;
184
+ DupByRows<float, float> op;
187
185
  op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
188
186
  op.dup();
189
187
  }
@@ -206,7 +204,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
206
204
  copy_to_ub(output_ne_gm, output_ne_ub, 32);
207
205
  copy_to_ub(output_nb_gm, output_nb_ub, 32);
208
206
 
209
- DupByRows<float_t, half> op;
207
+ DupByRows<float, half> op;
210
208
  op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
211
209
  op.dup_with_cast();
212
210
  }
@@ -230,7 +228,7 @@ extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
230
228
  copy_to_ub(output_ne_gm, output_ne_ub, 32);
231
229
  copy_to_ub(output_nb_gm, output_nb_ub, 32);
232
230
 
233
- DupByRows<half, float_t> op;
231
+ DupByRows<half, float> op;
234
232
  op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
235
233
  op.dup_with_cast();
236
234
  }
@@ -111,14 +111,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
111
111
  function(check_arm_feature tag code)
112
112
  set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
113
113
  set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}")
114
- check_cxx_source_runs(
115
- "${code}"
116
- GGML_MACHINE_SUPPORTS_${tag}
117
- )
114
+ check_cxx_source_runs("${code}" GGML_MACHINE_SUPPORTS_${tag})
118
115
  if (GGML_MACHINE_SUPPORTS_${tag})
119
116
  set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE)
120
117
  else()
121
- set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
118
+ set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+no${tag}")
119
+ check_cxx_source_compiles("int main() { return 0; }" GGML_MACHINE_SUPPORTS_no${tag})
120
+ if (GGML_MACHINE_SUPPORTS_no${tag})
121
+ set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE)
122
+ endif()
122
123
  endif()
123
124
  set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
124
125
  endfunction()
@@ -126,6 +127,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
126
127
  check_arm_feature(dotprod "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }")
127
128
  check_arm_feature(i8mm "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }")
128
129
  check_arm_feature(sve "#include <arm_sve.h>\nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }")
130
+ check_arm_feature(sme "#include <arm_sme.h>\n__arm_locally_streaming int main() { __asm__ volatile(\"smstart; smstop;\"); return 0; }")
129
131
 
130
132
  list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}")
131
133
  else()
@@ -150,7 +152,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
150
152
  if (ARM_FEATURE_RESULT)
151
153
  message(WARNING "Failed to get ARM features")
152
154
  else()
153
- foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
155
+ foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
154
156
  string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
155
157
  if (NOT ${feature_pos} EQUAL -1)
156
158
  message(STATUS "ARM feature ${feature} enabled")
@@ -217,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
217
219
  if (GGML_AVX_VNNI)
218
220
  list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
219
221
  endif()
222
+ if (GGML_BMI2)
223
+ # MSVC does not define macro __BMI2__
224
+ list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
225
+ endif()
220
226
  else ()
221
227
  if (GGML_NATIVE)
222
228
  list(APPEND ARCH_FLAGS -march=native)
@@ -231,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
231
237
  list(APPEND ARCH_FLAGS -mfma)
232
238
  list(APPEND ARCH_DEFINITIONS GGML_FMA)
233
239
  endif()
240
+ if (GGML_BMI2)
241
+ list(APPEND ARCH_FLAGS -mbmi2)
242
+ list(APPEND ARCH_DEFINITIONS GGML_BMI2)
243
+ endif()
234
244
  if (GGML_AVX)
235
245
  list(APPEND ARCH_FLAGS -mavx)
236
246
  list(APPEND ARCH_DEFINITIONS GGML_AVX)
@@ -277,21 +287,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
277
287
  endif()
278
288
  endif()
279
289
  endif()
280
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
290
+ elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
281
291
  message(STATUS "PowerPC detected")
282
- execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
283
- string(FIND "${POWER10_M}" "POWER10" substring_index)
284
- if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
285
- set(substring_index -1)
292
+ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
293
+ file(READ "/proc/cpuinfo" POWER10_M)
294
+ elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
295
+ execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
286
296
  endif()
287
297
 
288
- if (${substring_index} GREATER_EQUAL 0)
289
- list(APPEND ARCH_FLAGS -mcpu=power10)
298
+ string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
299
+ string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
300
+
301
+ if (EXTRACTED_NUMBER GREATER_EQUAL 10)
302
+ list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
303
+ elseif (EXTRACTED_NUMBER EQUAL 9)
304
+ list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
290
305
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
291
- list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
306
+ list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
292
307
  else()
293
- list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
294
- # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
308
+ list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
295
309
  endif()
296
310
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
297
311
  message(STATUS "loongarch64 detected")
@@ -308,6 +322,27 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
308
322
  if (GGML_RVV)
309
323
  list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
310
324
  endif()
325
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
326
+ message(STATUS "s390x detected")
327
+ file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
328
+ string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
329
+
330
+ # TODO: Separation to determine activation of VX/VXE/VXE2
331
+ if (${S390X_M} MATCHES "8561|8562")
332
+ message(STATUS "z15 target")
333
+ list(APPEND ARCH_FLAGS -march=z15 -mtune=z15)
334
+ elseif (${S390X_M} MATCHES "3931")
335
+ message(STATUS "z16 target")
336
+ list(APPEND ARCH_FLAGS -march=z16 -mtune=z16)
337
+ else()
338
+ message(STATUS "Unknown target")
339
+ message(WARNING "Unknown target. If you are compiling for z14 and earlier, you might have to add -DGGML_VXE=OFF.")
340
+ list(APPEND ARCH_FLAGS -march=native -mtune=native)
341
+ endif()
342
+
343
+ if (GGML_VXE)
344
+ list(APPEND ARCH_FLAGS -mvx -mzvector)
345
+ endif()
311
346
  else()
312
347
  message(STATUS "Unknown architecture")
313
348
  endif()
@@ -316,6 +351,94 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
316
351
  target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
317
352
  endif()
318
353
 
354
+ if (GGML_CPU_KLEIDIAI)
355
+ message(STATUS "Using KleidiAI optimized kernels if applicable")
356
+
357
+ # Disable the KleidiAI tests
358
+ set(KLEIDIAI_BUILD_TESTS OFF)
359
+
360
+ # Fetch KleidiAI sources:
361
+ include(FetchContent)
362
+ set(KLEIDIAI_COMMIT_TAG "v1.3.0")
363
+ set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
364
+ set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
365
+
366
+ if (POLICY CMP0135)
367
+ cmake_policy(SET CMP0135 NEW)
368
+ endif()
369
+
370
+ FetchContent_Declare(KleidiAI_Download
371
+ URL ${KLEIDIAI_DOWNLOAD_URL}
372
+ DOWNLOAD_EXTRACT_TIMESTAMP NEW
373
+ URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
374
+
375
+ FetchContent_MakeAvailable(KleidiAI_Download)
376
+ FetchContent_GetProperties(KleidiAI_Download
377
+ SOURCE_DIR KLEIDIAI_SRC
378
+ POPULATED KLEIDIAI_POPULATED)
379
+
380
+ if (NOT KLEIDIAI_POPULATED)
381
+ message(FATAL_ERROR "KleidiAI source downloaded failed.")
382
+ endif()
383
+
384
+ add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
385
+
386
+ # Remove kleidiai target after fetching it
387
+ if (TARGET kleidiai)
388
+ set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
389
+ endif()
390
+
391
+ list(APPEND GGML_CPU_SOURCES
392
+ ggml-cpu/kleidiai/kleidiai.cpp
393
+ ggml-cpu/kleidiai/kernels.cpp
394
+ ggml-cpu/kleidiai/kleidiai.h
395
+ ggml-cpu/kleidiai/kernels.h
396
+ )
397
+
398
+ # KleidiAI
399
+ include_directories(
400
+ ${KLEIDIAI_SRC}/
401
+ ${KLEIDIAI_SRC}/kai/
402
+ ${KLEIDIAI_SRC}/kai/ukernels/
403
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/
404
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
405
+ ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)
406
+
407
+ set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
408
+ if (NOT ARCH_FLAGS_TEMP)
409
+ string(REGEX MATCH "-march=[^ ]+" ARCH_FLAGS_TEMP "${CMAKE_C_FLAGS}")
410
+ endif()
411
+ string(FIND "${ARCH_FLAGS_TEMP}" "+dotprod" DOTPROD_ENABLED)
412
+ string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
413
+ string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)
414
+
415
+ set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
416
+
417
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
418
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
419
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
420
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
421
+
422
+ if (NOT DOTPROD_ENABLED MATCHES -1)
423
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
424
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
425
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
426
+ endif()
427
+
428
+ if (NOT I8MM_ENABLED MATCHES -1)
429
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.c)
430
+ endif()
431
+
432
+ if (NOT SME_ENABLED MATCHES -1)
433
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
434
+ list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
435
+ set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
436
+ endif()
437
+
438
+ set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
439
+ list(APPEND GGML_CPU_SOURCES ${GGML_KLEIDIAI_SOURCES})
440
+ endif()
441
+
319
442
  message(STATUS "Adding CPU backend variant ${GGML_CPU_NAME}: ${ARCH_FLAGS} ${ARCH_DEFINITIONS}")
320
443
  target_sources(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_SOURCES})
321
444
  target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
@@ -50,10 +50,11 @@ static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
50
50
  return (void *) (buffer->context);
51
51
  }
52
52
 
53
- static void ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
53
+ static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
54
54
  tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor);
55
55
 
56
56
  GGML_UNUSED(buffer);
57
+ return GGML_STATUS_SUCCESS;
57
58
  }
58
59
 
59
60
  static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
278
278
  if (!is.SSE42()) { return 0; }
279
279
  score += 1<<2;
280
280
  #endif
281
+ #ifdef GGML_BMI2
282
+ if (!is.BMI2()) { return 0; }
283
+ score += 1<<3;
284
+ #endif
281
285
  #ifdef GGML_AVX
282
286
  if (!is.AVX()) { return 0; }
283
287
  score += 1<<4;
@@ -4135,10 +4135,11 @@ static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(con
4135
4135
  return nullptr;
4136
4136
  }
4137
4137
 
4138
- static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
4138
+ static enum ggml_status ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
4139
4139
  tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_aarch64_get_optimal_repack_type(tensor));
4140
4140
 
4141
4141
  GGML_UNUSED(buffer);
4142
+ return GGML_STATUS_SUCCESS;
4142
4143
  }
4143
4144
 
4144
4145
  static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
@@ -59,6 +59,15 @@ struct ggml_compute_params {
59
59
  #endif
60
60
  #endif
61
61
 
62
+ #if defined(__s390x__) && defined(__VEC__)
63
+ #ifndef __VXE__
64
+ #define __VXE__
65
+ #endif
66
+ #ifndef __VXE2__
67
+ #define __VXE2__
68
+ #endif
69
+ #endif
70
+
62
71
  #if defined(__ARM_FEATURE_SVE)
63
72
  #include <arm_sve.h>
64
73
  #include <sys/prctl.h>
@@ -359,6 +368,148 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
359
368
  #endif
360
369
  #endif
361
370
 
371
+ #if defined(__VXE__) || defined(__VXE2__)
372
+ #include <vecintrin.h>
373
+
374
+ #define vec_neg(a) (-(a)) // Vector Negate
375
+ #define vec_add(a, b) ((a) + (b)) // Vector Add
376
+ #define vec_sub(a, b) ((a) - (b)) // Vector Subtract
377
+ #define vec_mul(a, b) ((a) * (b)) // Vector Multiply
378
+ #define vec_div(a, b) ((a) / (b)) // Vector Divide
379
+ #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
380
+ #define vec_sra(a, b) ((a) >> (b)) // Vector Shift Right
381
+ #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
382
+ #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
383
+ #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
384
+
385
+ #ifndef vec_and
386
+ #define vec_and(a, b) ((a) & (b)) // Vector AND
387
+ #endif
388
+
389
+ #ifndef vec_or
390
+ #define vec_or(a, b) ((a) | (b)) // Vector OR
391
+ #endif
392
+
393
+ #ifndef vec_xor
394
+ #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
395
+ #endif
396
+
397
+ typedef signed char char8x16_t __attribute__((vector_size(16)));
398
+ typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
399
+
400
+ typedef int8_t int8x16_t __attribute__((vector_size(16)));
401
+ typedef int16_t int16x8_t __attribute__((vector_size(16)));
402
+ typedef int32_t int32x4_t __attribute__((vector_size(16)));
403
+
404
+ typedef uint8_t uint8x16_t __attribute__((vector_size(16)));
405
+ typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
406
+ typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
407
+
408
+ typedef float float32x4_t __attribute__((vector_size(16)));
409
+ typedef double double64x2_t __attribute((vector_size(16)));
410
+
411
+ typedef signed long long long64x2_t __attribute((vector_size(16)));
412
+ typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
413
+
414
+ typedef struct ggml_uint8x16x2_t {
415
+ uint8x16_t val[2];
416
+ } ggml_uint8x16x2_t;
417
+
418
+ inline static ggml_uint8x16x2_t ggml_vec_xl_u8x2(const uint8_t * ptr) {
419
+ ggml_uint8x16x2_t res;
420
+
421
+ res.val[0] = vec_xl( 0, ptr);
422
+ res.val[1] = vec_xl(16, ptr);
423
+
424
+ return res;
425
+ }
426
+
427
+ typedef struct ggml_uint8x16x4_t {
428
+ uint8x16_t val[4];
429
+ } ggml_uint8x16x4_t;
430
+
431
+ inline static ggml_uint8x16x4_t ggml_vec_xl_u8x4(const uint8_t * ptr) {
432
+ ggml_uint8x16x4_t res;
433
+
434
+ res.val[0] = vec_xl( 0, ptr);
435
+ res.val[1] = vec_xl(16, ptr);
436
+ res.val[2] = vec_xl(32, ptr);
437
+ res.val[3] = vec_xl(48, ptr);
438
+
439
+ return res;
440
+ }
441
+
442
+ typedef struct ggml_int8x16x4_t {
443
+ int8x16_t val[4];
444
+ } ggml_int8x16x4_t;
445
+
446
+ inline static ggml_int8x16x4_t ggml_vec_xl_s8x4(const int8_t * ptr) {
447
+ ggml_int8x16x4_t res;
448
+
449
+ res.val[0] = vec_xl( 0, ptr);
450
+ res.val[1] = vec_xl(16, ptr);
451
+ res.val[2] = vec_xl(32, ptr);
452
+ res.val[3] = vec_xl(48, ptr);
453
+
454
+ return res;
455
+ }
456
+
457
+ typedef struct ggml_int16x8x2_t {
458
+ int16x8_t val[2];
459
+ } ggml_int16x8x2_t;
460
+
461
+ inline static ggml_int16x8x2_t ggml_vec_xl_s16x2(const int16_t * ptr) {
462
+ ggml_int16x8x2_t res;
463
+
464
+ res.val[0] = vec_xl( 0, ptr);
465
+ res.val[1] = vec_xl(16, ptr);
466
+
467
+ return res;
468
+ }
469
+
470
+ /*
471
+ ! WARNING: Very slow. Use vec_perm if possible. Refer to iq4_xs
472
+ ! or iq4_nl for example implementation.
473
+ */
474
+ inline static int8x16_t ggml_vec_tbl(int8x16_t a, uint8x16_t b) {
475
+ int8x16_t res;
476
+
477
+ res[ 0] = a[b[ 0]];
478
+ res[ 1] = a[b[ 1]];
479
+ res[ 2] = a[b[ 2]];
480
+ res[ 3] = a[b[ 3]];
481
+ res[ 4] = a[b[ 4]];
482
+ res[ 5] = a[b[ 5]];
483
+ res[ 6] = a[b[ 6]];
484
+ res[ 7] = a[b[ 7]];
485
+ res[ 8] = a[b[ 8]];
486
+ res[ 9] = a[b[ 9]];
487
+ res[10] = a[b[10]];
488
+ res[11] = a[b[11]];
489
+ res[12] = a[b[12]];
490
+ res[13] = a[b[13]];
491
+ res[14] = a[b[14]];
492
+ res[15] = a[b[15]];
493
+
494
+ return res;
495
+ }
496
+
497
+ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
498
+ const uchar8x16_t v_maske = { 0, 1, 4, 5, 8, 9, 12, 13,
499
+ 16, 17, 20, 21, 24, 25, 28, 29 };
500
+
501
+ const int16x8_t v_abo = vec_pack((int32x4_t)a, (int32x4_t)b);
502
+ const int16x8_t v_abe = vec_perm(a, b, v_maske);
503
+ return v_abo + v_abe;
504
+ }
505
+
506
+ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
507
+ const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
508
+ return acc + (vec_unpackh(p) + vec_unpackl(p));
509
+ }
510
+
511
+ #endif
512
+
362
513
  #if defined(__loongarch_asx)
363
514
  /* float type data load instructions */
364
515
  static __m128 __lsx_vreplfr2vr_s(const float val) {