whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -31,6 +31,8 @@
31
31
  #include <mutex>
32
32
  #include <queue>
33
33
  #include <chrono>
34
+ #include <unordered_set>
35
+ #include <optional>
34
36
 
35
37
  #include "ggml-impl.h"
36
38
  #include "ggml-backend-impl.h"
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
93
95
  return id;
94
96
  }
95
97
 
98
+ /**
99
+ * @brief Get the value of the specified environment variable (name).
100
+ * if not empty, return a std::string object
101
+ */
102
+ std::optional<std::string> get_env(const std::string& name) {
103
+ const char* val = std::getenv(name.c_str());
104
+ if (!val) return std::nullopt;
105
+ std::string res = std::string(val);
106
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
+ return res;
108
+ }
109
+
110
+ /**
111
+ * @brief Verify whether the environment variable is a valid value.
112
+ */
113
+ bool parse_bool(const std::string& value) {
114
+ std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115
+ return valid_values.find(value) != valid_values.end();
116
+ }
117
+
96
118
  /**
97
119
  * @brief Initialize the CANN device information.
98
120
  *
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
214
236
  * @param device The device ID to associate with this buffer pool.
215
237
  */
216
238
  explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
239
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
218
240
  }
219
241
 
220
242
  /**
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410
432
  * @param device The device ID to associate with this buffer pool.
411
433
  */
412
434
  explicit ggml_cann_pool_buf(int device) : device(device) {
413
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
435
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
414
436
  }
415
437
 
416
438
  /**
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731
753
  */
732
754
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
733
755
  int device) {
734
- bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735
- if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736
- GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738
- }
739
- bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740
- if (enable_buf_prio) {
756
+ std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
757
+
758
+ if (mem_pool_type == "prio") {
741
759
  GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742
760
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743
761
  }
762
+
763
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
764
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
765
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
766
+ }
767
+
744
768
  GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745
769
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
746
770
  }
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1074
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1075
  GGML_TABLE_END()
1076
1076
 
1077
+ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
+ -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
+ GGML_TABLE_END()
1080
+
1077
1081
  #define NGRID_IQ1S 2048
1078
1082
  #define IQ1S_DELTA 0.125f
1079
1083
  #define IQ1M_DELTA 0.125f
@@ -1,3 +1,17 @@
1
+ function(ggml_add_cpu_backend_features cpu_name arch)
2
+ # The feature detection code is compiled as a separate target so that
3
+ # it can be built without the architecture flags
4
+ # Since multiple variants of the CPU backend may be included in the same
5
+ # build, using set_source_files_properties() to set the arch flags is not possible
6
+ set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7
+ add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
9
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10
+ target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11
+ set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
12
+ target_link_libraries(${cpu_name} PRIVATE ${GGML_CPU_FEATS_NAME})
13
+ endfunction()
14
+
1
15
  function(ggml_add_cpu_backend_variant_impl tag_name)
2
16
  if (tag_name)
3
17
  set(GGML_CPU_NAME ggml-cpu-${tag_name})
@@ -10,14 +24,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
10
24
  list (APPEND GGML_CPU_SOURCES
11
25
  ggml-cpu/ggml-cpu.c
12
26
  ggml-cpu/ggml-cpu.cpp
13
- ggml-cpu/ggml-cpu-aarch64.cpp
14
- ggml-cpu/ggml-cpu-aarch64.h
15
- ggml-cpu/ggml-cpu-hbm.cpp
16
- ggml-cpu/ggml-cpu-hbm.h
17
- ggml-cpu/ggml-cpu-quants.c
18
- ggml-cpu/ggml-cpu-quants.h
19
- ggml-cpu/ggml-cpu-traits.cpp
20
- ggml-cpu/ggml-cpu-traits.h
27
+ ggml-cpu/repack.cpp
28
+ ggml-cpu/repack.h
29
+ ggml-cpu/hbm.cpp
30
+ ggml-cpu/hbm.h
31
+ ggml-cpu/quants.c
32
+ ggml-cpu/quants.h
33
+ ggml-cpu/traits.cpp
34
+ ggml-cpu/traits.h
21
35
  ggml-cpu/amx/amx.cpp
22
36
  ggml-cpu/amx/amx.h
23
37
  ggml-cpu/amx/mmq.cpp
@@ -82,12 +96,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
82
96
  target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind)
83
97
  endif()
84
98
 
85
- if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
86
- CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
87
- (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
88
- CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
89
-
99
+ if (GGML_SYSTEM_ARCH STREQUAL "ARM")
90
100
  message(STATUS "ARM detected")
101
+ list(APPEND GGML_CPU_SOURCES
102
+ ggml-cpu/arch/arm/quants.c
103
+ ggml-cpu/arch/arm/repack.cpp
104
+ )
91
105
 
92
106
  if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
93
107
  message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
@@ -143,6 +157,49 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
143
157
  else()
144
158
  if (GGML_CPU_ARM_ARCH)
145
159
  list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
160
+ elseif(GGML_CPU_ALL_VARIANTS)
161
+ # Begin with the lowest baseline
162
+ set(ARM_MCPU "armv8-a")
163
+ set(ARCH_TAGS "")
164
+ set(ARCH_DEFINITIONS "")
165
+
166
+ # When a feature is selected, bump the MCPU to the first
167
+ # version that supported it
168
+ if (GGML_INTERNAL_DOTPROD)
169
+ set(ARM_MCPU "armv8.2-a")
170
+ set(ARCH_TAGS "${ARCH_TAGS}+dotprod")
171
+ list(APPEND ARCH_DEFINITIONS GGML_USE_DOTPROD)
172
+ endif()
173
+ if (GGML_INTERNAL_FP16_VECTOR_ARITHMETIC)
174
+ set(ARM_MCPU "armv8.2-a")
175
+ set(ARCH_TAGS "${ARCH_TAGS}+fp16")
176
+ list(APPEND ARCH_DEFINITIONS GGML_USE_FP16_VECTOR_ARITHMETIC)
177
+ endif()
178
+ if (GGML_INTERNAL_SVE)
179
+ set(ARM_MCPU "armv8.2-a")
180
+ set(ARCH_TAGS "${ARCH_TAGS}+sve")
181
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SVE)
182
+ endif()
183
+ if (GGML_INTERNAL_MATMUL_INT8)
184
+ set(ARM_MCPU "armv8.6-a")
185
+ set(ARCH_TAGS "${ARCH_TAGS}+i8mm")
186
+ list(APPEND ARCH_DEFINITIONS GGML_USE_MATMUL_INT8)
187
+ endif()
188
+ if (GGML_INTERNAL_SVE2)
189
+ set(ARM_MCPU "armv8.6-a")
190
+ set(ARCH_TAGS "${ARCH_TAGS}+sve2")
191
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SVE2)
192
+ endif()
193
+ if (GGML_INTERNAL_NOSVE)
194
+ set(ARCH_TAGS "${ARCH_TAGS}+nosve")
195
+ endif()
196
+ if (GGML_INTERNAL_SME)
197
+ set(ARM_MCPU "armv9.2-a")
198
+ set(ARCH_TAGS "${ARCH_TAGS}+sme")
199
+ list(APPEND ARCH_DEFINITIONS GGML_USE_SME)
200
+ endif()
201
+ list(APPEND ARCH_FLAGS "-march=${ARM_MCPU}${ARCH_TAGS}")
202
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} arm ${ARCH_DEFINITIONS})
146
203
  endif()
147
204
  endif()
148
205
 
@@ -170,11 +227,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
170
227
  endforeach()
171
228
  endif()
172
229
  endif()
173
- elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
174
- (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
175
- CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
176
-
230
+ elseif (GGML_SYSTEM_ARCH STREQUAL "x86")
177
231
  message(STATUS "x86 detected")
232
+ list(APPEND GGML_CPU_SOURCES
233
+ ggml-cpu/arch/x86/quants.c
234
+ ggml-cpu/arch/x86/repack.cpp
235
+ )
178
236
 
179
237
  if (MSVC)
180
238
  # instruction set detection for MSVC only
@@ -305,21 +363,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
305
363
  # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
306
364
  message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
307
365
  endif()
308
-
309
- # The feature detection code is compiled as a separate target so that
310
- # it can be built without the architecture flags
311
- # Since multiple variants of the CPU backend may be included in the same
312
- # build, using set_source_files_properties() to set the arch flags is not possible
313
- set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
314
- add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
315
- target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
316
- target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
317
- target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
318
- set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
319
- target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
366
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} x86 ${ARCH_DEFINITIONS})
320
367
  endif()
321
- elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
368
+ elseif (GGML_SYSTEM_ARCH STREQUAL "PowerPC")
322
369
  message(STATUS "PowerPC detected")
370
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/powerpc/quants.c)
323
371
  if (GGML_NATIVE)
324
372
  if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
325
373
  file(READ "/proc/cpuinfo" POWER10_M)
@@ -327,7 +375,8 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
327
375
  execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
328
376
  endif()
329
377
 
330
- string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
378
+ string(TOUPPER "${POWER10_M}" POWER10_M_UPPER)
379
+ string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
331
380
  string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
332
381
 
333
382
  if (EXTRACTED_NUMBER GREATER_EQUAL 10)
@@ -339,13 +388,35 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
339
388
  else()
340
389
  list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
341
390
  endif()
391
+ elseif(GGML_CPU_ALL_VARIANTS)
392
+ # Begin with the lowest baseline
393
+ set(ARCH_DEFINITIONS "")
394
+
395
+ # When a feature is selected, bump the MCPU to the first
396
+ # version that supported it
397
+ foreach(PVER RANGE 7 11)
398
+ if(DEFINED GGML_INTERNAL_POWER${PVER})
399
+ set(POWERPC_MCPU "power${PVER}")
400
+ list(APPEND ARCH_DEFINITIONS GGML_USE_POWER${PVER})
401
+ endif()
402
+ endforeach()
403
+ if (GGML_INTERNAL_VSX)
404
+ list(APPEND ARCH_DEFINITIONS GGML_USE_VSX)
405
+ list(APPEND ARCH_FLAGS -mvsx)
406
+ endif()
407
+
408
+ if (DEFINED POWERPC_MCPU)
409
+ list(APPEND ARCH_FLAGS -mcpu=${POWERPC_MCPU})
410
+ endif()
411
+ ggml_add_cpu_backend_features(${GGML_CPU_NAME} powerpc ${ARCH_DEFINITIONS})
342
412
  else()
343
413
  if (GGML_CPU_POWERPC_CPUTYPE)
344
414
  list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
345
415
  endif()
346
416
  endif()
347
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
417
+ elseif (GGML_SYSTEM_ARCH STREQUAL "loongarch64")
348
418
  message(STATUS "loongarch64 detected")
419
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/loongarch/quants.c)
349
420
 
350
421
  list(APPEND ARCH_FLAGS -march=loongarch64)
351
422
  if (GGML_LASX)
@@ -354,8 +425,12 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
354
425
  if (GGML_LSX)
355
426
  list(APPEND ARCH_FLAGS -mlsx)
356
427
  endif()
357
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
358
- message(STATUS "RISC-V detected")
428
+ elseif (GGML_SYSTEM_ARCH STREQUAL "riscv64")
429
+ message(STATUS "riscv64 detected")
430
+ list(APPEND GGML_CPU_SOURCES
431
+ ggml-cpu/arch/riscv/quants.c
432
+ ggml-cpu/arch/riscv/repack.cpp
433
+ )
359
434
  if (GGML_RVV)
360
435
  if (GGML_XTHEADVECTOR)
361
436
  list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
@@ -365,13 +440,15 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
365
440
  list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
366
441
  endif()
367
442
  endif()
368
- elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
443
+ elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
369
444
  message(STATUS "s390x detected")
445
+ list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
370
446
  file(READ "/proc/cpuinfo" CPUINFO_CONTENTS)
371
447
  string(REGEX REPLACE "machine[ \t\r\n]*=[ \t\r\n]*([0-9]+)" "\\1" S390X_M ${CPUINFO_CONTENTS})
372
448
 
373
449
  # TODO: Separation to determine activation of VX/VXE/VXE2
374
450
  if (${S390X_M} MATCHES "8561|8562")
451
+ set(GGML_NNPA OFF)
375
452
  message(STATUS "z15 target")
376
453
  list(APPEND ARCH_FLAGS -march=z15)
377
454
  elseif (${S390X_M} MATCHES "3931")
@@ -388,14 +465,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
388
465
  endif()
389
466
 
390
467
  if (GGML_VXE)
468
+ message(STATUS "VX/VXE/VXE2 enabled")
391
469
  list(APPEND ARCH_FLAGS -mvx -mzvector)
470
+ list(APPEND ARCH_DEFINITIONS GGML_VXE)
471
+ endif()
472
+
473
+ if (GGML_NNPA)
474
+ message(STATUS "NNPA enabled")
475
+ list(APPEND ARCH_DEFINITIONS GGML_NNPA)
392
476
  endif()
477
+ elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
478
+ message(STATUS "Wasm detected")
479
+ list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
393
480
  else()
394
- message(STATUS "Unknown architecture")
481
+ message(WARNING "Unknown CPU architecture. Falling back to generic implementations.")
482
+ list(APPEND ARCH_FLAGS -DGGML_CPU_GENERIC)
395
483
  endif()
396
484
 
397
- if (GGML_CPU_AARCH64)
398
- target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_AARCH64)
485
+ if (GGML_CPU_REPACK)
486
+ target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_USE_CPU_REPACK)
399
487
  endif()
400
488
 
401
489
  if (GGML_CPU_KLEIDIAI)
@@ -406,9 +494,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
406
494
 
407
495
  # Fetch KleidiAI sources:
408
496
  include(FetchContent)
409
- set(KLEIDIAI_COMMIT_TAG "v1.6.0")
497
+ set(KLEIDIAI_COMMIT_TAG "v1.9.0")
410
498
  set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
411
- set(KLEIDIAI_ARCHIVE_MD5 "75b4ad68f25ab673dcc01065e5a0b05f")
499
+ set(KLEIDIAI_ARCHIVE_MD5 "2a8e1bb55d201557553545536489a017")
412
500
 
413
501
  if (POLICY CMP0135)
414
502
  cmake_policy(SET CMP0135 NEW)
@@ -501,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
501
589
  if (EMSCRIPTEN)
502
590
  set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
503
591
  endif()
592
+
593
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
594
+ # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
595
+ target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
596
+ endif()
504
597
  endfunction()
@@ -5,7 +5,7 @@
5
5
  #include "ggml-backend.h"
6
6
  #include "ggml-impl.h"
7
7
  #include "ggml-cpu.h"
8
- #include "ggml-cpu-traits.h"
8
+ #include "traits.h"
9
9
 
10
10
  #if defined(__gnu_linux__)
11
11
  #include <sys/syscall.h>
@@ -8,7 +8,8 @@
8
8
  #include "mmq.h"
9
9
  #include "ggml-impl.h"
10
10
  #include "ggml-cpu-impl.h"
11
- #include "ggml-cpu-quants.h"
11
+ #include "simd-mappings.h"
12
+ #include "quants.h"
12
13
  #include "ggml-quants.h"
13
14
  #include <algorithm>
14
15
  #include <type_traits>
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
453
454
 
454
455
  // Quantize these floats
455
456
  const float iscale = 127.f / amax;
456
- y[i].d = GGML_FP32_TO_FP16(1 / iscale);
457
+ y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale);
457
458
  const float id = ( amax != 0.0f ) ? iscale : 0.f;
458
459
  const __m512 vscale = _mm512_set1_ps(id);
459
460
 
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
1090
1091
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1091
1092
 
1092
1093
  for (int m = 0; m < nr; ++m) {
1093
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1094
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1094
1095
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1095
1096
 
1096
1097
  __m512 vsum;
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
1113
1114
  const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half))));
1114
1115
 
1115
1116
  for (int m = 0; m < nr; ++m) {
1116
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1117
- const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s));
1117
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1118
+ const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s));
1118
1119
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1119
1120
 
1120
1121
  __m512 vsum;
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
1137
1138
  const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
1138
1139
 
1139
1140
  for (int m = 0; m < nr; ++m) {
1140
- const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d));
1141
+ const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d));
1141
1142
  const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
1142
1143
 
1143
1144
  __m512 vsum;
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
1437
1438
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1438
1439
  vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
1439
1440
  }
1440
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1441
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1441
1442
  }
1442
1443
 
1443
1444
  // load b
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
1498
1499
  for (int k = 0; k < 8; ++k) {
1499
1500
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1500
1501
  }
1501
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1502
- vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s));
1502
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1503
+ vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
1503
1504
  }
1504
1505
 
1505
1506
  // load b
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
1571
1572
  va[k] = _mm512_set1_epi32(a_ptr[k]);
1572
1573
  va[k] = _mm512_add_epi8(va[k], off);
1573
1574
  }
1574
- vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d));
1575
+ vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
1575
1576
  }
1576
1577
 
1577
1578
  // load b
@@ -0,0 +1,94 @@
1
+ #include "ggml-backend-impl.h"
2
+
3
+ #if defined(__aarch64__)
4
+
5
+ #if defined(__linux__)
6
+ #include <sys/auxv.h>
7
+ #elif defined(__APPLE__)
8
+ #include <sys/sysctl.h>
9
+ #endif
10
+
11
+ #if !defined(HWCAP2_I8MM)
12
+ #define HWCAP2_I8MM (1 << 13)
13
+ #endif
14
+
15
+ #if !defined(HWCAP2_SME)
16
+ #define HWCAP2_SME (1 << 23)
17
+ #endif
18
+
19
+ struct aarch64_features {
20
+ // has_neon not needed, aarch64 has NEON guaranteed
21
+ bool has_dotprod = false;
22
+ bool has_fp16_va = false;
23
+ bool has_sve = false;
24
+ bool has_sve2 = false;
25
+ bool has_i8mm = false;
26
+ bool has_sme = false;
27
+
28
+ aarch64_features() {
29
+ #if defined(__linux__)
30
+ uint32_t hwcap = getauxval(AT_HWCAP);
31
+ uint32_t hwcap2 = getauxval(AT_HWCAP2);
32
+
33
+ has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
34
+ has_fp16_va = !!(hwcap & HWCAP_FPHP);
35
+ has_sve = !!(hwcap & HWCAP_SVE);
36
+ has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
37
+ has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
38
+ has_sme = !!(hwcap2 & HWCAP2_SME);
39
+ #elif defined(__APPLE__)
40
+ int oldp = 0;
41
+ size_t size = sizeof(oldp);
42
+
43
+ if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
44
+ has_dotprod = static_cast<bool>(oldp);
45
+ }
46
+
47
+ if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
48
+ has_i8mm = static_cast<bool>(oldp);
49
+ }
50
+
51
+ if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
52
+ has_sme = static_cast<bool>(oldp);
53
+ }
54
+
55
+ // Apple apparently does not implement SVE yet
56
+ #endif
57
+ }
58
+ };
59
+
60
+ static int ggml_backend_cpu_aarch64_score() {
61
+ int score = 1;
62
+ aarch64_features af;
63
+
64
+ #ifdef GGML_USE_DOTPROD
65
+ if (!af.has_dotprod) { return 0; }
66
+ score += 1<<1;
67
+ #endif
68
+ #ifdef GGML_USE_FP16_VECTOR_ARITHMETIC
69
+ if (!af.has_fp16_va) { return 0; }
70
+ score += 1<<2;
71
+ #endif
72
+ #ifdef GGML_USE_SVE
73
+ if (!af.has_sve) { return 0; }
74
+ score += 1<<3;
75
+ #endif
76
+ #ifdef GGML_USE_MATMUL_INT8
77
+ if (!af.has_i8mm) { return 0; }
78
+ score += 1<<4;
79
+ #endif
80
+ #ifdef GGML_USE_SVE2
81
+ if (!af.has_sve2) { return 0; }
82
+ score += 1<<5;
83
+ #endif
84
+ #ifdef GGML_USE_SME
85
+ if (!af.has_sme) { return 0; }
86
+ score += 1<<6;
87
+ #endif
88
+
89
+ return score;
90
+ }
91
+
92
+ GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_aarch64_score)
93
+
94
+ # endif // defined(__aarch64__)