whispercpp 1.3.2 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (244) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -3
  3. data/README.md +71 -14
  4. data/Rakefile +20 -7
  5. data/ext/.gitignore +4 -6
  6. data/ext/dependencies.rb +36 -24
  7. data/ext/extconf.rb +1 -1
  8. data/ext/options.rb +48 -184
  9. data/ext/ruby_whisper.c +18 -0
  10. data/ext/ruby_whisper_context.c +43 -12
  11. data/ext/ruby_whisper_model.c +1 -1
  12. data/ext/ruby_whisper_params.c +4 -2
  13. data/ext/ruby_whisper_segment.c +81 -4
  14. data/ext/ruby_whisper_transcribe.cpp +13 -7
  15. data/ext/ruby_whisper_vad_params.c +1 -1
  16. data/ext/sources/CMakeLists.txt +5 -1
  17. data/ext/sources/bindings/javascript/package.json +1 -1
  18. data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
  19. data/ext/sources/examples/addon.node/addon.cpp +150 -31
  20. data/ext/sources/examples/addon.node/index.js +3 -0
  21. data/ext/sources/examples/addon.node/vad-example.js +132 -0
  22. data/ext/sources/examples/bench/bench.cpp +3 -2
  23. data/ext/sources/examples/cli/cli.cpp +3 -2
  24. data/ext/sources/examples/command/command.cpp +32 -8
  25. data/ext/sources/examples/common-whisper.cpp +14 -7
  26. data/ext/sources/examples/lsp/lsp.cpp +2 -0
  27. data/ext/sources/examples/quantize/quantize.cpp +3 -0
  28. data/ext/sources/examples/server/CMakeLists.txt +3 -0
  29. data/ext/sources/examples/server/server.cpp +169 -22
  30. data/ext/sources/examples/stream/stream.cpp +6 -0
  31. data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
  32. data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
  33. data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
  34. data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
  35. data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
  36. data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
  37. data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
  38. data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
  39. data/ext/sources/examples/talk-llama/llama-context.h +38 -17
  40. data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
  41. data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
  42. data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
  43. data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
  44. data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
  45. data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
  46. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
  47. data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
  48. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
  49. data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
  50. data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
  51. data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
  52. data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
  53. data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
  54. data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
  55. data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
  56. data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
  57. data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
  58. data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
  59. data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
  60. data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
  61. data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
  62. data/ext/sources/examples/talk-llama/llama-model.h +27 -0
  63. data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
  64. data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
  65. data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
  66. data/ext/sources/examples/talk-llama/llama.cpp +11 -7
  67. data/ext/sources/examples/talk-llama/llama.h +147 -40
  68. data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
  69. data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
  70. data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
  71. data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
  72. data/ext/sources/ggml/CMakeLists.txt +48 -3
  73. data/ext/sources/ggml/cmake/common.cmake +24 -0
  74. data/ext/sources/ggml/include/ggml-backend.h +1 -1
  75. data/ext/sources/ggml/include/ggml-cpu.h +2 -0
  76. data/ext/sources/ggml/include/ggml.h +144 -5
  77. data/ext/sources/ggml/src/CMakeLists.txt +82 -24
  78. data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
  79. data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
  80. data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  81. data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  82. data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
  83. data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  84. data/ext/sources/ggml/src/ggml-common.h +4 -0
  85. data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
  86. data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  87. data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  88. data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  89. data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  90. data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  91. data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  92. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  93. data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  94. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  95. data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  96. data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  97. data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  98. data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  99. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  100. data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  101. data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
  102. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  103. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
  104. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  105. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  106. data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  107. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  108. data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  109. data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
  110. data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
  111. data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
  112. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  113. data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
  114. data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
  115. data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
  116. data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  117. data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
  118. data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
  119. data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
  120. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  121. data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  122. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  123. data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  124. data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
  125. data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
  126. data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
  127. data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  128. data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  129. data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
  130. data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
  131. data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
  132. data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
  133. data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
  134. data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  135. data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
  136. data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  137. data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
  138. data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
  139. data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  140. data/ext/sources/ggml/src/ggml-impl.h +127 -183
  141. data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  142. data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
  143. data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
  144. data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
  145. data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
  146. data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
  147. data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
  148. data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  149. data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  150. data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  151. data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
  152. data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  153. data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  154. data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  155. data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  156. data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  157. data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  158. data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  159. data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  160. data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  161. data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  162. data/ext/sources/ggml/src/ggml-quants.c +6 -8
  163. data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  164. data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  165. data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  166. data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
  167. data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
  168. data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
  169. data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
  170. data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
  171. data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  172. data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  173. data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  174. data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
  175. data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
  176. data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
  177. data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
  178. data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
  179. data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
  180. data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
  181. data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
  182. data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  183. data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
  184. data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
  185. data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
  186. data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
  187. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  188. data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  189. data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  190. data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  191. data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
  192. data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
  193. data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
  194. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  195. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  196. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
  197. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
  198. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
  199. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
  200. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
  201. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
  202. data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
  203. data/ext/sources/ggml/src/ggml.c +328 -48
  204. data/ext/sources/ggml/src/ggml.cpp +26 -0
  205. data/ext/sources/ggml/src/gguf.cpp +24 -3
  206. data/ext/sources/include/whisper.h +2 -0
  207. data/ext/sources/src/CMakeLists.txt +2 -0
  208. data/ext/sources/src/coreml/whisper-compat.h +10 -0
  209. data/ext/sources/src/coreml/whisper-compat.m +35 -0
  210. data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
  211. data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
  212. data/ext/sources/src/whisper.cpp +218 -169
  213. data/extsources.rb +15 -9
  214. data/lib/whisper/context.rb +15 -0
  215. data/lib/whisper/model/uri.rb +56 -1
  216. data/lib/whisper/segment.rb +58 -0
  217. data/sig/whisper.rbs +68 -38
  218. data/{tests → test}/helper.rb +1 -12
  219. data/{tests → test}/test_model.rb +9 -0
  220. data/test/test_package.rb +51 -0
  221. data/test/test_segment.rb +146 -0
  222. data/{tests → test}/test_whisper.rb +70 -0
  223. data/whispercpp.gemspec +2 -3
  224. metadata +91 -43
  225. data/ext/sources/.dockerignore +0 -3
  226. data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
  227. data/ext/sources/ci/run.sh +0 -336
  228. data/ext/sources/close-issue.yml +0 -28
  229. data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
  230. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  231. data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
  232. data/tests/test_package.rb +0 -46
  233. data/tests/test_segment.rb +0 -74
  234. /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  235. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  236. /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  237. /data/{tests → test}/jfk_reader/.gitignore +0 -0
  238. /data/{tests → test}/jfk_reader/extconf.rb +0 -0
  239. /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
  240. /data/{tests → test}/test_callback.rb +0 -0
  241. /data/{tests → test}/test_error.rb +0 -0
  242. /data/{tests → test}/test_params.rb +0 -0
  243. /data/{tests → test}/test_vad.rb +0 -0
  244. /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,98 @@
1
+ #pragma once
2
+
3
+ #define GGML_COMMON_DECL_CPP
4
+ #include "ggml-common.h"
5
+
6
+ #include "traits.h"
7
+ #include "ggml.h"
8
+
9
+ // GGML internal header
10
+
11
+ ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void);
12
+
13
+ template <int K> constexpr int QK_0() {
14
+ if constexpr (K == 4) {
15
+ return QK4_0;
16
+ }
17
+ if constexpr (K == 8) {
18
+ return QK8_0;
19
+ }
20
+ return -1;
21
+ }
22
+
23
+ template <int K, int N> struct block {
24
+ ggml_half d[N]; // deltas for N qK_0 blocks
25
+ int8_t qs[(QK_0<K>() * N * K) / 8]; // quants for N qK_0 blocks
26
+ };
27
+
28
+ // control size
29
+ static_assert(sizeof(block<4, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 2, "wrong block<4,4> size/padding");
30
+ static_assert(sizeof(block<4, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<4,8> size/padding");
31
+ static_assert(sizeof(block<8, 4>) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong block<8,4> size/padding");
32
+ static_assert(sizeof(block<8, 8>) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong block<8,8> size/padding");
33
+
34
+ using block_q4_0x4 = block<4, 4>;
35
+ using block_q4_0x8 = block<4, 8>;
36
+ using block_q8_0x4 = block<8, 4>;
37
+ using block_q8_0x8 = block<8, 8>;
38
+
39
+ struct block_q4_Kx8 {
40
+ ggml_half d[8]; // super-block scale for quantized scales
41
+ ggml_half dmin[8]; // super-block scale for quantized mins
42
+ uint8_t scales[96]; // scales and mins, quantized with 6 bits
43
+ uint8_t qs[1024]; // 4--bit quants
44
+ };
45
+
46
+ static_assert(sizeof(block_q4_Kx8) == sizeof(ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
47
+
48
+ struct block_q8_Kx4 {
49
+ float d[4]; // delta
50
+ int8_t qs[QK_K * 4]; // quants
51
+ int16_t bsums[QK_K / 4]; // sum of quants in groups of 16
52
+ };
53
+
54
+ static_assert(sizeof(block_q8_Kx4) == sizeof(float) * 4 + QK_K * 4 + (QK_K / 4) * sizeof(int16_t), "wrong q8_K block size/padding");
55
+
56
+ struct block_iq4_nlx4 {
57
+ ggml_half d[4]; // deltas for 4 iq4_nl blocks
58
+ uint8_t qs[QK4_NL * 2]; // nibbles / quants for 4 iq4_nl blocks
59
+ };
60
+
61
+ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
62
+
63
+ #if defined(__cplusplus)
64
+ extern "C" {
65
+ #endif
66
+
67
+ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
68
+ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
69
+ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
70
+ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
71
+ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
72
+ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
73
+ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
74
+ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
75
+ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
76
+ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
77
+ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
78
+ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
79
+ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
80
+
81
+ // Native implementations
82
+ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
83
+ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
84
+ void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
85
+ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
86
+ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
87
+ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
88
+ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
89
+ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
90
+ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
91
+ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
92
+ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
93
+ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
94
+ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
95
+
96
+ #if defined(__cplusplus)
97
+ } // extern "C"
98
+ #endif
@@ -2,10 +2,167 @@
2
2
 
3
3
  #include "ggml-cpu-impl.h"
4
4
 
5
+ #ifdef __ARM_FEATURE_SVE
6
+ #include <arm_sve.h>
7
+ #endif // __ARM_FEATURE_SVE
8
+
9
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
+ //
12
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
+ //
14
+ #include <arm_neon.h>
15
+ #endif
16
+
17
+ #if defined(__F16C__)
18
+ #include <immintrin.h>
19
+ #endif
20
+
21
+ #ifdef __cplusplus
22
+ extern "C" {
23
+ #endif
24
+
5
25
  //
6
26
  // simd mappings
7
27
  //
8
28
 
29
+ // FP16 to FP32 conversion
30
+
31
+ // 16-bit float
32
+ // on Arm, we use __fp16
33
+ // on x86, we use uint16_t
34
+ //
35
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
+ //
38
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
+
42
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
+
44
+ static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45
+ __fp16 tmp;
46
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47
+ return (float)tmp;
48
+ }
49
+
50
+ static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
+ ggml_fp16_t res;
52
+ __fp16 tmp = f;
53
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54
+ return res;
55
+ }
56
+ #elif defined(__F16C__)
57
+ #ifdef _MSC_VER
58
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
+ #else
61
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
+ #endif
64
+ #elif defined(__POWER9_VECTOR__)
65
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
+ /* the inline asm below is about 12% faster than the lookup method */
68
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
+
71
+ static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72
+ float f;
73
+ double d;
74
+ __asm__(
75
+ "mtfprd %0,%2\n"
76
+ "xscvhpdp %0,%0\n"
77
+ "frsp %1,%0\n" :
78
+ /* temp */ "=d"(d),
79
+ /* out */ "=f"(f):
80
+ /* in */ "r"(h));
81
+ return f;
82
+ }
83
+
84
+ static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
+ double d;
86
+ ggml_fp16_t r;
87
+ __asm__( /* xscvdphp can work on double or single precision */
88
+ "xscvdphp %0,%2\n"
89
+ "mffprd %1,%0\n" :
90
+ /* temp */ "=d"(d),
91
+ /* out */ "=r"(r):
92
+ /* in */ "f"(f));
93
+ return r;
94
+ }
95
+ #elif defined(__riscv) && defined(__riscv_zfhmin)
96
+ static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
+ float f;
98
+ __asm__(
99
+ "fmv.h.x %[f], %[h]\n\t"
100
+ "fcvt.s.h %[f], %[f]"
101
+ : [f] "=&f" (f)
102
+ : [h] "r" (h)
103
+ );
104
+ return f;
105
+ }
106
+
107
+ static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
108
+ ggml_fp16_t res;
109
+ __asm__(
110
+ "fcvt.h.s %[f], %[f]\n\t"
111
+ "fmv.x.h %[h], %[f]"
112
+ : [h] "=&r" (res)
113
+ : [f] "f" (f)
114
+ );
115
+ return res;
116
+ }
117
+
118
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
119
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
120
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
121
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
122
+ #elif defined(__NNPA__)
123
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
124
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
125
+
126
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
127
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
128
+
129
+ static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
130
+ uint16x8_t v_h = vec_splats(h);
131
+ uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
132
+ return vec_extend_to_fp32_hi(v_hd, 0)[0];
133
+ }
134
+
135
+ static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
136
+ float32x4_t v_f = vec_splats(f);
137
+ float32x4_t v_zero = vec_splats(0.0f);
138
+ uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
139
+ uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
140
+ return vec_extract(v_h, 0);
141
+ }
142
+ #endif
143
+
144
+ // precomputed f32 table for f16 (256 KB)
145
+ // defined in ggml-cpu.c, initialized in ggml_cpu_init()
146
+ extern float ggml_table_f32_f16[1 << 16];
147
+
148
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
149
+ // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
150
+ // This is also true for POWER9.
151
+ #if !defined(GGML_CPU_FP16_TO_FP32)
152
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
153
+ uint16_t s;
154
+ memcpy(&s, &f, sizeof(uint16_t));
155
+ return ggml_table_f32_f16[s];
156
+ }
157
+
158
+ #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
159
+ #endif
160
+
161
+ #if !defined(GGML_CPU_FP32_TO_FP16)
162
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
163
+ #endif
164
+
165
+
9
166
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
10
167
  // we then implement the fundamental computation operations below using only these macros
11
168
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -17,7 +174,123 @@
17
174
  // number of elements to fit in a single register
18
175
  //
19
176
 
20
- #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
177
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
178
+
179
+ #define GGML_SIMD
180
+
181
+ // F32 SVE
182
+ #define GGML_F32_EPR 8
183
+ #define DEFAULT_PG svptrue_b32()
184
+
185
+ #define GGML_F32xt svfloat32_t
186
+ #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
187
+ #define GGML_F32xt_SET1(x) svdup_n_f32(x)
188
+ #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
189
+ #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
190
+ #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
191
+ #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
192
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, a, b, c)
193
+ #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
194
+ #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
195
+ #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
196
+ #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
197
+ #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
198
+ #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
199
+ #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
200
+ #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
201
+ { \
202
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
203
+ sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
204
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
205
+ sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
206
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
207
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
208
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
209
+ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
210
+ }
211
+ #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
212
+
213
+ #define GGML_F32_VEC GGML_F32xt
214
+ #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
215
+ #define GGML_F32_VEC_SET1 GGML_F32xt_SET1
216
+ #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
217
+ #define GGML_F32_VEC_STORE GGML_F32xt_STORE
218
+ #define GGML_F32_VEC_FMA GGML_F32xt_FMA
219
+ #define GGML_F32_VEC_ADD GGML_F32xt_ADD
220
+ #define GGML_F32_VEC_MUL GGML_F32xt_MUL
221
+ #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
222
+
223
+ // F16 NEON
224
+
225
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
226
+ #define GGML_F16_STEP 32
227
+ #define GGML_F16_EPR 8
228
+
229
+ #define GGML_F16x8 float16x8_t
230
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
231
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
232
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
233
+ #define GGML_F16x8_STORE vst1q_f16
234
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
235
+ #define GGML_F16x8_ADD vaddq_f16
236
+ #define GGML_F16x8_MUL vmulq_f16
237
+ #define GGML_F16x8_REDUCE(res, x) \
238
+ do { \
239
+ int offset = GGML_F16_ARR >> 1; \
240
+ for (int i = 0; i < offset; ++i) { \
241
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
242
+ } \
243
+ offset >>= 1; \
244
+ for (int i = 0; i < offset; ++i) { \
245
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
246
+ } \
247
+ offset >>= 1; \
248
+ for (int i = 0; i < offset; ++i) { \
249
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
250
+ } \
251
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
252
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
253
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
254
+ } while (0)
255
+
256
+ #define GGML_F16_VEC GGML_F16x8
257
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
258
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
259
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
260
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
261
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
262
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
263
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
264
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
265
+ #else
266
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
267
+ // and take advantage of the vcvt_ functions to convert to/from FP16
268
+
269
+ #define GGML_F16_STEP 16
270
+ #define GGML_F16_EPR 4
271
+
272
+ #define GGML_F32Cx4 float32x4_t
273
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
274
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
275
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
276
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
277
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
278
+ #define GGML_F32Cx4_ADD vaddq_f32
279
+ #define GGML_F32Cx4_MUL vmulq_f32
280
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
281
+
282
+ #define GGML_F16_VEC GGML_F32Cx4
283
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
284
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
285
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
286
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
287
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
288
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
289
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
290
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
291
+ #endif
292
+
293
+ #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
21
294
 
22
295
  #define GGML_SIMD
23
296
 
@@ -299,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
299
572
  float tmp[8];
300
573
 
301
574
  for (int i = 0; i < 8; i++) {
302
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
575
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
303
576
  }
304
577
 
305
578
  return _mm256_loadu_ps(tmp);
@@ -310,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
310
583
  _mm256_storeu_ps(arr, y);
311
584
 
312
585
  for (int i = 0; i < 8; i++)
313
- x[i] = GGML_FP32_TO_FP16(arr[i]);
586
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
314
587
  }
315
588
  #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
316
589
  #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -458,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) {
458
731
  inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
459
732
  float tmp[4];
460
733
 
461
- tmp[0] = GGML_FP16_TO_FP32(p[0]);
462
- tmp[1] = GGML_FP16_TO_FP32(p[1]);
463
- tmp[2] = GGML_FP16_TO_FP32(p[2]);
464
- tmp[3] = GGML_FP16_TO_FP32(p[3]);
734
+ tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
735
+ tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
736
+ tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
737
+ tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
465
738
 
466
739
  return wasm_v128_load(tmp);
467
740
  }
@@ -471,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
471
744
 
472
745
  wasm_v128_store(tmp, x);
473
746
 
474
- p[0] = GGML_FP32_TO_FP16(tmp[0]);
475
- p[1] = GGML_FP32_TO_FP16(tmp[1]);
476
- p[2] = GGML_FP32_TO_FP16(tmp[2]);
477
- p[3] = GGML_FP32_TO_FP16(tmp[3]);
747
+ p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
748
+ p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
749
+ p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
750
+ p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
478
751
  }
479
752
 
480
753
  #define GGML_F16x4 v128_t
@@ -574,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
574
847
  static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
575
848
  float tmp[4];
576
849
 
577
- tmp[0] = GGML_FP16_TO_FP32(x[0]);
578
- tmp[1] = GGML_FP16_TO_FP32(x[1]);
579
- tmp[2] = GGML_FP16_TO_FP32(x[2]);
580
- tmp[3] = GGML_FP16_TO_FP32(x[3]);
850
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
851
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
852
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
853
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
581
854
 
582
855
  return _mm_loadu_ps(tmp);
583
856
  }
@@ -587,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
587
860
 
588
861
  _mm_storeu_ps(arr, y);
589
862
 
590
- x[0] = GGML_FP32_TO_FP16(arr[0]);
591
- x[1] = GGML_FP32_TO_FP16(arr[1]);
592
- x[2] = GGML_FP32_TO_FP16(arr[2]);
593
- x[3] = GGML_FP32_TO_FP16(arr[3]);
863
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
864
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
865
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
866
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
594
867
  }
595
868
 
596
869
  #define GGML_F32Cx4 __m128
@@ -712,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
712
985
  #define GGML_F32x4_ZERO __lsx_vldi(0)
713
986
  #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
714
987
  #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
715
- #define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
988
+ #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
716
989
  #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
717
990
  #define GGML_F32x4_ADD __lsx_vfadd_s
718
991
  #define GGML_F32x4_MUL __lsx_vfmul_s
@@ -758,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
758
1031
  static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
759
1032
  float tmp[4];
760
1033
 
761
- tmp[0] = GGML_FP16_TO_FP32(x[0]);
762
- tmp[1] = GGML_FP16_TO_FP32(x[1]);
763
- tmp[2] = GGML_FP16_TO_FP32(x[2]);
764
- tmp[3] = GGML_FP16_TO_FP32(x[3]);
1034
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1035
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1036
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1037
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
765
1038
 
766
1039
  return __lsx_vld(tmp, 0);
767
1040
  }
@@ -771,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
771
1044
 
772
1045
  __lsx_vst(y, arr, 0);
773
1046
 
774
- x[0] = GGML_FP32_TO_FP16(arr[0]);
775
- x[1] = GGML_FP32_TO_FP16(arr[1]);
776
- x[2] = GGML_FP32_TO_FP16(arr[2]);
777
- x[3] = GGML_FP32_TO_FP16(arr[3]);
1047
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1048
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1049
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1050
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
778
1051
  }
779
1052
 
780
1053
  #define GGML_F32Cx4 __m128
@@ -806,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
806
1079
  #define GGML_F32_STEP 32
807
1080
  #define GGML_F32_EPR 4
808
1081
 
809
- #define GGML_F32x4 __vector float
1082
+ #define GGML_F32x4 float32x4_t
810
1083
  #define GGML_F32x4_ZERO vec_splats(0.0f)
811
1084
  #define GGML_F32x4_SET1 vec_splats
812
1085
  #define GGML_F32x4_LOAD(p) vec_xl(0, p)
@@ -828,10 +1101,8 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
828
1101
  for (int i = 0; i < offset; ++i) { \
829
1102
  x[i] = vec_add(x[i], x[offset + i]); \
830
1103
  } \
831
- res = vec_extract(x[0], 0) + \
832
- vec_extract(x[0], 1) + \
833
- vec_extract(x[0], 2) + \
834
- vec_extract(x[0], 3); \
1104
+ float32x4_t tmp = x[0] + vec_reve(x[0]); \
1105
+ res = tmp[0] + tmp[1]; \
835
1106
  }
836
1107
 
837
1108
  #define GGML_F32_VEC GGML_F32x4
@@ -848,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
848
1119
  #define GGML_F16_STEP GGML_F32_STEP
849
1120
  #define GGML_F16_EPR GGML_F32_EPR
850
1121
 
851
- static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
1122
+ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1123
+ #if defined(__NNPA__)
1124
+ uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1125
+ uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1126
+ return vec_extend_to_fp32_hi(v_xd, 0);
1127
+ #else
852
1128
  float tmp[4];
853
1129
 
854
1130
  for (int i = 0; i < 4; i++) {
855
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
1131
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
856
1132
  }
857
1133
 
858
1134
  // note: keep type-cast here to prevent compiler bugs
859
1135
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
860
1136
  return vec_xl(0, (const float *)(tmp));
1137
+ #endif
861
1138
  }
862
1139
 
863
- static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
1140
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1141
+ #if defined(__NNPA__)
1142
+ float32x4_t v_zero = vec_splats(0.0f);
1143
+ uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1144
+ uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1145
+
1146
+ x[0] = vec_extract(v_x, 0);
1147
+ x[1] = vec_extract(v_x, 1);
1148
+ x[2] = vec_extract(v_x, 2);
1149
+ x[3] = vec_extract(v_x, 3);
1150
+ #else
864
1151
  float arr[4];
865
1152
 
866
1153
  // note: keep type-cast here to prevent compiler bugs
867
1154
  // see: https://github.com/ggml-org/llama.cpp/issues/12846
868
- vec_xst(y, 0, (float *)(arr));
1155
+ vec_xst(v_y, 0, (float *)(arr));
869
1156
 
870
1157
  for (int i = 0; i < 4; i++) {
871
- x[i] = GGML_FP32_TO_FP16(arr[i]);
1158
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
872
1159
  }
1160
+ #endif
873
1161
  }
874
1162
 
875
1163
  #define GGML_F16_VEC GGML_F32x4
@@ -890,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
890
1178
  #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
891
1179
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
892
1180
  #endif
1181
+
1182
+ #ifdef __cplusplus
1183
+ }
1184
+ #endif
@@ -1,4 +1,4 @@
1
- #include "ggml-cpu-traits.h"
1
+ #include "traits.h"
2
2
 
3
3
  #include "ggml-backend-impl.h"
4
4
  #include "ggml-backend.h"