local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1279 @@
1
+ #pragma once
2
+
3
+ #include "ggml-cpu-impl.h"
4
+
5
+ #ifdef __ARM_FEATURE_SVE
6
+ #include <arm_sve.h>
7
+ #endif // __ARM_FEATURE_SVE
8
+
9
+ #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
10
+ // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
11
+ //
12
+ // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
13
+ //
14
+ #include <arm_neon.h>
15
+ #endif
16
+
17
+ #if defined(__riscv_v_intrinsic)
18
+ #include <riscv_vector.h>
19
+ #endif
20
+
21
+ #ifdef __cplusplus
22
+ extern "C" {
23
+ #endif
24
+
25
+ //
26
+ // simd mappings
27
+ //
28
+
29
+ // FP16 to FP32 conversion
30
+
31
+ // 16-bit float
32
+ // on Arm, we use __fp16
33
+ // on x86, we use uint16_t
34
+ //
35
+ // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
36
+ // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
37
+ //
38
+ #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
39
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
40
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
41
+
42
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
43
+
44
+ static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
45
+ __fp16 tmp;
46
+ memcpy(&tmp, &h, sizeof(ggml_fp16_t));
47
+ return (float)tmp;
48
+ }
49
+
50
+ static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
51
+ ggml_fp16_t res;
52
+ __fp16 tmp = f;
53
+ memcpy(&res, &tmp, sizeof(ggml_fp16_t));
54
+ return res;
55
+ }
56
+ #elif defined(__F16C__)
57
+ #ifdef _MSC_VER
58
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
59
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
60
+ #else
61
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
62
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
63
+ #endif
64
+ #elif defined(__POWER9_VECTOR__)
65
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
66
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
67
+ /* the inline asm below is about 12% faster than the lookup method */
68
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
69
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
70
+
71
+ static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
72
+ float f;
73
+ double d;
74
+ __asm__(
75
+ "mtfprd %0,%2\n"
76
+ "xscvhpdp %0,%0\n"
77
+ "frsp %1,%0\n" :
78
+ /* temp */ "=d"(d),
79
+ /* out */ "=f"(f):
80
+ /* in */ "r"(h));
81
+ return f;
82
+ }
83
+
84
+ static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
85
+ double d;
86
+ ggml_fp16_t r;
87
+ __asm__( /* xscvdphp can work on double or single precision */
88
+ "xscvdphp %0,%2\n"
89
+ "mffprd %1,%0\n" :
90
+ /* temp */ "=d"(d),
91
+ /* out */ "=r"(r):
92
+ /* in */ "f"(f));
93
+ return r;
94
+ }
95
+ #elif defined(__riscv) && defined(__riscv_zfhmin)
96
+ static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
97
+ _Float16 hf;
98
+ memcpy(&hf, &h, sizeof(ggml_fp16_t));
99
+ return hf;
100
+ }
101
+
102
+ static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
103
+ ggml_fp16_t res;
104
+ _Float16 hf = (_Float16)f;
105
+ memcpy(&res, &hf, sizeof(ggml_fp16_t));
106
+ return res;
107
+ }
108
+
109
+ #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
110
+ #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
111
+ #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
112
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
113
+ #endif
114
+
115
+ // precomputed f32 table for f16 (256 KB)
116
+ // defined in ggml-cpu.c, initialized in ggml_cpu_init()
117
+ extern float ggml_table_f32_f16[1 << 16];
118
+
119
+ // precomputed f32 table for e8m0 half (1 KB)
120
+ // defined in ggml-cpu.c, initialized in ggml_cpu_init()
121
+ extern float ggml_table_f32_e8m0_half[1 << 8];
122
+
123
+ // Use lookup table for E8M0 on x86 (faster than bit manipulation)
124
+ #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
125
+ #define GGML_CPU_E8M0_TO_FP32_HALF(x) ggml_table_f32_e8m0_half[(uint8_t)(x)]
126
+ #else
127
+ #define GGML_CPU_E8M0_TO_FP32_HALF(x) GGML_E8M0_TO_FP32_HALF(x)
128
+ #endif
129
+
130
+ // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
131
+ // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
132
+ // This is also true for POWER9.
133
+ #if !defined(GGML_CPU_FP16_TO_FP32)
134
+ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
135
+ uint16_t s;
136
+ memcpy(&s, &f, sizeof(uint16_t));
137
+ return ggml_table_f32_f16[s];
138
+ }
139
+
140
+ #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
141
+ #endif
142
+
143
+ #if !defined(GGML_CPU_FP32_TO_FP16)
144
+ #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
145
+ #endif
146
+
147
+
148
+ // we define a common set of C macros which map to specific intrinsics based on the current architecture
149
+ // we then implement the fundamental computation operations below using only these macros
150
+ // adding support for new architectures requires to define the corresponding SIMD macros
151
+ //
152
+ // GGML_F32_STEP / GGML_F16_STEP
153
+ // number of elements to process in a single step
154
+ //
155
+ // GGML_F32_EPR / GGML_F16_EPR
156
+ // number of elements to fit in a single register
157
+ //
158
+
159
+ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
160
+
161
+ #define GGML_SIMD
162
+
163
+ // F32 SVE
164
+ #define GGML_F32_EPR 8
165
+ #define DEFAULT_PG svptrue_b32()
166
+
167
+ #define GGML_F32xt svfloat32_t
168
+ #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
169
+ #define GGML_F32xt_SET1(x) svdup_n_f32(x)
170
+ #define GGML_F32xt_LOAD_IMPL(pg, a) svld1_f32(pg, a)
171
+ #define GGML_F32xt_LOAD(a) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, a)
172
+ #define GGML_F32xt_STORE_IMPL(pg, a, b) svst1_f32(pg, a, b)
173
+ #define GGML_F32xt_STORE(a, b) GGML_F32xt_STORE_IMPL(DEFAULT_PG, a, b)
174
+ #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
175
+ #define GGML_F32xt_FMA(a, b, c) GGML_F32xt_FMA_IMPL(DEFAULT_PG, a, b, c)
176
+ #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
177
+ #define GGML_F32xt_ADD(a, b) GGML_F32xt_ADD_IMPL(DEFAULT_PG, a, b)
178
+ #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
179
+ #define GGML_F32xt_MUL(a, b) GGML_F32xt_MUL_IMPL(DEFAULT_PG, a, b)
180
+ #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
181
+ #define GGML_F32xt_REDUCE_ONE(a) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, a)
182
+ #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
183
+ { \
184
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
185
+ sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
186
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
187
+ sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
188
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
189
+ sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
190
+ sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
191
+ (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
192
+ }
193
+ #define GGML_F32xt_REDUCE(res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
194
+ GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8)
195
+
196
+ #define GGML_F32_VEC GGML_F32xt
197
+ #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
198
+ #define GGML_F32_VEC_SET1 GGML_F32xt_SET1
199
+ #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
200
+ #define GGML_F32_VEC_STORE GGML_F32xt_STORE
201
+ #define GGML_F32_VEC_FMA GGML_F32xt_FMA
202
+ #define GGML_F32_VEC_ADD GGML_F32xt_ADD
203
+ #define GGML_F32_VEC_MUL GGML_F32xt_MUL
204
+ #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
205
+
206
+ // F16 SVE
207
+ #define DEFAULT_PG32 svptrue_b32()
208
+ #define DEFAULT_PG16 svptrue_b16()
209
+
210
+ #define GGML_F32Cxt svfloat16_t
211
+ #define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
212
+ #define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
213
+ #define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
214
+ #define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
215
+
216
+ #define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
217
+ #define GGML_F32Cxt_FMA(a, b, c) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, a, b, c)
218
+ #define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
219
+ #define GGML_F32Cxt_ADD(a, b) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, a, b)
220
+ #define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
221
+ #define GGML_F32Cxt_MUL(a, b) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, a, b)
222
+ #define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
223
+
224
+ #define GGML_F16x_VEC GGML_F32Cxt
225
+ #define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
226
+ #define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
227
+ #define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
228
+ #define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
229
+ #define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
230
+ #define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
231
+ #define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
232
+ #define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
233
+
234
+ #define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
235
+ #define GGML_F16xt_REDUCE_ONE(a) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, a)
236
+
237
+ #define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
238
+ { \
239
+ sum1 = svadd_f16_x(pg16, sum1, sum2); \
240
+ sum3 = svadd_f16_x(pg16, sum3, sum4); \
241
+ sum1 = svadd_f16_x(pg16, sum1, sum3); \
242
+ __fp16 sum_f16 = svaddv_f16(pg16, sum1); \
243
+ (res) = (ggml_float) sum_f16; \
244
+ }
245
+ #define GGML_F16xt_REDUCE_MIXED(res, sum1, sum2, sum3, sum4) \
246
+ GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, res, sum1, sum2, sum3, sum4)
247
+
248
+ // F16 NEON
249
+
250
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
251
+ #define GGML_F16_STEP 32
252
+ #define GGML_F16_EPR 8
253
+
254
+ #define GGML_F16x8 float16x8_t
255
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
256
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
257
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
258
+ #define GGML_F16x8_STORE vst1q_f16
259
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
260
+ #define GGML_F16x8_ADD vaddq_f16
261
+ #define GGML_F16x8_MUL vmulq_f16
262
+ #define GGML_F16x8_REDUCE(res, x) \
263
+ do { \
264
+ int offset = GGML_F16_ARR >> 1; \
265
+ for (int i = 0; i < offset; ++i) { \
266
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
267
+ } \
268
+ offset >>= 1; \
269
+ for (int i = 0; i < offset; ++i) { \
270
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
271
+ } \
272
+ offset >>= 1; \
273
+ for (int i = 0; i < offset; ++i) { \
274
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
275
+ } \
276
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
277
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
278
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
279
+ } while (0)
280
+
281
+ #define GGML_F16_VEC GGML_F16x8
282
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
283
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
284
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
285
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
286
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
287
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
288
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
289
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
290
+ #else
291
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
292
+ // and take advantage of the vcvt_ functions to convert to/from FP16
293
+
294
+ #define GGML_F16_STEP 16
295
+ #define GGML_F16_EPR 4
296
+
297
+ #define GGML_F32Cx4 float32x4_t
298
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
299
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
300
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
301
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
302
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
303
+ #define GGML_F32Cx4_ADD vaddq_f32
304
+ #define GGML_F32Cx4_MUL vmulq_f32
305
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
306
+
307
+ #define GGML_F16_VEC GGML_F32Cx4
308
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
309
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
310
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
311
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
312
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
313
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
314
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
315
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
316
+ #endif
317
+
318
+ #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
319
+
320
+ #define GGML_SIMD
321
+
322
+ // F32 NEON
323
+
324
+ #define GGML_F32_STEP 16
325
+ #define GGML_F32_EPR 4
326
+
327
+ #define GGML_F32x4 float32x4_t
328
+ #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
329
+ #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
330
+ #define GGML_F32x4_LOAD vld1q_f32
331
+ #define GGML_F32x4_STORE vst1q_f32
332
+ #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
333
+ #define GGML_F32x4_ADD vaddq_f32
334
+ #define GGML_F32x4_MUL vmulq_f32
335
+ #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
336
+ #define GGML_F32x4_REDUCE(res, x) \
337
+ { \
338
+ int offset = GGML_F32_ARR >> 1; \
339
+ for (int i = 0; i < offset; ++i) { \
340
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
341
+ } \
342
+ offset >>= 1; \
343
+ for (int i = 0; i < offset; ++i) { \
344
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
345
+ } \
346
+ offset >>= 1; \
347
+ for (int i = 0; i < offset; ++i) { \
348
+ (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
349
+ } \
350
+ (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
351
+ }
352
+
353
+ #define GGML_F32_VEC GGML_F32x4
354
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
355
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
356
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
357
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
358
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
359
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
360
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
361
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
362
+
363
+ // F16 NEON
364
+
365
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
366
+ #define GGML_F16_STEP 32
367
+ #define GGML_F16_EPR 8
368
+
369
+ #define GGML_F16x8 float16x8_t
370
+ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
371
+ #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
372
+ #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
373
+ #define GGML_F16x8_STORE vst1q_f16
374
+ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
375
+ #define GGML_F16x8_ADD vaddq_f16
376
+ #define GGML_F16x8_MUL vmulq_f16
377
+ #define GGML_F16x8_REDUCE(res, x) \
378
+ do { \
379
+ int offset = GGML_F16_ARR >> 1; \
380
+ for (int i = 0; i < offset; ++i) { \
381
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
382
+ } \
383
+ offset >>= 1; \
384
+ for (int i = 0; i < offset; ++i) { \
385
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
386
+ } \
387
+ offset >>= 1; \
388
+ for (int i = 0; i < offset; ++i) { \
389
+ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
390
+ } \
391
+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
392
+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
393
+ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
394
+ } while (0)
395
+
396
+ #define GGML_F16_VEC GGML_F16x8
397
+ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
398
+ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
399
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
400
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
401
+ #define GGML_F16_VEC_FMA GGML_F16x8_FMA
402
+ #define GGML_F16_VEC_ADD GGML_F16x8_ADD
403
+ #define GGML_F16_VEC_MUL GGML_F16x8_MUL
404
+ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
405
+ #else
406
+ // if FP16 vector arithmetic is not supported, we use FP32 instead
407
+ // and take advantage of the vcvt_ functions to convert to/from FP16
408
+
409
+ #define GGML_F16_STEP 16
410
+ #define GGML_F16_EPR 4
411
+
412
+ #define GGML_F32Cx4 float32x4_t
413
+ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
414
+ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
415
+ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
416
+ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
417
+ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
418
+ #define GGML_F32Cx4_ADD vaddq_f32
419
+ #define GGML_F32Cx4_MUL vmulq_f32
420
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
421
+
422
+ #define GGML_F16_VEC GGML_F32Cx4
423
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
424
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
425
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
426
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
427
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
428
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
429
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
430
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
431
+ #endif
432
+
433
+ #elif defined(__AVX512F__)
434
+
435
+ #define GGML_SIMD
436
+
437
+ // F32 AVX512
438
+
439
+ #define GGML_F32_STEP 64
440
+ #define GGML_F32_EPR 16
441
+
442
+ #define GGML_F32x16 __m512
443
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
444
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
445
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
446
+ #define GGML_F32x16_STORE _mm512_storeu_ps
447
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
448
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
449
+ #define GGML_F32x16_ADD _mm512_add_ps
450
+ #define GGML_F32x16_MUL _mm512_mul_ps
451
+ #define GGML_F32x16_REDUCE(res, x) \
452
+ do { \
453
+ int offset = GGML_F32_ARR >> 1; \
454
+ for (int i = 0; i < offset; ++i) { \
455
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
456
+ } \
457
+ offset >>= 1; \
458
+ for (int i = 0; i < offset; ++i) { \
459
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
460
+ } \
461
+ offset >>= 1; \
462
+ for (int i = 0; i < offset; ++i) { \
463
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
464
+ } \
465
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
466
+ } while (0)
467
+
468
+ // TODO: is this optimal ?
469
+
470
+ #define GGML_F32_VEC GGML_F32x16
471
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
472
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
473
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
474
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
475
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
476
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
477
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
478
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
479
+
480
+ // F16 AVX512
481
+
482
+ // F16 AVX
483
+
484
+ #define GGML_F16_STEP 64
485
+ #define GGML_F16_EPR 16
486
+
487
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
488
+
489
+ #define GGML_F32Cx16 __m512
490
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
491
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
492
+
493
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
494
+ // so F16C guard isn't required
495
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
496
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
497
+
498
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
499
+ #define GGML_F32Cx16_ADD _mm512_add_ps
500
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
501
+ #define GGML_F32Cx16_REDUCE(res, x) \
502
+ do { \
503
+ int offset = GGML_F32_ARR >> 1; \
504
+ for (int i = 0; i < offset; ++i) { \
505
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
506
+ } \
507
+ offset >>= 1; \
508
+ for (int i = 0; i < offset; ++i) { \
509
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
510
+ } \
511
+ offset >>= 1; \
512
+ for (int i = 0; i < offset; ++i) { \
513
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
514
+ } \
515
+ res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
516
+ } while (0)
517
+
518
+ #define GGML_F16_VEC GGML_F32Cx16
519
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
520
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
521
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
522
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
523
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
524
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
525
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
526
+
527
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
528
+ #elif defined(__AVX__)
529
+
530
+ #define GGML_SIMD
531
+
532
+ // F32 AVX
533
+
534
+ #define GGML_F32_STEP 32
535
+ #define GGML_F32_EPR 8
536
+
537
+ #define GGML_F32x8 __m256
538
+ #define GGML_F32x8_ZERO _mm256_setzero_ps()
539
+ #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
540
+ #define GGML_F32x8_LOAD _mm256_loadu_ps
541
+ #define GGML_F32x8_STORE _mm256_storeu_ps
542
+ #if defined(__FMA__)
543
+ #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
544
+ #else
545
+ #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
546
+ #endif
547
+ #define GGML_F32x8_ADD _mm256_add_ps
548
+ #define GGML_F32x8_MUL _mm256_mul_ps
549
+ #define GGML_F32x8_REDUCE(res, x) \
550
+ do { \
551
+ int offset = GGML_F32_ARR >> 1; \
552
+ for (int i = 0; i < offset; ++i) { \
553
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
554
+ } \
555
+ offset >>= 1; \
556
+ for (int i = 0; i < offset; ++i) { \
557
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
558
+ } \
559
+ offset >>= 1; \
560
+ for (int i = 0; i < offset; ++i) { \
561
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
562
+ } \
563
+ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
564
+ _mm256_extractf128_ps(x[0], 1)); \
565
+ const __m128 t1 = _mm_hadd_ps(t0, t0); \
566
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
567
+ } while (0)
568
+ // TODO: is this optimal ?
569
+
570
+ #define GGML_F32_VEC GGML_F32x8
571
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
572
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
573
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
574
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
575
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
576
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
577
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
578
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
579
+
580
+ // F16 AVX
581
+
582
+ #define GGML_F16_STEP 32
583
+ #define GGML_F16_EPR 8
584
+
585
+ // F16 arithmetic is not supported by AVX, so we use F32 instead
586
+
587
+ #define GGML_F32Cx8 __m256
588
+ #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
589
+ #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
590
+
591
+ #if defined(__F16C__)
592
+ // the _mm256_cvt intrinsics require F16C
593
+ #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
594
+ #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
595
+ #else
596
+ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
597
+ float tmp[8];
598
+
599
+ for (int i = 0; i < 8; i++) {
600
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
601
+ }
602
+
603
+ return _mm256_loadu_ps(tmp);
604
+ }
605
+ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
606
+ float arr[8];
607
+
608
+ _mm256_storeu_ps(arr, y);
609
+
610
+ for (int i = 0; i < 8; i++)
611
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
612
+ }
613
+ #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
614
+ #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
615
+ #endif
616
+
617
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
618
+ #define GGML_F32Cx8_ADD _mm256_add_ps
619
+ #define GGML_F32Cx8_MUL _mm256_mul_ps
620
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
621
+
622
+ #define GGML_F16_VEC GGML_F32Cx8
623
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
624
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
625
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
626
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
627
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
628
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
629
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
630
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
631
+
632
+ #elif defined(__POWER9_VECTOR__)
633
+
634
+ #define GGML_SIMD
635
+
636
+ // F32 POWER9
637
+
638
+ #define GGML_F32_STEP 32
639
+ #define GGML_F32_EPR 4
640
+
641
+ #define GGML_F32x4 vector float
642
+ #define GGML_F32x4_ZERO {0.0f}
643
+ #define GGML_F32x4_SET1 vec_splats
644
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
645
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
646
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
647
+ #define GGML_F32x4_ADD vec_add
648
+ #define GGML_F32x4_MUL vec_mul
649
+ #define GGML_F32x4_REDUCE(res, x) \
650
+ { \
651
+ int offset = GGML_F32_ARR >> 1; \
652
+ for (int i = 0; i < offset; ++i) { \
653
+ x[i] = vec_add(x[i], x[offset+i]); \
654
+ } \
655
+ offset >>= 1; \
656
+ for (int i = 0; i < offset; ++i) { \
657
+ x[i] = vec_add(x[i], x[offset+i]); \
658
+ } \
659
+ offset >>= 1; \
660
+ for (int i = 0; i < offset; ++i) { \
661
+ x[i] = vec_add(x[i], x[offset+i]); \
662
+ } \
663
+ res = vec_extract(x[0], 0) + \
664
+ vec_extract(x[0], 1) + \
665
+ vec_extract(x[0], 2) + \
666
+ vec_extract(x[0], 3); \
667
+ }
668
+ #define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
669
+ { \
670
+ vector float v = vec_add(vec_add(s0, s1), \
671
+ vec_add(s2, s3)); \
672
+ v = vec_add(v, vec_sld(v, v, 8)); \
673
+ v = vec_add(v, vec_sld(v, v, 4)); \
674
+ res += (ggml_float) vec_extract(v, 0); \
675
+ }
676
+
677
+ #define GGML_F32_VEC GGML_F32x4
678
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
679
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
680
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
681
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
682
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
683
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
684
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
685
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
686
+
687
+ // F16 POWER9
688
+ #define GGML_F16_STEP GGML_F32_STEP
689
+ #define GGML_F16_EPR GGML_F32_EPR
690
+ #define GGML_F16_VEC GGML_F32x4
691
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
692
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
693
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
694
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
695
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
696
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
697
+ // Use vec_xl, not vec_ld, in case the load address is not aligned.
698
+ #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
699
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
700
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
701
+ static inline unsigned char ggml_endian_byte(int i) {
702
+ uint16_t tmp_val = 1;
703
+ return ((unsigned char *)&tmp_val)[i];
704
+ }
705
+ #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
706
+ #define GGML_F16_VEC_STORE(p, r, i) \
707
+ if (i & 0x1) \
708
+ vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
709
+ r[i - GGML_ENDIAN_BYTE(0)]), \
710
+ 0, p - GGML_F16_EPR)
711
+
712
+ //BF16 POWER9
713
+ #define GGML_BF16_STEP 16
714
+ #define GGML_BF16_EPR 8
715
+
716
+ #define GGML_BF16x8 vector unsigned short
717
+ #define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
718
+ #define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
719
+
720
+ #define GGML_BF16_VEC GGML_BF16x8
721
+ #define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
722
+ #define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
723
+ #if defined(__LITTLE_ENDIAN__)
724
+ #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel(GGML_BF16_VEC_ZERO, (v)))
725
+ #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh(GGML_BF16_VEC_ZERO, (v)))
726
+ #else
727
+ #define GGML_BF16_TO_F32_LO(v) ((vector float) vec_mergel((v), GGML_BF16_VEC_ZERO))
728
+ #define GGML_BF16_TO_F32_HI(v) ((vector float) vec_mergeh((v), GGML_BF16_VEC_ZERO))
729
+ #endif
730
+ #define GGML_BF16_FMA_LO(acc, x, y) \
731
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
732
+ #define GGML_BF16_FMA_HI(acc, x, y) \
733
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
734
+
735
+ #elif defined(__wasm_simd128__)
736
+
737
+ #define GGML_SIMD
738
+
739
+ // F32 WASM
740
+
741
+ #define GGML_F32_STEP 16
742
+ #define GGML_F32_EPR 4
743
+
744
+ #define GGML_F32x4 v128_t
745
+ #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
746
+ #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
747
+ #define GGML_F32x4_LOAD wasm_v128_load
748
+ #define GGML_F32x4_STORE wasm_v128_store
749
+ #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
750
+ #define GGML_F32x4_ADD wasm_f32x4_add
751
+ #define GGML_F32x4_MUL wasm_f32x4_mul
752
+ #define GGML_F32x4_REDUCE(res, x) \
753
+ { \
754
+ int offset = GGML_F32_ARR >> 1; \
755
+ for (int i = 0; i < offset; ++i) { \
756
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
757
+ } \
758
+ offset >>= 1; \
759
+ for (int i = 0; i < offset; ++i) { \
760
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
761
+ } \
762
+ offset >>= 1; \
763
+ for (int i = 0; i < offset; ++i) { \
764
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
765
+ } \
766
+ res = wasm_f32x4_extract_lane(x[0], 0) + \
767
+ wasm_f32x4_extract_lane(x[0], 1) + \
768
+ wasm_f32x4_extract_lane(x[0], 2) + \
769
+ wasm_f32x4_extract_lane(x[0], 3); \
770
+ }
771
+
772
+ #define GGML_F32_VEC GGML_F32x4
773
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
774
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
775
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
776
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
777
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
778
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
779
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
780
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
781
+
782
+ // F16 WASM
783
+
784
+ #define GGML_F16_STEP 16
785
+ #define GGML_F16_EPR 4
786
+
787
+ inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
788
+ float tmp[4];
789
+
790
+ tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
791
+ tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
792
+ tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
793
+ tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
794
+
795
+ return wasm_v128_load(tmp);
796
+ }
797
+
798
+ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
799
+ float tmp[4];
800
+
801
+ wasm_v128_store(tmp, x);
802
+
803
+ p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
804
+ p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
805
+ p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
806
+ p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
807
+ }
808
+
809
+ #define GGML_F16x4 v128_t
810
+ #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
811
+ #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
812
+ #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
813
+ #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
814
+ #define GGML_F16x4_FMA GGML_F32x4_FMA
815
+ #define GGML_F16x4_ADD wasm_f32x4_add
816
+ #define GGML_F16x4_MUL wasm_f32x4_mul
817
+ #define GGML_F16x4_REDUCE(res, x) \
818
+ { \
819
+ int offset = GGML_F16_ARR >> 1; \
820
+ for (int i = 0; i < offset; ++i) { \
821
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
822
+ } \
823
+ offset >>= 1; \
824
+ for (int i = 0; i < offset; ++i) { \
825
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
826
+ } \
827
+ offset >>= 1; \
828
+ for (int i = 0; i < offset; ++i) { \
829
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
830
+ } \
831
+ res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
832
+ wasm_f32x4_extract_lane(x[0], 1) + \
833
+ wasm_f32x4_extract_lane(x[0], 2) + \
834
+ wasm_f32x4_extract_lane(x[0], 3)); \
835
+ }
836
+
837
+ #define GGML_F16_VEC GGML_F16x4
838
+ #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
839
+ #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
840
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
841
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
842
+ #define GGML_F16_VEC_FMA GGML_F16x4_FMA
843
+ #define GGML_F16_VEC_ADD GGML_F16x4_ADD
844
+ #define GGML_F16_VEC_MUL GGML_F16x4_MUL
845
+ #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
846
+
847
+ #elif defined(__SSE3__)
848
+
849
+ #define GGML_SIMD
850
+
851
+ // F32 SSE
852
+
853
+ #define GGML_F32_STEP 32
854
+ #define GGML_F32_EPR 4
855
+
856
+ #define GGML_F32x4 __m128
857
+ #define GGML_F32x4_ZERO _mm_setzero_ps()
858
+ #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
859
+ #define GGML_F32x4_LOAD _mm_loadu_ps
860
+ #define GGML_F32x4_STORE _mm_storeu_ps
861
+ #if defined(__FMA__)
862
+ // TODO: Does this work?
863
+ #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
864
+ #else
865
+ #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
866
+ #endif
867
+ #define GGML_F32x4_ADD _mm_add_ps
868
+ #define GGML_F32x4_MUL _mm_mul_ps
869
+ #define GGML_F32x4_REDUCE(res, x) \
870
+ { \
871
+ int offset = GGML_F32_ARR >> 1; \
872
+ for (int i = 0; i < offset; ++i) { \
873
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
874
+ } \
875
+ offset >>= 1; \
876
+ for (int i = 0; i < offset; ++i) { \
877
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
878
+ } \
879
+ offset >>= 1; \
880
+ for (int i = 0; i < offset; ++i) { \
881
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
882
+ } \
883
+ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
884
+ res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
885
+ }
886
+ // TODO: is this optimal ?
887
+
888
+ #define GGML_F32_VEC GGML_F32x4
889
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
890
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
891
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
892
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
893
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
894
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
895
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
896
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
897
+
898
+ // F16 SSE
899
+
900
+ #define GGML_F16_STEP 32
901
+ #define GGML_F16_EPR 4
902
+
903
+ static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
904
+ float tmp[4];
905
+
906
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
907
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
908
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
909
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
910
+
911
+ return _mm_loadu_ps(tmp);
912
+ }
913
+
914
+ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
915
+ float arr[4];
916
+
917
+ _mm_storeu_ps(arr, y);
918
+
919
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
920
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
921
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
922
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
923
+ }
924
+
925
+ #define GGML_F32Cx4 __m128
926
+ #define GGML_F32Cx4_ZERO _mm_setzero_ps()
927
+ #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
928
+ #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
929
+ #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
930
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
931
+ #define GGML_F32Cx4_ADD _mm_add_ps
932
+ #define GGML_F32Cx4_MUL _mm_mul_ps
933
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
934
+
935
+ #define GGML_F16_VEC GGML_F32Cx4
936
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
937
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
938
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
939
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
940
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
941
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
942
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
943
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
944
+
945
+ #elif defined(__loongarch_asx)
946
+
947
+ #define GGML_SIMD
948
+
949
+ // F32 LASX
950
+ #define GGML_F32_STEP 32
951
+ #define GGML_F32_EPR 8
952
+
953
+ #define GGML_F32x8 __m256
954
+ #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
955
+ #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
956
+ #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
957
+ #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
958
+ #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
959
+ #define GGML_F32x8_ADD __lasx_xvfadd_s
960
+ #define GGML_F32x8_MUL __lasx_xvfmul_s
961
+ #define GGML_F32x8_REDUCE(res, x) \
962
+ do { \
963
+ int offset = GGML_F32_ARR >> 1; \
964
+ for (int i = 0; i < offset; ++i) { \
965
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
966
+ } \
967
+ offset >>= 1; \
968
+ for (int i = 0; i < offset; ++i) { \
969
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
970
+ } \
971
+ offset >>= 1; \
972
+ for (int i = 0; i < offset; ++i) { \
973
+ x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
974
+ } \
975
+ float *tmp_p = (float *)&x[0]; \
976
+ res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
977
+ } while (0)
978
+ // TODO: is this optimal ?
979
+
980
+ #define GGML_F32_VEC GGML_F32x8
981
+ #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
982
+ #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
983
+ #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
984
+ #define GGML_F32_VEC_STORE GGML_F32x8_STORE
985
+ #define GGML_F32_VEC_FMA GGML_F32x8_FMA
986
+ #define GGML_F32_VEC_ADD GGML_F32x8_ADD
987
+ #define GGML_F32_VEC_MUL GGML_F32x8_MUL
988
+ #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
989
+
990
+ // F16 LASX
991
+
992
+ #define GGML_F16_STEP 32
993
+ #define GGML_F16_EPR 8
994
+
995
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
996
+
997
+ #define GGML_F32Cx8 __m256
998
+ #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
999
+ #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
1000
+
1001
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1002
+ __m256i a;
1003
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1004
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1005
+ return __lasx_xvfcvtl_s_h(a);
1006
+ }
1007
+
1008
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1009
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
1010
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1011
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1012
+ }
1013
+ #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1014
+ #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
1015
+
1016
+ #define GGML_F32Cx8_FMA GGML_F32x8_FMA
1017
+ #define GGML_F32Cx8_ADD __lasx_xvfadd_s
1018
+ #define GGML_F32Cx8_MUL __lasx_xvfmul_s
1019
+ #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
1020
+
1021
+ #define GGML_F16_VEC GGML_F32Cx8
1022
+ #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
1023
+ #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
1024
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
1025
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
1026
+ #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
1027
+ #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
1028
+ #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
1029
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
1030
+
1031
+ #elif defined(__loongarch_sx)
1032
+
1033
+ #define GGML_SIMD
1034
+
1035
+ // F32 LSX
1036
+
1037
+ #define GGML_F32_STEP 32
1038
+ #define GGML_F32_EPR 4
1039
+
1040
+ #define GGML_F32x4 __m128
1041
+ #define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
1042
+ #define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1043
+ #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
1044
+ #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
1045
+ #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
1046
+ #define GGML_F32x4_ADD __lsx_vfadd_s
1047
+ #define GGML_F32x4_MUL __lsx_vfmul_s
1048
+
1049
+ #define GGML_F32x4_REDUCE(res, x) \
1050
+ { \
1051
+ int offset = GGML_F32_ARR >> 1; \
1052
+ for (int i = 0; i < offset; ++i) { \
1053
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1054
+ } \
1055
+ offset >>= 1; \
1056
+ for (int i = 0; i < offset; ++i) { \
1057
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1058
+ } \
1059
+ offset >>= 1; \
1060
+ for (int i = 0; i < offset; ++i) { \
1061
+ x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \
1062
+ } \
1063
+ __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
1064
+ __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
1065
+ __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1); \
1066
+ __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2); \
1067
+ __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2); \
1068
+ __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4); \
1069
+ res = (ggml_float) ((v4f32)t5)[0]; \
1070
+ }
1071
+
1072
+ #define GGML_F32_VEC GGML_F32x4
1073
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1074
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1075
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1076
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1077
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1078
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1079
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1080
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1081
+
1082
+ // F16 LSX
1083
+
1084
+ #define GGML_F16_STEP 32
1085
+ #define GGML_F16_EPR 4
1086
+
1087
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1088
+ float tmp[4];
1089
+
1090
+ tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
1091
+ tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
1092
+ tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
1093
+ tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
1094
+
1095
+ return (__m128)__lsx_vld(tmp, 0);
1096
+ }
1097
+
1098
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1099
+ float arr[4];
1100
+
1101
+ __lsx_vst(y, arr, 0);
1102
+
1103
+ x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
1104
+ x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
1105
+ x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
1106
+ x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
1107
+ }
1108
+
1109
+ #define GGML_F32Cx4 __m128
1110
+ #define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
1111
+ #define GGML_F32Cx4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
1112
+ #define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
1113
+ #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
1114
+ #define GGML_F32Cx4_FMA GGML_F32x4_FMA
1115
+ #define GGML_F32Cx4_ADD __lsx_vfadd_s
1116
+ #define GGML_F32Cx4_MUL __lsx_vfmul_s
1117
+ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
1118
+
1119
+ #define GGML_F16_VEC GGML_F32Cx4
1120
+ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
1121
+ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
1122
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
1123
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
1124
+ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
1125
+ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
1126
+ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1127
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1128
+
1129
+ #elif defined(__VXE__) || defined(__VXE2__)
1130
+
1131
+ #define GGML_SIMD
1132
+
1133
+ // F32 s390x
1134
+
1135
+ #define GGML_F32_STEP 32
1136
+ #define GGML_F32_EPR 4
1137
+
1138
+ #define GGML_F32x4 float32x4_t
1139
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
1140
+ #define GGML_F32x4_SET1 vec_splats
1141
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
1142
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1143
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1144
+ #define GGML_F32x4_ADD vec_add
1145
+ #define GGML_F32x4_MUL vec_mul
1146
+ #define GGML_F32x4_REDUCE(res, x) \
1147
+ { \
1148
+ int offset = GGML_F32_ARR >> 1; \
1149
+ for (int i = 0; i < offset; ++i) { \
1150
+ x[i] = vec_add(x[i], x[offset + i]); \
1151
+ } \
1152
+ offset >>= 1; \
1153
+ for (int i = 0; i < offset; ++i) { \
1154
+ x[i] = vec_add(x[i], x[offset + i]); \
1155
+ } \
1156
+ offset >>= 1; \
1157
+ for (int i = 0; i < offset; ++i) { \
1158
+ x[i] = vec_add(x[i], x[offset + i]); \
1159
+ } \
1160
+ float32x4_t tmp = x[0] + vec_reve(x[0]); \
1161
+ res = tmp[0] + tmp[1]; \
1162
+ }
1163
+ #define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
1164
+ { \
1165
+ float32x4_t v = vec_add(vec_add(s0, s1), \
1166
+ vec_add(s2, s3)); \
1167
+ v = vec_add(v, vec_sld(v, v, 8)); \
1168
+ v = vec_add(v, vec_sld(v, v, 4)); \
1169
+ res += (ggml_float)vec_extract(v, 0); \
1170
+ }
1171
+
1172
+ #define GGML_F32_VEC GGML_F32x4
1173
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1174
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1175
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1176
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1177
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1178
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1179
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1180
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1181
+
1182
+ // F16 s390x
1183
+ #define GGML_F16_STEP GGML_F32_STEP
1184
+ #define GGML_F16_EPR GGML_F32_EPR
1185
+
1186
+ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1187
+ float tmp[4];
1188
+
1189
+ for (int i = 0; i < 4; i++) {
1190
+ tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
1191
+ }
1192
+
1193
+ // note: keep type-cast here to prevent compiler bugs
1194
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
1195
+ return vec_xl(0, (const float *)(tmp));
1196
+ }
1197
+
1198
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1199
+ float arr[4];
1200
+
1201
+ // note: keep type-cast here to prevent compiler bugs
1202
+ // see: https://github.com/ggml-org/llama.cpp/issues/12846
1203
+ vec_xst(v_y, 0, (float *)(arr));
1204
+
1205
+ for (int i = 0; i < 4; i++) {
1206
+ x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
1207
+ }
1208
+ }
1209
+
1210
+ #define GGML_F16_VEC GGML_F32x4
1211
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1212
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1213
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1214
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1215
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1216
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1217
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1218
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1219
+
1220
+ // BF16 s390x
1221
+ #define GGML_BF16_STEP 16
1222
+ #define GGML_BF16_EPR 8
1223
+
1224
+ #define GGML_BF16x8 __vector unsigned short
1225
+ #define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
1226
+ #define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
1227
+
1228
+ #define GGML_BF16_VEC GGML_BF16x8
1229
+ #define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
1230
+ #define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
1231
+ #define GGML_BF16_TO_F32_LO(v) ((float32x4_t) vec_mergel((v), GGML_BF16_VEC_ZERO))
1232
+ #define GGML_BF16_TO_F32_HI(v) ((float32x4_t) vec_mergeh((v), GGML_BF16_VEC_ZERO))
1233
+ #define GGML_BF16_FMA_LO(acc, x, y) \
1234
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
1235
+ #define GGML_BF16_FMA_HI(acc, x, y) \
1236
+ (acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
1237
+
1238
+ #elif defined(__riscv_v_intrinsic)
1239
+
1240
+ // compatible with vlen >= 128
1241
+
1242
+ #define GGML_SIMD
1243
+
1244
+ // F32
1245
+
1246
+ #define GGML_F32_STEP 16
1247
+ #define GGML_F32_EPR 4
1248
+
1249
+ #define GGML_F32x4 vfloat32m1_t
1250
+ #define GGML_F32x4_ZERO __riscv_vfmv_v_f_f32m1(0.0f, GGML_F32_EPR)
1251
+ #define GGML_F32x4_SET1(x) __riscv_vfmv_v_f_f32m1(x, GGML_F32_EPR)
1252
+ #define GGML_F32x4_LOAD(x) __riscv_vle32_v_f32m1(x, GGML_F32_EPR)
1253
+ #define GGML_F32x4_STORE(b, v) __riscv_vse32_v_f32m1(b, v, GGML_F32_EPR)
1254
+ #define GGML_F32x4_FMA(a, b, c) __riscv_vfmacc_vv_f32m1(a, b, c, GGML_F32_EPR)
1255
+ #define GGML_F32x4_ADD(a, b) __riscv_vfadd_vv_f32m1(a, b, GGML_F32_EPR)
1256
+ #define GGML_F32x4_MUL(a, b) __riscv_vfmul_vv_f32m1(a, b, GGML_F32_EPR)
1257
+
1258
+ #define GGML_F32_VEC GGML_F32x4
1259
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1260
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1261
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1262
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1263
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1264
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1265
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1266
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1267
+
1268
+ #endif
1269
+
1270
+ // GGML_F32_ARR / GGML_F16_ARR
1271
+ // number of registers to use per step
1272
+ #ifdef GGML_SIMD
1273
+ #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
1274
+ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
1275
+ #endif
1276
+
1277
+ #ifdef __cplusplus
1278
+ }
1279
+ #endif