local-llm-rn 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (626) hide show
  1. package/cpp/CMakeLists.txt +285 -0
  2. package/cpp/common/CMakeLists.txt +149 -0
  3. package/cpp/common/arg.cpp +3799 -0
  4. package/cpp/common/arg.h +131 -0
  5. package/cpp/common/base64.hpp +392 -0
  6. package/cpp/common/build-info.cpp.in +4 -0
  7. package/cpp/common/chat-parser-xml-toolcall.cpp +879 -0
  8. package/cpp/common/chat-parser-xml-toolcall.h +45 -0
  9. package/cpp/common/chat-parser.cpp +1649 -0
  10. package/cpp/common/chat-parser.h +133 -0
  11. package/cpp/common/chat-peg-parser.cpp +124 -0
  12. package/cpp/common/chat-peg-parser.h +105 -0
  13. package/cpp/common/chat.cpp +3355 -0
  14. package/cpp/common/chat.h +252 -0
  15. package/cpp/common/common.cpp +1824 -0
  16. package/cpp/common/common.h +930 -0
  17. package/cpp/common/console.cpp +1137 -0
  18. package/cpp/common/console.h +41 -0
  19. package/cpp/common/debug.cpp +167 -0
  20. package/cpp/common/debug.h +43 -0
  21. package/cpp/common/download.cpp +792 -0
  22. package/cpp/common/download.h +84 -0
  23. package/cpp/common/http.h +84 -0
  24. package/cpp/common/jinja/README.md +88 -0
  25. package/cpp/common/jinja/caps.cpp +285 -0
  26. package/cpp/common/jinja/caps.h +30 -0
  27. package/cpp/common/jinja/lexer.cpp +341 -0
  28. package/cpp/common/jinja/lexer.h +157 -0
  29. package/cpp/common/jinja/parser.cpp +591 -0
  30. package/cpp/common/jinja/parser.h +21 -0
  31. package/cpp/common/jinja/runtime.cpp +867 -0
  32. package/cpp/common/jinja/runtime.h +638 -0
  33. package/cpp/common/jinja/string.cpp +213 -0
  34. package/cpp/common/jinja/string.h +61 -0
  35. package/cpp/common/jinja/utils.h +149 -0
  36. package/cpp/common/jinja/value.cpp +1393 -0
  37. package/cpp/common/jinja/value.h +756 -0
  38. package/cpp/common/json-partial.cpp +324 -0
  39. package/cpp/common/json-partial.h +39 -0
  40. package/cpp/common/json-schema-to-grammar.cpp +1153 -0
  41. package/cpp/common/json-schema-to-grammar.h +43 -0
  42. package/cpp/common/llguidance.cpp +258 -0
  43. package/cpp/common/log.cpp +446 -0
  44. package/cpp/common/log.h +119 -0
  45. package/cpp/common/ngram-cache.cpp +285 -0
  46. package/cpp/common/ngram-cache.h +101 -0
  47. package/cpp/common/ngram-map.cpp +530 -0
  48. package/cpp/common/ngram-map.h +115 -0
  49. package/cpp/common/ngram-mod.cpp +60 -0
  50. package/cpp/common/ngram-mod.h +38 -0
  51. package/cpp/common/peg-parser.cpp +1712 -0
  52. package/cpp/common/peg-parser.h +459 -0
  53. package/cpp/common/preset.cpp +483 -0
  54. package/cpp/common/preset.h +83 -0
  55. package/cpp/common/regex-partial.cpp +204 -0
  56. package/cpp/common/regex-partial.h +56 -0
  57. package/cpp/common/sampling.cpp +745 -0
  58. package/cpp/common/sampling.h +119 -0
  59. package/cpp/common/speculative.cpp +1074 -0
  60. package/cpp/common/speculative.h +41 -0
  61. package/cpp/common/unicode.cpp +64 -0
  62. package/cpp/common/unicode.h +22 -0
  63. package/cpp/ggml/CMakeLists.txt +494 -0
  64. package/cpp/ggml/cmake/GitVars.cmake +22 -0
  65. package/cpp/ggml/cmake/common.cmake +50 -0
  66. package/cpp/ggml/cmake/ggml-config.cmake.in +191 -0
  67. package/cpp/ggml/include/ggml-alloc.h +85 -0
  68. package/cpp/ggml/include/ggml-backend.h +373 -0
  69. package/cpp/ggml/include/ggml-blas.h +25 -0
  70. package/cpp/ggml/include/ggml-cann.h +123 -0
  71. package/cpp/ggml/include/ggml-cpp.h +39 -0
  72. package/cpp/ggml/include/ggml-cpu.h +151 -0
  73. package/cpp/ggml/include/ggml-cuda.h +47 -0
  74. package/cpp/ggml/include/ggml-hexagon.h +19 -0
  75. package/cpp/ggml/include/ggml-metal.h +61 -0
  76. package/cpp/ggml/include/ggml-opencl.h +26 -0
  77. package/cpp/ggml/include/ggml-opt.h +256 -0
  78. package/cpp/ggml/include/ggml-rpc.h +30 -0
  79. package/cpp/ggml/include/ggml-sycl.h +49 -0
  80. package/cpp/ggml/include/ggml-virtgpu.h +14 -0
  81. package/cpp/ggml/include/ggml-vulkan.h +29 -0
  82. package/cpp/ggml/include/ggml-webgpu.h +19 -0
  83. package/cpp/ggml/include/ggml-zdnn.h +17 -0
  84. package/cpp/ggml/include/ggml-zendnn.h +22 -0
  85. package/cpp/ggml/include/ggml.h +2753 -0
  86. package/cpp/ggml/include/gguf.h +204 -0
  87. package/cpp/ggml/src/CMakeLists.txt +492 -0
  88. package/cpp/ggml/src/ggml-alloc.c +1244 -0
  89. package/cpp/ggml/src/ggml-backend-dl.cpp +48 -0
  90. package/cpp/ggml/src/ggml-backend-dl.h +45 -0
  91. package/cpp/ggml/src/ggml-backend-impl.h +255 -0
  92. package/cpp/ggml/src/ggml-backend-reg.cpp +566 -0
  93. package/cpp/ggml/src/ggml-backend.cpp +2270 -0
  94. package/cpp/ggml/src/ggml-blas/CMakeLists.txt +101 -0
  95. package/cpp/ggml/src/ggml-blas/ggml-blas.cpp +518 -0
  96. package/cpp/ggml/src/ggml-common.h +1878 -0
  97. package/cpp/ggml/src/ggml-cpu/CMakeLists.txt +691 -0
  98. package/cpp/ggml/src/ggml-cpu/amx/amx.cpp +247 -0
  99. package/cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  100. package/cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
  101. package/cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2512 -0
  102. package/cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  103. package/cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +98 -0
  104. package/cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4052 -0
  105. package/cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +4935 -0
  106. package/cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2159 -0
  107. package/cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  108. package/cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2305 -0
  109. package/cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  110. package/cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2726 -0
  111. package/cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +342 -0
  112. package/cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
  113. package/cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1468 -0
  114. package/cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1221 -0
  115. package/cpp/ggml/src/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  116. package/cpp/ggml/src/ggml-cpu/arch/x86/quants.c +3820 -0
  117. package/cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +6307 -0
  118. package/cpp/ggml/src/ggml-cpu/arch-fallback.h +313 -0
  119. package/cpp/ggml/src/ggml-cpu/binary-ops.cpp +154 -0
  120. package/cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  121. package/cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +100 -0
  122. package/cpp/ggml/src/ggml-cpu/common.h +95 -0
  123. package/cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +529 -0
  124. package/cpp/ggml/src/ggml-cpu/ggml-cpu.c +3734 -0
  125. package/cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +701 -0
  126. package/cpp/ggml/src/ggml-cpu/hbm.cpp +55 -0
  127. package/cpp/ggml/src/ggml-cpu/hbm.h +8 -0
  128. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +938 -0
  129. package/cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +90 -0
  130. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +798 -0
  131. package/cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  132. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +4033 -0
  133. package/cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +25 -0
  134. package/cpp/ggml/src/ggml-cpu/ops.cpp +10978 -0
  135. package/cpp/ggml/src/ggml-cpu/ops.h +116 -0
  136. package/cpp/ggml/src/ggml-cpu/quants.c +1193 -0
  137. package/cpp/ggml/src/ggml-cpu/quants.h +97 -0
  138. package/cpp/ggml/src/ggml-cpu/repack.cpp +3316 -0
  139. package/cpp/ggml/src/ggml-cpu/repack.h +173 -0
  140. package/cpp/ggml/src/ggml-cpu/simd-gemm.h +136 -0
  141. package/cpp/ggml/src/ggml-cpu/simd-mappings.h +1279 -0
  142. package/cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
  143. package/cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
  144. package/cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
  145. package/cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
  146. package/cpp/ggml/src/ggml-cpu/traits.cpp +36 -0
  147. package/cpp/ggml/src/ggml-cpu/traits.h +38 -0
  148. package/cpp/ggml/src/ggml-cpu/unary-ops.cpp +337 -0
  149. package/cpp/ggml/src/ggml-cpu/unary-ops.h +35 -0
  150. package/cpp/ggml/src/ggml-cpu/vec.cpp +629 -0
  151. package/cpp/ggml/src/ggml-cpu/vec.h +1585 -0
  152. package/cpp/ggml/src/ggml-hexagon/CMakeLists.txt +117 -0
  153. package/cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3232 -0
  154. package/cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +45 -0
  155. package/cpp/ggml/src/ggml-hexagon/htp/act-ops.c +815 -0
  156. package/cpp/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
  157. package/cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +827 -0
  158. package/cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
  159. package/cpp/ggml/src/ggml-hexagon/htp/cpy-ops.c +251 -0
  160. package/cpp/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +666 -0
  161. package/cpp/ggml/src/ggml-hexagon/htp/get-rows-ops.c +111 -0
  162. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.c +63 -0
  163. package/cpp/ggml/src/ggml-hexagon/htp/hex-dma.h +182 -0
  164. package/cpp/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
  165. package/cpp/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
  166. package/cpp/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
  167. package/cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +35 -0
  168. package/cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +154 -0
  169. package/cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +65 -0
  170. package/cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
  171. package/cpp/ggml/src/ggml-hexagon/htp/hvx-arith.h +470 -0
  172. package/cpp/ggml/src/ggml-hexagon/htp/hvx-base.h +173 -0
  173. package/cpp/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
  174. package/cpp/ggml/src/ggml-hexagon/htp/hvx-div.h +116 -0
  175. package/cpp/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
  176. package/cpp/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
  177. package/cpp/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
  178. package/cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.h +176 -0
  179. package/cpp/ggml/src/ggml-hexagon/htp/hvx-reduce.h +266 -0
  180. package/cpp/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
  181. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
  182. package/cpp/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
  183. package/cpp/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
  184. package/cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +18 -0
  185. package/cpp/ggml/src/ggml-hexagon/htp/main.c +1150 -0
  186. package/cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2595 -0
  187. package/cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +498 -0
  188. package/cpp/ggml/src/ggml-hexagon/htp/set-rows-ops.c +167 -0
  189. package/cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +421 -0
  190. package/cpp/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +130 -0
  191. package/cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +384 -0
  192. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +293 -0
  193. package/cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
  194. package/cpp/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
  195. package/cpp/ggml/src/ggml-hexagon/htp-drv.h +121 -0
  196. package/cpp/ggml/src/ggml-hexagon/libdl.h +79 -0
  197. package/cpp/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
  198. package/cpp/ggml/src/ggml-hexagon/op-desc.h +153 -0
  199. package/cpp/ggml/src/ggml-impl.h +724 -0
  200. package/cpp/ggml/src/ggml-metal/CMakeLists.txt +124 -0
  201. package/cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +457 -0
  202. package/cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
  203. package/cpp/ggml/src/ggml-metal/ggml-metal-context.h +41 -0
  204. package/cpp/ggml/src/ggml-metal/ggml-metal-context.m +702 -0
  205. package/cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1890 -0
  206. package/cpp/ggml/src/ggml-metal/ggml-metal-device.h +290 -0
  207. package/cpp/ggml/src/ggml-metal/ggml-metal-device.m +1749 -0
  208. package/cpp/ggml/src/ggml-metal/ggml-metal-impl.h +1054 -0
  209. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +4370 -0
  210. package/cpp/ggml/src/ggml-metal/ggml-metal-ops.h +94 -0
  211. package/cpp/ggml/src/ggml-metal/ggml-metal.cpp +937 -0
  212. package/cpp/ggml/src/ggml-metal/ggml-metal.metal +9819 -0
  213. package/cpp/ggml/src/ggml-musa/CMakeLists.txt +125 -0
  214. package/cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  215. package/cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  216. package/cpp/ggml/src/ggml-opencl/CMakeLists.txt +150 -0
  217. package/cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +11553 -0
  218. package/cpp/ggml/src/ggml-opencl/kernels/add.cl +190 -0
  219. package/cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
  220. package/cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  221. package/cpp/ggml/src/ggml-opencl/kernels/clamp.cl +20 -0
  222. package/cpp/ggml/src/ggml-opencl/kernels/concat.cl +51 -0
  223. package/cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  224. package/cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  225. package/cpp/ggml/src/ggml-opencl/kernels/cpy.cl +184 -0
  226. package/cpp/ggml/src/ggml-opencl/kernels/cvt.cl +417 -0
  227. package/cpp/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl +58 -0
  228. package/cpp/ggml/src/ggml-opencl/kernels/div.cl +138 -0
  229. package/cpp/ggml/src/ggml-opencl/kernels/embed_kernel.py +26 -0
  230. package/cpp/ggml/src/ggml-opencl/kernels/expm1.cl +113 -0
  231. package/cpp/ggml/src/ggml-opencl/kernels/fill.cl +17 -0
  232. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +370 -0
  233. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +371 -0
  234. package/cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +373 -0
  235. package/cpp/ggml/src/ggml-opencl/kernels/gelu.cl +89 -0
  236. package/cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
  237. package/cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
  238. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle.cl +268 -0
  239. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general.cl +274 -0
  240. package/cpp/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
  241. package/cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +187 -0
  242. package/cpp/ggml/src/ggml-opencl/kernels/glu.cl +378 -0
  243. package/cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +121 -0
  244. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +57 -0
  245. package/cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +57 -0
  246. package/cpp/ggml/src/ggml-opencl/kernels/mean.cl +140 -0
  247. package/cpp/ggml/src/ggml-opencl/kernels/mul.cl +152 -0
  248. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_Ab_Bi_8x4.cl +139 -0
  249. package/cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
  250. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
  251. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +146 -0
  252. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +147 -0
  253. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
  254. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
  255. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
  256. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
  257. package/cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
  258. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl +118 -0
  259. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl +118 -0
  260. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl +94 -0
  261. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +84 -0
  262. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl +118 -0
  263. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
  264. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
  265. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  266. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
  267. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
  268. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
  269. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
  270. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl +192 -0
  271. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl +307 -0
  272. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl +265 -0
  273. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl +272 -0
  274. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl +254 -0
  275. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
  276. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
  277. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
  278. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32.cl +194 -0
  279. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
  280. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
  281. package/cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
  282. package/cpp/ggml/src/ggml-opencl/kernels/norm.cl +161 -0
  283. package/cpp/ggml/src/ggml-opencl/kernels/pad.cl +39 -0
  284. package/cpp/ggml/src/ggml-opencl/kernels/relu.cl +16 -0
  285. package/cpp/ggml/src/ggml-opencl/kernels/repeat.cl +38 -0
  286. package/cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +190 -0
  287. package/cpp/ggml/src/ggml-opencl/kernels/rope.cl +747 -0
  288. package/cpp/ggml/src/ggml-opencl/kernels/scale.cl +27 -0
  289. package/cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +208 -0
  290. package/cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  291. package/cpp/ggml/src/ggml-opencl/kernels/silu.cl +30 -0
  292. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +108 -0
  293. package/cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +108 -0
  294. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +107 -0
  295. package/cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +107 -0
  296. package/cpp/ggml/src/ggml-opencl/kernels/softplus.cl +116 -0
  297. package/cpp/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
  298. package/cpp/ggml/src/ggml-opencl/kernels/sqr.cl +53 -0
  299. package/cpp/ggml/src/ggml-opencl/kernels/sqrt.cl +53 -0
  300. package/cpp/ggml/src/ggml-opencl/kernels/ssm_conv.cl +77 -0
  301. package/cpp/ggml/src/ggml-opencl/kernels/sub.cl +138 -0
  302. package/cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +140 -0
  303. package/cpp/ggml/src/ggml-opencl/kernels/tanh.cl +109 -0
  304. package/cpp/ggml/src/ggml-opencl/kernels/transpose.cl +117 -0
  305. package/cpp/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
  306. package/cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  307. package/cpp/ggml/src/ggml-opencl/kernels/upscale.cl +120 -0
  308. package/cpp/ggml/src/ggml-opt.cpp +1093 -0
  309. package/cpp/ggml/src/ggml-quants.c +5325 -0
  310. package/cpp/ggml/src/ggml-quants.h +106 -0
  311. package/cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
  312. package/cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2118 -0
  313. package/cpp/ggml/src/ggml-threading.cpp +12 -0
  314. package/cpp/ggml/src/ggml-threading.h +14 -0
  315. package/cpp/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
  316. package/cpp/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
  317. package/cpp/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
  318. package/cpp/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
  319. package/cpp/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
  320. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
  321. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
  322. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
  323. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
  324. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
  325. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
  326. package/cpp/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
  327. package/cpp/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
  328. package/cpp/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
  329. package/cpp/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
  330. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
  331. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
  332. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
  333. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
  334. package/cpp/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
  335. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
  336. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
  337. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
  338. package/cpp/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
  339. package/cpp/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
  340. package/cpp/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
  341. package/cpp/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
  342. package/cpp/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
  343. package/cpp/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
  344. package/cpp/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
  345. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
  346. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
  347. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
  348. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
  349. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
  350. package/cpp/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
  351. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
  352. package/cpp/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
  353. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
  354. package/cpp/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
  355. package/cpp/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
  356. package/cpp/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
  357. package/cpp/ggml/src/ggml-webgpu/CMakeLists.txt +80 -0
  358. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1231 -0
  359. package/cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +3150 -0
  360. package/cpp/ggml/src/ggml-webgpu/pre_wgsl.hpp +778 -0
  361. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
  362. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
  363. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
  364. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +107 -0
  365. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +923 -0
  366. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +107 -0
  367. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
  368. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +182 -0
  369. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +636 -0
  370. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.wgsl +668 -0
  371. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
  372. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  373. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +713 -0
  374. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +103 -0
  375. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.wgsl +138 -0
  376. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.wgsl +188 -0
  377. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +194 -0
  378. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
  379. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
  380. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
  381. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.wgsl +63 -0
  382. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +109 -0
  383. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
  384. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
  385. package/cpp/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
  386. package/cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
  387. package/cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
  388. package/cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +633 -0
  389. package/cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
  390. package/cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
  391. package/cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
  392. package/cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
  393. package/cpp/ggml/src/ggml-zendnn/CMakeLists.txt +92 -0
  394. package/cpp/ggml/src/ggml-zendnn/ggml-zendnn.cpp +469 -0
  395. package/cpp/ggml/src/ggml.c +7669 -0
  396. package/cpp/ggml/src/ggml.cpp +26 -0
  397. package/cpp/ggml/src/gguf.cpp +1699 -0
  398. package/cpp/include/llama-cpp.h +32 -0
  399. package/cpp/include/llama.h +1568 -0
  400. package/cpp/mtmd/CMakeLists.txt +98 -0
  401. package/cpp/mtmd/README.md +63 -0
  402. package/cpp/mtmd/clip-graph.h +117 -0
  403. package/cpp/mtmd/clip-impl.h +586 -0
  404. package/cpp/mtmd/clip-model.h +390 -0
  405. package/cpp/mtmd/clip.cpp +4154 -0
  406. package/cpp/mtmd/clip.h +121 -0
  407. package/cpp/mtmd/deprecation-warning.cpp +22 -0
  408. package/cpp/mtmd/legacy-models/convert_image_encoder_to_gguf.py +412 -0
  409. package/cpp/mtmd/legacy-models/glmedge-convert-image-encoder-to-gguf.py +280 -0
  410. package/cpp/mtmd/legacy-models/glmedge-surgery.py +33 -0
  411. package/cpp/mtmd/legacy-models/llava_surgery.py +38 -0
  412. package/cpp/mtmd/legacy-models/llava_surgery_v2.py +180 -0
  413. package/cpp/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +892 -0
  414. package/cpp/mtmd/legacy-models/minicpmv-surgery.py +47 -0
  415. package/cpp/mtmd/models/cogvlm.cpp +98 -0
  416. package/cpp/mtmd/models/conformer.cpp +216 -0
  417. package/cpp/mtmd/models/glm4v.cpp +122 -0
  418. package/cpp/mtmd/models/internvl.cpp +69 -0
  419. package/cpp/mtmd/models/kimik25.cpp +101 -0
  420. package/cpp/mtmd/models/kimivl.cpp +63 -0
  421. package/cpp/mtmd/models/llama4.cpp +96 -0
  422. package/cpp/mtmd/models/llava.cpp +374 -0
  423. package/cpp/mtmd/models/minicpmv.cpp +114 -0
  424. package/cpp/mtmd/models/mobilenetv5.cpp +451 -0
  425. package/cpp/mtmd/models/models.h +128 -0
  426. package/cpp/mtmd/models/nemotron-v2-vl.cpp +35 -0
  427. package/cpp/mtmd/models/paddleocr.cpp +52 -0
  428. package/cpp/mtmd/models/pixtral.cpp +86 -0
  429. package/cpp/mtmd/models/qwen2vl.cpp +183 -0
  430. package/cpp/mtmd/models/qwen3vl.cpp +193 -0
  431. package/cpp/mtmd/models/siglip.cpp +86 -0
  432. package/cpp/mtmd/models/whisper-enc.cpp +115 -0
  433. package/cpp/mtmd/models/youtuvl.cpp +179 -0
  434. package/cpp/mtmd/mtmd-audio.cpp +730 -0
  435. package/cpp/mtmd/mtmd-audio.h +113 -0
  436. package/cpp/mtmd/mtmd-cli.cpp +437 -0
  437. package/cpp/mtmd/mtmd-helper.cpp +521 -0
  438. package/cpp/mtmd/mtmd-helper.h +96 -0
  439. package/cpp/mtmd/mtmd.cpp +1156 -0
  440. package/cpp/mtmd/mtmd.h +319 -0
  441. package/cpp/mtmd/requirements.txt +5 -0
  442. package/cpp/mtmd/test-1.jpeg +0 -0
  443. package/cpp/mtmd/test-2.mp3 +0 -0
  444. package/cpp/mtmd/tests.sh +192 -0
  445. package/cpp/src/CMakeLists.txt +169 -0
  446. package/cpp/src/llama-adapter.cpp +488 -0
  447. package/cpp/src/llama-adapter.h +89 -0
  448. package/cpp/src/llama-arch.cpp +2855 -0
  449. package/cpp/src/llama-arch.h +619 -0
  450. package/cpp/src/llama-batch.cpp +917 -0
  451. package/cpp/src/llama-batch.h +173 -0
  452. package/cpp/src/llama-chat.cpp +896 -0
  453. package/cpp/src/llama-chat.h +71 -0
  454. package/cpp/src/llama-context.cpp +3512 -0
  455. package/cpp/src/llama-context.h +359 -0
  456. package/cpp/src/llama-cparams.cpp +5 -0
  457. package/cpp/src/llama-cparams.h +44 -0
  458. package/cpp/src/llama-grammar.cpp +1464 -0
  459. package/cpp/src/llama-grammar.h +194 -0
  460. package/cpp/src/llama-graph.cpp +2685 -0
  461. package/cpp/src/llama-graph.h +1026 -0
  462. package/cpp/src/llama-hparams.cpp +234 -0
  463. package/cpp/src/llama-hparams.h +339 -0
  464. package/cpp/src/llama-impl.cpp +171 -0
  465. package/cpp/src/llama-impl.h +73 -0
  466. package/cpp/src/llama-io.cpp +15 -0
  467. package/cpp/src/llama-io.h +35 -0
  468. package/cpp/src/llama-kv-cache-iswa.cpp +330 -0
  469. package/cpp/src/llama-kv-cache-iswa.h +137 -0
  470. package/cpp/src/llama-kv-cache.cpp +2271 -0
  471. package/cpp/src/llama-kv-cache.h +388 -0
  472. package/cpp/src/llama-kv-cells.h +533 -0
  473. package/cpp/src/llama-memory-hybrid-iswa.cpp +275 -0
  474. package/cpp/src/llama-memory-hybrid-iswa.h +140 -0
  475. package/cpp/src/llama-memory-hybrid.cpp +268 -0
  476. package/cpp/src/llama-memory-hybrid.h +139 -0
  477. package/cpp/src/llama-memory-recurrent.cpp +1165 -0
  478. package/cpp/src/llama-memory-recurrent.h +182 -0
  479. package/cpp/src/llama-memory.cpp +59 -0
  480. package/cpp/src/llama-memory.h +122 -0
  481. package/cpp/src/llama-mmap.cpp +785 -0
  482. package/cpp/src/llama-mmap.h +92 -0
  483. package/cpp/src/llama-model-loader.cpp +1414 -0
  484. package/cpp/src/llama-model-loader.h +203 -0
  485. package/cpp/src/llama-model-saver.cpp +286 -0
  486. package/cpp/src/llama-model-saver.h +37 -0
  487. package/cpp/src/llama-model.cpp +9253 -0
  488. package/cpp/src/llama-model.h +576 -0
  489. package/cpp/src/llama-quant.cpp +1119 -0
  490. package/cpp/src/llama-quant.h +1 -0
  491. package/cpp/src/llama-sampler.cpp +3885 -0
  492. package/cpp/src/llama-sampler.h +42 -0
  493. package/cpp/src/llama-vocab.cpp +3970 -0
  494. package/cpp/src/llama-vocab.h +187 -0
  495. package/cpp/src/llama.cpp +1313 -0
  496. package/cpp/src/models/afmoe.cpp +191 -0
  497. package/cpp/src/models/apertus.cpp +125 -0
  498. package/cpp/src/models/arcee.cpp +135 -0
  499. package/cpp/src/models/arctic.cpp +138 -0
  500. package/cpp/src/models/arwkv7.cpp +86 -0
  501. package/cpp/src/models/baichuan.cpp +122 -0
  502. package/cpp/src/models/bailingmoe.cpp +144 -0
  503. package/cpp/src/models/bailingmoe2.cpp +135 -0
  504. package/cpp/src/models/bert.cpp +178 -0
  505. package/cpp/src/models/bitnet.cpp +160 -0
  506. package/cpp/src/models/bloom.cpp +101 -0
  507. package/cpp/src/models/chameleon.cpp +178 -0
  508. package/cpp/src/models/chatglm.cpp +132 -0
  509. package/cpp/src/models/codeshell.cpp +111 -0
  510. package/cpp/src/models/cogvlm.cpp +102 -0
  511. package/cpp/src/models/cohere2-iswa.cpp +134 -0
  512. package/cpp/src/models/command-r.cpp +122 -0
  513. package/cpp/src/models/dbrx.cpp +123 -0
  514. package/cpp/src/models/deci.cpp +135 -0
  515. package/cpp/src/models/deepseek.cpp +144 -0
  516. package/cpp/src/models/deepseek2.cpp +262 -0
  517. package/cpp/src/models/delta-net-base.cpp +376 -0
  518. package/cpp/src/models/dots1.cpp +134 -0
  519. package/cpp/src/models/dream.cpp +105 -0
  520. package/cpp/src/models/ernie4-5-moe.cpp +150 -0
  521. package/cpp/src/models/ernie4-5.cpp +110 -0
  522. package/cpp/src/models/eurobert.cpp +97 -0
  523. package/cpp/src/models/exaone-moe.cpp +146 -0
  524. package/cpp/src/models/exaone.cpp +114 -0
  525. package/cpp/src/models/exaone4.cpp +123 -0
  526. package/cpp/src/models/falcon-h1.cpp +111 -0
  527. package/cpp/src/models/falcon.cpp +120 -0
  528. package/cpp/src/models/gemma-embedding.cpp +116 -0
  529. package/cpp/src/models/gemma.cpp +112 -0
  530. package/cpp/src/models/gemma2-iswa.cpp +128 -0
  531. package/cpp/src/models/gemma3.cpp +155 -0
  532. package/cpp/src/models/gemma3n-iswa.cpp +384 -0
  533. package/cpp/src/models/glm4-moe.cpp +170 -0
  534. package/cpp/src/models/glm4.cpp +157 -0
  535. package/cpp/src/models/gpt2.cpp +105 -0
  536. package/cpp/src/models/gptneox.cpp +144 -0
  537. package/cpp/src/models/granite-hybrid.cpp +196 -0
  538. package/cpp/src/models/granite.cpp +211 -0
  539. package/cpp/src/models/grok.cpp +159 -0
  540. package/cpp/src/models/grovemoe.cpp +141 -0
  541. package/cpp/src/models/hunyuan-dense.cpp +132 -0
  542. package/cpp/src/models/hunyuan-moe.cpp +154 -0
  543. package/cpp/src/models/internlm2.cpp +120 -0
  544. package/cpp/src/models/jais.cpp +86 -0
  545. package/cpp/src/models/jais2.cpp +123 -0
  546. package/cpp/src/models/jamba.cpp +106 -0
  547. package/cpp/src/models/kimi-linear.cpp +392 -0
  548. package/cpp/src/models/lfm2.cpp +190 -0
  549. package/cpp/src/models/llada-moe.cpp +122 -0
  550. package/cpp/src/models/llada.cpp +99 -0
  551. package/cpp/src/models/llama-iswa.cpp +178 -0
  552. package/cpp/src/models/llama.cpp +168 -0
  553. package/cpp/src/models/maincoder.cpp +117 -0
  554. package/cpp/src/models/mamba-base.cpp +285 -0
  555. package/cpp/src/models/mamba.cpp +54 -0
  556. package/cpp/src/models/mimo2-iswa.cpp +123 -0
  557. package/cpp/src/models/minicpm3.cpp +200 -0
  558. package/cpp/src/models/minimax-m2.cpp +124 -0
  559. package/cpp/src/models/mistral3.cpp +160 -0
  560. package/cpp/src/models/models.h +684 -0
  561. package/cpp/src/models/modern-bert.cpp +109 -0
  562. package/cpp/src/models/mpt.cpp +126 -0
  563. package/cpp/src/models/nemotron-h.cpp +148 -0
  564. package/cpp/src/models/nemotron.cpp +122 -0
  565. package/cpp/src/models/neo-bert.cpp +104 -0
  566. package/cpp/src/models/olmo.cpp +121 -0
  567. package/cpp/src/models/olmo2.cpp +150 -0
  568. package/cpp/src/models/olmoe.cpp +124 -0
  569. package/cpp/src/models/openai-moe-iswa.cpp +127 -0
  570. package/cpp/src/models/openelm.cpp +124 -0
  571. package/cpp/src/models/orion.cpp +123 -0
  572. package/cpp/src/models/paddleocr.cpp +122 -0
  573. package/cpp/src/models/pangu-embedded.cpp +121 -0
  574. package/cpp/src/models/phi2.cpp +121 -0
  575. package/cpp/src/models/phi3.cpp +152 -0
  576. package/cpp/src/models/plamo.cpp +110 -0
  577. package/cpp/src/models/plamo2.cpp +318 -0
  578. package/cpp/src/models/plamo3.cpp +128 -0
  579. package/cpp/src/models/plm.cpp +169 -0
  580. package/cpp/src/models/qwen.cpp +108 -0
  581. package/cpp/src/models/qwen2.cpp +126 -0
  582. package/cpp/src/models/qwen2moe.cpp +151 -0
  583. package/cpp/src/models/qwen2vl.cpp +117 -0
  584. package/cpp/src/models/qwen3.cpp +117 -0
  585. package/cpp/src/models/qwen35.cpp +386 -0
  586. package/cpp/src/models/qwen35moe.cpp +420 -0
  587. package/cpp/src/models/qwen3moe.cpp +124 -0
  588. package/cpp/src/models/qwen3next.cpp +525 -0
  589. package/cpp/src/models/qwen3vl-moe.cpp +140 -0
  590. package/cpp/src/models/qwen3vl.cpp +132 -0
  591. package/cpp/src/models/refact.cpp +94 -0
  592. package/cpp/src/models/rnd1.cpp +126 -0
  593. package/cpp/src/models/rwkv6-base.cpp +164 -0
  594. package/cpp/src/models/rwkv6.cpp +94 -0
  595. package/cpp/src/models/rwkv6qwen2.cpp +86 -0
  596. package/cpp/src/models/rwkv7-base.cpp +137 -0
  597. package/cpp/src/models/rwkv7.cpp +90 -0
  598. package/cpp/src/models/seed-oss.cpp +124 -0
  599. package/cpp/src/models/smallthinker.cpp +126 -0
  600. package/cpp/src/models/smollm3.cpp +128 -0
  601. package/cpp/src/models/stablelm.cpp +146 -0
  602. package/cpp/src/models/starcoder.cpp +100 -0
  603. package/cpp/src/models/starcoder2.cpp +121 -0
  604. package/cpp/src/models/step35-iswa.cpp +168 -0
  605. package/cpp/src/models/t5-dec.cpp +166 -0
  606. package/cpp/src/models/t5-enc.cpp +96 -0
  607. package/cpp/src/models/wavtokenizer-dec.cpp +149 -0
  608. package/cpp/src/models/xverse.cpp +108 -0
  609. package/cpp/src/unicode-data.cpp +7034 -0
  610. package/cpp/src/unicode-data.h +20 -0
  611. package/cpp/src/unicode.cpp +1103 -0
  612. package/cpp/src/unicode.h +111 -0
  613. package/cpp/vendor/nlohmann/json.hpp +25526 -0
  614. package/cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  615. package/cpp/vendor/stb/stb_image.h +7988 -0
  616. package/ios/LocalLLM-Bridging-Header.h +2 -0
  617. package/ios/LocalLLM.h +5 -0
  618. package/ios/LocalLLM.mm +1267 -0
  619. package/local-llm-rn.podspec +60 -0
  620. package/package.json +35 -0
  621. package/src/NativeLocalLLM.ts +73 -0
  622. package/src/device.ts +50 -0
  623. package/src/download-adapter.ts +17 -0
  624. package/src/index.ts +21 -0
  625. package/src/native-bridge.ts +142 -0
  626. package/src/rn-downloader.ts +37 -0
@@ -0,0 +1,1585 @@
1
+ // Vectorized functions for fundamental operations
2
+
3
+ #pragma once
4
+
5
+ #include "ggml-impl.h"
6
+ #include "simd-mappings.h"
7
+ #include "ggml.h"
8
+ #include "ggml-cpu.h"
9
+
10
+ #if defined(GGML_USE_ACCELERATE)
11
+ #include <Accelerate/Accelerate.h>
12
+ #endif
13
+
14
+ // floating point type used to accumulate sums
15
+ typedef double ggml_float;
16
+
17
+ #define GGML_GELU_FP16
18
+ #define GGML_GELU_QUICK_FP16
19
+
20
+ #define GGML_SOFT_MAX_UNROLL 4
21
+ #define GGML_VEC_DOT_UNROLL 2
22
+ #define GGML_VEC_MAD_UNROLL 32
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ //
29
+ // global data
30
+ //
31
+
32
+ // precomputed gelu table for f16 (128 KB)
33
+ extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
34
+
35
+ // precomputed quick gelu table for f16 (128 KB)
36
+ extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
37
+
38
+ //
39
+ // fundamental operations
40
+ //
41
+
42
+ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
43
+ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
44
+ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
45
+
46
+ void ggml_vec_silu_f32(const int n, float * y, const float * x);
47
+ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
48
+ ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
49
+ ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
50
+
51
+ inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
52
+ inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
53
+
54
+ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
55
+ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
56
+
57
+ inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
58
+ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
59
+
60
+ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
61
+ int i = 0;
62
+ #if defined(__AVX2__)
63
+ for (; i + 7 < n; i += 8) {
64
+ __m256 vx = _mm256_loadu_ps(x + i);
65
+ __m256 vy = _mm256_loadu_ps(y + i);
66
+ __m256 vz = _mm256_add_ps(vx, vy);
67
+ _mm256_storeu_ps(z + i, vz);
68
+ }
69
+ #endif
70
+ for (; i < n; ++i) {
71
+ z[i] = x[i] + y[i];
72
+ }
73
+ }
74
+
75
+ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
76
+ for (int i = 0; i < n; ++i) {
77
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
78
+ }
79
+ }
80
+ inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
81
+ inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
82
+ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
83
+ inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
84
+ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
85
+ for (int i = 0; i < n; ++i) {
86
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
87
+ }
88
+ }
89
+ inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
90
+ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
91
+ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
92
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
93
+ for (int i = 0; i < n; ++i) {
94
+ y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
95
+ }
96
+ }
97
+
98
+ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
99
+ inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
100
+ for (int i = 0; i < n; ++i) {
101
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
102
+ }
103
+ }
104
+ inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
105
+ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
106
+ for (int i = 0; i < n; ++i) {
107
+ z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
108
+ }
109
+ }
110
+
111
+ // compute GGML_VEC_DOT_UNROLL dot products at once
112
+ // xs - x row stride in bytes
113
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
114
+ ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
115
+
116
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
117
+
118
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
119
+ x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
120
+ }
121
+
122
+ #if defined(GGML_SIMD)
123
+ #if defined(__ARM_FEATURE_SVE)
124
+
125
+ const int sve_register_length = svcntb() * 8;
126
+ const int ggml_f16_epr = sve_register_length / 16; // running when 16
127
+ const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
128
+
129
+ const int np = (n & ~(ggml_f16_step - 1));
130
+
131
+ svfloat16_t sum_00 = svdup_n_f16(0.0f);
132
+ svfloat16_t sum_01 = svdup_n_f16(0.0f);
133
+ svfloat16_t sum_02 = svdup_n_f16(0.0f);
134
+ svfloat16_t sum_03 = svdup_n_f16(0.0f);
135
+
136
+ svfloat16_t sum_10 = svdup_n_f16(0.0f);
137
+ svfloat16_t sum_11 = svdup_n_f16(0.0f);
138
+ svfloat16_t sum_12 = svdup_n_f16(0.0f);
139
+ svfloat16_t sum_13 = svdup_n_f16(0.0f);
140
+
141
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
142
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
143
+
144
+ for (int i = 0; i < np; i += ggml_f16_step) {
145
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
146
+
147
+ ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
148
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
149
+ ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
150
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
151
+
152
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
153
+
154
+ ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
155
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
156
+ ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
157
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
158
+
159
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
160
+
161
+ ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
162
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
163
+ ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
164
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
165
+
166
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
167
+
168
+ ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
169
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
170
+ ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
171
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
172
+
173
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
174
+
175
+ ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
176
+
177
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
178
+ ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
179
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
180
+
181
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
182
+
183
+ ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
184
+
185
+ sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
186
+ ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
187
+ sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
188
+
189
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
190
+
191
+ ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
192
+
193
+ sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
194
+ ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
195
+ sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
196
+
197
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
198
+
199
+ ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
200
+
201
+ sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
202
+ ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
203
+ sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
204
+ }
205
+
206
+ const int np2 = (n & ~(ggml_f16_epr - 1));
207
+ for (int k = np; k < np2; k += ggml_f16_epr) {
208
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
209
+
210
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
211
+ sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
212
+ rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
213
+ sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
214
+ }
215
+
216
+ if (np2 < n) {
217
+ svbool_t pg = svwhilelt_b16(np2, n);
218
+ svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
219
+ svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
220
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
221
+
222
+ sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
223
+ sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
224
+ }
225
+ GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
226
+ GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
227
+
228
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
229
+ size_t vl = __riscv_vsetvlmax_e32m4();
230
+
231
+ // initialize accumulators to all zeroes
232
+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
233
+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
234
+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
235
+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
236
+
237
+ // calculate step size
238
+ const size_t epr = __riscv_vsetvlmax_e16m2();
239
+ const size_t step = epr * 2;
240
+ const int np = (n & ~(step - 1));
241
+
242
+ // unroll by 2 along the row dimension
243
+ for (int i = 0; i < np; i += step) {
244
+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr);
245
+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr);
246
+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr);
247
+ vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr);
248
+ vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr);
249
+
250
+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr);
251
+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr);
252
+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr);
253
+ vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr);
254
+ vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr);
255
+ }
256
+
257
+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl);
258
+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl);
259
+
260
+ // leftovers
261
+ for (int i = np; i < n; i += vl) {
262
+ vl = __riscv_vsetvl_e16m2(n - i);
263
+ vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl);
264
+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl);
265
+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl);
266
+
267
+ vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl);
268
+ vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl);
269
+ }
270
+
271
+ // reduce
272
+ vl = __riscv_vsetvlmax_e32m2();
273
+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0),
274
+ __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl);
275
+ vl = __riscv_vsetvlmax_e32m1();
276
+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0),
277
+ __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl);
278
+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1(
279
+ acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
280
+
281
+ vl = __riscv_vsetvlmax_e32m2();
282
+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0),
283
+ __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl);
284
+ vl = __riscv_vsetvlmax_e32m1();
285
+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0),
286
+ __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl);
287
+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1(
288
+ acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
289
+ sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0);
290
+ sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1);
291
+
292
+ #else
293
+ const int np = (n & ~(GGML_F16_STEP - 1));
294
+
295
+ GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
296
+
297
+ GGML_F16_VEC ax[GGML_F16_ARR];
298
+ GGML_F16_VEC ay[GGML_F16_ARR];
299
+
300
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
301
+ for (int j = 0; j < GGML_F16_ARR; j++) {
302
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
303
+
304
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
305
+ ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
306
+
307
+ sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
308
+ }
309
+ }
310
+ }
311
+
312
+ // reduce sum0..sum3 to sum0
313
+ for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
314
+ GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
315
+ }
316
+
317
+ // leftovers
318
+ for (int i = np; i < n; ++i) {
319
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
320
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
321
+ }
322
+ }
323
+ #endif
324
+ #else
325
+ for (int i = 0; i < n; ++i) {
326
+ for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
327
+ sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
328
+ }
329
+ }
330
+ #endif
331
+
332
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
333
+ s[i] = (float)sumf[i];
334
+ }
335
+ }
336
+
337
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
338
+ #if defined(GGML_SIMD)
339
+ #if defined(__ARM_FEATURE_SVE)
340
+
341
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
342
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
343
+ const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
344
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
345
+
346
+ const int np = (n & ~(ggml_f32_step - 1));
347
+ svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
348
+ svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
349
+ for (int i = 0; i < np; i += ggml_f32_step) {
350
+
351
+ ax1 = GGML_F32_VEC_LOAD(x + i);
352
+ ay1 = GGML_F32_VEC_LOAD(y + i);
353
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
354
+
355
+ GGML_F32_VEC_STORE(y + i, ay1);
356
+
357
+ ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
358
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
359
+ ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
360
+
361
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
362
+
363
+ ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
364
+ ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
365
+ ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
366
+
367
+ GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
368
+
369
+ ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
370
+ ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
371
+ ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
372
+
373
+ GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
374
+
375
+ ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
376
+ ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
377
+ ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
378
+
379
+ GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
380
+
381
+ ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
382
+ ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
383
+ ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
384
+
385
+ GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
386
+
387
+ ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
388
+ ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
389
+ ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
390
+
391
+ GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
392
+
393
+ ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
394
+ ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
395
+ ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
396
+
397
+ GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
398
+ }
399
+ // leftovers
400
+ // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
401
+ const int np2 = (n & ~(ggml_f32_epr - 1));
402
+ for (int i = np; i < np2; i += ggml_f32_epr) {
403
+ ax1 = GGML_F32_VEC_LOAD(x + i);
404
+ ay1 = GGML_F32_VEC_LOAD(y + i);
405
+ ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
406
+
407
+ GGML_F32_VEC_STORE(y + i, ay1);
408
+ }
409
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
410
+ if (np2 < n) {
411
+ svbool_t pg =svwhilelt_b32(np2, n);
412
+ ax1 = svld1_f32(pg, x + np2);
413
+ ay1 = svld1_f32(pg, y + np2);
414
+ ay1 = svmad_f32_m(pg, ax1, vx, ay1);
415
+
416
+ svst1_f32(pg, y + np2, ay1);
417
+ }
418
+ #elif defined(__riscv_v_intrinsic)
419
+ for (int i = 0, avl; i < n; i += avl) {
420
+ avl = __riscv_vsetvl_e32m8(n - i);
421
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
422
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
423
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
424
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
425
+ }
426
+ #else
427
+ const int np = (n & ~(GGML_F32_STEP - 1));
428
+
429
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
430
+
431
+ GGML_F32_VEC ax[GGML_F32_ARR];
432
+ GGML_F32_VEC ay[GGML_F32_ARR];
433
+
434
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
435
+ for (int j = 0; j < GGML_F32_ARR; j++) {
436
+ ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
437
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
438
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
439
+
440
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
441
+ }
442
+ }
443
+
444
+ // leftovers
445
+ for (int i = np; i < n; ++i) {
446
+ y[i] += x[i]*v;
447
+ }
448
+ #endif
449
+ #else
450
+ // scalar
451
+ for (int i = 0; i < n; ++i) {
452
+ y[i] += x[i]*v;
453
+ }
454
+ #endif
455
+ }
456
+
457
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
458
+ #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
459
+ const int sve_register_length = svcntb() * 8;
460
+ const int ggml_f16_epr = sve_register_length / 16;
461
+ const int ggml_f16_step = 8 * ggml_f16_epr;
462
+
463
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
464
+
465
+ int np = (n & ~(ggml_f16_step - 1));
466
+
467
+ svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
468
+ svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
469
+ for (int i = 0; i < np; i += ggml_f16_step) {
470
+ ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
471
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
472
+ ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
473
+
474
+ GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
475
+
476
+ ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
477
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
478
+ ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
479
+
480
+ GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
481
+
482
+ ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
483
+ ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
484
+ ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
485
+
486
+ GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
487
+
488
+ ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
489
+ ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
490
+ ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
491
+
492
+ GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
493
+
494
+ ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
495
+ ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
496
+ ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
497
+
498
+ GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
499
+
500
+ ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
501
+ ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
502
+ ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
503
+
504
+ GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
505
+
506
+ ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
507
+ ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
508
+ ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
509
+
510
+ GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
511
+
512
+ ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
513
+ ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
514
+ ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
515
+
516
+ GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
517
+ }
518
+ const int np2 = (n & ~(ggml_f16_epr - 1));
519
+ for (int k = np; k < np2; k += ggml_f16_epr) {
520
+ svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
521
+ svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
522
+ ry = GGML_F16x_VEC_FMA(ry, rx, vx);
523
+
524
+ GGML_F16x_VEC_STORE(y + k, ry, 0);
525
+ }
526
+
527
+ if (np2 < n) {
528
+ svbool_t pg = svwhilelt_b16(np2, n);
529
+ svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
530
+ svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
531
+ hy = svmad_f16_x(pg, hx, vx, hy);
532
+ svst1_f16(pg, (__fp16 *)(y + np2), hy);
533
+ }
534
+ np = n;
535
+ #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
536
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
537
+ const _Float16 scale = *(const _Float16*)(&s);
538
+
539
+ // calculate step size
540
+ const int epr = __riscv_vsetvlmax_e16m4();
541
+ const int step = epr * 2;
542
+ int np = (n & ~(step - 1));
543
+
544
+ // unroll by 2
545
+ for (int i = 0; i < np; i += step) {
546
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr);
547
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
548
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr);
549
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
550
+ __asm__ __volatile__ ("" ::: "memory");
551
+
552
+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr);
553
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
554
+ ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr);
555
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
556
+ __asm__ __volatile__ ("" ::: "memory");
557
+ }
558
+
559
+ // leftovers
560
+ int vl;
561
+ for (int i = np; i < n; i += vl) {
562
+ vl = __riscv_vsetvl_e16m4(n - i);
563
+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl);
564
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
565
+ ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl);
566
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
567
+ }
568
+ np = n;
569
+ #elif defined(GGML_SIMD)
570
+ const int np = (n & ~(GGML_F16_STEP - 1));
571
+
572
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
573
+
574
+ GGML_F16_VEC ax[GGML_F16_ARR];
575
+ GGML_F16_VEC ay[GGML_F16_ARR];
576
+
577
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
578
+ for (int j = 0; j < GGML_F16_ARR; j++) {
579
+ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
580
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
581
+ ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
582
+
583
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
584
+ }
585
+ }
586
+ #else
587
+ const int np = 0;
588
+ #endif
589
+
590
+ // leftovers
591
+ for (int i = np; i < n; ++i) {
592
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
593
+ }
594
+ }
595
+
596
+ // xs and vs are byte strides of x and v
597
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
598
+
599
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
600
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
601
+
602
+ for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
603
+ x[i] = (const float *) ((const char *) xv + i*xs);
604
+ v[i] = (const float *) ((const char *) vv + i*vs);
605
+ }
606
+
607
+ #if defined(GGML_SIMD)
608
+ #if defined(__ARM_FEATURE_SVE)
609
+ // scalar Route to scalar implementation //TODO: Write SVE code
610
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
611
+ for (int i = 0; i < n; ++i) {
612
+ y[i] += x[k][i]*v[k][0];
613
+ }
614
+ }
615
+ #elif defined(__riscv_v_intrinsic)
616
+ for (int i = 0, avl; i < n; i += avl) {
617
+ avl = __riscv_vsetvl_e32m8(n - i);
618
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
619
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
620
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
621
+ ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
622
+ }
623
+ __riscv_vse32_v_f32m8(&y[i], ay, avl);
624
+ }
625
+ #else
626
+ const int np = (n & ~(GGML_F32_STEP - 1));
627
+
628
+ GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
629
+
630
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
631
+ vx[k] = GGML_F32_VEC_SET1(v[k][0]);
632
+ }
633
+
634
+ GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
635
+ GGML_F32_VEC ay[GGML_F32_ARR];
636
+
637
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
638
+ for (int j = 0; j < GGML_F32_ARR; j++) {
639
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
640
+
641
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
642
+ ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
643
+ ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
644
+ }
645
+
646
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
647
+ }
648
+ }
649
+
650
+ // leftovers
651
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
652
+ for (int i = np; i < n; ++i) {
653
+ y[i] += x[k][i]*v[k][0];
654
+ }
655
+ }
656
+ #endif
657
+ #else
658
+ // scalar
659
+ for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
660
+ for (int i = 0; i < n; ++i) {
661
+ y[i] += x[k][i]*v[k][0];
662
+ }
663
+ }
664
+ #endif
665
+ }
666
+
667
+ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
668
+ #if defined(GGML_USE_ACCELERATE)
669
+ vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
670
+ #elif defined(GGML_SIMD)
671
+ #if defined(__ARM_FEATURE_SVE)
672
+ // scalar ; TODO: Write SVE code
673
+ for (int i = 0; i < n; ++i) {
674
+ y[i] = x[i]*s + b;
675
+ }
676
+ #elif defined(__riscv_v_intrinsic)
677
+ for (int i = 0, avl; i < n; i += avl) {
678
+ avl = __riscv_vsetvl_e32m8(n - i);
679
+ vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
680
+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
681
+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
682
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
683
+ }
684
+ #else
685
+ const int np = (n & ~(GGML_F32_STEP - 1));
686
+
687
+ GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
688
+ GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
689
+
690
+ GGML_F32_VEC ay[GGML_F32_ARR];
691
+
692
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
693
+ for (int j = 0; j < GGML_F32_ARR; j++) {
694
+ ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
695
+ ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
696
+
697
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
698
+ }
699
+ }
700
+
701
+ // leftovers
702
+ for (int i = np; i < n; ++i) {
703
+ y[i] = x[i]*s + b;
704
+ }
705
+ #endif
706
+ #else
707
+ // scalar
708
+ for (int i = 0; i < n; ++i) {
709
+ y[i] = x[i]*s + b;
710
+ }
711
+ #endif
712
+ }
713
+
714
+ //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
715
+ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
716
+ #if defined(GGML_USE_ACCELERATE)
717
+ vDSP_vsmul(y, 1, &v, y, 1, n);
718
+ #elif defined(GGML_SIMD)
719
+ #if defined(__ARM_FEATURE_SVE)
720
+ const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
721
+ const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
722
+ const int ggml_f32_step = 2 * ggml_f32_epr;
723
+
724
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
725
+ const int np = (n & ~(ggml_f32_step - 1));
726
+ svfloat32_t ay1;
727
+ svfloat32_t ay2;
728
+ for (int i = 0; i < np; i += ggml_f32_step) {
729
+ ay1 = GGML_F32_VEC_LOAD(y + i);
730
+ ay1 = GGML_F32_VEC_MUL(ay1, vx);
731
+ GGML_F32_VEC_STORE(y + i, ay1);
732
+
733
+ ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
734
+ ay2 = GGML_F32_VEC_MUL(ay2, vx);
735
+ GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
736
+ }
737
+ // leftovers
738
+ // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
739
+ for (int i = np; i < n; i += ggml_f32_epr) {
740
+ svbool_t pg = svwhilelt_b32(i, n);
741
+ ay1 = svld1_f32(pg, y + i);
742
+ ay1 = svmul_f32_m(pg, ay1, vx);
743
+ svst1_f32(pg, y + i, ay1);
744
+ }
745
+ #elif defined(__riscv_v_intrinsic)
746
+ for (int i = 0, avl; i < n; i += avl) {
747
+ avl = __riscv_vsetvl_e32m8(n - i);
748
+ vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
749
+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
750
+ __riscv_vse32_v_f32m8(&y[i], ny, avl);
751
+ }
752
+ #else
753
+ const int np = (n & ~(GGML_F32_STEP - 1));
754
+
755
+ GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
756
+
757
+ GGML_F32_VEC ay[GGML_F32_ARR];
758
+
759
+ for (int i = 0; i < np; i += GGML_F32_STEP) {
760
+ for (int j = 0; j < GGML_F32_ARR; j++) {
761
+ ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
762
+ ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
763
+
764
+ GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
765
+ }
766
+ }
767
+
768
+ // leftovers
769
+ for (int i = np; i < n; ++i) {
770
+ y[i] *= v;
771
+ }
772
+ #endif
773
+ #else
774
+ // scalar
775
+ for (int i = 0; i < n; ++i) {
776
+ y[i] *= v;
777
+ }
778
+ #endif
779
+ }
780
+
781
+ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
782
+ #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
783
+ const int sve_register_length = svcntb() * 8;
784
+ const int ggml_f16_epr = sve_register_length / 16;
785
+ const int ggml_f16_step = 2 * ggml_f16_epr;
786
+
787
+ GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
788
+ const int np = (n & ~(ggml_f16_step - 1));
789
+ svfloat16_t ay1, ay2;
790
+
791
+ for (int i = 0; i < np; i += ggml_f16_step) {
792
+ ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
793
+ ay1 = GGML_F16x_VEC_MUL(ay1, vx);
794
+ GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
795
+
796
+ ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
797
+ ay2 = GGML_F16x_VEC_MUL(ay2, vx);
798
+ GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
799
+ }
800
+ // leftovers
801
+ // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
802
+ if (np < n) {
803
+ svbool_t pg = svwhilelt_b16(np, n);
804
+ svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
805
+ svfloat16_t out = svmul_f16_m(pg, hy, vx);
806
+ svst1_f16(pg, (__fp16 *)(y + np), out);
807
+ }
808
+ #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
809
+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v);
810
+ const _Float16 scale = *(const _Float16*)(&s);
811
+
812
+ // calculate step size
813
+ const int epr = __riscv_vsetvlmax_e16m4();
814
+ const int step = epr * 2;
815
+ const int np = (n & ~(step - 1));
816
+
817
+ // unroll by 2
818
+ for (int i = 0; i < np; i += step) {
819
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr);
820
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr);
821
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr);
822
+ __asm__ __volatile__ ("" ::: "memory");
823
+
824
+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr);
825
+ ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr);
826
+ __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr);
827
+ __asm__ __volatile__ ("" ::: "memory");
828
+ }
829
+
830
+ // leftovers
831
+ int vl;
832
+ for (int i = np; i < n; i += vl) {
833
+ vl = __riscv_vsetvl_e16m4(n - i);
834
+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl);
835
+ ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl);
836
+ __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl);
837
+ }
838
+ #elif defined(GGML_SIMD)
839
+ const int np = (n & ~(GGML_F16_STEP - 1));
840
+
841
+ GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
842
+
843
+ GGML_F16_VEC ay[GGML_F16_ARR];
844
+
845
+ for (int i = 0; i < np; i += GGML_F16_STEP) {
846
+ for (int j = 0; j < GGML_F16_ARR; j++) {
847
+ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
848
+ ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
849
+
850
+ GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
851
+ }
852
+ }
853
+
854
+ // leftovers
855
+ for (int i = np; i < n; ++i) {
856
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
857
+ }
858
+ #else
859
+ // scalar
860
+ for (int i = 0; i < n; ++i) {
861
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
862
+ }
863
+ #endif
864
+ }
865
+
866
+ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
867
+ inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
868
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
869
+ for (int i = 0; i < n; ++i) {
870
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
871
+ y[i] = GGML_CPU_FP32_TO_FP16(v*v);
872
+ }
873
+ }
874
+ inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
875
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
876
+ for (int i = 0; i < n; ++i) {
877
+ y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
878
+ }
879
+ }
880
+ inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
881
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
882
+ for (int i = 0; i < n; ++i) {
883
+ y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
884
+ }
885
+ }
886
+ inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
887
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
888
+ for (int i = 0; i < n; ++i) {
889
+ y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
890
+ }
891
+ }
892
+ inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
893
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
894
+ for (int i = 0; i < n; ++i) {
895
+ y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
896
+ }
897
+ }
898
+ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
899
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
900
+ for (int i = 0; i < n; ++i) {
901
+ y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
902
+ }
903
+ }
904
+ inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
905
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
906
+ for (int i = 0; i < n; ++i) {
907
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
908
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
909
+ }
910
+ }
911
+ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
912
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
913
+ for (int i = 0; i < n; ++i) {
914
+ y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
915
+ }
916
+ }
917
+ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
918
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
919
+ for (int i = 0; i < n; ++i) {
920
+ y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
921
+ }
922
+ }
923
+ inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
924
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
925
+ for (int i = 0; i < n; ++i) {
926
+ const float v = GGML_CPU_FP16_TO_FP32(x[i]);
927
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
928
+ }
929
+ }
930
+ inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
931
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
932
+ for (int i = 0; i < n; ++i) {
933
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
934
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
935
+ }
936
+ }
937
+ inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
938
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
939
+ for (int i = 0; i < n; ++i) {
940
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
941
+ y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
942
+ }
943
+ }
944
+ inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
945
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
946
+ for (int i = 0; i < n; ++i) {
947
+ y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
948
+ }
949
+ }
950
+ // TODO: optimize performance
951
+ inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
952
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
953
+ for (int i = 0; i < n; ++i) {
954
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
955
+ y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
956
+ }
957
+ }
958
+ inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
959
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
960
+ for (int i = 0; i < n; ++i) {
961
+ y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
962
+ }
963
+ }
964
+ inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
965
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
966
+ for (int i = 0; i < n; ++i) {
967
+ y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
968
+ }
969
+ }
970
+
971
+ static const float GELU_COEF_A = 0.044715f;
972
+ static const float GELU_QUICK_COEF = -1.702f;
973
+ static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
974
+ static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
975
+
976
+ inline static float ggml_gelu_f32(float x) {
977
+ return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
978
+ }
979
+
980
+ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
981
+ const uint16_t * i16 = (const uint16_t *) x;
982
+ for (int i = 0; i < n; ++i) {
983
+ y[i] = ggml_table_gelu_f16[i16[i]];
984
+ }
985
+ }
986
+
987
+ inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
988
+ for (int i = 0; i < n; ++i) {
989
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
990
+ float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
991
+ y[i] = GGML_CPU_FP32_TO_FP16(res);
992
+ }
993
+ }
994
+
995
+ #ifdef GGML_GELU_FP16
996
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
997
+ uint16_t t;
998
+ for (int i = 0; i < n; ++i) {
999
+ if (x[i] <= -10.0f) {
1000
+ y[i] = 0.0f;
1001
+ } else if (x[i] >= 10.0f) {
1002
+ y[i] = x[i];
1003
+ } else {
1004
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1005
+ memcpy(&t, &fp16, sizeof(uint16_t));
1006
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
1007
+ }
1008
+ }
1009
+ }
1010
+ #else
1011
+ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
1012
+ for (int i = 0; i < n; ++i) {
1013
+ y[i] = ggml_gelu_f32(x[i]);
1014
+ }
1015
+ }
1016
+ #endif
1017
+
1018
+ inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
1019
+ for (int i = 0; i < n; ++i) {
1020
+ float xi = x[i];
1021
+ y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
1022
+ }
1023
+ }
1024
+
1025
+ inline static float ggml_gelu_quick_f32(float x) {
1026
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
1027
+ }
1028
+
1029
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1030
+ // const uint16_t * i16 = (const uint16_t *) x;
1031
+ // for (int i = 0; i < n; ++i) {
1032
+ // y[i] = ggml_table_gelu_quick_f16[i16[i]];
1033
+ // }
1034
+ //}
1035
+
1036
+ #ifdef GGML_GELU_QUICK_FP16
1037
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1038
+ uint16_t t;
1039
+ for (int i = 0; i < n; ++i) {
1040
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1041
+ memcpy(&t, &fp16, sizeof(uint16_t));
1042
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
1043
+ }
1044
+ }
1045
+ #else
1046
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
1047
+ for (int i = 0; i < n; ++i) {
1048
+ y[i] = ggml_gelu_quick_f32(x[i]);
1049
+ }
1050
+ }
1051
+ #endif
1052
+
1053
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1054
+ for (int i = 0; i < n; ++i) {
1055
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
1056
+ y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
1057
+ }
1058
+ }
1059
+
1060
+ // Sigmoid Linear Unit (SiLU) function
1061
+ inline static float ggml_silu_f32(float x) {
1062
+ return x/(1.0f + expf(-x));
1063
+ }
1064
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
1065
+ float v = GGML_CPU_FP16_TO_FP32(x);
1066
+ return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
1067
+ }
1068
+
1069
+ #if __FINITE_MATH_ONLY__
1070
+ #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
1071
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
1072
+ #endif
1073
+
1074
+ /* Below function was borrowed from the GitHub repository:
1075
+ https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
1076
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1077
+ inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
1078
+ // Constants
1079
+ const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
1080
+ const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
1081
+ const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
1082
+ const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
1083
+ const svfloat32_t one = svdup_n_f32(1.0f);
1084
+ const svfloat32_t inactive1 = svdup_n_f32(0.0f);
1085
+ const svint32_t inactive2 = svdup_n_s32(0);
1086
+
1087
+ // Algorithm starts here
1088
+ svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
1089
+ svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
1090
+ svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
1091
+
1092
+ t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
1093
+ t1 = svadd_f32_m(pg, t1, one); // b = a + 1
1094
+
1095
+ svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
1096
+ svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
1097
+ t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
1098
+
1099
+ // and_(t2.d, t1.d, not_mask17.d)
1100
+ svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
1101
+ t5 = svsub_f32_m(pg, t1, t5); // z
1102
+ t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
1103
+ t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
1104
+ t0 = svmul_f32_m(pg, t0, t4); // Final result
1105
+
1106
+ return t0;
1107
+ }
1108
+ #endif
1109
+
1110
+ #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
1111
+
1112
+ inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
1113
+ const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
1114
+ const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
1115
+ const svfloat32_t n = svsub_f32_x(pg, z, r);
1116
+ const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
1117
+ const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
1118
+ const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
1119
+ const svbool_t c = svacgt_n_f32(pg, n, 126);
1120
+ const svfloat32_t u = svmul_f32_x(pg, b, b);
1121
+ const svfloat32_t j = svmla_f32_x(pg,
1122
+ svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
1123
+ svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
1124
+ svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
1125
+ const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
1126
+ const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
1127
+ const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
1128
+ return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
1129
+ svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
1130
+ }
1131
+
1132
+ // computes silu x/(1+exp(-x)) in single precision vector
1133
+ inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
1134
+ const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
1135
+ const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
1136
+ const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
1137
+ const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
1138
+ const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
1139
+ return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
1140
+ }
1141
+
1142
+ #elif defined(__ARM_NEON) && defined(__aarch64__)
1143
+
1144
+ // adapted from arm limited optimized routine
1145
+ // the maximum error is 1.45358 plus 0.5 ulps
1146
+ // numbers above 88.38 will flush to infinity
1147
+ // numbers beneath -103.97 will flush to zero
1148
+ inline static float32x4_t ggml_v_expf(float32x4_t x) {
1149
+ const float32x4_t r = vdupq_n_f32(0x1.8p23f);
1150
+ const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
1151
+ const float32x4_t n = vsubq_f32(z, r);
1152
+ const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
1153
+ vdupq_n_f32(0x1.7f7d1cp-20f));
1154
+ const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
1155
+ const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
1156
+ const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
1157
+ const float32x4_t u = vmulq_f32(b, b);
1158
+ const float32x4_t j = vfmaq_f32(
1159
+ vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
1160
+ vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
1161
+ vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
1162
+ if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
1163
+ return vfmaq_f32(k, j, k);
1164
+ const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
1165
+ const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
1166
+ const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
1167
+ return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
1168
+ vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
1169
+ }
1170
+
1171
+ // computes silu x/(1+exp(-x)) in single precision vector
1172
+ inline static float32x4_t ggml_v_silu(float32x4_t x) {
1173
+ const float32x4_t one = vdupq_n_f32(1.0f);
1174
+ const float32x4_t zero = vdupq_n_f32(0.0f);
1175
+ const float32x4_t neg_x = vsubq_f32(zero, x);
1176
+ const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
1177
+ const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
1178
+ return vdivq_f32(x, one_plus_exp_neg_x);
1179
+ }
1180
+
1181
+ #elif defined(__AVX512F__) && defined(__AVX512DQ__)
1182
+
1183
+ // adapted from arm limited optimized routine
1184
+ // the maximum error is 1.45358 plus 0.5 ulps
1185
+ // numbers above 88.38 will flush to infinity
1186
+ // numbers beneath -103.97 will flush to zero
1187
+ inline static __m512 ggml_v_expf(__m512 x) {
1188
+ const __m512 r = _mm512_set1_ps(0x1.8p23f);
1189
+ const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
1190
+ const __m512 n = _mm512_sub_ps(z, r);
1191
+ const __m512 b =
1192
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
1193
+ _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
1194
+ const __mmask16 d =
1195
+ _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
1196
+ const __m512 u = _mm512_mul_ps(b, b);
1197
+ const __m512 j = _mm512_fmadd_ps(
1198
+ _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
1199
+ _mm512_set1_ps(0x1.573e2ep-5f)),
1200
+ u,
1201
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
1202
+ _mm512_set1_ps(0x1.fffdb6p-2f))),
1203
+ u,
1204
+ _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
1205
+ const __m512 res = _mm512_scalef_ps(j, n);
1206
+ if (_mm512_kortestz(d, d))
1207
+ return res;
1208
+ const __m512 zero = _mm512_setzero_ps();
1209
+ const __m512 alt = _mm512_mask_blend_ps(
1210
+ _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
1211
+ return _mm512_mask_blend_ps(d, res, alt);
1212
+ }
1213
+
1214
+ // computes silu x/(1+exp(-x)) in single precision vector
1215
+ inline static __m512 ggml_v_silu(__m512 x) {
1216
+ const __m512 one = _mm512_set1_ps(1);
1217
+ const __m512 zero = _mm512_setzero_ps();
1218
+ const __m512 neg_x = _mm512_sub_ps(zero, x);
1219
+ const __m512 exp_neg_x = ggml_v_expf(neg_x);
1220
+ const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
1221
+ return _mm512_div_ps(x, one_plus_exp_neg_x);
1222
+ }
1223
+
1224
+ #elif defined(__AVX2__) && defined(__FMA__)
1225
+
1226
+ // adapted from arm limited optimized routine
1227
+ // the maximum error is 1.45358 plus 0.5 ulps
1228
+ // numbers above 88.38 will flush to infinity
1229
+ // numbers beneath -103.97 will flush to zero
1230
+ inline static __m256 ggml_v_expf(__m256 x) {
1231
+ const __m256 r = _mm256_set1_ps(0x1.8p23f);
1232
+ const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
1233
+ const __m256 n = _mm256_sub_ps(z, r);
1234
+ const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
1235
+ _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
1236
+ const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
1237
+ const __m256 k = _mm256_castsi256_ps(
1238
+ _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
1239
+ const __m256i c = _mm256_castps_si256(
1240
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1241
+ _mm256_set1_ps(126), _CMP_GT_OQ));
1242
+ const __m256 u = _mm256_mul_ps(b, b);
1243
+ const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
1244
+ _mm256_set1_ps(0x1.573e2ep-5f)), u,
1245
+ _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
1246
+ _mm256_set1_ps(0x1.fffdb6p-2f))),
1247
+ u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
1248
+ if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
1249
+ return _mm256_fmadd_ps(j, k, k);
1250
+ const __m256i g = _mm256_and_si256(
1251
+ _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
1252
+ _mm256_set1_epi32(0x82000000u));
1253
+ const __m256 s1 =
1254
+ _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
1255
+ const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
1256
+ const __m256i d = _mm256_castps_si256(
1257
+ _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
1258
+ _mm256_set1_ps(192), _CMP_GT_OQ));
1259
+ return _mm256_or_ps(
1260
+ _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
1261
+ _mm256_andnot_ps(
1262
+ _mm256_castsi256_ps(d),
1263
+ _mm256_or_ps(
1264
+ _mm256_and_ps(_mm256_castsi256_ps(c),
1265
+ _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
1266
+ _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
1267
+ }
1268
+
1269
+ // computes silu x/(1+exp(-x)) in single precision vector
1270
+ inline static __m256 ggml_v_silu(__m256 x) {
1271
+ const __m256 one = _mm256_set1_ps(1);
1272
+ const __m256 zero = _mm256_setzero_ps();
1273
+ const __m256 neg_x = _mm256_sub_ps(zero, x);
1274
+ const __m256 exp_neg_x = ggml_v_expf(neg_x);
1275
+ const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
1276
+ return _mm256_div_ps(x, one_plus_exp_neg_x);
1277
+ }
1278
+
1279
+ #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
1280
+
1281
+ #if defined(__FMA__)
1282
+ #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
1283
+ #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
1284
+ #else
1285
+ #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
1286
+ #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
1287
+ #endif
1288
+
1289
+ // adapted from arm limited optimized routine
1290
+ // the maximum error is 1.45358 plus 0.5 ulps
1291
+ // numbers above 88.38 will flush to infinity
1292
+ // numbers beneath -103.97 will flush to zero
1293
+ inline static __m128 ggml_v_expf(__m128 x) {
1294
+ const __m128 r = _mm_set1_ps(0x1.8p23f);
1295
+ const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
1296
+ const __m128 n = _mm_sub_ps(z, r);
1297
+ const __m128 b =
1298
+ NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
1299
+ const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
1300
+ const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
1301
+ const __m128i c =
1302
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
1303
+ const __m128 u = _mm_mul_ps(b, b);
1304
+ const __m128 j =
1305
+ MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
1306
+ MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
1307
+ u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
1308
+ if (!_mm_movemask_epi8(c))
1309
+ return MADD128(j, k, k);
1310
+ const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
1311
+ _mm_set1_epi32(0x82000000u));
1312
+ const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
1313
+ const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
1314
+ const __m128i d =
1315
+ _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
1316
+ return _mm_or_ps(
1317
+ _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
1318
+ _mm_andnot_ps(_mm_castsi128_ps(d),
1319
+ _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
1320
+ _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
1321
+ }
1322
+
1323
+ // computes silu x/(1+exp(-x)) in single precision vector
1324
+ inline static __m128 ggml_v_silu(__m128 x) {
1325
+ const __m128 one = _mm_set1_ps(1);
1326
+ const __m128 zero = _mm_setzero_ps();
1327
+ const __m128 neg_x = _mm_sub_ps(zero, x);
1328
+ const __m128 exp_neg_x = ggml_v_expf(neg_x);
1329
+ const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
1330
+ return _mm_div_ps(x, one_plus_exp_neg_x);
1331
+ }
1332
+
1333
+ #elif defined(__riscv_v_intrinsic)
1334
+
1335
+ // adapted from arm limited optimized routine
1336
+ // the maximum error is 1.45358 plus 0.5 ulps
1337
+ // numbers above 88.38 will flush to infinity
1338
+ // numbers beneath -103.97 will flush to zero
1339
+ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
1340
+ const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
1341
+ #ifdef __riscv_xtheadvector
1342
+ // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
1343
+ vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
1344
+ z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
1345
+ #else
1346
+ const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
1347
+ #endif
1348
+ const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
1349
+ const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
1350
+ 0x1.7f7d1cp-20f, n, vl);
1351
+ const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
1352
+ const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
1353
+ const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
1354
+ const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
1355
+ const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
1356
+ __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
1357
+ __riscv_vfmacc_vv_f32m2(
1358
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
1359
+ __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
1360
+ u, vl), u, vl);
1361
+ if (!__riscv_vcpop_m_b16(c, vl))
1362
+ return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
1363
+ const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
1364
+ const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
1365
+ const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
1366
+ const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
1367
+ const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
1368
+ __riscv_vfmacc_vv_f32m2(k, k, j, vl),
1369
+ __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
1370
+ c, vl);
1371
+ return __riscv_vmerge_vvm_f32m2(
1372
+ r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
1373
+ __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
1374
+ vl);
1375
+ }
1376
+
1377
+ // computes silu x/(1+exp(-x)) in single precision vector
1378
+ inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
1379
+ const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
1380
+ const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
1381
+ const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
1382
+ return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
1383
+ }
1384
+
1385
+ #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
1386
+
1387
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1388
+ for (int i = 0; i < n; ++i) {
1389
+ y[i] = ggml_silu_f16(x[i]);
1390
+ }
1391
+ }
1392
+
1393
+ inline static float ggml_silu_backward_f32(float x, float dy) {
1394
+ const float s = 1.0f/(1.0f + expf(-x));
1395
+ return dy*s*(1.0f + x*(1.0f - s));
1396
+ }
1397
+
1398
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
1399
+ const float v = GGML_CPU_FP16_TO_FP32(x);
1400
+ const float s = 1.0f/(1.0f + expf(-v));
1401
+ return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
1402
+ }
1403
+
1404
+ inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
1405
+ for (int i = 0; i < n; ++i) {
1406
+ dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
1407
+ }
1408
+ }
1409
+
1410
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
1411
+ for (int i = 0; i < n; ++i) {
1412
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
1413
+ }
1414
+ }
1415
+
1416
+ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
1417
+ for (int i = 0; i < n; ++i) {
1418
+ y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
1419
+ }
1420
+ }
1421
+
1422
+ inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1423
+ for (int i = 0; i < n; ++i) {
1424
+ float v = GGML_CPU_FP16_TO_FP32(x[i]);
1425
+ y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
1426
+ }
1427
+ }
1428
+
1429
+ #ifdef GGML_GELU_FP16
1430
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1431
+ uint16_t t;
1432
+ for (int i = 0; i < n; ++i) {
1433
+ if (x[i] <= -10.0f) {
1434
+ y[i] = 0.0f;
1435
+ } else if (x[i] >= 10.0f) {
1436
+ y[i] = x[i] * g[i];
1437
+ } else {
1438
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1439
+ memcpy(&t, &fp16, sizeof(uint16_t));
1440
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
1441
+ }
1442
+ }
1443
+ }
1444
+ #else
1445
+ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
1446
+ for (int i = 0; i < n; ++i) {
1447
+ y[i] = ggml_gelu_f32(x[i]) * g[i];
1448
+ }
1449
+ }
1450
+ #endif
1451
+
1452
+ inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1453
+ const uint16_t * i16 = (const uint16_t *) x;
1454
+ for (int i = 0; i < n; ++i) {
1455
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1456
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
1457
+ }
1458
+ }
1459
+
1460
+ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
1461
+
1462
+ inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1463
+ for (int i = 0; i < n; ++i) {
1464
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1465
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1466
+ y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
1467
+ }
1468
+ }
1469
+
1470
+ inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
1471
+ for (int i = 0; i < n; ++i) {
1472
+ float xi = x[i];
1473
+ y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
1474
+ }
1475
+ }
1476
+
1477
+ inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1478
+ for (int i = 0; i < n; ++i) {
1479
+ float xi = GGML_CPU_FP16_TO_FP32(x[i]);
1480
+ float gi = GGML_CPU_FP16_TO_FP32(g[i]);
1481
+ y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
1482
+ }
1483
+ }
1484
+
1485
+ #ifdef GGML_GELU_QUICK_FP16
1486
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1487
+ uint16_t t;
1488
+ for (int i = 0; i < n; ++i) {
1489
+ ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
1490
+ memcpy(&t, &fp16, sizeof(uint16_t));
1491
+ y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
1492
+ }
1493
+ }
1494
+ #else
1495
+ inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
1496
+ for (int i = 0; i < n; ++i) {
1497
+ y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
1498
+ }
1499
+ }
1500
+ #endif
1501
+
1502
+ inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
1503
+ const uint16_t * i16 = (const uint16_t *) x;
1504
+ for (int i = 0; i < n; ++i) {
1505
+ float v = GGML_CPU_FP16_TO_FP32(g[i]);
1506
+ y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
1507
+ }
1508
+ }
1509
+
1510
+ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
1511
+ #ifndef GGML_USE_ACCELERATE
1512
+ ggml_float sum = 0.0;
1513
+ for (int i = 0; i < n; ++i) {
1514
+ sum += (ggml_float)x[i];
1515
+ }
1516
+ *s = (float)sum;
1517
+ #else
1518
+ vDSP_sve(x, 1, s, n);
1519
+ #endif
1520
+ }
1521
+
1522
+ inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
1523
+ for (int i = 0; i < n; ++i) {
1524
+ if (i == 0) {
1525
+ y[i] = x[i];
1526
+ } else {
1527
+ y[i] = y[i - 1] + x[i];
1528
+ }
1529
+ }
1530
+ }
1531
+
1532
+ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
1533
+ ggml_float sum = 0.0;
1534
+ for (int i = 0; i < n; ++i) {
1535
+ sum += (ggml_float)x[i];
1536
+ }
1537
+ *s = sum;
1538
+ }
1539
+
1540
+ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
1541
+ float sum = 0.0f;
1542
+ for (int i = 0; i < n; ++i) {
1543
+ sum += GGML_CPU_FP16_TO_FP32(x[i]);
1544
+ }
1545
+ *s = sum;
1546
+ }
1547
+
1548
+ inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
1549
+ float sum = 0.0f;
1550
+ for (int i = 0; i < n; ++i) {
1551
+ sum += GGML_BF16_TO_FP32(x[i]);
1552
+ }
1553
+ *s = sum;
1554
+ }
1555
+
1556
+ inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
1557
+ #ifndef GGML_USE_ACCELERATE
1558
+ float max = -INFINITY;
1559
+ for (int i = 0; i < n; ++i) {
1560
+ max = MAX(max, x[i]);
1561
+ }
1562
+ *s = max;
1563
+ #else
1564
+ vDSP_maxv(x, 1, s, n);
1565
+ #endif
1566
+ }
1567
+
1568
+ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
1569
+ ggml_vec_norm_f32(n, s, x);
1570
+ *s = 1.f/(*s);
1571
+ }
1572
+
1573
+ inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
1574
+ float max = -INFINITY;
1575
+ int idx = 0;
1576
+ for (int i = 0; i < n; ++i) {
1577
+ max = MAX(max, x[i]);
1578
+ if (max == x[i]) { idx = i; }
1579
+ }
1580
+ *s = idx;
1581
+ }
1582
+
1583
+ #ifdef __cplusplus
1584
+ }
1585
+ #endif